Commit e21223b4 authored by davidkep's avatar davidkep

adjust linearized ADMM algorithms to optimize with factor 1/n

parent ac5d2f87
......@@ -67,7 +67,7 @@ struct DataCache {
inline bool AnyViolateKKT(const arma::mat& x, const arma::vec& residuals, const double lambda,
const EnPenalty& penalty) {
// const double lambda_1 = residuals.n_elem * penalty.alpha() * penalty.lambda();
const double lambda_1 = penalty.alpha() * lambda;
const double lambda_1 = penalty.alpha() * lambda * residuals.n_elem;
for (arma::uword j = 0; j < x.n_cols; ++j) {
// const double cutoff = lambda1 * penalty.loadings()[j];
const double inner = arma::dot(x.col(j), residuals);
......@@ -87,7 +87,7 @@ inline bool AnyViolateKKT(const arma::mat& x, const arma::vec& residuals, const
inline bool AnyViolateKKT(const arma::mat& x, const arma::vec& residuals, const double lambda,
const AdaptiveEnPenalty& penalty) {
// const double lambda_1 = residuals.n_elem * penalty.alpha() * penalty.lambda();
const double lambda_1 = penalty.alpha() * lambda;
const double lambda_1 = penalty.alpha() * lambda * residuals.n_elem;
for (arma::uword j = 0; j < x.n_cols; ++j) {
const double cutoff = lambda_1 * penalty.loadings()[j];
const double inner = arma::dot(x.col(j), residuals);
......@@ -209,12 +209,12 @@ class LsProximalOperator {
const double n_obs_dbl = static_cast<double>(data.n_obs());
if (data.n_obs() < data.n_pred()) {
const double expo = std::max(0.5, n_obs_dbl / data.n_pred());
return std::min((data.n_pred() * std::pow(norm_x, expo)) / n_obs_dbl,
norm_x / std::sqrt(data.n_pred() * penalty.lambda()));
return std::min((data.n_pred() * std::pow(norm_x, expo)),
norm_x * n_obs_dbl / std::sqrt(data.n_pred() * penalty.lambda()));
} else {
const double expo = std::max(0.5, data.n_pred() / n_obs_dbl);
return std::min(std::max(1., (data.n_pred() * std::pow(norm_x, expo)) / n_obs_dbl),
norm_x / std::sqrt(n_obs_dbl * penalty.lambda()));
return std::min(std::max(1., (data.n_pred() * std::pow(norm_x, expo))),
norm_x * n_obs_dbl / std::sqrt(n_obs_dbl * penalty.lambda()));
}
}
......@@ -289,11 +289,12 @@ class WeightedLsProximalOperator {
//! @param metrics optional metrics object to collect metrics of the proximal operator
inline arma::vec operator()(const arma::vec& u, const double intercept, const double lambda,
nsoptim::Metrics * const = nullptr) const {
// TODO: Adapt for (1/n) factor!
const auto n = loss_->data().n_obs();
if (loss_->IncludeIntercept()) {
return (u + lambda * (loss_->data().cy() - intercept)) / (1 + lambda);
return (n * u + lambda * weights_ % (loss_->data().cy() - intercept)) /
(n + lambda * weights_);
} else {
return (u + lambda * loss_->data().cy()) / (1 + lambda);
return (n * u + lambda * weights_ % loss_->data().cy()) / (n + lambda * weights_);
}
}
......@@ -312,12 +313,12 @@ class WeightedLsProximalOperator {
const double n_obs_weighted = data.n_obs() * loss_->mean_weight();
if (n_obs_weighted < data.n_pred()) {
const double expo = std::max(0.5, n_obs_weighted / data.n_pred());
return std::min((data.n_pred() * std::pow(norm_x, expo)) / n_obs_weighted,
norm_x / std::sqrt(data.n_pred() * penalty.lambda()));
return std::min(data.n_pred() * std::pow(norm_x, expo) / loss_->mean_weight(),
norm_x * data.n_obs() / std::sqrt(data.n_pred() * penalty.lambda()));
} else {
const double expo = std::max(0.5, data.n_pred() / n_obs_weighted);
return std::min(std::max(1., (data.n_pred() * std::pow(norm_x, expo)) / n_obs_weighted),
norm_x / std::sqrt(n_obs_weighted * penalty.lambda()));
return std::min(std::max(1., data.n_pred() * std::pow(norm_x, expo)) / loss_->mean_weight(),
norm_x * data.n_obs() / std::sqrt(n_obs_weighted * penalty.lambda()));
}
}
return 1 / config_.tau;
......@@ -333,12 +334,12 @@ class WeightedLsProximalOperator {
const double n_obs_weighted = data.n_obs() * loss_->mean_weight();
if (n_obs_weighted < data.n_pred()) {
const double expo = std::max(0.5, n_obs_weighted / data.n_pred());
return std::min((data.n_pred() * std::pow(norm_x, expo)) / n_obs_weighted,
norm_x / std::sqrt(data.n_pred() * penalty.lambda()));
return std::min((data.n_pred() * std::pow(norm_x, expo)),
norm_x * n_obs_weighted / std::sqrt(data.n_pred() * penalty.lambda()));
} else {
const double expo = std::max(0.5, data.n_pred() / n_obs_weighted);
return std::min(std::max(1., (data.n_pred() * std::pow(norm_x, expo)) / n_obs_weighted),
norm_x / std::sqrt(n_obs_weighted * penalty.lambda()));
return std::min(std::max(1., (data.n_pred() * std::pow(norm_x, expo))),
norm_x * n_obs_weighted / std::sqrt(n_obs_weighted * penalty.lambda()));
}
}
return 1 / config_.tau;
......@@ -611,7 +612,6 @@ class GenericAdmmLinearOptimizer : public Optimizer<typename ProxOp::LossFunctio
metrics->AddDetail("step_size", step_size_);
int iter = 0;
int second_criterion = 0;
State prev_state;
while (iter++ < max_it) {
......@@ -633,20 +633,17 @@ class GenericAdmmLinearOptimizer : public Optimizer<typename ProxOp::LossFunctio
const double v_diff = arma::accu(arma::square(state_.v - prev_state.v));
const double l_diff = arma::accu(arma::square(state_.l - prev_state.l));
const double diff_scaling = arma::norm(state_.v, 2) + arma::norm(state_.l, 2);
// This is the "un-standardized" residual, without the division by the squared step-size.
gap = v_diff + l_diff;
gap = (v_diff + l_diff) * diff_scaling * diff_scaling;
iter_metrics.AddDetail("v_diff", v_diff);
iter_metrics.AddDetail("l_diff", l_diff);
iter_metrics.AddDetail("gap", gap);
if ((gap < conv_tol) || second_criterion > max_it) {
if (gap < conv_tol) {
return FinalizeResult(iter, gap, OptimumStatus::kOk, std::move(metrics));
} else if (gap * gap < conv_tol && (l_diff < gap || v_diff < gap)) {
second_criterion += admm_optimizer::kSecondCriterionMultiplier;
} else {
second_criterion = 0;
}
}
return FinalizeResult(iter, gap, OptimumStatus::kWarning, "ADMM-algorithm did not converge.", std::move(metrics));
......@@ -877,7 +874,6 @@ class AdmmLinearOptimizer : public Optimizer<LossFunction, PenaltyFunction, Coef
const PredictorResponseData& data = loss_->data();
const bool include_intercept = loss_->IncludeIntercept();
const double scaled_lambda = data.n_obs() * penalty_->lambda();
const bool check_empty = admm_optimizer::AllZero(coefs_.beta) || (coefs_.beta.n_elem != data.n_pred());
// Check if the coefficients are correct.
......@@ -896,7 +892,7 @@ class AdmmLinearOptimizer : public Optimizer<LossFunction, PenaltyFunction, Coef
// This is the convergence tolerance for the "un-standardized" residual.
const double conv_tol = convergence_tolerance_ * penalty_->alpha() * step_size_ * step_size_;
const auto en_cutoff = DetermineCutoff(IsAdaptiveTag{});
const double en_multiplier = 1 / (1 + norm_x_sq_inv_ * step_size_ * scaled_lambda * (1 - penalty_->alpha()));
const double en_multiplier = 1 / (1 + norm_x_sq_inv_ * step_size_ * penalty_->lambda() * (1 - penalty_->alpha()));
double gap = 0;
......@@ -905,8 +901,8 @@ class AdmmLinearOptimizer : public Optimizer<LossFunction, PenaltyFunction, Coef
coefs_.intercept = ComputeIntercept(fitted, IsWeightedTag{});
}
if (check_empty && !admm_optimizer::AnyViolateKKT(data.cx(), EmptyModelResiduals(IsWeightedTag{}), scaled_lambda,
*penalty_)) {
if (check_empty && !admm_optimizer::AnyViolateKKT(data.cx(), EmptyModelResiduals(IsWeightedTag{}),
penalty_->lambda(), *penalty_)) {
// None of the predictors will be activated for the current penalty. Return the current coefficient value.
return FinalizeResult(0, 0, OptimumStatus::kOk, std::move(metrics));
}
......@@ -951,9 +947,11 @@ class AdmmLinearOptimizer : public Optimizer<LossFunction, PenaltyFunction, Coef
const double v_diff = arma::accu(arma::square(state_.v - prev_state.v));
const double l_diff = arma::accu(arma::square(state_.l - prev_state.l));
const double diff_scaling = arma::norm(state_.v, 2) + arma::norm(state_.l, 2);
// This is the "un-standardized" residual, without the division by the squared step-size.
gap = v_diff + l_diff;
gap = (v_diff + l_diff) * diff_scaling * diff_scaling;
if ((gap < conv_tol) || second_criterion > max_it) {
return FinalizeResult(iter, gap, OptimumStatus::kOk, std::move(metrics));
} else if (gap * gap < conv_tol && (l_diff < gap || v_diff < gap)) {
......@@ -1007,25 +1005,28 @@ class AdmmLinearOptimizer : public Optimizer<LossFunction, PenaltyFunction, Coef
//! Determine the cutoff for the soft-threshold function for non-adaptive penalties
double DetermineCutoff(std::false_type) const noexcept {
return loss_->data().n_obs() * penalty_->alpha() * penalty_->lambda() * step_size_ * norm_x_sq_inv_;
return penalty_->alpha() * penalty_->lambda() * step_size_ * norm_x_sq_inv_;
}
//! Apply the proximal operator for the weighted LS loss with intercept
arma::vec ProximalLs(const arma::vec& v, std::true_type) const {
const auto n = loss_->data().n_obs();
if (loss_->IncludeIntercept()) {
return (v + step_size_ * weights_ % (loss_->data().cy() - coefs_.intercept)) /
(1 + step_size_ * weights_);
return (n * v + step_size_ * weights_ % (loss_->data().cy() - coefs_.intercept)) /
(n + step_size_ * weights_);
} else {
return (v + step_size_ * weights_ % loss_->data().cy()) / (1 + step_size_ * weights_);
return (n * v + step_size_ * weights_ % loss_->data().cy()) / (n + step_size_ * weights_);
}
}
//! Apply the proximal operator for the LS loss with intercept.
arma::vec ProximalLs(const arma::vec& v, std::false_type) const {
const auto n = loss_->data().n_obs();
const double mult_fact = 1 / (n + step_size_);
if (loss_->IncludeIntercept()) {
return (v + step_size_ * (loss_->data().cy() - coefs_.intercept)) / (1 + step_size_);
return mult_fact * n * v + mult_fact * step_size_ * (loss_->data().cy() - coefs_.intercept);
} else {
return (v + step_size_ * loss_->data().cy()) / (1 + step_size_);
return mult_fact * n * v + mult_fact * step_size_ * loss_->data().cy();
}
}
......@@ -1051,21 +1052,16 @@ class AdmmLinearOptimizer : public Optimizer<LossFunction, PenaltyFunction, Coef
//! Update the step size based on the predictor matrix and the penalty level.
void UpdateStepSize(std::false_type) noexcept {
if (config_.tau < 0) {
step_size_ = std::sqrt(norm_x_);
const PredictorResponseData& data = loss_->data();
const double n_obs_dbl = static_cast<double>(data.n_obs());
if (data.n_obs() < data.n_pred()) {
const double expo = std::max(0.5, n_obs_dbl / data.n_pred());
step_size_ = std::min((data.n_pred() * std::pow(norm_x_, expo)) / n_obs_dbl,
norm_x_ / std::sqrt(data.n_pred() * penalty_->lambda()));
// step_size_ = std::max(n_obs_dbl / (data.n_pred() * std::pow(norm_x_, expo)),
// std::sqrt(data.n_pred() * penalty_->lambda() / norm_x_));
step_size_ = std::min((data.n_pred() * std::pow(norm_x_, expo)),
norm_x_ * n_obs_dbl / std::sqrt(data.n_pred() * penalty_->lambda()));
} else {
const double expo = std::max(0.5, data.n_pred() / n_obs_dbl);
step_size_ = std::min(std::max(1., (data.n_pred() * std::pow(norm_x_, expo)) / n_obs_dbl),
norm_x_ / std::sqrt(n_obs_dbl * penalty_->lambda()));
// step_size_ = std::max(std::min(1., n_obs_dbl / (data.n_pred() * std::pow(norm_x_, expo))),
// std::sqrt(n_obs_dbl * penalty_->lambda() / norm_x_));
step_size_ = std::min(std::max(1., (data.n_pred() * std::pow(norm_x_, expo))),
norm_x_ * n_obs_dbl / std::sqrt(n_obs_dbl * penalty_->lambda()));
}
} else {
step_size_ = 1 / config_.tau;
......@@ -1075,21 +1071,16 @@ class AdmmLinearOptimizer : public Optimizer<LossFunction, PenaltyFunction, Coef
//! Update the step size based on the predictor matrix and the penalty level.
void UpdateStepSize(std::true_type) noexcept {
if (config_.tau < 0) {
step_size_ = std::sqrt(norm_x_);
const PredictorResponseData& data = loss_->data();
const double n_obs_weighted = data.n_obs() * loss_->mean_weight();
if (n_obs_weighted < data.n_pred()) {
const double expo = std::max(0.5, n_obs_weighted / data.n_pred());
step_size_ = std::min((data.n_pred() * std::pow(norm_x_, expo)) / n_obs_weighted,
norm_x_ / std::sqrt(data.n_pred() * penalty_->lambda()));
// step_size_ = std::max(n_obs_weighted / (data.n_pred() * std::pow(norm_x_, expo)),
// std::sqrt(data.n_pred() * penalty_->lambda() / norm_x_));
step_size_ = std::min(data.n_pred() * std::pow(norm_x_, expo) / loss_->mean_weight(),
norm_x_ * data.n_obs() / std::sqrt(data.n_pred() * penalty_->lambda()));
} else {
const double expo = std::max(0.5, data.n_pred() / n_obs_weighted);
step_size_ = std::min(std::max(1., (data.n_pred() * std::pow(norm_x_, expo)) / n_obs_weighted),
norm_x_ / std::sqrt(n_obs_weighted * penalty_->lambda()));
// step_size_ = 1 / std::max(std::min(1., n_obs_weighted / (data.n_pred() * std::pow(norm_x_, expo))),
// std::sqrt(n_obs_weighted * penalty_->lambda() / norm_x_));
step_size_ = std::min(std::max(1., (data.n_pred() * std::pow(norm_x_, expo))) / loss_->mean_weight(),
norm_x_ * data.n_obs() / std::sqrt(n_obs_weighted * penalty_->lambda()));
}
} else {
step_size_ = 1 / config_.tau;
......@@ -1311,10 +1302,12 @@ class AdmmVarStepOptimizer : public Optimizer<LossFunction, PenaltyFunction, Coe
coefs_.intercept = 0;
}
if (include_intercept) {
EmptyModelIntercept(IsWeightedTag{});
}
// Check if any of the predictors might be active.
if (check_empty && !admm_optimizer::AnyViolateKKT(data_->cx(), EmptyModelResiduals(IsWeightedTag{}),
scaled_lambda, *penalty_)) {
scaled_lambda / data_->n_obs(), *penalty_)) {
// None of the predictors will be activated for the current penalty. Return the current coefficient value.
return FinalizeResult(0, OptimumStatus::kOk, std::move(metrics));
}
......@@ -1376,10 +1369,11 @@ class AdmmVarStepOptimizer : public Optimizer<LossFunction, PenaltyFunction, Coe
UpdateStateL(IsSparseTag{});
const double diff_scaling = arma::norm(state_.v, 2) + state_.tau * arma::norm(state_.l, 2);
state_.gap = arma::accu(state_.tau * state_.tau * arma::square(state_.v - prev_state.v) +
arma::square(state_.l - prev_state.l)) + intercept_change;
if (state_.gap < conv_tol) {
if (state_.gap * diff_scaling * diff_scaling < conv_tol) {
return FinalizeResult(iter, OptimumStatus::kOk, std::move(metrics));
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment