diff --git a/inst/include/nsoptim/optimizer/admm.hpp b/inst/include/nsoptim/optimizer/admm.hpp index 3b43fad24a81118b8b177df6e4a270bbcd9ea22d..bba58eadb6c588ea09e1083f04eeb8a083d4afa3 100644 --- a/inst/include/nsoptim/optimizer/admm.hpp +++ b/inst/include/nsoptim/optimizer/admm.hpp @@ -26,15 +26,15 @@ namespace nsoptim { -//! Configuration options for the ADMM algorithm. +//! Configuration options for the variable-stepsize ADMM algorithm. //! //! The members have the following meaning: //! max_it ... maximum number of iterations allowed. -//! tau ... the step size. If negative (the default), use the square L_2 norm of `x` (for linearized ADMM). +//! tau ... the step size. If negative (the default), use the square L_2 norm of `x`. //! tau_lower_mult ... lower bound for the step size, defined as a multiple of `tau`. //! tau_adjustment_lower ... lower bound of the step-size adjustment factor. //! tau_adjustment_upper ... upper bound of the step-size adjustment factor. -struct AdmmConfiguration { +struct AdmmVarStepConfiguration { int max_it; double tau; double tau_lower_mult; @@ -42,9 +42,20 @@ struct AdmmConfiguration { double tau_adjustment_upper; }; +//! Configuration options for the linearized ADMM algorithm. +//! +//! The members have the following meaning: +//! max_it ... maximum number of iterations allowed. +struct AdmmLinearConfiguration { + int max_it; +}; + namespace admm_optimizer { -//! Default configuration for ADMM-type algorithms -constexpr AdmmConfiguration kDefaultAdmmConfiguration = {1000, -1, 0.01, 0.98, 0.999}; +//! Default configuration for the variable-stepsize ADMM algorithm +constexpr AdmmVarStepConfiguration kDefaultVarStepConfig { 1000, -1, 0.01, 0.98, 0.999 }; +//! Default configuration for the variable-stepsize ADMM algorithm +constexpr AdmmLinearConfiguration kDefaultLinConfig { 1000 }; + //! How often does the secondary convergence criterion (relative to the maximum number of iterations) need to be //! fulfulled to stop the linearized ADMM early. constexpr int kSecondCriterionMultiplier = 10; @@ -58,6 +69,16 @@ struct DataCache { arma::mat chol_xtx; }; +//! Check whether any of the predictors in `x` violates the KKT conditions for a EN-type problem. +//! +//! @param x matrix of predictor values. +//! @param residuals vector of residuals. +//! @param lambda the lambda value to use (overrides the lambda in the EN penalty) +//! @param penalty elastic net penalty object. +inline bool AnyViolateKKT(const arma::mat&, const arma::vec&, const double, const RidgePenalty&) { + return true; +} + //! Check whether any of the predictors in `x` violates the KKT conditions for a EN-type problem. //! //! @param x matrix of predictor values. @@ -67,7 +88,7 @@ struct DataCache { inline bool AnyViolateKKT(const arma::mat& x, const arma::vec& residuals, const double lambda, const EnPenalty& penalty) { // const double lambda_1 = residuals.n_elem * penalty.alpha() * penalty.lambda(); - const double lambda_1 = penalty.alpha() * lambda; + const double lambda_1 = penalty.alpha() * lambda * residuals.n_elem; for (arma::uword j = 0; j < x.n_cols; ++j) { // const double cutoff = lambda1 * penalty.loadings()[j]; const double inner = arma::dot(x.col(j), residuals); @@ -87,7 +108,7 @@ inline bool AnyViolateKKT(const arma::mat& x, const arma::vec& residuals, const inline bool AnyViolateKKT(const arma::mat& x, const arma::vec& residuals, const double lambda, const AdaptiveEnPenalty& penalty) { // const double lambda_1 = residuals.n_elem * penalty.alpha() * penalty.lambda(); - const double lambda_1 = penalty.alpha() * lambda; + const double lambda_1 = penalty.alpha() * lambda * residuals.n_elem; for (arma::uword j = 0; j < x.n_cols; ++j) { const double cutoff = lambda_1 * penalty.loadings()[j]; const double inner = arma::dot(x.col(j), residuals); @@ -143,46 +164,395 @@ inline bool SolveChol(const arma::mat& chol, arma::vec * const b) { return true; } +} // namespace admm_optimizer +//! Proximal operator for the unweighted LS loss. +class LsProximalOperator { + public: + using LossFunction = LsLoss; + + //! Initialize the proximal operator with fixed step size `1 / tau`. + explicit LsProximalOperator(const double tau = -1) noexcept : config_tau_(tau) {} + + //! Set the loss function for the proximal operator. + //! + //! @param loss the LS-loss for optimization. The object retains only a reference to the loss, so it is + //! the user's responsibility to not use the object after the loss is removed! + inline void loss(LsLoss* loss) noexcept { + loss_ = loss; + } + + //! Compute the proximal operator `v` for the given input parameters. + //! + //! @param u + //! @param v_prev ignored. + //! @param intercept + //! @param lambda + //! @param metrics optional metrics object to collect metrics of the proximal operator + inline arma::vec operator()(const arma::vec& u, const arma::vec&, const double intercept, const double lambda, + Metrics * const = nullptr) { + return this->operator()(u, intercept, lambda); + } + + //! Compute the proximal operator `v` for the given input parameters. + //! + //! @param u + //! @param intercept + //! @param lambda + //! @param metrics optional metrics object to collect metrics of the proximal operator + inline arma::vec operator()(const arma::vec& u, const double intercept, const double lambda, + Metrics * const = nullptr) { + const int n = loss_->data().n_obs(); + const double mult_fact = 1 / (n + lambda); + if (loss_->IncludeIntercept()) { + return n * mult_fact * u + lambda * mult_fact * (loss_->data().cy() - intercept); + } else { + return n * mult_fact * u + lambda * mult_fact * loss_->data().cy(); + } + } + + //! Compute the intercept. + inline double ComputeIntercept(const arma::vec& fitted) const noexcept { + return arma::mean(loss_->data().cy() - fitted); + } + + //! Compute the step size for the currently set loss. + //! + //! @param penalty the current penalty value. + //! @return the loss-specific step size. + double StepSize(const EnPenalty& penalty, const double norm_x) const { + if (config_tau_ < 0) { + const PredictorResponseData& data = loss_->data(); + const double n_obs_dbl = static_cast(data.n_obs()); + if (data.n_obs() < data.n_pred()) { + const double expo = std::max(0.5, n_obs_dbl / data.n_pred()); + return std::min((data.n_pred() * std::pow(norm_x, expo)), + norm_x * n_obs_dbl / std::sqrt(data.n_pred() * penalty.lambda())); + } else { + const double expo = std::max(0.5, data.n_pred() / n_obs_dbl); + return std::min(std::max(1., (data.n_pred() * std::pow(norm_x, expo))), + norm_x * n_obs_dbl / std::sqrt(n_obs_dbl * penalty.lambda())); + } + } + + return 1 / config_tau_; + } + + //! Compute the step size for the currently set loss. + //! + //! @param penalty the current penalty value. + //! @return the loss-specific step size. + double StepSize(const AdaptiveEnPenalty& penalty, const double norm_x) const { + if (config_tau_ < 0) { + const PredictorResponseData& data = loss_->data(); + const double n_obs_dbl = static_cast(data.n_obs()); + if (data.n_obs() < data.n_pred()) { + const double expo = std::max(0.5, n_obs_dbl / data.n_pred()); + return std::min((data.n_pred() * std::pow(norm_x, expo)) / n_obs_dbl, + norm_x / std::sqrt(data.n_pred() * penalty.lambda())); + } else { + const double expo = std::max(0.5, data.n_pred() / n_obs_dbl); + return std::min(std::max(1., (data.n_pred() * std::pow(norm_x, expo)) / n_obs_dbl), + norm_x / std::sqrt(n_obs_dbl * penalty.lambda())); + } + } + + return 1 / config_tau_; + } + + //! Compute the step size for the currently set loss. + //! + //! @param penalty the current penalty value. + //! @return the loss-specific step size. + inline double StepSize(const RidgePenalty& penalty, const double norm_x) const { + return StepSize(EnPenalty(penalty), norm_x); + } + + //! Compute the step size for the currently set loss. + //! + //! @param penalty the current penalty value. + //! @return the loss-specific step size. + inline double StepSize(const LassoPenalty& penalty, const double norm_x) const { + return StepSize(EnPenalty(penalty), norm_x); + } + + private: + double config_tau_; + LsLoss * loss_; +}; + +//! Proximal operator for the weighted LS loss. +class WeightedLsProximalOperator { + public: + using LossFunction = WeightedLsLoss; + + //! Initialize the proximal operator with fixed step size `1 / tau`. + explicit WeightedLsProximalOperator(const double tau = -1) noexcept : config_tau_(tau) {} + + //! Set the loss function for the proximal operator. + //! + //! @param loss the LS-loss for optimization. The object retains only a reference to the loss, so it is + //! the user's responsibility to not use the object after the loss is removed! + inline void loss(WeightedLsLoss* loss) noexcept { + loss_ = loss; + weights_ = loss_->weights(); + } + + //! Compute the proximal operator `v` for the given input parameters. + //! + //! @param u + //! @param v_prev ignored. + //! @param intercept + //! @param lambda + //! @param metrics optional metrics object to collect metrics of the proximal operator + inline arma::vec operator()(const arma::vec& u, const arma::vec&, const double intercept, const double lambda, + Metrics * const = nullptr) const { + return this->operator()(u, intercept, lambda); + } + + //! Compute the proximal operator `v` for the given input parameters. + //! + //! @param u + //! @param intercept + //! @param lambda + //! @param metrics optional metrics object to collect metrics of the proximal operator + inline arma::vec operator()(const arma::vec& u, const double intercept, const double lambda, + Metrics * const = nullptr) const { + const auto n = loss_->data().n_obs(); + if (loss_->IncludeIntercept()) { + return (n * u + lambda * weights_ % (loss_->data().cy() - intercept)) / + (n + lambda * weights_); + } else { + return (n * u + lambda * weights_ % loss_->data().cy()) / (n + lambda * weights_); + } + } + + //! Compute the intercept. + inline double ComputeIntercept(const arma::vec& fitted) const noexcept { + return arma::mean((loss_->data().cy() - fitted) % weights_) / loss_->mean_weight(); + } + + //! Compute the step size for the currently set loss. + //! + //! @param penalty the current penalty value. + //! @return the loss-specific step size. + double StepSize(const EnPenalty& penalty, const double norm_x) const { + if (config_tau_ < 0) { + const PredictorResponseData& data = loss_->data(); + const double n_obs_weighted = data.n_obs() * loss_->mean_weight(); + if (n_obs_weighted < data.n_pred()) { + const double expo = std::max(0.5, n_obs_weighted / data.n_pred()); + return std::min(data.n_pred() * std::pow(norm_x, expo) / loss_->mean_weight(), + norm_x * data.n_obs() / std::sqrt(data.n_pred() * penalty.lambda())); + } else { + const double expo = std::max(0.5, data.n_pred() / n_obs_weighted); + return std::min(std::max(1., data.n_pred() * std::pow(norm_x, expo)) / loss_->mean_weight(), + norm_x * data.n_obs() / std::sqrt(n_obs_weighted * penalty.lambda())); + } + } + return 1 / config_tau_; + } + + //! Compute the step size for the currently set loss. + //! + //! @param penalty the current penalty value. + //! @return the loss-specific step size. + double StepSize(const AdaptiveEnPenalty& penalty, const double norm_x) const { + if (config_tau_ < 0) { + const PredictorResponseData& data = loss_->data(); + const double n_obs_weighted = data.n_obs() * loss_->mean_weight(); + if (n_obs_weighted < data.n_pred()) { + const double expo = std::max(0.5, n_obs_weighted / data.n_pred()); + return std::min((data.n_pred() * std::pow(norm_x, expo)), + norm_x * n_obs_weighted / std::sqrt(data.n_pred() * penalty.lambda())); + } else { + const double expo = std::max(0.5, data.n_pred() / n_obs_weighted); + return std::min(std::max(1., (data.n_pred() * std::pow(norm_x, expo))), + norm_x * n_obs_weighted / std::sqrt(n_obs_weighted * penalty.lambda())); + } + } + return 1 / config_tau_; + } + + //! Compute the step size for the currently set loss. + //! + //! @param penalty the current penalty value. + //! @return the loss-specific step size. + inline double StepSize(const RidgePenalty& penalty, const double norm_x) const { + return StepSize(EnPenalty(penalty), norm_x); + } + + //! Compute the step size for the currently set loss. + //! + //! @param penalty the current penalty value. + //! @return the loss-specific step size. + inline double StepSize(const LassoPenalty& penalty, const double norm_x) const { + return StepSize(EnPenalty(penalty), norm_x); + } + + private: + double config_tau_; + WeightedLsLoss * loss_; + arma::vec weights_; +}; + +namespace admm_optimizer { +//! Type trait mapping the LsLoss to the LsProximalOperator, the WeightedLsLoss to the WeightedLsProximalOperator, +//! and any other type to itself. +template +using ProximalOperator = typename std::conditional< + std::is_same::value, LsProximalOperator, + typename std::conditional::value, + WeightedLsProximalOperator, T>::type >::type; } // namespace admm_optimizer + //! Compute the EN regression estimate using the alternating direction method of multiplier (ADMM) -//! with linearization -template -class AdmmLinearOptimizer : public Optimizer { +//! with linearization. This optimizer uses the given proximal operator class `ProxOp`. +//! +//! A proximal operator needs to implement the following methods: +//! void loss(const LossFunction& loss) ... change the loss function to `loss`. +//! arma::vec operator()(const vec& u, const vec& prev, const double intercept, const double lambda, Metrics * metrics) +//! ... get the value of the proximal operator of the function scaled by `lambda`, evaluated at `u`. The argument +//! `prev` is the previous value returned by the proximal operator and `intercept` is the current value +//! of the intercept or 0, if the loss does not use an intercept term. +//! double ComputeIntercept(const arma::vec& fitted) ... compute the intercept term, given the fitted values. +//! double StepSize(const PenaltyFunction& penalty, const double norm_x) ... get the step size required for the +//! loss function if the penalty is as given. +//! +//! See `LsProximalOperator` and `WeightedLsProximalOperator` for example implementations of the proximal operator. +template +class GenericLinearizedAdmmOptimizer : public Optimizer { + public: + using ProximalOperator = ProxOp; + using LossFunction = typename ProxOp::LossFunction; + + private: using Base = Optimizer; using LossFunctionPtr = std::unique_ptr; using PenaltyPtr = std::unique_ptr; - using IsWeightedTag = typename traits::is_weighted::type; using IsAdaptiveTag = typename traits::is_adaptive::type; - using WeightsType = typename std::conditional::type; static_assert(traits::is_en_penalty::value, "PenaltyFunction must be an EN-type penalty."); - static_assert(traits::is_ls_loss::value, "LossFunction must be an least-squares-type loss."); - // ADMM state variables struct State { arma::vec v; arma::vec l; }; + // Helper-traits to identify constructor arguments. + template + using IsConfiguration = std::is_same; + template + using IsLossFunction = std::is_same; + template + using IsProximalOperator = std::is_same; + public: using Optimum = typename Base::Optimum; - //! Ininitialize the optimizer using the given (weighted) LS loss function and the Ridge penalty. + //! Initialize the ADMM optimizer without setting a loss or penalty function. + GenericLinearizedAdmmOptimizer() noexcept + : config_(admm_optimizer::kDefaultLinConfig), prox_(), loss_(nullptr), penalty_(nullptr) {} + + //! Initialize the ADMM optimizer with the given loss and penalty functions. //! - //! @param loss a weighted LS loss function. - //! @param penalty the Ridge penalty. - explicit AdmmLinearOptimizer(const AdmmConfiguration& config = admm_optimizer::kDefaultAdmmConfiguration) noexcept - : config_(config), loss_(nullptr), penalty_(nullptr) {} + //! @param loss the loss function object. + //! @param penalty the penalty function object. + GenericLinearizedAdmmOptimizer(const LossFunction& loss, const PenaltyFunction& penalty) noexcept + : config_(admm_optimizer::kDefaultLinConfig), prox_(), + loss_(new LossFunction(loss)), penalty_(new PenaltyFunction(penalty)) {} - //! Ininitialize the optimizer using the given (weighted) LS loss function and the Ridge penalty. + //! Initialize the ADMM optimizer with the given loss and penalty functions. //! - //! @param loss a weighted LS loss function. - //! @param penalty the Ridge penalty. - AdmmLinearOptimizer(const LossFunction& loss, const PenaltyFunction& penalty, - const AdmmConfiguration& config = admm_optimizer::kDefaultAdmmConfiguration) noexcept - : config_(config), loss_(new LossFunction(loss)), penalty_(new PenaltyFunction(penalty)) {} + //! @param loss the loss function object. + //! @param penalty the penalty function object. + //! @param prox proximal operator object. + //! @param config optional ADMM configuration object. + GenericLinearizedAdmmOptimizer(const LossFunction& loss, const PenaltyFunction& penalty, const ProximalOperator& prox, + const AdmmLinearConfiguration& config = admm_optimizer::kDefaultLinConfig) noexcept + : config_(config), prox_(prox), loss_(new LossFunction(loss)), penalty_(new PenaltyFunction(penalty)) {} + + //! Initialize the ADMM optimizer without setting a loss or penalty function. + //! + //! @param prox_arg_1 first argument to constructor of the proximal operator. + //! @param prox_args... further arguments to the constructor of the proximal operator. + template::value && !IsLossFunction::value, void>::type > + explicit GenericLinearizedAdmmOptimizer(T&& prox_arg_1, Args&&... prox_args) noexcept + : config_(admm_optimizer::kDefaultLinConfig), + prox_(std::forward(prox_arg_1), std::forward(prox_args)...), + loss_(nullptr), penalty_(nullptr) {} + + //! Initialize the ADMM optimizer without setting a loss or penalty function. + //! + //! @param config ADMM configuration object. + //! @param prox_args... arguments to the constructor of the proximal operator. + template::value, void>::type > + explicit GenericLinearizedAdmmOptimizer(const C& config, Args&&... prox_args) noexcept + : config_(config), prox_(std::forward(prox_args)...), loss_(nullptr), penalty_(nullptr) {} + + //! Initialize the ADMM optimizer without setting a loss or penalty function. + //! + //! @param prox_arg_1 first argument to constructor of the proximal operator. + //! @param prox_arg_2 second argument to constructor of the proximal operator. + //! @param prox_args... further arguments to the constructor of the proximal operator. + template::value && !IsLossFunction::value, void>::type > + GenericLinearizedAdmmOptimizer(T1&& prox_arg_1, T2&& prox_arg_2, Args&&... prox_args) noexcept + : config_(admm_optimizer::kDefaultLinConfig), + prox_(std::forward(prox_arg_1), std::forward(prox_arg_2), std::forward(prox_args)...), + loss_(nullptr), penalty_(nullptr) {} + + //! Initialize the ADMM optimizer without setting a loss or penalty function. + //! + //! @param config ADMM configuration object. + //! @param prox_arg_1 first argument to constructor of the proximal operator. + //! @param prox_args... further arguments to the constructor of the proximal operator. + template::value, void>::type > + GenericLinearizedAdmmOptimizer(const C& config, T1&& prox_arg_1, Args&&... prox_args) noexcept + : config_(config), prox_(std::forward(prox_arg_1), std::forward(prox_args)...), + loss_(nullptr), penalty_(nullptr) {} + + //! Initialize the ADMM optimizer without setting a loss or penalty function. + //! + //! @param config ADMM configuration object. + //! @param prox_arg_1 first argument to constructor of the proximal operator. + //! @param prox_arg_2 second argument to constructor of the proximal operator. + //! @param prox_args... further arguments to the constructor of the proximal operator. + template::value, void>::type > + GenericLinearizedAdmmOptimizer(const C& config, T1&& prox_arg_1, T2&& prox_arg_2, Args&&... prox_args) noexcept + : config_(config), + prox_(std::forward(prox_arg_1), std::forward(prox_arg_2), std::forward(prox_args)...), + loss_(nullptr), penalty_(nullptr) {} + + //! Initialize the ADMM optimizer without setting a loss or penalty function. + //! + //! @param loss the loss function object. + //! @param penalty the penalty function object. + //! @param prox_arg_1 first argument to constructor of the proximal operator. + //! @param prox_args... further arguments to the constructor of the proximal operator. + template::value && !IsConfiguration::value, void>::type > + GenericLinearizedAdmmOptimizer(const L& loss, const P& penalty, T1&& prox_arg_1, Args&&... prox_args) noexcept + : config_(admm_optimizer::kDefaultLinConfig), + prox_(std::forward(prox_arg_1), std::forward(prox_args)...), + loss_(new LossFunction(loss)), penalty_(new PenaltyFunction(penalty)) {} + + //! Initialize the ADMM optimizer without setting a loss or penalty function. + //! + //! @param loss the loss function object. + //! @param penalty the penalty function object. + //! @param config ADMM configuration object. + //! @param prox_args... further arguments to the constructor of the proximal operator. + template::value && IsConfiguration::value, void>::type > + GenericLinearizedAdmmOptimizer(const L& loss, const P& penalty, const C& config, Args&&... prox_args) noexcept + : config_(config), prox_(std::forward(prox_args)...), + loss_(new LossFunction(loss)), penalty_(new PenaltyFunction(penalty)) {} //! Default copy constructor. //! @@ -190,15 +560,15 @@ class AdmmLinearOptimizer : public OptimizerClone(), penalty_->Clone()); + GenericLinearizedAdmmOptimizer clone(loss_->Clone(), penalty_->Clone(), prox_, config_); return clone; } - AdmmLinearOptimizer Copy() { - return AdmmLinearOptimizer(*this); + GenericLinearizedAdmmOptimizer Copy() { + return GenericLinearizedAdmmOptimizer(*this); } void Reset() { @@ -243,6 +613,7 @@ class AdmmLinearOptimizer : public Optimizerdata().cx(), 2); norm_x_sq_inv_ = 1 / (norm_x_ * norm_x_); - UpdateWeights(IsWeightedTag{}); } PenaltyFunction& penalty() const { @@ -308,10 +678,8 @@ class AdmmLinearOptimizer : public Optimizerdata().cx() * coefs_.beta, IsWeightedTag{}), - arma::zeros(loss_->data().n_obs()) - }; + // Reset the state (at least `v` so that the Optimize function knows that the state needs to be re-initialized) + state_.v.reset(); return Optimize(max_it); } @@ -331,8 +699,7 @@ class AdmmLinearOptimizer : public Optimizerdata(); const bool include_intercept = loss_->IncludeIntercept(); - const double scaled_lambda = data.n_obs() * penalty_->lambda(); - const bool check_empty = admm_optimizer::AllZero(coefs_.beta) || (coefs_.beta.n_elem != data.n_pred()); + // const bool check_empty = admm_optimizer::AllZero(coefs_.beta) || (coefs_.beta.n_elem != data.n_pred()); // Check if the coefficients are correct. if (coefs_.beta.n_elem != data.n_pred()) { @@ -345,30 +712,23 @@ class AdmmLinearOptimizer : public Optimizeralpha() * step_size_ * step_size_; const auto en_cutoff = DetermineCutoff(IsAdaptiveTag{}); - const double en_multiplier = 1 / (1 + norm_x_sq_inv_ * step_size_ * scaled_lambda * (1 - penalty_->alpha())); + const double en_multiplier = 1 / (1 + norm_x_sq_inv_ * step_size_ * penalty_->lambda() * (1 - penalty_->alpha())); double gap = 0; arma::vec fitted = data.cx() * coefs_.beta; if (include_intercept) { - coefs_.intercept = ComputeIntercept(fitted, IsWeightedTag{}); - } - - if (check_empty && !admm_optimizer::AnyViolateKKT(data.cx(), EmptyModelResiduals(IsWeightedTag{}), scaled_lambda, - *penalty_)) { - // None of the predictors will be activated for the current penalty. Return the current coefficient value. - return FinalizeResult(0, 0, OptimumStatus::kOk, std::move(metrics)); + coefs_.intercept = prox_.ComputeIntercept(fitted); } // Check if the state needs to be re-initialized if (state_.v.n_elem != fitted.n_elem) { - // This is the ProximalLS function, but inlined... - state_.v = ProximalLs(fitted, IsWeightedTag{}); + state_.v = prox_(fitted, coefs_.intercept, step_size_); state_.l.zeros(data.n_obs()); } else if (old_step_size != step_size_) { // Adjust the slack variable for the udpated step size. @@ -382,13 +742,11 @@ class AdmmLinearOptimizer : public OptimizerAddDetail("step_size", step_size_); int iter = 0; - int second_criterion = 0; State prev_state; while (iter++ < max_it) { - // State prev_state = state_; - prev_state.v = state_.v; - prev_state.l = state_.l; + Metrics& iter_metrics = metrics->CreateSubMetrics("admm-iteration"); + prev_state = state_; // remember: fitted is already fitted - state_.v coefs_.beta = en_multiplier * SoftThreshold(coefs_.beta, -norm_x_sq_inv_, data.cx().t() * (fitted + state_.l), en_cutoff); @@ -396,27 +754,29 @@ class AdmmLinearOptimizer : public Optimizer max_it) { + gap = (v_diff + l_diff) * diff_scaling * diff_scaling; + + iter_metrics.AddDetail("v_diff", v_diff); + iter_metrics.AddDetail("l_diff", l_diff); + iter_metrics.AddDetail("gap", gap); + + if (gap < conv_tol) { return FinalizeResult(iter, gap, OptimumStatus::kOk, std::move(metrics)); - } else if (gap * gap < conv_tol && (l_diff < gap || v_diff < gap)) { - second_criterion += admm_optimizer::kSecondCriterionMultiplier; - } else { - second_criterion = 0; } } - return FinalizeResult(iter, gap, OptimumStatus::kWarning, "ADMM-algorithm did not converge.", std::move(metrics)); + return FinalizeResult(--iter, gap, OptimumStatus::kWarning, "ADMM-algorithm did not converge.", std::move(metrics)); } private: @@ -435,25 +795,6 @@ class AdmmLinearOptimizer : public OptimizerIncludeIntercept()) { - return weights_ % (loss_->data().cy() - coefs_.intercept); - } else { - return weights_ % loss_->data().cy(); - } - } - - //! Compute the unweighted residuals. - arma::vec EmptyModelResiduals(std::false_type) const noexcept { - if (loss_->IncludeIntercept()) { - return loss_->data().cy() - coefs_.intercept; - } else { - return loss_->data().cy(); - } - } - //! Determine the cutoff for the soft-threshold function for adaptive penalties arma::vec DetermineCutoff(std::true_type) const noexcept { return penalty_->loadings() * DetermineCutoff(std::false_type{}); @@ -461,110 +802,26 @@ class AdmmLinearOptimizer : public Optimizerdata().n_obs() * penalty_->alpha() * penalty_->lambda() * step_size_ * norm_x_sq_inv_; - } - - //! Apply the proximal operator for the weighted LS loss with intercept - arma::vec ProximalLs(const arma::vec& v, std::true_type) const { - if (loss_->IncludeIntercept()) { - return (v + step_size_ * weights_ % (loss_->data().cy() - coefs_.intercept)) / - (1 + step_size_ * weights_); - } else { - return (v + step_size_ * weights_ % loss_->data().cy()) / (1 + step_size_ * weights_); - } - } - - //! Apply the proximal operator for the LS loss with intercept. - arma::vec ProximalLs(const arma::vec& v, std::false_type) const { - if (loss_->IncludeIntercept()) { - return (v + step_size_ * (loss_->data().cy() - coefs_.intercept)) / (1 + step_size_); - } else { - return (v + step_size_ * loss_->data().cy()) / (1 + step_size_); - } - } - - //! Compute the intercept for weighted LS - double ComputeIntercept(const arma::vec& fitted, std::true_type) const { - // Manually square the sqrt-weights to take advantage of them being standardized. - return arma::mean((loss_->data().cy() - fitted) % weights_) / loss_->mean_weight(); - } - - //! Compute the intercept for unweighted LS - double ComputeIntercept(const arma::vec& fitted, std::false_type) const { - return arma::mean(loss_->data().cy() - fitted); - } - - //! Update the cached weights. - void UpdateWeights(std::true_type) noexcept { - weights_ = loss_->weights(); + return penalty_->alpha() * penalty_->lambda() * step_size_ * norm_x_sq_inv_; } - //! Update the cached weights. For the unweighted LS, this does nothing. - void UpdateWeights(std::false_type) const noexcept {} - - //! Update the step size based on the predictor matrix and the penalty level. - void UpdateStepSize(std::false_type) noexcept { - if (config_.tau < 0) { - step_size_ = std::sqrt(norm_x_); - const PredictorResponseData& data = loss_->data(); - const double n_obs_dbl = static_cast(data.n_obs()); - if (data.n_obs() < data.n_pred()) { - const double expo = std::max(0.5, n_obs_dbl / data.n_pred()); - step_size_ = std::min((data.n_pred() * std::pow(norm_x_, expo)) / n_obs_dbl, - norm_x_ / std::sqrt(data.n_pred() * penalty_->lambda())); - // step_size_ = std::max(n_obs_dbl / (data.n_pred() * std::pow(norm_x_, expo)), - // std::sqrt(data.n_pred() * penalty_->lambda() / norm_x_)); - } else { - const double expo = std::max(0.5, data.n_pred() / n_obs_dbl); - step_size_ = std::min(std::max(1., (data.n_pred() * std::pow(norm_x_, expo)) / n_obs_dbl), - norm_x_ / std::sqrt(n_obs_dbl * penalty_->lambda())); - // step_size_ = std::max(std::min(1., n_obs_dbl / (data.n_pred() * std::pow(norm_x_, expo))), - // std::sqrt(n_obs_dbl * penalty_->lambda() / norm_x_)); - } - } else { - step_size_ = 1 / config_.tau; - } - } - - //! Update the step size based on the predictor matrix and the penalty level. - void UpdateStepSize(std::true_type) noexcept { - if (config_.tau < 0) { - step_size_ = std::sqrt(norm_x_); - const PredictorResponseData& data = loss_->data(); - const double n_obs_weighted = data.n_obs() * loss_->mean_weight(); - if (n_obs_weighted < data.n_pred()) { - const double expo = std::max(0.5, n_obs_weighted / data.n_pred()); - step_size_ = std::min((data.n_pred() * std::pow(norm_x_, expo)) / n_obs_weighted, - norm_x_ / std::sqrt(data.n_pred() * penalty_->lambda())); - // step_size_ = std::max(n_obs_weighted / (data.n_pred() * std::pow(norm_x_, expo)), - // std::sqrt(data.n_pred() * penalty_->lambda() / norm_x_)); - } else { - const double expo = std::max(0.5, data.n_pred() / n_obs_weighted); - step_size_ = std::min(std::max(1., (data.n_pred() * std::pow(norm_x_, expo)) / n_obs_weighted), - norm_x_ / std::sqrt(n_obs_weighted * penalty_->lambda())); - // step_size_ = 1 / std::max(std::min(1., n_obs_weighted / (data.n_pred() * std::pow(norm_x_, expo))), - // std::sqrt(n_obs_weighted * penalty_->lambda() / norm_x_)); - } - } else { - step_size_ = 1 / config_.tau; - } - } - - // //! The linearized ADMM algorithm uses a different criterion to determine convergence. - // //! To make the precision comparable to other optimizers, the convergence tolerance needs to be adjusted. - // constexpr static double kConvergenceToleranceAdjustment = 1e-2; - const AdmmConfiguration config_; + const AdmmLinearConfiguration config_; + ProximalOperator prox_; LossFunctionPtr loss_; PenaltyPtr penalty_; Coefficients coefs_; State state_; - WeightsType weights_; double step_size_; double norm_x_; double norm_x_sq_inv_; double convergence_tolerance_ = 1e-6; }; +//! Alias to make the GenericLinearizedAdmmOptimizer template more versatile. It either excepts a LS-type loss +//! function class, or a proximal operator. +template +using LinearizedAdmmOptimizer = GenericLinearizedAdmmOptimizer, + PenaltyFunction, Coefficients>; //! Compute the EN regression estimate using the alternating direction method of multiplier (ADMM) //! with variable step-size. @@ -599,7 +856,7 @@ class AdmmVarStepOptimizer : public Optimizercx(), EmptyModelResiduals(IsWeightedTag{}), - scaled_lambda, *penalty_)) { + scaled_lambda / data_->n_obs(), *penalty_)) { // None of the predictors will be activated for the current penalty. Return the current coefficient value. return FinalizeResult(0, OptimumStatus::kOk, std::move(metrics)); } @@ -830,10 +1089,11 @@ class AdmmVarStepOptimizer : public Optimizer::Optimum; //! Ininitialize the MM algorithm without loss or penalty function.