Commit 49819291 authored by davidkep's avatar davidkep

Retain "good" solutions along the ENPY iterations.

parent ff5a2043
...@@ -13,7 +13,6 @@ export(mlocscale) ...@@ -13,7 +13,6 @@ export(mlocscale)
export(mscale) export(mscale)
export(pense) export(pense)
export(rho_function) export(rho_function)
export(s_algo_options)
export(tau_size) export(tau_size)
importFrom(Rcpp,evalCpp) importFrom(Rcpp,evalCpp)
importFrom(stats,mad) importFrom(stats,mad)
......
...@@ -4,23 +4,27 @@ ...@@ -4,23 +4,27 @@
#' @param max_it maximum number of PY iterations. #' @param max_it maximum number of PY iterations.
#' @param eps numerical tolerance to check for convergence. #' @param eps numerical tolerance to check for convergence.
#' @param keep_psc_proportion how many observations should be kept based on the Principal Sensitivy Components. #' @param keep_psc_proportion how many observations should be kept based on the Principal Sensitivy Components.
#' @param keep_residuals_measure how to determine how many observations to keep, based on their residuals. #' @param keep_residuals_measure how to determine what observations to keep, based on their residuals.
#' If `proportion`, a fixed number of observations is kept, while if `threshold`, #' If `proportion`, a fixed number of observations is kept, while if `threshold`,
#' only observations with residuals below a threshold are kept. #' only observations with residuals below the threshold are kept.
#' @param keep_residuals_proportion how many observations should be kept based on their residuals. #' @param keep_residuals_proportion how many observations should be kept based on their residuals.
#' @param keep_residuals_threshold only observations with (standardized) residuals less than this threshold are kept. #' @param keep_residuals_threshold only observations with (standardized) residuals less than this threshold are kept.
#' @param retain_best_factor in addition to the candidates from the last iteration, also keep candidates
#' that are within this factor of the best candidate.
#' #'
#' @return options for the ENPY algorithm. #' @return options for the ENPY algorithm.
#' @export #' @export
enpy_options <- function (max_it = 10, eps = 1e-9, keep_psc_proportion = 0.5, enpy_options <- function (max_it = 10, eps = 1e-6, keep_psc_proportion = 0.5,
keep_residuals_measure = c('proportion', 'threshold'), keep_residuals_measure = c('threshold', 'proportion'),
keep_residuals_proportion = 0.5, keep_residuals_threshold = 2) { keep_residuals_proportion = 0.5, keep_residuals_threshold = 2,
retain_best_factor = 1.1) {
list(max_it = as.integer(max_it[[1L]]), list(max_it = as.integer(max_it[[1L]]),
eps = as.numeric(eps[[1L]]), eps = as.numeric(eps[[1L]]),
keep_psc_proportion = as.numeric(keep_psc_proportion[[1L]]), keep_psc_proportion = as.numeric(keep_psc_proportion[[1L]]),
use_residual_threshold = match.arg(keep_residuals_measure) == 'threshold', use_residual_threshold = match.arg(keep_residuals_measure) == 'threshold',
keep_residuals_proportion = as.numeric(keep_residuals_proportion[[1L]]), keep_residuals_proportion = as.numeric(keep_residuals_proportion[[1L]]),
keep_residuals_threshold = as.numeric(keep_residuals_threshold[[1L]])) keep_residuals_threshold = as.numeric(keep_residuals_threshold[[1L]]),
retain_best_factor = as.numeric(retain_best_factor[[1L]]))
} }
#' Options for the M-estimation Algorithm #' Options for the M-estimation Algorithm
...@@ -30,7 +34,7 @@ enpy_options <- function (max_it = 10, eps = 1e-9, keep_psc_proportion = 0.5, ...@@ -30,7 +34,7 @@ enpy_options <- function (max_it = 10, eps = 1e-9, keep_psc_proportion = 0.5,
#' #'
#' @return options for the M-estimation algorithm. #' @return options for the M-estimation algorithm.
#' @export #' @export
mest_options <- function (max_it = 200, eps = 1e-8) { mest_options <- function (max_it = 200, eps = 1e-6) {
list(max_it = as.integer(max_it[[1L]]), list(max_it = as.integer(max_it[[1L]]),
eps = as.numeric(eps[[1L]])) eps = as.numeric(eps[[1L]]))
} }
...@@ -43,7 +47,7 @@ mest_options <- function (max_it = 200, eps = 1e-8) { ...@@ -43,7 +47,7 @@ mest_options <- function (max_it = 200, eps = 1e-8) {
#' @param eps numerical tolerance to check for convergence. #' @param eps numerical tolerance to check for convergence.
#' #'
#' @return options for the S-Estimate algorithm. #' @return options for the S-Estimate algorithm.
s_algo_options <- function (explore_it = 10, max_it = 500, eps = 1e-8, s_algo_options <- function (explore_it = 10, max_it = 500, eps = 1e-6,
tightening = c('none', 'adaptive', 'exponential'), tightening = c('none', 'adaptive', 'exponential'),
tightening_steps = 10) { tightening_steps = 10) {
list(max_it = as.integer(max_it[[1L]]), list(max_it = as.integer(max_it[[1L]]),
...@@ -80,7 +84,7 @@ s_algo_options <- function (explore_it = 10, max_it = 500, eps = 1e-8, ...@@ -80,7 +84,7 @@ s_algo_options <- function (explore_it = 10, max_it = 500, eps = 1e-8,
#' @return options for the ADMM EN algorithm. #' @return options for the ADMM EN algorithm.
#' @family EN algorithms #' @family EN algorithms
#' @export #' @export
en_admm_options <- function (max_it = 1000, eps = 1e-6, tau, sparse = FALSE, en_admm_options <- function (max_it = 1000, eps = 1e-9, tau, sparse = FALSE,
admm_type = c('auto', 'linearized', admm_type = c('auto', 'linearized',
'var-stepsize'), 'var-stepsize'),
tau_lower_mult = 0.01, tau_adjustment_lower = 0.98, tau_lower_mult = 0.01, tau_adjustment_lower = 0.98,
...@@ -112,7 +116,7 @@ en_admm_options <- function (max_it = 1000, eps = 1e-6, tau, sparse = FALSE, ...@@ -112,7 +116,7 @@ en_admm_options <- function (max_it = 1000, eps = 1e-6, tau, sparse = FALSE,
#' @return options for the DAL EN algorithm. #' @return options for the DAL EN algorithm.
#' @family EN algorithms #' @family EN algorithms
#' @export #' @export
en_dal_options <- function (max_it = 100, max_inner_it = 100, eps = 1e-9, eta_multiplier = 2, en_dal_options <- function (max_it = 100, max_inner_it = 100, eps = 1e-6, eta_multiplier = 2,
eta_start_conservative = 0.01, eta_start_aggressive = 1, eta_start_conservative = 0.01, eta_start_aggressive = 1,
lambda_relchange_aggressive = 0.25) { lambda_relchange_aggressive = 0.25) {
list(algorithm = 'dal', list(algorithm = 'dal',
......
#' ENPY Initial Estimates #' ENPY Initial Estimates
#' #'
#' @export #' @export
enpy_initial_estimates <- function (x, y, alpha, lambdas, bdp = 0.25, cc, enpy_initial_estimates <- function (x, y, alpha, lambdas, bdp = 0.25, cc, include_intercept = TRUE,
include_intercept = TRUE, enpy_opts = enpy_options(), mscale_maxit = 200, mscale_eps = 1e-9, en_options) {
enpy_opts = enpy_options(),
mscale_maxit = 200, mscale_eps = 1e-8,
en_options) {
if (missing(cc)) { if (missing(cc)) {
cc <- .bisquare_consistency_const(bdp) cc <- .bisquare_consistency_const(bdp)
} }
......
...@@ -29,7 +29,7 @@ ...@@ -29,7 +29,7 @@
#' @export #' @export
pense <- function(x, y, alpha, lambdas, cold_lambdas, penalty_loadings, pense <- function(x, y, alpha, lambdas, cold_lambdas, penalty_loadings,
additional_initial_estimates, include_intercept = TRUE, additional_initial_estimates, include_intercept = TRUE,
max_it = 200, eps = 1e-5, explore_it = 10, max_it = 200, eps = 1e-6, explore_it = 10,
tightening = c('none', 'adaptive', 'exponential'), tightening = c('none', 'adaptive', 'exponential'),
tightening_steps = 10L, en_algorithm_opts, tightening_steps = 10L, en_algorithm_opts,
mest_opts = mest_options(), enpy_opts = enpy_options()) { mest_opts = mest_options(), enpy_opts = enpy_options()) {
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
\alias{en_admm_options} \alias{en_admm_options}
\title{Options for the ADMM Elastic Net Algorithm} \title{Options for the ADMM Elastic Net Algorithm}
\usage{ \usage{
en_admm_options(max_it = 1000, eps = 1e-06, tau, sparse = FALSE, en_admm_options(max_it = 1000, eps = 1e-09, tau, sparse = FALSE,
admm_type = c("auto", "linearized", "var-stepsize"), admm_type = c("auto", "linearized", "var-stepsize"),
tau_lower_mult = 0.01, tau_adjustment_lower = 0.98, tau_lower_mult = 0.01, tau_adjustment_lower = 0.98,
tau_adjustment_upper = 0.999) tau_adjustment_upper = 0.999)
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
\alias{en_dal_options} \alias{en_dal_options}
\title{Options for the DAL Elastic Net Algorithm} \title{Options for the DAL Elastic Net Algorithm}
\usage{ \usage{
en_dal_options(max_it = 100, max_inner_it = 100, eps = 1e-09, en_dal_options(max_it = 100, max_inner_it = 100, eps = 1e-06,
eta_multiplier = 2, eta_start_conservative = 0.01, eta_multiplier = 2, eta_start_conservative = 0.01,
eta_start_aggressive = 1, lambda_relchange_aggressive = 0.25) eta_start_aggressive = 1, lambda_relchange_aggressive = 0.25)
} }
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
\title{ENPY Initial Estimates} \title{ENPY Initial Estimates}
\usage{ \usage{
enpy_initial_estimates(x, y, alpha, lambdas, bdp = 0.25, cc, enpy_initial_estimates(x, y, alpha, lambdas, bdp = 0.25, cc,
include_intercept = TRUE, enpy_opts = enpy_options(), mscale_maxit, include_intercept = TRUE, enpy_opts = enpy_options(),
mscale_eps, en_options) mscale_maxit = 200, mscale_eps = 1e-09, en_options)
} }
\description{ \description{
ENPY Initial Estimates ENPY Initial Estimates
......
...@@ -4,9 +4,10 @@ ...@@ -4,9 +4,10 @@
\alias{enpy_options} \alias{enpy_options}
\title{Options for the ENPY Algorithm} \title{Options for the ENPY Algorithm}
\usage{ \usage{
enpy_options(max_it = 10, eps = 1e-09, keep_psc_proportion = 0.5, enpy_options(max_it = 10, eps = 1e-06, keep_psc_proportion = 0.5,
keep_residuals_measure = c("proportion", "threshold"), keep_residuals_measure = c("threshold", "proportion"),
keep_residuals_proportion = 0.5, keep_residuals_threshold = 2) keep_residuals_proportion = 0.5, keep_residuals_threshold = 2,
retain_best_factor = 1.1)
} }
\arguments{ \arguments{
\item{max_it}{maximum number of PY iterations.} \item{max_it}{maximum number of PY iterations.}
...@@ -15,13 +16,16 @@ enpy_options(max_it = 10, eps = 1e-09, keep_psc_proportion = 0.5, ...@@ -15,13 +16,16 @@ enpy_options(max_it = 10, eps = 1e-09, keep_psc_proportion = 0.5,
\item{keep_psc_proportion}{how many observations should be kept based on the Principal Sensitivy Components.} \item{keep_psc_proportion}{how many observations should be kept based on the Principal Sensitivy Components.}
\item{keep_residuals_measure}{how to determine how many observations to keep, based on their residuals. \item{keep_residuals_measure}{how to determine what observations to keep, based on their residuals.
If \code{proportion}, a fixed number of observations is kept, while if \code{threshold}, If \code{proportion}, a fixed number of observations is kept, while if \code{threshold},
only observations with residuals below a threshold are kept.} only observations with residuals below the threshold are kept.}
\item{keep_residuals_proportion}{how many observations should be kept based on their residuals.} \item{keep_residuals_proportion}{how many observations should be kept based on their residuals.}
\item{keep_residuals_threshold}{only observations with (standardized) residuals less than this threshold are kept.} \item{keep_residuals_threshold}{only observations with (standardized) residuals less than this threshold are kept.}
\item{retain_best_factor}{in addition to the candidates from the last iteration, also keep candidates
that are within this factor of the best candidate.}
} }
\value{ \value{
options for the ENPY algorithm. options for the ENPY algorithm.
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
\alias{mest_options} \alias{mest_options}
\title{Options for the M-estimation Algorithm} \title{Options for the M-estimation Algorithm}
\usage{ \usage{
mest_options(max_it = 200, eps = 1e-08) mest_options(max_it = 200, eps = 1e-06)
} }
\arguments{ \arguments{
\item{max_it}{maximum number of iterations.} \item{max_it}{maximum number of iterations.}
......
...@@ -6,7 +6,8 @@ ...@@ -6,7 +6,8 @@
\usage{ \usage{
pense(x, y, alpha, lambdas, cold_lambdas, penalty_loadings, pense(x, y, alpha, lambdas, cold_lambdas, penalty_loadings,
additional_initial_estimates, include_intercept = TRUE, max_it = 200, additional_initial_estimates, include_intercept = TRUE, max_it = 200,
eps = 1e-05, explore_it = 10, en_algorithm_opts, eps = 1e-06, explore_it = 10, tightening = c("none", "adaptive",
"exponential"), tightening_steps = 10L, en_algorithm_opts,
mest_opts = mest_options(), enpy_opts = enpy_options()) mest_opts = mest_options(), enpy_opts = enpy_options())
} }
\arguments{ \arguments{
......
...@@ -4,7 +4,9 @@ ...@@ -4,7 +4,9 @@
\alias{s_algo_options} \alias{s_algo_options}
\title{Options for the S-Estimate Algorithm} \title{Options for the S-Estimate Algorithm}
\usage{ \usage{
s_algo_options(explore_it = 10, max_it = 500, eps = 1e-08) s_algo_options(explore_it = 10, max_it = 500, eps = 1e-06,
tightening = c("none", "adaptive", "exponential"),
tightening_steps = 10)
} }
\arguments{ \arguments{
\item{explore_it}{number of iterations to explore potential candidate \item{explore_it}{number of iterations to explore potential candidate
......
CXX_STD = CXX11 CXX_STD = CXX11
PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) # -flto=thin PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) # -flto=thin
PKG_CPPFLAGS= -D__STDC_LIMIT_MACROS -DHAVE_RCPP -DNSOPTIM_DETAILED_METRICS -DTESTTHAT_DISABLED # PKG_CPPFLAGS= -D__STDC_LIMIT_MACROS -DHAVE_RCPP -DNSOPTIM_DETAILED_METRICS -DTESTTHAT_DISABLED
PKG_CXXFLAGS= -fstrict-aliasing -Wstrict-aliasing PKG_CXXFLAGS= -fstrict-aliasing -Wstrict-aliasing
PKG_OPTFLAGS= -g -O0 PKG_OPTFLAGS= -g -Os
# -DNSOPTIM_METRICS_DISABLED -DNSOPTIM_DETAILED_METRICS # -DNSOPTIM_METRICS_DISABLED -DNSOPTIM_DETAILED_METRICS
# PKG_CPPFLAGS= -D__STDC_LIMIT_MACROS -DHAVE_RCPP -DTESTTHAT_DISABLED PKG_CPPFLAGS= -D__STDC_LIMIT_MACROS -DHAVE_RCPP -DTESTTHAT_DISABLED
...@@ -9,12 +9,12 @@ ...@@ -9,12 +9,12 @@
#include <algorithm> #include <algorithm>
#include <nsoptim.hpp> #include <nsoptim.hpp>
#include "constants.hpp"
#include "rcpp_utils.hpp" #include "rcpp_utils.hpp"
#include "enpy_initest.hpp" #include "enpy_initest.hpp"
namespace { namespace {
constexpr int kDefaultCfgMaxIt = 1; //!< Maximum number of iterations. constexpr int kDefaultCfgMaxIt = 1; //!< Maximum number of iterations.
constexpr double kDefaultCfgEps = 1e-9; //!< Numerical tolerance level to determine convergence.
constexpr double kKeepPscProportion = 0.5; //!< Proportion of observations to keep based on PSCs. constexpr double kKeepPscProportion = 0.5; //!< Proportion of observations to keep based on PSCs.
constexpr bool kUseResidualThreshold = false; //!< Use a fixed threshold instead of a proportion constexpr bool kUseResidualThreshold = false; //!< Use a fixed threshold instead of a proportion
//!< to screen observations based on their //!< to screen observations based on their
...@@ -23,6 +23,8 @@ constexpr double kKeepResidualsProportion = 0.5; //!< Proportion of observation ...@@ -23,6 +23,8 @@ constexpr double kKeepResidualsProportion = 0.5; //!< Proportion of observation
//!< on the residuals. //!< on the residuals.
constexpr double kKeepResidualsThreshold = 2; //!< Fixed threshold to keep observations based constexpr double kKeepResidualsThreshold = 2; //!< Fixed threshold to keep observations based
//!< on the residuals. //!< on the residuals.
constexpr double kRetainBestFactor = 1.1; //!< Retain not only the candidates from the last iteration,
//!< but also those that are within this factor of the best candidate.
inline arma::uword HashUpdate(const arma::uword hash, const arma::uword value) noexcept; inline arma::uword HashUpdate(const arma::uword hash, const arma::uword value) noexcept;
} // namespace } // namespace
...@@ -32,11 +34,12 @@ namespace enpy_initest_internal { ...@@ -32,11 +34,12 @@ namespace enpy_initest_internal {
PyConfiguration ParseConfiguration(const Rcpp::List& config) noexcept { PyConfiguration ParseConfiguration(const Rcpp::List& config) noexcept {
return PyConfiguration{ return PyConfiguration{
GetFallback(config, "max_it", kDefaultCfgMaxIt), GetFallback(config, "max_it", kDefaultCfgMaxIt),
GetFallback(config, "eps", kDefaultCfgEps), GetFallback(config, "eps", kDefaultConvergenceTolerance),
GetFallback(config, "keep_psc_proportion", kKeepPscProportion), GetFallback(config, "keep_psc_proportion", kKeepPscProportion),
GetFallback(config, "use_residual_threshold", kUseResidualThreshold), GetFallback(config, "use_residual_threshold", kUseResidualThreshold),
GetFallback(config, "keep_residuals_proportion", kKeepResidualsProportion), GetFallback(config, "keep_residuals_proportion", kKeepResidualsProportion),
GetFallback(config, "keep_residuals_threshold", kKeepResidualsThreshold) GetFallback(config, "keep_residuals_threshold", kKeepResidualsThreshold),
GetFallback(config, "retain_best_factor", kRetainBestFactor)
}; };
} }
......
...@@ -35,6 +35,8 @@ struct PyConfiguration { ...@@ -35,6 +35,8 @@ struct PyConfiguration {
//!< their residuals. //!< their residuals.
double keep_residuals_proportion; //!< Proportion of observations to keep based on the residuals. double keep_residuals_proportion; //!< Proportion of observations to keep based on the residuals.
double keep_residuals_threshold; //!< Fixed threshold to keep observations based on the residuals. double keep_residuals_threshold; //!< Fixed threshold to keep observations based on the residuals.
double retain_best_factor; //!< Retain not only the candidates from the last iteration, but also those that are
//!< within this factor of the best candidate.
}; };
//! Parse an Rcpp::List into the PyConfiguration structure. //! Parse an Rcpp::List into the PyConfiguration structure.
...@@ -70,10 +72,12 @@ template<typename T> ...@@ -70,10 +72,12 @@ template<typename T>
class IndexCompAbsoluteAscending { class IndexCompAbsoluteAscending {
public: public:
//! Create a comparator using the given _values_ for comparision. //! Create a comparator using the given _values_ for comparision.
explicit IndexCompAbsoluteAscending(const T& values) noexcept : values_(arma::abs(values)) {} explicit IndexCompAbsoluteAscending(const T& values) noexcept : values_(values) {}
bool operator()(const arma::uword a, const arma::uword b) { return std::abs(values_[a]) < std::abs(values_[b]); } bool operator()(const arma::uword a, const arma::uword b) const {
return std::abs(values_[a]) < std::abs(values_[b]);
}
private: private:
const arma::vec values_; const T& values_;
}; };
//! Comparator class to sort an _index_ vector based on ascending values of the _values_ vector. //! Comparator class to sort an _index_ vector based on ascending values of the _values_ vector.
...@@ -81,7 +85,7 @@ template<typename T> ...@@ -81,7 +85,7 @@ template<typename T>
class IndexCompAscending { class IndexCompAscending {
public: public:
explicit IndexCompAscending(const T& values) noexcept : values_(values) {} explicit IndexCompAscending(const T& values) noexcept : values_(values) {}
bool operator()(const arma::uword a, const arma::uword b) { return values_[a] < values_[b]; } bool operator()(const arma::uword a, const arma::uword b) const { return values_[a] < values_[b]; }
private: private:
const T& values_; const T& values_;
}; };
...@@ -91,7 +95,7 @@ template<typename T> ...@@ -91,7 +95,7 @@ template<typename T>
class IndexCompDescending { class IndexCompDescending {
public: public:
explicit IndexCompDescending(const T& values) noexcept : values_(values) {} explicit IndexCompDescending(const T& values) noexcept : values_(values) {}
bool operator()(const arma::uword a, const arma::uword b) { return values_[a] > values_[b]; } bool operator()(const arma::uword a, const arma::uword b) const { return values_[a] > values_[b]; }
private: private:
const T& values_; const T& values_;
}; };
...@@ -104,18 +108,17 @@ enum class SubsetEstimateResult { ...@@ -104,18 +108,17 @@ enum class SubsetEstimateResult {
//! Compute estimator & the objective function value on a subset of the given data. //! Compute estimator & the objective function value on a subset of the given data.
//! //!
//! @param full_data_loss the loss operating on the full data set. //! @param loss the LS loss operating on the **full** data set.
//! @param filtered_data_loss the loss operating on the residuals-filtered data. //! @param subset an index vector (with respect to the full data) of observations to consider.
//! @param optim the optimizer to compute the estimate. //! @param optim the optimizer to compute the estimate.
//! @param subset a subset of the index vector with indices to consider.
//! @param candidates list of candidates which will be updated. //! @param candidates list of candidates which will be updated.
//! @param insert_it iterator pointing to the position where to insert the new candidate. The iterator //! @param insert_it iterator pointing to the position where to insert the new candidate. The iterator
//! will be updated to point to the newly inserted element. //! will be updated to point to the newly inserted element.
//! @param subset_hashes hashes of all previously examined subsets. The hash of this subset is added to the list. //! @param subset_hashes hashes of all previously examined subsets. The hash of the new subset is added to the list.
//! @return status of the minimum computed on a subset of the data. //! @return status of the minimum computed on a subset of the data.
template <typename Optimizer> template <typename Optimizer, typename T>
inline SubsetEstimateResult EstimateOnSubset(const nsoptim::LsLoss& filtered_data_loss, inline SubsetEstimateResult EstimateOnSubset(const nsoptim::LsLoss& loss,
const arma::subview_col<arma::uword>& subset, Optimizer* optim, const T& subset, Optimizer* optim,
alias::Optima<Optimizer>* candidates, alias::Optima<Optimizer>* candidates,
typename alias::Optima<Optimizer>::iterator* insert_it, typename alias::Optima<Optimizer>::iterator* insert_it,
std::unordered_set<arma::uword>* subset_hashes); std::unordered_set<arma::uword>* subset_hashes);
...@@ -214,10 +217,12 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu ...@@ -214,10 +217,12 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
} }
uvec all_indices = regspace<uvec>(0, data.n_obs() - 1); uvec all_indices = regspace<uvec>(0, data.n_obs() - 1);
uvec residuals_keep_ind = all_indices;
HashSet residual_filtered_data_hashes { HashSequence(all_indices.n_elem - 1) }; HashSet residual_filtered_data_hashes { HashSequence(all_indices.n_elem - 1) };
HashSet subset_candidate_hashes { HashSequence(all_indices.n_elem - 1) };
auto best_candidate_it = py_result.initial_estimates.begin(); auto best_candidate_it = py_result.initial_estimates.begin();
std::unique_ptr<nsoptim::LsLoss> filtered_data_loss(new nsoptim::LsLoss(loss.SharedData(), loss.IncludeIntercept())); nsoptim::LsLoss ls_loss(loss.SharedData(), loss.IncludeIntercept());
PscResult<Optimizer> psc_result(Optimum(*filtered_data_loss, penalty)); PscResult<Optimizer> psc_result(Optimum(ls_loss, penalty));
arma::mat const * pscs = &full_psc_result.pscs; arma::mat const * pscs = &full_psc_result.pscs;
// The initial "best candidate" comes from the PSC and thus has the wrong objective function value. // The initial "best candidate" comes from the PSC and thus has the wrong objective function value.
...@@ -228,25 +233,26 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu ...@@ -228,25 +233,26 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
// Start the PY iterations. // Start the PY iterations.
int iter = 0; int iter = 0;
decltype(best_candidate_it) insert_candidate_it;
while (true) { while (true) {
HashSet candidate_hashes;
// Based on each PSC, "thin-out" the data in 3 different ways and compute the classical estimator // Based on each PSC, "thin-out" the data in 3 different ways and compute the classical estimator
const uword psc_index_shift = std::max<uword>(pyconfig.keep_psc_proportion * filtered_data_loss->data().n_obs(), const uword subset_size = std::max<uword>(pyconfig.keep_psc_proportion * residuals_keep_ind.n_elem, kMinObs);
kMinObs); uvec subset_indices = arma::regspace<uvec>(0, residuals_keep_ind.n_elem - 1);
uvec index_vec = arma::regspace<uvec>(0, pscs->n_rows - 1); // uvec subset_index_vec = arma::regspace<uvec>(0, pscs->n_rows - 1);
iter_metrics->AddDetail("n_obs_psc_filtered", static_cast<int>(psc_index_shift)); iter_metrics->AddDetail("n_obs_psc_filtered", static_cast<int>(subset_size));
auto insert_candidate_it = best_candidate_it; insert_candidate_it = best_candidate_it;
for (uword psc_col = 0; psc_col < pscs->n_cols; ++psc_col) { for (uword psc_col = 0; psc_col < pscs->n_cols; ++psc_col) {
auto& psc_metric = iter_metrics->CreateSubMetrics("psc_subset"); auto& psc_metric = iter_metrics->CreateSubMetrics("psc_subset");
// Sort indices according to ascending absolute values of the PSC // Sort indices according to ascending absolute values of the PSC
std::partial_sort(index_vec.begin(), index_vec.begin() + psc_index_shift, index_vec.end(), std::partial_sort(subset_indices.begin(), subset_indices.begin() + subset_size,
IndexCompAbsoluteAscending<subview_vec>(pscs->col(psc_col))); subset_indices.end(), IndexCompAbsoluteAscending<subview_vec>(pscs->col(psc_col)));
const auto estimate_result_absasc = EstimateOnSubset(*filtered_data_loss, index_vec.head(psc_index_shift), const auto estimate_result_absasc = EstimateOnSubset(ls_loss,
residuals_keep_ind.rows(subset_indices.head(subset_size)),
&pyinit_optim, &py_result.initial_estimates, &pyinit_optim, &py_result.initial_estimates,
&insert_candidate_it, &candidate_hashes); &insert_candidate_it, &subset_candidate_hashes);
psc_metric.AddDetail("absasc_result", static_cast<int>(estimate_result_absasc)); psc_metric.AddDetail("absasc_result", static_cast<int>(estimate_result_absasc));
if (insert_candidate_it->metrics) { if (insert_candidate_it->metrics) {
...@@ -255,12 +261,12 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu ...@@ -255,12 +261,12 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
} }
// Sort indices according to ascending values of the PSC // Sort indices according to ascending values of the PSC
std::partial_sort(index_vec.begin(), index_vec.begin() + psc_index_shift, index_vec.end(), std::partial_sort(subset_indices.begin(), subset_indices.begin() + subset_size,
IndexCompAscending<subview_vec>(pscs->col(psc_col))); subset_indices.end(), IndexCompAscending<subview_vec>(pscs->col(psc_col)));
const auto estimate_result_asc = EstimateOnSubset(ls_loss,
const auto estimate_result_asc = EstimateOnSubset(*filtered_data_loss, index_vec.head(psc_index_shift), residuals_keep_ind.rows(subset_indices.head(subset_size)),
&pyinit_optim, &py_result.initial_estimates, &pyinit_optim, &py_result.initial_estimates,
&insert_candidate_it, &candidate_hashes); &insert_candidate_it, &subset_candidate_hashes);
psc_metric.AddDetail("asc_result", static_cast<int>(estimate_result_asc)); psc_metric.AddDetail("asc_result", static_cast<int>(estimate_result_asc));
if (insert_candidate_it->metrics) { if (insert_candidate_it->metrics) {
...@@ -269,12 +275,12 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu ...@@ -269,12 +275,12 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
} }
// Sort indices according to descending values of the PSC // Sort indices according to descending values of the PSC
std::partial_sort(index_vec.begin(), index_vec.begin() + psc_index_shift, index_vec.end(), std::partial_sort(subset_indices.begin(), subset_indices.begin() + subset_size,
IndexCompDescending<subview_vec>(pscs->col(psc_col))); subset_indices.end(), IndexCompDescending<subview_vec>(pscs->col(psc_col)));
const auto estimate_result_desc = EstimateOnSubset(ls_loss,
const auto estimate_result_desc = EstimateOnSubset(*filtered_data_loss, index_vec.head(psc_index_shift), residuals_keep_ind.rows(subset_indices.head(subset_size)),
&pyinit_optim, &py_result.initial_estimates, &pyinit_optim, &py_result.initial_estimates,
&insert_candidate_it, &candidate_hashes); &insert_candidate_it, &subset_candidate_hashes);
psc_metric.AddDetail("desc_result", static_cast<int>(estimate_result_desc)); psc_metric.AddDetail("desc_result", static_cast<int>(estimate_result_desc));
if (insert_candidate_it->metrics) { if (insert_candidate_it->metrics) {
...@@ -293,11 +299,11 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu ...@@ -293,11 +299,11 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
arma::vec best_candidate_residuals; arma::vec best_candidate_residuals;
double best_candidate_mscale; double best_candidate_mscale;
// Skip the first element since this is the `best_candidate_it` and already evaluated and iterate just after the // Skip the first element since this is the `best_candidate_it` and already evaluated and iterate just after the
// last inserted element. // last inserted element.
++insert_candidate_it; auto end_check_candidate_it = insert_candidate_it;
for (auto cand_it = ++py_result.initial_estimates.begin(); cand_it != insert_candidate_it; ++cand_it) { ++end_check_candidate_it;
for (auto cand_it = ++py_result.initial_estimates.begin(); cand_it != end_check_candidate_it; ++cand_it) {
// Compute the S-loss of the candidate // Compute the S-loss of the candidate
const arma::vec candidate_residuals = data.cy() - data.cx() * cand_it->coefs.beta - cand_it->coefs.intercept; const arma::vec candidate_residuals = data.cy() - data.cx() * cand_it->coefs.beta - cand_it->coefs.intercept;
const auto candidate_eval = loss.EvaluateResiduals(candidate_residuals); const auto candidate_eval = loss.EvaluateResiduals(candidate_residuals);
...@@ -332,33 +338,39 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu ...@@ -332,33 +338,39 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
iter_metrics = &py_result.metrics.CreateSubMetrics("py_iteration"); iter_metrics = &py_result.metrics.CreateSubMetrics("py_iteration");
// Move the new best candidate to the front of the list. // Move the new best candidate to the front of the list.
std::swap_ranges(new_best_candidate_it, ++new_best_candidate_it, py_result.initial_estimates.begin()); if (insert_candidate_it != new_best_candidate_it) {
best_candidate_it = py_result.initial_estimates.begin(); std::iter_swap(new_best_candidate_it, py_result.initial_estimates.begin());
insert_candidate_it = best_candidate_it; best_candidate_it = py_result.initial_estimates.begin();
} else {
// If the last inserted element is the new best candidate, we need to ensure that `insert_candidate_it`
// still points to the last element for this iteration after the swap!
insert_candidate_it = py_result.initial_estimates.begin();
std::iter_swap(new_best_candidate_it, insert_candidate_it);
best_candidate_it = py_result.initial_estimates.begin();
}
// Use the new best solution to thin-out the data based on the residuals and compute new PSCs. // Use the new best solution to thin-out the data based on the residuals and compute new PSCs.
const uvec keep_ind = GetResidualKeepIndices(best_candidate_residuals, best_candidate_mscale, pyconfig, residuals_keep_ind = GetResidualKeepIndices(best_candidate_residuals, best_candidate_mscale, pyconfig,
&all_indices); &all_indices);
// Store the hashed index vector to avoid unnecessary iterations. // Store the hashed index vector to avoid unnecessary iterations.
const auto hash_insert = residual_filtered_data_hashes.insert(HashIndexVector(keep_ind)); const auto hash_insert = residual_filtered_data_hashes.insert(HashIndexVector(residuals_keep_ind));
if (!hash_insert.second) { if (!hash_insert.second) {
// The resulting subset of observations was already used before. // The resulting subset of observations was already used before.
// Therefore, the next iteration would result in the exact same solution and we can // Therefore, the next iteration would result in duplicate solutions and we can stop.
// stop the iterations early.
iter_metrics->AddMetric("stop", "filtered residuals are duplicates"); iter_metrics->AddMetric("stop", "filtered residuals are duplicates");
break; break;
} }
// This subset of observations was not yet considered. Continue. // This subset of observations was not yet considered. Continue.
filtered_data_loss.reset(new nsoptim::LsLoss(std::make_shared<PredictorResponseData>(data.Observations(keep_ind)), nsoptim::LsLoss filtered_ls_loss(std::make_shared<PredictorResponseData>(data.Observations(residuals_keep_ind)),
loss.IncludeIntercept())); loss.IncludeIntercept());
pyinit_optim.loss(*filtered_data_loss); pyinit_optim.loss(filtered_ls_loss);
iter_metrics->AddDetail("n_obs", static_cast<int>(keep_ind.n_elem)); iter_metrics->AddDetail("n_obs", static_cast<int>(residuals_keep_ind.n_elem));
// Compute the optimzer and the PSCs on the residual-filtered data // Compute the optimzer and the PSCs on the residual-filtered data
psc_result = PrincipalSensitiviyComponents(*filtered_data_loss, pyinit_optim); psc_result = PrincipalSensitiviyComponents(filtered_ls_loss, pyinit_optim);
if (psc_result.optimum.metrics) { if (psc_result.optimum.metrics) {
iter_metrics->AddSubMetrics(std::move(*psc_result.optimum.metrics)); iter_metrics->AddSubMetrics(std::move(*psc_result.optimum.metrics));
...@@ -396,13 +408,37 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu ...@@ -396,13 +408,37 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
// We continue the iterations, so remove all but the best estimate. // We continue the iterations, so remove all but the best estimate.
// py_result.initial_estimates.resize(1, *best_candidate_it); // py_result.initial_estimates.resize(1, *best_candidate_it);
pscs = &psc_result.pscs; pscs = &psc_result.pscs;
insert_candidate_it = best_candidate_it;
}
// Only retain the estimates from the last iteration plus the estimates that are within 10% of the best solution
// (up to at most the number of estimates from the last iteration).
// `insert_candidate_it` points to the last element inserted in the last iteration.
if (pyconfig.retain_best_factor > 1) {
const double objf_value_cutoff = pyconfig.retain_best_factor * best_candidate_it->objf_value;
while (true) {
const auto current_end = py_result.initial_estimates.end();
if (insert_candidate_it == current_end) {
break;
}
auto tmp_it = insert_candidate_it;
// Search for either the last element or the next element that is "good enough".
int removing = 0;
while (++tmp_it != current_end && tmp_it->objf_value > objf_value_cutoff) { ++removing; }
if (removing > 0) {
py_result.initial_estimates.erase_after(insert_candidate_it, tmp_it);
}
insert_candidate_it = tmp_it;
}
} }
return py_result; return py_result;
} }
template <typename Optimizer> template <typename Optimizer, typename T>
inline SubsetEstimateResult EstimateOnSubset(const nsoptim::LsLoss& filtered_data_loss, inline SubsetEstimateResult EstimateOnSubset(const nsoptim::LsLoss& loss,
const arma::subview_col<arma::uword>& subset, Optimizer* optim, const T& subset, Optimizer* optim,
alias::Optima<Optimizer>* candidates, alias::Optima<Optimizer>* candidates,
typename alias::Optima<Optimizer>::iterator* insert_it, typename alias::Optima<Optimizer>::iterator* insert_it,
std::unordered_set<arma::uword>* subset_hashes) { std::unordered_set<arma::uword>* subset_hashes) {
...@@ -413,8 +449,8 @@ inline SubsetEstimateResult EstimateOnSubset(const nsoptim::LsLoss& filtered_dat ...@@ -413,8 +449,8 @@ inline SubsetEstimateResult EstimateOnSubset(const nsoptim::LsLoss& filtered_dat
if (subset_hashes->find(siv_hash) == subset_hashes->end()) { if (subset_hashes->find(siv_hash) == subset_hashes->end()) {
// The indices are unique. Compute estimator and store hash to avoid this subset in the future. // The indices are unique. Compute estimator and store hash to avoid this subset in the future.
optim->loss(nsoptim::LsLoss( optim->loss(nsoptim::LsLoss(
std::make_shared<nsoptim::PredictorResponseData>(filtered_data_loss.data().Observations(sorted_index_vec)), std::make_shared<nsoptim::PredictorResponseData>(loss.data().Observations(sorted_index_vec)),
filtered_data_loss.IncludeIntercept())); loss.IncludeIntercept()));
try { try {
// We don't want to visit this subset ever again. Even if there's an error, we assume it's the subset's fault. // We don't want to visit this subset ever again. Even if there's an error, we assume it's the subset's fault.
subset_hashes->insert(siv_hash); subset_hashes->insert(siv_hash);
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include "r_robust_utils.hpp" #include "r_robust_utils.hpp"