Commit ff5a2043 authored by davidkep's avatar davidkep
Browse files

update

parent 1082eb4b
......@@ -43,11 +43,13 @@ mest_options <- function (max_it = 200, eps = 1e-8) {
#' @param eps numerical tolerance to check for convergence.
#'
#' @return options for the S-Estimate algorithm.
#' @export
s_algo_options <- function (explore_it = 10, max_it = 500,
eps = 1e-8) {
s_algo_options <- function (explore_it = 10, max_it = 500, eps = 1e-8,
tightening = c('none', 'adaptive', 'exponential'),
tightening_steps = 10) {
list(max_it = as.integer(max_it[[1L]]),
eps = as.numeric(eps[[1L]]),
tightening = .tightening_id(match.arg(tightening)),
tightening_steps = as.integer(tightening_steps[[1L]]),
cold_explore_it = as.integer(explore_it[[1L]]))
}
......@@ -163,3 +165,8 @@ en_ridge_options <- function () {
1L)
}
.tightening_id <- function (tightening) {
switch (tightening, exponential = 1L, adaptive = 2L, 0L)
}
......@@ -4,7 +4,8 @@
enpy_initial_estimates <- function (x, y, alpha, lambdas, bdp = 0.25, cc,
include_intercept = TRUE,
enpy_opts = enpy_options(),
mscale_maxit, mscale_eps, en_options) {
mscale_maxit = 200, mscale_eps = 1e-8,
en_options) {
if (missing(cc)) {
cc <- .bisquare_consistency_const(bdp)
}
......@@ -33,19 +34,17 @@ enpy_initial_estimates <- function (x, y, alpha, lambdas, bdp = 0.25, cc,
en_options <- NULL
}
optional_args <- list()
optional_args <- list(en_options = en_options)
optional_args$en_options <- if (is.null(en_options)) {
if (alpha > 0) {
if (is.null(en_options)) {
optional_args$en_options <- if (alpha > 0) {
en_admm_options()
} else {
en_ridge_options()
}
} else {
en_options
}
if (optional_args$en_options$en_algorithm == 'admm') {
if (optional_args$en_options$algorithm == 'admm') {
optional_args$en_options <- .choose_admm_algorithm(optional_args$en_options,
alpha, x)
}
......
......@@ -29,7 +29,9 @@
#' @export
pense <- function(x, y, alpha, lambdas, cold_lambdas, penalty_loadings,
additional_initial_estimates, include_intercept = TRUE,
max_it = 200, eps = 1e-5, explore_it = 10, en_algorithm_opts,
max_it = 200, eps = 1e-5, explore_it = 10,
tightening = c('none', 'adaptive', 'exponential'),
tightening_steps = 10L, en_algorithm_opts,
mest_opts = mest_options(), enpy_opts = enpy_options()) {
optional_args <- list()
......@@ -39,19 +41,17 @@ pense <- function(x, y, alpha, lambdas, cold_lambdas, penalty_loadings,
x_dim <- dim(x)
if (length(y) != x_dim[[1L]]) {
stop("Number of observations does not match between `x` and `y`.")
stop("Number of observations in `x` and `y` does not match.")
}
alpha <- as.numeric(alpha[[1L]])
lambdas <- sort(as.numeric(lambdas), decreasing = TRUE)
pense_opts <- list(
max_it = as.integer(max_it[[1L]]),
eps = as.numeric(eps[[1L]]),
cold_explore_it = as.integer(explore_it[[1L]]),
intercept = isTRUE(include_intercept),
mscale = mest_opts
)
pense_opts <- c(
list(mm_options = s_algo_options(explore_it = explore_it, max_it = max_it,
eps = eps, tightening = match.arg(tightening),
tightening_steps = tightening_steps)),
list(intercept = isTRUE(include_intercept), mscale = mest_opts))
if (alpha < sqrt(.Machine$double.eps)) {
alpha <- 0
......
CXX_STD = CXX11
PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) # -flto=thin
#PKG_CPPFLAGS= -D__STDC_LIMIT_MACROS -DHAVE_RCPP -DNSOPTIM_DETAILED_METRICS -DTESTTHAT_DISABLED
PKG_CPPFLAGS= -D__STDC_LIMIT_MACROS -DHAVE_RCPP -DNSOPTIM_DETAILED_METRICS -DTESTTHAT_DISABLED
PKG_CXXFLAGS= -fstrict-aliasing -Wstrict-aliasing
PKG_OPTFLAGS= -g -Os
PKG_OPTFLAGS= -g -O0
# -DNSOPTIM_METRICS_DISABLED -DNSOPTIM_DETAILED_METRICS
PKG_CPPFLAGS= -D__STDC_LIMIT_MACROS -DHAVE_RCPP -DTESTTHAT_DISABLED
# PKG_CPPFLAGS= -D__STDC_LIMIT_MACROS -DHAVE_RCPP -DTESTTHAT_DISABLED
......@@ -15,7 +15,7 @@ namespace pense {
constexpr double kDefaultConvergenceTolerance = 1e-6;
//! The threshold for any numeric value to be considered 0.
constexpr double kNumericZero = 1e-14;
constexpr double kNumericZero = 1e-12;
//! Integer IDs for the supported rho-functions
enum class RhoFunctionType {
......
......@@ -129,7 +129,7 @@ inline SubsetEstimateResult EstimateOnSubset(const nsoptim::LsLoss& filtered_dat
//! @param pyconfig configuration object.
template<typename Optimizer>
PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFunction& penalty,
const PscResult<Optimizer>& full_psc_result, const Optimizer& optim,
const PscResult<Optimizer>& full_psc_result, Optimizer optim,
const enpy_initest_internal::PyConfiguration& pyconfig);
} // namespace enpy_initest_internal
......@@ -160,6 +160,7 @@ alias::FwdList<PyResult<Optimizer>> PenaYohaiInitialEstimators(
py_initest_res_it, enpy_initest_internal::PYIterations(loss, *penalty_it, psc_result, optim, pyconfig));
} else {
py_initest_res_it = py_initest_results.emplace_after(py_initest_res_it);
py_initest_res_it->metrics.AddMetric("pscs", static_cast<int>(psc_result.pscs.n_cols));
py_initest_res_it->metrics.AddMetric("psc_status", static_cast<int>(psc_result.status));
py_initest_res_it->metrics.AddMetric("psc_message", psc_result.status_message);
py_initest_res_it->metrics.AddDetail("psc_loo_errors", psc_result.loo_errors);
......@@ -173,7 +174,7 @@ alias::FwdList<PyResult<Optimizer>> PenaYohaiInitialEstimators(
namespace enpy_initest_internal {
template<typename Optimizer>
PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFunction& penalty,
const PscResult<Optimizer>& full_psc_result, const Optimizer& optim,
const PscResult<Optimizer>& full_psc_result, Optimizer pyinit_optim,
const enpy_initest_internal::PyConfiguration& pyconfig) {
using arma::uvec;
using arma::vec;
......@@ -193,11 +194,14 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
// Add details about the PSCs from the full data set.
Metrics* iter_metrics = &py_result.metrics.CreateSubMetrics("full_data");
if (full_psc_result.optimum.metrics) {
iter_metrics->AddSubMetrics(*full_psc_result.optimum.metrics);
}
iter_metrics->AddMetric("psc_status", static_cast<int>(full_psc_result.status));
iter_metrics->AddMetric("pscs", static_cast<int>(full_psc_result.pscs.n_cols));
iter_metrics->AddDetail("n_obs", static_cast<int>(data.n_obs()));
iter_metrics->AddDetail("psc_loo_errors", full_psc_result.loo_errors);
iter_metrics->AddDetail("psc_loo_warnings", full_psc_result.loo_warnings);
iter_metrics->AddDetail("pscs", static_cast<int>(full_psc_result.pscs.n_cols));
if (full_psc_result.status != PscStatus::kOk) {
iter_metrics->AddMetric("psc_message", full_psc_result.status_message);
......@@ -213,7 +217,6 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
HashSet residual_filtered_data_hashes { HashSequence(all_indices.n_elem - 1) };
auto best_candidate_it = py_result.initial_estimates.begin();
std::unique_ptr<nsoptim::LsLoss> filtered_data_loss(new nsoptim::LsLoss(loss.SharedData(), loss.IncludeIntercept()));
Optimizer pyinit_optim(optim);
PscResult<Optimizer> psc_result(Optimum(*filtered_data_loss, penalty));
arma::mat const * pscs = &full_psc_result.pscs;
......@@ -237,15 +240,19 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
auto insert_candidate_it = best_candidate_it;
for (uword psc_col = 0; psc_col < pscs->n_cols; ++psc_col) {
auto& psc_metric = iter_metrics->CreateSubMetrics("psc_subset");
// Sort indices according to ascending absolute values of the PSC
std::partial_sort(index_vec.begin(), index_vec.begin() + psc_index_shift, index_vec.end(),
IndexCompAbsoluteAscending<subview_vec>(pscs->col(psc_col)));
const auto estimate_result_absasc = EstimateOnSubset(*filtered_data_loss, index_vec.head(psc_index_shift),
&pyinit_optim, &py_result.initial_estimates,
&insert_candidate_it, &candidate_hashes);
iter_metrics->AddDetail("subset_absasc", static_cast<int>(estimate_result_absasc));
psc_metric.AddDetail("absasc_result", static_cast<int>(estimate_result_absasc));
if (insert_candidate_it->metrics) {
psc_metric.AddSubMetrics(std::move(*insert_candidate_it->metrics));
insert_candidate_it->metrics.reset();
}
// Sort indices according to ascending values of the PSC
std::partial_sort(index_vec.begin(), index_vec.begin() + psc_index_shift, index_vec.end(),
......@@ -255,7 +262,11 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
&pyinit_optim, &py_result.initial_estimates,
&insert_candidate_it, &candidate_hashes);
iter_metrics->AddDetail("subset_ascending", static_cast<int>(estimate_result_asc));
psc_metric.AddDetail("asc_result", static_cast<int>(estimate_result_asc));
if (insert_candidate_it->metrics) {
psc_metric.AddSubMetrics(std::move(*insert_candidate_it->metrics));
insert_candidate_it->metrics.reset();
}
// Sort indices according to descending values of the PSC
std::partial_sort(index_vec.begin(), index_vec.begin() + psc_index_shift, index_vec.end(),
......@@ -265,7 +276,11 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
&pyinit_optim, &py_result.initial_estimates,
&insert_candidate_it, &candidate_hashes);
iter_metrics->AddDetail("subset_descending", static_cast<int>(estimate_result_desc));
psc_metric.AddDetail("desc_result", static_cast<int>(estimate_result_desc));
if (insert_candidate_it->metrics) {
psc_metric.AddSubMetrics(std::move(*insert_candidate_it->metrics));
insert_candidate_it->metrics.reset();
}
}
// Check if we are at the end of our iterations
......@@ -277,13 +292,17 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
auto new_best_candidate_it = py_result.initial_estimates.begin();
arma::vec best_candidate_residuals;
double best_candidate_mscale;
// Skip the first element since this is the `best_candidate_it` and already evaluated.
for (auto cand_it = ++py_result.initial_estimates.begin(), cand_end = py_result.initial_estimates.end();
cand_it != cand_end; ++cand_it) {
// Skip the first element since this is the `best_candidate_it` and already evaluated and iterate just after the
// last inserted element.
++insert_candidate_it;
for (auto cand_it = ++py_result.initial_estimates.begin(); cand_it != insert_candidate_it; ++cand_it) {
// Compute the S-loss of the candidate
const arma::vec candidate_residuals = data.cy() - data.cx() * cand_it->coefs.beta - cand_it->coefs.intercept;
const auto candidate_eval = loss.EvaluateResiduals(candidate_residuals);
cand_it->objf_value = candidate_eval.loss + penalty.Evaluate(cand_it->coefs);
if (cand_it->objf_value < new_best_candidate_it->objf_value) {
new_best_candidate_it = cand_it;
best_candidate_residuals = candidate_residuals;
......@@ -293,6 +312,7 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
// Check if the best candidate is still the first element, i.e., didn't change.
if (new_best_candidate_it == best_candidate_it) {
iter_metrics->AddMetric("stop", "no change in best candidate");
break;
}
......@@ -301,15 +321,17 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
const double rel_diff = std::abs(
loss.Evaluate(best_candidate_it->coefs) + penalty.Evaluate(best_candidate_it->coefs) -
loss.Evaluate(new_best_candidate_it->coefs) + penalty.Evaluate(new_best_candidate_it->coefs));
iter_metrics->AddDetail("rel_diff", rel_diff);
if (rel_diff < pyconfig.eps) {
// The previously best candidate is (almost) identical to the newly best candidate. We are done!
iter_metrics->AddMetric("stop", "best candidate is almost identical");
break;
}
iter_metrics->AddDetail("rel_diff", rel_diff);
iter_metrics = &py_result.metrics.CreateSubMetrics("py_iteration");
// Move the new best candidate to the front of the list and remove the rest.
// Move the new best candidate to the front of the list.
std::swap_ranges(new_best_candidate_it, ++new_best_candidate_it, py_result.initial_estimates.begin());
best_candidate_it = py_result.initial_estimates.begin();
insert_candidate_it = best_candidate_it;
......@@ -324,34 +346,36 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
// The resulting subset of observations was already used before.
// Therefore, the next iteration would result in the exact same solution and we can
// stop the iterations early.
iter_metrics->AddDetail("duplicate", 1);
iter_metrics->AddMetric("stop", "filtered residuals are duplicates");
break;
}
// We continue the iterations, so remove all but the best estimate.
py_result.initial_estimates.resize(1, *best_candidate_it);
// This subset of observations was not yet considered. Continue.
filtered_data_loss.reset(new nsoptim::LsLoss(std::make_shared<PredictorResponseData>(data.Observations(keep_ind)),
loss.IncludeIntercept()));
pyinit_optim.loss(*filtered_data_loss);
iter_metrics->AddDetail("n_obs", static_cast<int>(data.n_obs()));
iter_metrics->AddDetail("n_obs", static_cast<int>(keep_ind.n_elem));
// Compute the optimzer and the PSCs on the residual-filtered data
psc_result = PrincipalSensitiviyComponents(*filtered_data_loss, pyinit_optim);
if (psc_result.optimum.metrics) {
iter_metrics->AddSubMetrics(std::move(*psc_result.optimum.metrics));
}
iter_metrics->AddMetric("psc_status", static_cast<int>(psc_result.status));
iter_metrics->AddMetric("pscs", static_cast<int>(psc_result.pscs.n_cols));
iter_metrics->AddDetail("psc_loo_errors", psc_result.loo_errors);
iter_metrics->AddDetail("psc_loo_warnings", psc_result.loo_warnings);
iter_metrics->AddDetail("pscs", static_cast<int>(psc_result.pscs.n_cols));
if (psc_result.status != PscStatus::kOk) {
iter_metrics->AddMetric("psc_message", psc_result.status_message);
}
if (psc_result.status == PscStatus::kError) {
return py_result;
iter_metrics->AddMetric("stop", "could not compute PSCs");
break;
}
// Check if the LS estimate, computed on the filtered data is better than the previously best candidate.
......@@ -360,15 +384,17 @@ PyResult<Optimizer> PYIterations(SLoss loss, const typename Optimizer::PenaltyFu
// If so, replace the best candidate by this estimate.
py_result.initial_estimates.push_front(std::move(psc_result.optimum));
best_candidate_it = py_result.initial_estimates.begin();
py_result.initial_estimates.erase_after(best_candidate_it);
}
if (psc_result.pscs.n_cols < 2) {
// Either no PSCs or only one have been found.
// If only a single PSC is found, all LOO coefficients are identical, probably the 0-vector!
return py_result;
iter_metrics->AddMetric("stop", "single PSC");
break;
}
// We continue the iterations, so remove all but the best estimate.
// py_result.initial_estimates.resize(1, *best_candidate_it);
pscs = &psc_result.pscs;
}
return py_result;
......
......@@ -10,6 +10,7 @@
#include "enpy_psc.hpp"
using arma::mat;
using arma::uword;
using arma::vec;
using arma::uvec;
using arma::eig_sym;
......@@ -34,19 +35,25 @@ void FinalizePSC(const mat& sensitivity_matrix, PscResult* psc_result) {
return;
}
// Only use the Eigenvectors with non-zero Eigenvalue.
uvec shed_eigenvectors = find(eigenvalues < kNumericZero);
if (shed_eigenvectors.n_elem == eigenvalues.n_elem) {
// Only use the Eigenvectors with "non-zero" Eigenvalue.
// "Non-zero" means Eigenvalues larger than the numerical tolerance times the largest Eigenvalue.
uword cutoff_index = eigenvalues.n_elem - 1;
if (eigenvalues[cutoff_index] < kNumericZero) {
psc_result->pscs.reset();
psc_result->status = PscStatus::kError;
psc_result->status_message.append("all Eigenvalues are zero");
return;
}
const double cutoff_threshold = kNumericZero * eigenvalues[cutoff_index];
// Determine the index at which the Eigenvalues are too small (the largest one is apparently fine, c.f. line 42).
while (cutoff_index > 0 && eigenvalues[--cutoff_index] > cutoff_threshold) {}
// Hide the Eigenvectors for zero Eigenvalues
if (shed_eigenvectors.n_elem > 0) {
if (cutoff_index > 0) {
// Eigenvalues are ordered ascending, so we can simply strip all the eigenvectors for the first `k` eigenvalues.
eigenvectors.shed_cols(0, shed_eigenvectors[shed_eigenvectors.n_elem - 1]);
eigenvectors.shed_cols(0, cutoff_index);
}
// Project the sensitivity vectors onto the Eigenvectors (eq. 11-12 in the paper)
psc_result->pscs = sensitivity_matrix * eigenvectors;
......
......@@ -42,90 +42,6 @@ class PscResult : public enpy_psc_internal::PscResult {
typename T::Optimum optimum;
};
//! Compute the Principal Sensitivity Components as defined in
//! Pena, D. & Yohai, V. (1999). A Fast Procedure for Outlier Diagnostics in Large Regression
//! Problems. JASA.
//!
//! @param loss the S-loss object to compute the PSCs for.
//! @param optim the optimizer to use to compute leave-one-out residuals.
//! @return a matrix of PSCs.
template<typename Optimizer>
PscResult<Optimizer> PrincipalSensitiviyComponents(const nsoptim::LsLoss& loss, const Optimizer& optim) {
using Optimum = typename Optimizer::Optimum;
// Ensure that the Optimizer is using an LS-loss.
static_assert(std::is_same<typename Optimizer::LossFunction, nsoptim::LsLoss>::value,
"Optimizer must use the LS loss function");
// Copy the optimizer. A shallow copy is fine.
PscResult<Optimizer> psc_result(Optimum(loss, optim.penalty()));
const nsoptim::PredictorResponseData& data = loss.data();
Optimizer psc_optimizer(optim);
arma::mat sensitivity_matrix(data.n_obs(), data.n_obs()); // this is matrix R in the paper
// First optimize with respect to the full data set and compute the predictions.
psc_optimizer.loss(loss);
arma::vec full_predictions;
psc_result.optimum = psc_optimizer.Optimize();
full_predictions = data.cx() * psc_result.optimum.coefs.beta + psc_result.optimum.coefs.intercept;
// Now create the LOO data set by removing the first observation.
std::shared_ptr<nsoptim::PredictorResponseData> loo_data =
std::make_shared<nsoptim::PredictorResponseData>(data.TailRows(data.n_obs() - 1));
// This `loo_loss` holds a shared pointer to the LOO data, therefore, the data used by the loss changes
// in accordance to `loo_data`!
// It is nevertheless important to change the loss for the optimizer whenever the data changes to inform the
// optimizer about the changes!
nsoptim::LsLoss loo_loss(loo_data, loss.IncludeIntercept());
// Estimate the regression coefficients with the first observation omitted.
// Set the loss to the loss with the LOO data.
psc_optimizer.loss(loo_loss);
const auto loo_optimum = psc_optimizer.Optimize();
if (loo_optimum.status == nsoptim::OptimumStatus::kError) {
++psc_result.loo_errors;
psc_result.status = PscStatus::kWarning;
psc_result.status_message.append("Can not compute LOO residuals: ");
psc_result.status_message.append(loo_optimum.message);
psc_result.status_message.append("; ");
}
if (loo_optimum.status == nsoptim::OptimumStatus::kWarning) {
++psc_result.loo_warnings;
}
sensitivity_matrix.col(0) = full_predictions - data.cx() * loo_optimum.coefs.beta - loo_optimum.coefs.intercept;
// Now compute the regression coefficients with each of the other observations omitted.
for (arma::uword loo_index = 0; loo_index < data.cx().n_rows - 1; ++loo_index) {
// Omit the next row.
loo_data->x().row(loo_index) = data.cx().row(loo_index);
loo_data->y()[loo_index] = data.cy()[loo_index];
// Set the loss to the loss with the LOO data.
psc_optimizer.loss(loo_loss);
const auto loo_optimum = psc_optimizer.Optimize();
if (loo_optimum.status == nsoptim::OptimumStatus::kError) {
++psc_result.loo_errors;
psc_result.status = PscStatus::kWarning;
psc_result.status_message.append("Can not compute LOO residuals: ");
psc_result.status_message.append(loo_optimum.message);
psc_result.status_message.append("; ");
}
if (loo_optimum.status == nsoptim::OptimumStatus::kWarning) {
++psc_result.loo_warnings;
psc_result.status = PscStatus::kWarning;
}
sensitivity_matrix.col(loo_index + 1) = full_predictions - loo_optimum.coefs.intercept -
data.cx() * loo_optimum.coefs.beta;
}
enpy_psc_internal::FinalizePSC(sensitivity_matrix, &psc_result);
return psc_result;
}
//! Compute the Principal Sensitivity Components as defined in
//! Pena, D. & Yohai, V. (1999). A Fast Procedure for Outlier Diagnostics in Large Regression
//! Problems. JASA.
......@@ -138,11 +54,8 @@ PscResult<Optimizer> PrincipalSensitiviyComponents(const nsoptim::LsLoss& loss,
template<typename Optimizer>
alias::FwdList<PscResult<Optimizer>> PrincipalSensitiviyComponents(
const nsoptim::LsLoss& loss, const alias::FwdList<typename Optimizer::PenaltyFunction>& penalties,
const Optimizer& optim) {
// Copy the optimizer. A shallow copy is fine.
Optimizer psc_optimizer) {
const nsoptim::PredictorResponseData& data = loss.data();
Optimizer psc_optimizer(optim);
alias::FwdList<PscResult<Optimizer>> psc_results;
alias::FwdList<arma::mat> sensitivity_matrices; // these are the matrices `R` in the paper, one for each penalty.
......@@ -155,6 +68,15 @@ alias::FwdList<PscResult<Optimizer>> PrincipalSensitiviyComponents(
psc_optimizer.penalty(penalty);
psc_result_it = psc_results.emplace_after(psc_result_it, psc_optimizer.Optimize());
// Retain metrics from all LOO fits.
nsoptim::Metrics loo_metrics("loo_fits");
if (psc_result_it->optimum.metrics) {
psc_result_it->optimum.metrics->AddDetail("loo_index", -1);
loo_metrics.AddSubMetrics(std::move(*(psc_result_it->optimum.metrics)));
}
psc_result_it->optimum.metrics.reset(new nsoptim::Metrics(std::move(loo_metrics)));
switch (psc_result_it->optimum.status) {
case nsoptim::OptimumStatus::kError:
++(psc_result_it->loo_errors);
......@@ -193,7 +115,12 @@ alias::FwdList<PscResult<Optimizer>> PrincipalSensitiviyComponents(
for (auto&& penalty : penalties) {
if (psc_result_it->status != PscStatus::kError) {
psc_optimizer.penalty(penalty);
const auto loo_optimum = psc_optimizer.Optimize();
auto loo_optimum = psc_optimizer.Optimize();
if (loo_optimum.metrics) {
loo_optimum.metrics->AddDetail("loo_index", 0);
psc_result_it->optimum.metrics->AddSubMetrics(std::move(*loo_optimum.metrics));
}
switch (loo_optimum.status) {
case nsoptim::OptimumStatus::kError:
......@@ -230,7 +157,12 @@ alias::FwdList<PscResult<Optimizer>> PrincipalSensitiviyComponents(
for (auto&& penalty : penalties) {
if (psc_result_it->status != PscStatus::kError) {
psc_optimizer.penalty(penalty);
const auto loo_optimum = psc_optimizer.Optimize();
auto loo_optimum = psc_optimizer.Optimize();
if (loo_optimum.metrics) {
loo_optimum.metrics->AddDetail("loo_index", static_cast<int>(loo_index) + 1);
psc_result_it->optimum.metrics->AddSubMetrics(std::move(*loo_optimum.metrics));
}
switch (loo_optimum.status) {
case nsoptim::OptimumStatus::kError:
......@@ -266,6 +198,113 @@ alias::FwdList<PscResult<Optimizer>> PrincipalSensitiviyComponents(
return psc_results;
}
//! Compute the Principal Sensitivity Components as defined in
//! Pena, D. & Yohai, V. (1999). A Fast Procedure for Outlier Diagnostics in Large Regression
//! Problems. JASA.
//!
//! @param loss the S-loss object to compute the PSCs for.
//! @param optim the optimizer to use to compute leave-one-out residuals.
//! @return a matrix of PSCs.
template<typename Optimizer>
PscResult<Optimizer> PrincipalSensitiviyComponents(const nsoptim::LsLoss& loss, const Optimizer& optim) {
const alias::FwdList<typename Optimizer::PenaltyFunction> penalties { optim.penalty() };
return PrincipalSensitiviyComponents(loss, std::move(penalties), optim).front();
// using Optimum = typename Optimizer::Optimum;
// // Ensure that the Optimizer is using an LS-loss.
// static_assert(std::is_same<typename Optimizer::LossFunction, nsoptim::LsLoss>::value,
// "Optimizer must use the LS loss function");
// PscResult<Optimizer> psc_result(Optimum(loss, psc_optimizer.penalty()));
// const nsoptim::PredictorResponseData& data = loss.data();
// arma::mat sensitivity_matrix(data.n_obs(), data.n_obs()); // this is matrix R in the paper
// // First optimize with respect to the full data set and compute the predictions.
// psc_optimizer.loss(loss);
// arma::vec full_predictions;
// psc_result.optimum = psc_optimizer.Optimize();
// full_predictions = data.cx() * psc_result.optimum.coefs.beta + psc_result.optimum.coefs.intercept;
// // Now create the LOO data set by removing the first observation.
// std::shared_ptr<nsoptim::PredictorResponseData> loo_data =
// std::make_shared<nsoptim::PredictorResponseData>(data.TailRows(data.n_obs() - 1));
// // Retain metrics from all LOO fits.
// nsoptim::Metrics loo_metrics("loo_fits");
// if (psc_result.optimum.metrics) {
// psc_result.optimum.metrics->AddDetail("loo_index", -1);
// loo_metrics.AddSubMetrics(std::move(*psc_result.optimum.metrics));
// }
// // This `loo_loss` holds a shared pointer to the LOO data, therefore, the data used by the loss changes
// // in accordance to `loo_data`!
// // It is nevertheless important to change the loss for the optimizer whenever the data changes to inform the
// // optimizer about the changes!
// nsoptim::LsLoss loo_loss(loo_data, loss.IncludeIntercept());
// // Estimate the regression coefficients with the first observation omitted.
// {
// // Set the loss to the loss with the LOO data.
// psc_optimizer.loss(loo_loss);
// auto loo_optimum = psc_optimizer.Optimize();
// if (loo_optimum.metrics) {
// loo_optimum.metrics->AddDetail("loo_index", 0);
// loo_metrics.AddSubMetrics(std::move(*loo_optimum.metrics));
// }
// if (loo_optimum.status == nsoptim::OptimumStatus::kError) {
// ++psc_result.loo_errors;
// psc_result.status = PscStatus::kWarning;
// psc_result.status_message.append("Can not compute LOO residuals: ");
// psc_result.status_message.append(loo_optimum.message);
// psc_result.status_message.append("; ");
// }
// if (loo_optimum.status == nsoptim::OptimumStatus::kWarning) {
// ++psc_result.loo_warnings;
// }
// sensitivity_matrix.col(0) = full_predictions - data.cx() * loo_optimum.coefs.beta - loo_optimum.coefs.intercept;
// }
// // Now compute the regression coefficients with each of the other observations omitted.
// for (arma::uword loo_index = 0; loo_index < data.cx().n_rows - 1; ++loo_index) {
// // Omit the next row.
// loo_data->x().row(loo_index) = data.cx().row(loo_index);
// loo_data->y()[loo_index] = data.cy()[loo_index];
// // Set the loss to the loss with the LOO data.
// psc_optimizer.loss(loo_loss);
// auto loo_optimum = psc_optimizer.Optimize();
// if (loo_optimum.metrics) {
// loo_optimum.metrics->AddDetail("loo_index", static_cast<int>(loo_index) + 1);