Commit 6452c290 authored by davidkep's avatar davidkep

initial codebase

parent f82865d8
......@@ -31,10 +31,12 @@ Imports:
LinkingTo:
nsoptim,
Rcpp,
RcppArmadillo (>= 0.9.100)
RcppArmadillo (>= 0.9.100),
testthat
Suggests:
testthat,
testthat (>= 2.1.0),
lars
License: GPL (>= 2)
NeedsCompilation: yes
RoxygenNote: 6.0.1
RoxygenNote: 6.1.1
Roxygen: list(markdown = TRUE)
# Generated by roxygen2: do not edit by hand
export(consistency_const)
export(en_dal_options)
export(enpy_initial_estimates)
export(enpy_options)
export(mest_options)
export(mloc)
export(mlocscale)
export(mscale)
export(pense)
export(rho_function)
export(s_algo_options)
export(tau_size)
importFrom(Rcpp,evalCpp)
importFrom(stats,mad)
useDynLib(pense, .registration = TRUE)
#' Options for the ENPY Algorithm
#'
#' @param max_it maximum number of PY iterations.
#' @param eps numerical tolerance to check for convergence.
#' @param keep_psc_proportion how many observations should be kept based on the Principal Sensitivy Components.
#' @param keep_residuals_measure how to determine how many observations to keep, based on their residuals.
#' If `proportion`, a fixed number of observations is kept, while if `threshold`,
#' only observations with residuals below a threshold are kept.
#' @param keep_residuals_proportion how many observations should be kept based on their residuals.
#' @param keep_residuals_threshold only observations with (standardized) residuals less than this threshold are kept.
#'
#' @return options for the ENPY algorithm.
#' @export
enpy_options <- function (max_it = 10, eps = 1e-9, keep_psc_proportion = 0.5,
keep_residuals_measure = c('proportion', 'threshold'),
keep_residuals_proportion = 0.5, keep_residuals_threshold = 2) {
list(max_it = as.integer(max_it[[1L]]),
eps = as.numeric(eps[[1L]]),
keep_psc_proportion = as.numeric(keep_psc_proportion[[1L]]),
use_residual_threshold = match.arg(keep_residuals_measure) == 'threshold',
keep_residuals_proportion = as.numeric(keep_residuals_proportion[[1L]]),
keep_residuals_threshold = as.numeric(keep_residuals_threshold[[1L]]))
}
#' Options for the M-estimation Algorithm
#'
#' @param max_it maximum number of iterations.
#' @param eps numerical tolerance to check for convergence.
#'
#' @return options for the M-estimation algorithm.
#' @export
mest_options <- function (max_it = 200, eps = 1e-8) {
list(max_it = as.integer(max_it[[1L]]),
eps = as.numeric(eps[[1L]]))
}
#' Options for the S-Estimate Algorithm
#'
#' @param explore_it number of iterations to explore potential candidate
#' solutions.
#' @param max_it maximum number of iterations.
#' @param eps numerical tolerance to check for convergence.
#'
#' @return options for the S-Estimate algorithm.
#' @export
s_algo_options <- function (explore_it = 10, max_it = 500,
eps = 1e-8) {
list(max_it = as.integer(max_it[[1L]]),
eps = as.numeric(eps[[1L]]),
cold_explore_it = as.integer(explore_it[[1L]]))
}
#' Options for the DAL Elastic Net Algorithm
#'
#' @param max_it maximum number of (outer) iterations.
#' @param max_inner_it maximum number of (inner) iterations in each outer iteration.
#' @param eps numerical tolerance to check for convergence.
#' @param eta_multiplier multiplier for the barrier parameter. In each iteration, the barrier must be more restrictive
#' (i.e., the multiplier must be > 1).
#' @param eta_start_conservative conservative initial barrier parameter. This is used if the previous penalty is
#' undefined or too far away.
#' @param eta_start_aggressive aggressive initial barrier parameter. This is used if the previous penalty is close.
#' @param lambda_relchange_aggressive how close must the lambda parameter from the previous penalty term be to use
#' an aggressive initial barrier parameter (i.e., what constitues "too far").
#'
#' @return options for the DAL EN algorithm.
#' @family EN algorithms
#' @export
en_dal_options <- function (max_it = 100, max_inner_it = 100, eps = 1e-9, eta_multiplier = 2,
eta_start_conservative = 0.01, eta_start_aggressive = 1,
lambda_relchange_aggressive = 0.25) {
list(en_algorithm = 'dal',
max_it = as.integer(max_it[[1L]]),
max_inner_it = as.integer(max_inner_it[[1L]]),
eps = as.numeric(eps[[1L]]),
eta_start_numerator_conservative = as.numeric(eta_start_conservative[[1L]]),
eta_start_numerator_aggressive = as.numeric(eta_start_aggressive[[1L]]),
lambda_relchange_aggressive = as.numeric(lambda_relchange_aggressive[[1L]]),
eta_multiplier = as.numeric(eta_multiplier[[1L]]))
}
#' Check if the selected EN algorithm can handle the given penalty.
.check_en_algorithm <- function (en_algorithm_opts, alpha) {
if (en_algorithm_opts$en_algorithm == 'dal') {
if(!isTRUE(alpha > 0)) {
warning('The DAL algorithm can not handle a Ridge penalty. Using default
algorithm as fallback.')
return(FALSE)
}
}
return(TRUE)
}
#' ENPY Initial Estimates
#'
#' @export
enpy_initial_estimates <- function (x, y, alpha, lambdas, bdp = 0.25, cc, include_intercept = TRUE,
enpy_opts = enpy_options(), mscale_opts = mscale_options(),
s_algo_opts = s_algo_options(), en_options) {
if (missing(cc)) {
cc <- .bisquare_consistency_const(bdp)
}
penalties <- make_penalties(alpha, lambdas)
s_loss_params <- c(list(mscale = c(list(delta = bdp, cc = cc), mscale_opts),
intercept = include_intercept), s_algo_opts)
if (isTRUE(alpha == 0)) {
return(.Call(C_ridgepy, x, drop(y), penalties, s_loss_params, enpy_opts))
} else if (isTRUE(alpha > 0) && isTRUE(alpha <= 1)) {
if (missing(en_options)) {
en_options <- en_dal_options()
}
return(.Call(C_enpy_dal, x, drop(y), penalties, s_loss_params, enpy_opts, list(dal_options = en_options)))
} else {
stop("`alpha` must be between 0 and 1.")
}
}
make_penalties <- function (alpha, lambdas) {
alpha <- as.numeric(alpha[[1L]])
lapply(sort(as.numeric(lambdas), decreasing = TRUE), function (lambda) {
list(alpha = alpha, lambda = lambda)
})
}
#' @useDynLib pense, .registration = TRUE
#' @importFrom Rcpp evalCpp
NULL
#' Compute the PENSE Regularization Path
#'
#' @param alpha value for the alpha parameter, the balance between L1 and L2
#' penalization.
#' @param lambdas a vector of positive values for the lambda parameter.
#' @param cold_lambdas a vector of lambda values at which *cold* initial
#' estimates are computed (see [enpy] for details).
#' @param additional_initial_estimates a list of other initial estimates to try.
#' Each list item must contain the value
#' for the regularization parameter `lambda`
#' as well as the `intercept` and `beta`
#' coefficients.
#' @param include_intercept include an intercept in the model.
#' @param max_it maximum number of iterations for the algorithm.
#' @param eps convergence tolerance for the algorithm.
#' @param explore_it number of iterations to explore potential candidate
#' solutions.
#' @param en_algorithm_opts options for the EN algorithm. See [en_dal_options]
#' for details.
#' @param mest_opts options for the M-scale estimation. See [mest_options]
#' for details.
#' @param enpy_opts options for the ENPY initial estimates, created with the
#' [enpy_options] function. See [enpy_initial_estimates] for
#' details.
#' @seealso [pensem] for an M-estimate of regression.
#' @export
pense <- function(x, y, alpha, lambdas, cold_lambdas, penalty_loadings,
additional_initial_estimates, include_intercept = TRUE,
max_it = 1000, eps = 1e-5, explore_it = 10, en_algorithm_opts,
mest_opts = mest_options(), enpy_opts = enpy_options()) {
optional_args <- list()
# Normalize input
y <- as.numeric(y)
x_dim <- dim(x)
if (length(y) != x_dim[[1L]]) {
stop("Number of observations does not match between `x` and `y`.")
}
alpha <- as.numeric(alpha[[1L]])
lambdas <- sort(as.numeric(lambdas), decreasing = TRUE)
pense_opts <- list(
max_it = as.integer(max_it[[1L]]),
eps = as.numeric(eps[[1L]]),
cold_explore_it = as.integer(explore_it[[1L]]),
intercept = isTRUE(include_intercept),
mscale = mest_opts
)
if (alpha < sqrt(.Machine$double.eps)) {
alpha <- 0
}
penalty_loadings <- if (!missing(penalty_loadings)) {
if(alpha == 0) {
stop("Penalty loadings are only supported for alpha > 0.")
} else if (length(penalty_loadings) != x_dim[[2L]]) {
stop("Penalty loadings are not of length `p`.")
}
as.numeric(penalty_loadings)
} else {
NULL
}
# Check EN algorithm
if (!missing(en_algorithm_opts)) {
if (!.check_en_algorithm(en_algorithm_opts, alpha)) {
en_algorithm_opts <- NULL
}
} else {
en_algorithm_opts <- NULL
}
if (is.null(en_algorithm_opts)) {
if (alpha > 0) {
optional_args$en_options <- en_dal_options()
}
} else {
optional_args$en_options <- en_algorithm_opts
}
cold_lambda_inds <- .approx_match(cold_lambdas, lambdas)
if (anyNA(cold_lambda_inds)) {
lambdas <- sort(c(lambdas, cold_lambdas), decreasing = TRUE)
cold_lambda_inds <- .approx_match(cold_lambdas, lambdas)
}
if (missing(additional_initial_estimates)) {
additional_initial_estimates <- list()
}
init_ests <- .make_initest_list(additional_initial_estimates, lambdas)
# Call internal function
.pense_internal(x, y, alpha, lambdas, cold_lambda_inds, init_ests,
penalty_loadings, pense_opts, enpy_opts, optional_args)
}
#' Make a list of initial estimates
#'
#' @return a list the same length as `lambdas` with a list of initial estimates
#' for each value in `lambdas`.
.make_initest_list <- function (initial_estimates, lambdas) {
if (length(initial_estimates) == 0L) {
return(list())
}
init_est_lambdas <- unlist(lapply(initial_estimates, `[[`, 'lambda'),
use.names = FALSE, recursive = FALSE)
init_est_inds <- .approx_match(init_est_lambdas, lambdas)
if (all(is.na(init_est_inds))) {
return(list())
}
lapply(seq_along(lambdas), function (i) {
matches <- which(i == init_est_inds)
if (length(matches) > 0) {
return(initial_estimates[matches])
}
return(list())
})
}
#' Perform some final input adjustments and call the internal C++ code.
.pense_internal <- function(x, y, alpha, lambdas, cold_lambda_inds,
additional_initial_estimates,
penalty_loadings = NULL,
pense_opts, enpy_opts, optional_args) {
# Create penalties-list, without sorting the lambda sequence
penalties <- lapply(lambdas, function (l) { list(lambda = l, alpha = alpha) })
if (!is.null(penalty_loadings)) {
optional_args$pen_loadings <- penalty_loadings
}
if (alpha == 0) {
return(.Call('C_pense_ridge_regression', x, y, penalties,
additional_initial_estimates, cold_lambda_inds,
pense_opts, enpy_opts))
}
if (!is.null(optional_args$en_options) &&
isTRUE(optional_args$en_options$en_algorithm == 'dal')) {
return(.Call('C_pense_en_regression_dal', x, y, penalties,
additional_initial_estimates, cold_lambda_inds,
pense_opts, enpy_opts, optional_args))
}
}
#' Compute the Tau-Scale of Centered Values
#'
#' Compute the tau-scale without centering the values.
#'
#' @param x numeric values.
#' @return the tau scale.
#' @export
tau_size <- function (x) {
# No checks for NA values!
.Call(C_tau_size, as.numeric(x))
}
#' Compute the M-Scale of Centered Values
#'
#' Compute the M-scale without centering the values.
#'
#' @param x numeric values.
#' @param bdp desired breakdown point (between 0 and 0.5).
#' @param cc cutoff value for the bisquare rho function. By default, chosen
#' for a consistent estimate under the Normal model.
#' @param opts a list of options for the M-scale equation, see [mest_options]
#' for details.
#' @return the m-scale estimate.
#' @export
mscale <- function (x, bdp = 0.25, cc = consistency_const(bdp, 'bisquare'),
opts = mest_options()) {
# No checks for NA values!
opts$delta <- as.numeric(bdp[[1L]])
opts$cc <- as.numeric(cc[[1L]])
.Call(C_mscale, as.numeric(x), opts)
}
#' Compute the M-estimate of Location
#'
#' @param x numeric values.
#' @param scale scale of the `x` values.
#' @param rho the rho function to use.
#' @param cc value of the tuning constant for the chosen rho function.
#' By default, chosen to achieve 95% efficiency under the Normal
#' model.
#' @param opts a list of options for the M-estimating equation, see
#' [mest_options] for details.
#' @return the m-scale estimate.
#' @importFrom stats mad
#' @export
mloc <- function (x, scale = mad(x), rho, cc, opts = mest_options()) {
# No checks for NA values!
opts$rho <- rho_function(rho)
if (!missing(cc)) {
opts$cc <- as.numeric(cc[[1L]])
}
.Call(C_mloc, as.numeric(x), scale, opts)
}
#' Compute the M-Location and M-Scale
#'
#' Simultaneously estiamte the M-Location and the M-Scale.
#'
#' @param x numeric values.
#' @param bdp desired breakdown point (between 0 and 0.5).
#' @param cc cutoff value for the bisquare rho function. By default, chosen
#' for a consistent estimate under the Normal model.
#' @param opts a list of options for the M-estimating equation,
#' see [mest_options] for details.
#' @return a vector with two elements, the M-location and the M-scale estimate.
#' @export
mlocscale <- function (x, bdp = 0.25, location_rho = c('bisquare', 'huber'),
scale_cc = consistency_const(bdp, 'bisquare'),
location_cc, opts = mest_options()) {
# No checks for NA values!
opts$delta <- as.numeric(bdp[[1L]])
opts$cc <- as.numeric(scale_cc[[1L]])
loc_opts <- list(rho = rho_function(location_rho))
if (!missing(location_cc)) {
loc_opts$cc <- as.numeric(location_cc[[1L]])
}
.Call(C_mlocscale, as.numeric(x), opts, loc_opts)
}
#' Get the Constant for Consistency for the M-Scale Using the Bisquare Rho Function
#' @param delta desired breakdown point (between 0 and 0.5)
#'
#' @return consistency constant
.bisquare_consistency_const <- function (delta) {
##
## Pre-computed values for some delta values
##
eps <- sqrt(.Machine$double.eps)
if (!isTRUE(delta < 0.5 + eps && delta > -eps)) {
stop("`delta` is outside valid bounds")
}
if (abs(delta - 0.5) < eps) {
return(1.5476450)
} else if (abs(delta - 0.25) < eps) {
return(2.937015)
} else if (abs(delta - 0.1) < eps) {
return(5.182361)
} else if (delta < 0.005) {
return(50) # ~.1% bdp for bisquare
}
integral_interval <- if (delta > 0.1) {
c(1.5, 5.5)
} else {
c(5, 25)
}
# For bisquare we have the closed form solution to the expectation
expectation <- function(cc, delta) {
pnorm.mcc <- 2 * pnorm(-cc)
1/cc^6 * exp(-(cc^2/2)) * (
-cc * (15 - 4 * cc^2 + cc^4) * sqrt(2 / pi) +
3 * (5 - 3 * cc^2 + cc^4) * exp(cc^2/2) * (1 - pnorm.mcc) +
cc^6 * exp(cc^2/2) * pnorm.mcc
) - delta
}
uniroot(expectation, interval = integral_interval, delta)$root
}
#' Get the Constant for Consistency for the M-Scale
#'
#' @param delta desired breakdown point (between 0 and 0.5)
#' @param rho the name of the chosen rho function.
#'
#' @return consistency constant
#' @export
consistency_const <- function (delta, rho) {
return(switch(rho_function(rho),
bisquare = .bisquare_consistency_const(delta),
huber = stop("Huber's rho function not supported for scale ",
"estimation!")))
}
#' List or check available rho functions.
#'
#' @param rho the name of the rho function to check.
#' @return if `rho` is missing returns a vector of rho function names, otherwise
#' the integer representation of the rho function.
#' @export
rho_function <- function (rho) {
available <- c('bisquare', 'huber')
if (missing(rho)) {
return(available)
}
return(match(match.arg(rho, available), available))
}
#' Approximate Value Matching
#'
#' @param x,table see [base::match] for details.
#' @param eps numerical tolerance for matching.
#' @return a vector the same lenght of `x` with integers giving the position in
#' `table` of the first match if there is a match, or `NA_integer_`
#' otherwise.
.approx_match <- function(x, table,
eps = min(sqrt(.Machine$double.eps),
0.5 * min(x, table))) {
.Call('C_approx_match', as.numeric(x), as.numeric(table),
as.numeric(eps[[1L]]))
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utilities.R
\name{consistency_const}
\alias{consistency_const}
\title{Get the Constant for Consistency for the M-Scale}
\usage{
consistency_const(delta, rho)
}
\arguments{
\item{delta}{desired breakdown point (between 0 and 0.5)}
\item{rho}{the name of the chosen rho function.}
}
\value{
consistency constant
}
\description{
Get the Constant for Consistency for the M-Scale
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utilities.R
\name{.approx_match}
\alias{.approx_match}
\title{Approximate Value Matching}
\usage{
.approx_match(x, table, eps = min(sqrt(.Machine$double.eps), 0.5 * min(x,
table)))
}
\arguments{
\item{x, table}{see \link[base:match]{base::match} for details.}
\item{eps}{numerical tolerance for matching.}
}
\value{
a vector the same lenght of \code{x} with integers giving the position in
\code{table} of the first match if there is a match, or \code{NA_integer_}
otherwise.
}
\description{
Approximate Value Matching
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utilities.R
\name{.bisquare_consistency_const}
\alias{.bisquare_consistency_const}
\title{Get the Constant for Consistency for the M-Scale Using the Bisquare Rho Function}
\usage{
.bisquare_consistency_const(delta)
}
\arguments{
\item{delta}{desired breakdown point (between 0 and 0.5)}
}
\value{
consistency constant
}
\description{
Get the Constant for Consistency for the M-Scale Using the Bisquare Rho Function
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/control_options.R
\name{.check_en_algorithm}
\alias{.check_en_algorithm}
\title{Check if the selected EN algorithm can handle the given penalty.}
\usage{
.check_en_algorithm(en_algorithm_opts, alpha)
}
\description{
Check if the selected EN algorithm can handle the given penalty.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/pense_regression.R
\name{.make_initest_list}
\alias{.make_initest_list}
\title{Make a list of initial estimates}
\usage{
.make_initest_list(initial_estimates, lambdas)
}
\value{
a list the same length as \code{lambdas} with a list of initial estimates
for each value in \code{lambdas}.
}
\description{
Make a list of initial estimates
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/pense_regression.R
\name{.pense_internal}
\alias{.pense_internal}
\title{Perform some final input adjustments and call the internal C++ code.}
\usage{
.pense_internal(x, y, alpha, lambdas, cold_lambda_inds,
additional_initial_estimates, penalty_loadings = NULL, pense_opts,
enpy_opts, optional_args)
}
\description{
Perform some final input adjustments and call the internal C++ code.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/control_options.R
\name{en_dal_options}
\alias{en_dal_options}
\title{Options for the DAL Elastic Net Algorithm}
\usage{
en_dal_options(max_it = 100, max_inner_it = 100, eps = 1e-09,
eta_multiplier = 2, eta_start_conservative = 0.01,
eta_start_aggressive = 1, lambda_relchange_aggressive = 0.25)
}
\arguments{
\item{max_it}{maximum number of (outer) iterations.}
\item{max_inner_it}{maximum number of (inner) iterations in each outer iteration.}
\item{eps}{numerical tolerance to check for convergence.}
\item{eta_multiplier}{multiplier for the barrier parameter. In each iteration, the barrier must be more restrictive
(i.e., the multiplier must be > 1).}
\item{eta_start_conservative}{conservative initial barrier parameter. This is used if the previous penalty is
undefined or too far away.}
\item{eta_start_aggressive}{aggressive initial barrier parameter. This is used if the previous penalty is close.}
\item{lambda_relchange_aggressive}{how close must the lambda parameter from the previous penalty term be to use
an aggressive initial barrier parameter (i.e., what constitues "too far").}
}
\value{
options for the DAL EN algorithm.
}
\description{
Options for the DAL Elastic Net Algorithm
}
\concept{EN algorithms}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/enpy.R
\name{enpy_initial_estimates}
\alias{enpy_initial_estimates}
\title{ENPY Initial Estimates}
\usage{
enpy_initial_estimates(x, y, alpha, lambdas, bdp = 0.25, cc,
include_intercept = TRUE, enpy_opts = enpy_options(),
mscale_opts = mscale_options(), s_algo_opts = s_algo_options(),
en_options)
}
\description{
ENPY Initial Estimates
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/control_options.R
\name{enpy_options}
\alias{enpy_options}
\title{Options for the ENPY Algorithm}
\usage{
enpy_options(max_it = 10, eps = 1e-09, keep_psc_proportion = 0.5,
keep_residuals_measure = c("proportion", "threshold"),
keep_residuals_proportion = 0.5, keep_residuals_threshold = 2)
}
\arguments{
\item{max_it}{maximum number of PY iterations.}
\item{eps}{numerical tolerance to check for convergence.}
\item{keep_psc_proportion}{how many observations should be kept based on the Principal Sensitivy Components.}
\item{keep_residuals_measure}{how to determine how many observations to keep, based on their residuals.
If \code{proportion}, a fixed number of observations is kept, while if \code{threshold},
only observations with residuals below a threshold are kept.}
\item{keep_residuals_proportion}{how many observations should be kept based on their residuals.}
\item{keep_residuals_threshold}{only observations with (standardized) residuals less than this threshold are kept.}
}
\value{
options for the ENPY algorithm.
}
\description{
Options for the ENPY Algorithm
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/control_options.R
\name{mest_options}
\alias{mest_options}
\title{Options for the M-estimation Algorithm}
\usage{
mest_options(max_it = 200, eps = 1e-08)
}
\arguments{
\item{max_it}{maximum number of iterations.}
\item{eps}{numerical tolerance to check for convergence.}
}
\value{
options for the M-estimation algorithm.
}
\description{
Options for the M-estimation Algorithm
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utilities.R
\name{mloc}
\alias{mloc}
\title{Compute the M-estimate of Location}
\usage{
mloc(x, scale = mad(x), rho, cc, opts = mest_options())
}
\arguments{
\item{x}{numeric values.}
\item{scale}{scale of the \code{x} values.}
\item{rho}{the rho function to use.}
\item{cc}{value of the tuning constant for the chosen rho function.
By default, chosen to achieve 95% efficiency under the Normal
model.}
\item{opts}{a list of options for the M-estimating equation, see
\link{mest_options} for details.}
}
\value{
the m-scale estimate.
}
\description{
Compute the M-estimate of Location
}