Commit 4ff37305 authored by davidkep's avatar davidkep

add coefs method

parent dd877ad2
......@@ -35,3 +35,4 @@ Suggests:
License: MIT + file LICENSE
NeedsCompilation: yes
RoxygenNote: 7.1.0
Roxygen: list(markdown = TRUE, load = "source")
# Generated by roxygen2: do not edit by hand
S3method(coef,post_lasso)
S3method(coef,thrive)
export(post_lasso)
export(post_lasso_cv)
importClassesFrom(Matrix,dsparseVector)
importFrom(Matrix,drop)
export(thrive)
importFrom(Rcpp,evalCpp)
useDynLib(thrive, .registration = TRUE)
#' Extract Model Coefficients
#'
#' @param object a post-LASSO estimate to extract coefficients from.
#' @param s the number of variables in the model (for \code{what="plasso"}) or the value of the
#' penalty parameter (for \code{what="lasso"}).
#' @param sparse return a sparse vector or a dense (base R) numeric vector
#' @param s the number of variables in the model (for `what = "plasso"`) or the value of the
#' penalty parameter (for `what = "lasso"`).
#' @param sparse return sparse vector or dense (base R) vectors.
#' @param ... currently not used.
#' @return if \code{sparse = FALSE} a numeric vector of size \eqn{p + 1}.
#' Otherwise a sparse vector.
#'
#' @importFrom Matrix drop
#' @importClassesFrom Matrix dsparseVector
#' @return a list of coefficients. Each coefficient is a list with two components: `intercept` and `beta`.
#' If `sparse = FALSE`, `beta` is a numeric vector of size \eqn{p}, otherwise it is a sparse vector.
#'
#' @export
coef.post_lasso <- function(object, s, what = c("plasso", "lasso"), sparse = TRUE, ...) {
coef.post_lasso <- function (object, s, what = c("plasso", "lasso"), sparse = TRUE, ...) {
what <- match.arg(what)
res <- switch(what, plasso = .plasso_coefs(object, s), lasso = .lasso_coefs(object, s))
......@@ -24,7 +21,31 @@ coef.post_lasso <- function(object, s, what = c("plasso", "lasso"), sparse = TRU
return(res)
}
.plasso_coefs <- function(object, nkeep) {
#' Extract Model Coefficients
#'
#' @param object a thrive estimate to extract coefficients from.
#' @param nnz the number of exposures with non-zero coefficient.
#' @param sparse return a sparse vector or a dense (base R) numeric vector
#' @return a list of coefficients. Each coefficient is a list with two components: `intercept` and `beta`.
#' If `sparse = FALSE`, `beta` is a numeric vector of size \eqn{p}, otherwise it is a sparse vector.
#'
#' @export
coef.thrive <- function (object, nnz, sparse = TRUE, ...) {
nvars <- sapply(object$estimates, function (x) length(x$beta@i) )
lapply(nnz, function (nk) {
ret_ind <- which(nk == nvars)
if (length(ret_ind) == 0L) {
warning(sprintf("requested number of variables (%d) not available", nk))
return(list())
}
if (isFALSE(sparse)) {
object$estimates[[ret_ind]]$beta <- as.numeric(object$estimates[[ret_ind]]$beta)
}
return(object$estimates[[ret_ind]])
})
}
.plasso_coefs <- function (object, nkeep) {
nvars <- sapply(object$plasso, function (x) length(x$beta@i) )
unlist(lapply(nkeep, function (nk) {
ret_ind <- match(nk, nvars)
......@@ -36,7 +57,6 @@ coef.post_lasso <- function(object, s, what = c("plasso", "lasso"), sparse = TRU
}), recursive = FALSE, use.names = FALSE)
}
.lasso_coefs <- function(object, lambda) {
lambda_seq <- unlist(lapply(object$lasso, `[[`, 'lambda'))
if (length(lambda_seq) == 0L) {
......
......@@ -118,7 +118,7 @@ post_lasso_cv <- function (x, y, post_keep, cv_k, nlambda = 0, lambda, lambda_mi
length(coef$beta@i)
})
plasso_prederr <- lapply(train_fit$plasso, function (coef) {
y_test - coef$intercept - drop(x_test %*% coef$beta)
y_test - coef$intercept - as.numeric(x_test %*% coef$beta)
})
if (any(duplicated(plasso_nact))) {
......@@ -131,7 +131,6 @@ post_lasso_cv <- function (x, y, post_keep, cv_k, nlambda = 0, lambda, lambda_mi
} else {
plasso_prederr <- matrix(unlist(plasso_prederr), nrow = length(y_test))
}
plasso_prederr[ , match(nact_full_present, plasso_nact)]
} else {
numeric(0L)
......
#' Compute Thrice-Robust Instrumental Variable Estimates
#'
#' Post-LASSO estimates regressing each exposure on the instruments are selected via cross-validation.
#' Compute thrice-robust instrumental variable estimates.
#' The exposures `x` are first de-confounded using instruments `z`, and the response `y` is then regressed on
#' the de-confounded exposures.
#' Estimates for de-confounding the exposures are computed via post-LASSO and the reguarlization parameter
#' is selected via cross-validation.
#'
#'
#' @param x matrix of exposures. Should be properly standardized beforehand.
#' @param y vector of response values.
......@@ -18,6 +23,14 @@
#' @param cv_metric either a string specifying the performance metric to use, or a function to evaluate prediction
#' errors in a single CV replication. If a function, it is called with a single numeric vector of
#' prediction errors and must return a scalar number.
#' @return
#' \item{plasso}{list of 2nd-stage PLASSO regression coefficients.}
#' \item{x_fitted}{the fitted exposures.}
#' \item{cv_perf_plasso}{cross-validated prediction performance of the estimates based on `x_fitted`.}
#' \item{cv_perf_grid}{cross-validated prediction performance of the estimates based on all the considered fitted
#' exposures.}
#' \item{z_coefs}{coefficients for regressing each exposure on the instruments.}
#' @export
thrive <- function (x, y, z, cv_k, model_size_span = 1, model_sizes_x, model_size_y,
intercept = TRUE, cv_repl = 1, cv_metric = c('rmspe', 'mape')) {
y <- drop(y)
......@@ -51,26 +64,26 @@ thrive <- function (x, y, z, cv_k, model_size_span = 1, model_sizes_x, model_siz
}
model_size_y <- if (missing(model_size_y) || is.null(model_size_y)) {
seq_len(x_dim[[2L]])
seq_len(x_dim[[2L]] + 1L) - 1L
} else if (is.numeric(model_size_y)) {
sort(as.integer(model_size_y))
} else {
stop("`model_size_y` must be an integer vector.")
}
model_size_x <- if (missing(model_size_x) || is.null(model_size_x)) {
model_sizes_x <- if (missing(model_sizes_x) || is.null(model_sizes_x)) {
rep.int(list(seq_len(z_dim[[2L]] + 1L) - 1L), x_dim[[2L]])
} else if (is.list(model_size_x)) {
if (length(model_size_x) != x_dim[[2L]]) {
stop("If `model_size_x` is a list, it must contain one integer vector per column in `x`.")
} else if (is.list(model_sizes_x)) {
if (length(model_sizes_x) != x_dim[[2L]]) {
stop("If `model_sizes_x` is a list, it must contain one integer vector per column in `x`.")
}
lapply(model_size_x, function (msx) {
lapply(model_sizes_x, function (msx) {
if (!is.numeric(msx)) {
stop("If `model_size_x` is a list, it must contain one integer vector per column in `x`.")
stop("If `model_sizes_x` is a list, it must contain one integer vector per column in `x`.")
}
sort(as.integer(msx))
})
} else if (is.numeric(model_size_x)) {
} else if (is.numeric(model_sizes_x)) {
rep.int(list(sort(as.integer(model_size_y))), x_dim[[2L]])
} else {
stop("`model_size_y` must be an integer vector or a list of integer vectors.")
......@@ -79,7 +92,7 @@ thrive <- function (x, y, z, cv_k, model_size_span = 1, model_sizes_x, model_siz
# First stage, regress each exposure on all instruments
stage_1 <- lapply(seq_len(x_dim[[2L]]), function (j) {
pr <- post_lasso_cv(x = z, y = x[, j, drop = TRUE], post_keep = model_size_x[[j]], lambda = numeric(0L),
pr <- post_lasso_cv(x = z, y = x[, j, drop = TRUE], post_keep = model_sizes_x[[j]], lambda = numeric(0L),
intercept = intercept, cv_k = cv_k, cv_repl = cv_repl, cv_metric = cv_metric)
best_model <- which.min(pr$cv_perf_plasso$cvm)
......@@ -88,7 +101,7 @@ thrive <- function (x, y, z, cv_k, model_size_span = 1, model_sizes_x, model_siz
coefs <- pr$plasso[model_grid]
fitted <- lapply(coefs, function (coef) {
drop(z %*% coef$beta) + coef$intercept
as.numeric(z %*% coef$beta) + coef$intercept
})
list(coefs = coefs, fitted = fitted)
})
......@@ -98,6 +111,7 @@ thrive <- function (x, y, z, cv_k, model_size_span = 1, model_sizes_x, model_siz
pr <- post_lasso_cv(x = x_fitted, y = y, post_keep = model_size_y, lambda = numeric(0L), intercept = intercept,
cv_k = cv_k, cv_repl = cv_repl, cv_metric = cv_metric)
pr$x_fitted <- x_fitted
pr$cv_perf_plasso$grid_step <- k - model_size_span - 1L
pr$z_coefs <- lapply(stage_1, function (s1) { s1$coefs[[k]] })
return(pr)
})
......@@ -108,7 +122,12 @@ thrive <- function (x, y, z, cv_k, model_size_span = 1, model_sizes_x, model_siz
best_stage_2 <- stage_2[[which.min(stage_2_min_cv)]]
best_stage_2$cv_perf_lasso <- NULL
best_stage_2$lasso <- NULL
best_stage_2$cv_perf_grid <- do.call(rbind, lapply(stage_2, `[[`, 'cv_perf_plasso'))
class(best_stage_2) <- 'thrive'
names(best_stage_2)[[which(names(best_stage_2) == 'plasso')]] <- 'estimates'
return(best_stage_2)
}
......
......@@ -9,16 +9,16 @@
\arguments{
\item{object}{a post-LASSO estimate to extract coefficients from.}
\item{s}{the number of variables in the model (for \code{what="plasso"}) or the value of the
penalty parameter (for \code{what="lasso"}).}
\item{s}{the number of variables in the model (for \code{what = "plasso"}) or the value of the
penalty parameter (for \code{what = "lasso"}).}
\item{sparse}{return a sparse vector or a dense (base R) numeric vector}
\item{sparse}{return sparse vector or dense (base R) vectors.}
\item{...}{currently not used.}
}
\value{
if \code{sparse = FALSE} a numeric vector of size \eqn{p + 1}.
Otherwise a sparse vector.
a list of coefficients. Each coefficient is a list with two components: \code{intercept} and \code{beta}.
If \code{sparse = FALSE}, \code{beta} is a numeric vector of size \eqn{p}, otherwise it is a sparse vector.
}
\description{
Extract Model Coefficients
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/coef-methods.R
\name{coef.thrive}
\alias{coef.thrive}
\title{Extract Model Coefficients}
\usage{
\method{coef}{thrive}(object, nnz, sparse = TRUE, ...)
}
\arguments{
\item{object}{a thrive estimate to extract coefficients from.}
\item{nnz}{the number of exposures with non-zero coefficient.}
\item{sparse}{return a sparse vector or a dense (base R) numeric vector}
}
\value{
a list of coefficients. Each coefficient is a list with two components: \code{intercept} and \code{beta}.
If \code{sparse = FALSE}, \code{beta} is a numeric vector of size \eqn{p}, otherwise it is a sparse vector.
}
\description{
Extract Model Coefficients
}
......@@ -34,18 +34,18 @@ post_lasso_cv(
\item{y}{response vector.}
\item{post_keep}{sizes of models for which post-LASSO estimates are to be computed. If missing, post-LASSO estimates
are computed for all possible models, i.e., models of size 1 ... `ncol(x)`.
If `NULL`, no post-LASSO estimates are computed.}
are computed for all possible models, i.e., models of size 1 ... \code{ncol(x)}.
If \code{NULL}, no post-LASSO estimates are computed.}
\item{nlambda}{number of penalization levels for which LASSO estimates are computed.}
\item{lambda}{optional user-supplied sequence of penalization levels. If given and not `NULL`, `nlambda` and
`lambda_min_ratio` are ignored.}
\item{lambda}{optional user-supplied sequence of penalization levels. If given and not \code{NULL}, \code{nlambda} and
\code{lambda_min_ratio} are ignored.}
\item{lambda_min_ratio}{Smallest value of the penalization level as a fraction of the largest level (i.e., the
smallest value for which all coefficients are zero). The default depends on the sample
size relative to the number of variables. If more observations than variables
are available, the default is `1e-3`, otherwise `1e-2`.}
are available, the default is \code{1e-3}, otherwise \code{1e-2}.}
\item{intercept}{should an intercept be estimated?}
......@@ -59,15 +59,15 @@ prediction errors and must return a scalar number.}
}
\value{
\item{lambda}{vector of lambda values.}
\item{lasso}{list of LASSO regression coefficients.}
\item{plasso}{list of PLASSO regression coefficients.}
\item{lasso}{list of LASSO regression coefficients.}
\item{plasso}{list of PLASSO regression coefficients.}
}
\description{
Estimate the LASSO and Post-LASSO (PLASSO) regression coefficients.
Post-LASSO estimates are computed for all models of size in `post_keep`, whereas LASSO estimates are computed
for all penalization levels in `lambda`.
Post-LASSO estimates are computed for all models of size in \code{post_keep}, whereas LASSO estimates are computed
for all penalization levels in \code{lambda}.
[post_lasso_cv()] automatically cross-validates (post)-LASSO estimates.
\code{\link[=post_lasso_cv]{post_lasso_cv()}} automatically cross-validates (post)-LASSO estimates.
}
\details{
Post-LASSO estimates are ordinary least square estimates of the coefficients for variables selected by LASSO.
......
......@@ -29,12 +29,12 @@ thrive(
\item{model_size_span}{span of the grid of model sizes to consider in the second stage. See details.}
\item{model_sizes_x}{model sizes to consider for the exposures. If missing, consider all possible model sizes
for each exposure (i.e., models of size 1 ... `ncol(z)`). If an integer vector,
the same model sizes are considered for every exposure. If a list the same length as `x`
for each exposure (i.e., models of size 1 ... \code{ncol(z)}). If an integer vector,
the same model sizes are considered for every exposure. If a list the same length as \code{x}
columns, different model sizes are considered for each exposure.}
\item{model_size_y}{model sizes to consider for the second stage regression, i.e., for regressing the
response `y` on the de-confounded matrix of exposures.}
response \code{y} on the de-confounded matrix of exposures.}
\item{intercept}{should an intercept be estimated?}
......@@ -44,6 +44,18 @@ response `y` on the de-confounded matrix of exposures.}
errors in a single CV replication. If a function, it is called with a single numeric vector of
prediction errors and must return a scalar number.}
}
\value{
\item{plasso}{list of 2nd-stage PLASSO regression coefficients.}
\item{x_fitted}{the fitted exposures.}
\item{cv_perf_plasso}{cross-validated prediction performance of the estimates based on \code{x_fitted}.}
\item{cv_perf_grid}{cross-validated prediction performance of the estimates based on all the considered fitted
exposures.}
\item{z_coefs}{coefficients for regressing each exposure on the instruments.}
}
\description{
Post-LASSO estimates regressing each exposure on the instruments are selected via cross-validation.
Compute thrice-robust instrumental variable estimates.
The exposures \code{x} are first de-confounded using instruments \code{z}, and the response \code{y} is then regressed on
the de-confounded exposures.
Estimates for de-confounding the exposures are computed via post-LASSO and the reguarlization parameter
is selected via cross-validation.
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment