# multiple linear regression # x ~ Np(mu, var) # y = beta0 + beta1*x1 + beta2*x2 + ... + e # e ~ N(0, sigma) # simulate independent predictors # (n correlated observations on nrow(var) covariates) # n - number of observations to simulate # mu - mean vector of covariates # var - covariance matrix of covariates # return value is a data frame of the newly simulated # independent predictors independent <- function(n, mu, var) { p <- nrow(var) z <- matrix(rnorm(n*p), n, p) x <- t(t(z %*% chol(var)) + mu) return(data.frame(x)) } # simulate dependant data from a normal linear model # formula - linear model formula # beta - linear coefficients # sigma - normal standard deviation # data - data frame containing independent predictors # return value is a model frame with the newly simulated dependent # data (labeled "Y"), as well as the independent "data" dependent <- function(formula, beta, sigma, data) { x <- model.matrix(formula, data) y <- x %*% beta + rnorm(nrow(data), 0, sigma) data[["Y"]] <- y formula <- update.formula(formula, Y ~ .) return(model.frame(formula, data)) } # simulate data and the stepwise procedure simstep <- function(n, mu, var, form_true, beta_true, sigma, form_lower, form_upper) { # simulate independent data ind <- independent(n, mu, var) # simulate dependent data # FIXME, dep is needed by step(), but cannot find it # unless it's assigned like this (may be a bug): dep <<- dependent(form_true, beta_true, sigma, ind) # use forward and backward stepwise regression to find a minimum AIC model step_both <- step(object = lm(update(form_upper, Y ~ .), data=dep), scope = list(upper=form_upper, lower=form_lower), direction="both", trace=0) # extract the terms remaining after stepwise procedure terms_remaining <- attr(terms(formula(step_both)), "term.labels") beta1 <- coef(step_both)["X1"] attr(beta1, 'terms.remaining') <- terms_remaining return(beta1) } # simulate data and the "full-model" procedure simfull <- function(n, mu, var, form_true, beta_true, sigma, form_full) { # simulate independent data ind <- independent(n, mu, var) # simulate dependent data dep <- dependent(form_true, beta_true, sigma, ind) # use "full model" regression fit <- lm(update(form_upper, Y ~ .), data=dep) beta1 <- coef(fit)["X1"] return(beta1) } ##### #let the covariates and model be characterised as follows: mu <- c(0,0,0,0,0) var <- matrix(c(1, 0.95, 0, 0, 0.95, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1), 4,4, byrow=TRUE) # true formula and coefficients; note that the coefficients # for X2 and X4 are zero; we retain a main effect for each # predictor even if the associated coefficient is zero, # to ensure that model.frame outputs each predictor form_true <- ~ 1 + X1 + X2 + X3 + X4 + X1:X3 beta_true <- c(0, 1, 0, 0.3, 0, 0.1) # error standard deviation sigma <- 1.0 # upper and lower formulas for stepwise regression # upper formula has each main effect, and all first order interactions form_upper <- ~ 1 + (X1 + X2 + X3 + X4)^2 form_lower <- ~ 1 + X1 # sample size N <- 100 # number of simulation replicates B <- 5000 # perform stepwise regression B times system.time({ beta1_step <- replicate(B, simstep(N, mu, var, form_true, beta_true, sigma, form_lower, form_upper), simplify=FALSE) # count how often each term remains the stepwise procedure terms_step <- list() for(term in attr(terms(form_upper), "term.labels")) terms_step[term] <- 0 for(beta1 in beta1_step) for(term in attr(beta1, "terms.remaining")) terms_step[[term]] <- terms_step[[term]] + 1 terms_step_num <- as.numeric(terms_step) names(terms_step_num) <- names(terms_step) # show how often (proportion) each predictor is retained # by the stepwise regression method terms_step_num/B # perform full model regression B times beta1_full <- replicate(B, simfull(N, mu, var, form_true, beta_true, sigma, form_upper), simplify=FALSE) }) # system.time() # plot the empirical density of beta1 estimates for both # prodecures beta1_step <- as.numeric(beta1_step) mu_step <- format(mean(beta1_step), nsmall=2, digits=2) sd_step <- format(sd(beta1_step), nsmall=2, digits=2) beta1_full <- as.numeric(beta1_full) mu_full <- format(mean(beta1_full), nsmall=2, digits=2) sd_full <- format(sd(beta1_full), nsmall=2, digits=2) dens_step <- density(beta1_step) dens_full <- density(beta1_full) plot(y=dens_step\$y, x=dens_step\$x, type="l", lty=1, xlab=expression(hat(beta)[1]), ylab="Density") lines(y=dens_full\$y, x=dens_full\$x, type="l", lty=2) legend("topleft", c( paste("Stepwise: ", mu_step, " (", sd_step, ")", sep=""), paste("Full: ", mu_full, " (", sd_full, ")", sep="")), lty=c(1,2), bty="n", cex=0.8)