### Chun's R Notes

#### Getting help

To see the help page for a function, use "?" of help(). For example, `?plot` and `help(plot)` will show the help page for the function plot(). For some special functions, you only can access the help page through help(). For example, `help("[")` tells you the usage of accessing data through index.

At the end of most help pages, there are examples. You can try those examples line by line to see the effects. Or, you can use example() function to go through them all at once. For example, `example(plot)` will go through all the code in the plot() help page. This function often leads to >1 page of plots; to see all of them, you may want to turn on history recording by clicking on the plot window and then go to History -> Recording. Note that history recording eats up memory quickly.

A few commands and packages also have built-in demonstration features. `demo()` will list them. Try out `demo(graphics)`.

#### Some exploratory graphics in EMS Chapter 3

```try <- table(rpois(100, lambda=3))
barplot(try)
barplot(try, horiz=T)
pie(try)
dotchart(try)
```

jitter() is the function that can help with drawing jittering plot.

#### Continuous Distributions

##### Normal distributions with various standard deviations
```curve(dnorm(x, 0, 0.5), -5, 5, xlab="", ylab="Density")
curve(dnorm(x, 0, 1), -5, 5, add=T, col=2)
curve(dnorm(x, 0, 2), -5, 5, add=T, col=3)
title(main=expression(paste("Normal distributions:  ",
N(list(0, sigma^2)))),
sub=expression(paste("If ", italic(X), " ~ N(0, 1), then ",
sigma*italic(X)+mu, " ~ ",
N(list(mu, sigma^2)))))
legend(1.5, 0.8, lty=1, col=1:3, bty="n",
legend=c(expression(sigma == 0.5),
expression(sigma == 1),
expression(sigma == 2)))
```

##### t distributions with various degrees of freedom
```curve(dt(x, 50), -5, 5, xlab="", ylab="Density")
curve(dt(x, 10), -5, 5, add=T, col=2)
curve(dt(x, 5), -5, 5, add=T, col=3)
curve(dt(x, 1), -5, 5, add=T, col=4)
title(main=expression(paste(italic(t), " distributions:  ",
italic(t)[df])))
mtext(expression(paste(italic(t),
" is Cauchy distribution and ",
italic(t)[infinity], " ~ N(0, 1)")),
mtext(expression(paste(E(italic(t)[d]) == 0, ", when d > 1, ",
SD(italic(t)[d]) == sqrt(d/(d-2)),
", when d > 2.")),
legend(1.5, 0.4, lty=1, col=1:4, bty="n",
legend=c("df = 50", "df = 10", "df = 5", "df = 1"))
```

##### Compare t and normal distributions
```TvsNormal = function(df) {
curve(dt(x, df), -5, 5, xlab="", ylab="Density")
# Normal distribution with same peak height
curve(dnorm(x, 0, 1/dt(0, df)/sqrt(2*pi)), -5, 5, add=T, col=2)
# Normal distribution with same variance
# (t1 and t2 don't have finite variance)
if (df > 2)
curve(dnorm(x, 0, sqrt(df/(df-2))), -5, 5, add=T, col=3)

title(main=eval(expression(paste("df = ", df))))
legend(1, dt(0, df), lty=1, col=1:3, bty="n", cex=.6,
legend=c(do.call("expression", list(bquote(t[.(df)]))),
"Normal, same peak height",
"Normal, same variance"))
}
par(mfrow=c(2,2), mar=c(3, 4, 2, 1)+.1)
TvsNormal(1)
TvsNormal(5)
TvsNormal(10)
TvsNormal(50)
```

##### F distributions with various degrees of freedom
```curve(df(x, 1, 10), 0, 5, xlab="", ylab="Density")
curve(df(x, 1, 50), 0, 5, add=T, col=2)
curve(df(x, 5, 10), 0, 5, add=T, col=3)
curve(df(x, 5, 50), 0, 5, add=T, col=4)
title(main=expression(paste(italic(F), " distributions:  ",
italic(F)[list(v1, v2)])),
sub=expression(paste("Note:  ",
E(italic(F)[list(v1,v2)])==v2/(v2-2),
", when v2 > 2. If ",
italic(X), " ~ ", italic(t)[d],
", then ", italic(X)^2, " ~ ",
italic(F)[list(1, d)])))
legend(2, 1.5, lty=1, col=1:4, bty="n",
legend=c("v1 = 1,  v2 = 10", "v1 = 1,  v2 = 50",
"v1 = 5,  v2 = 10", "v1 = 5,  v2 = 50"))
```

##### χ2 distributions with various degrees of freedom
```curve(dchisq(x, 1), 0, 20, xlab="", ylab="Density")
curve(dchisq(x, 2), 0, 20, add=T, col=2)
curve(dchisq(x, 3), 0, 20, add=T, col=3)
curve(dchisq(x, 5), 0, 20, add=T, col=4)
curve(dchisq(x, 10), 0, 20, add=T, col=5)
title(main=expression(paste("Chi-squared distributions:  ",
chi[df]^2)))
mtext(expression(paste(chi[d]^2, " ~ Gamma(d/2, 1/2).  If ",
italic(X), " ~ N(0, 1), then ",
italic(X)^2, " ~ ", chi^2)),
mtext(expression(paste("If ", italic(X), " ~ ", chi[d]^2,
", then ", E(italic(X)) == d, " and ",
SD(italic(X)) == sqrt(2*d))),
legend(5, .8, lty=1, col=1:5, bty="n",
legend=c("df = 1", "df = 2", "df = 3", "df = 5",
"df = 10"))
```

##### Compare standard normal and χ21 distributions
```curve(dnorm(x), -3, 3, xlab="", ylab="Density", ylim=c(0,.8),
lwd=2)
curve(dchisq(x, 1), 0, 3, add=T, col=2, lwd=2)
segments(c(-1/2, 1/2, 0, 1/4), 0, c(-1/2, 1/2, 0, 1/4),
c(rep(dnorm(1/2),2), 10, dchisq(1/4,1)),
col=c(1,1,2,2))
segments(c(-sqrt(2), sqrt(2), 2), 0, c(-sqrt(2), sqrt(2), 2),
c(rep(dnorm(sqrt(2)),2), dchisq(2,1)),
col=c(1,1,2), lty=2)
title(main=expression(paste("Compare N(0, 1) and ", chi^2,
" distributions")),
sub=expression(paste("Note:  If ", italic(X),
" ~ N(0, 1), then ", italic(X)^2,
" ~ ", chi^2)))
legend(-3, .8, lty=1, lwd=2, col=1:2, bty="n",
legend=c("N(0, 1)", expression(chi^2)))
```

##### Exponential distributions with various rates
```curve(dexp(x, 5), 0, 2, xlab="", ylab="Density", col=1)
curve(dexp(x, 2), 0, 2, add=T, col=2)
curve(dexp(x, 1), 0, 2, add=T, col=3)
curve(dexp(x, 0.5), 0, 2, add=T, col=4)
title(main=expression(paste("Exponential distributions:  ",
Exp(beta))))
mtext(expression(paste(Exp(beta), " ~ Gamma", (list(1, beta)),
", where ", beta, " is rate")),
mtext(expression(paste("If ", italic(X), " ~ ", Exp(beta),
", then ", E(italic(X)) == SD(italic(X)),
" = ", 1/beta, ", and ",
a*italic(X), " ~ ", Exp(beta/a))),
legend(1, 5, lty=1, col=1:4, bty="n",
legend=c(expression(beta == 5),
expression(beta == 2),
expression(beta == 1),
expression(paste(beta == 0.5, ", same as ",
chi^2))))
```

##### Gamma distributions with various α's and β = 1
```curve(dgamma(x, 0.5), 0, 10, xlab="", ylab="Density")
curve(dgamma(x, 1), 0, 10, add=T, col=2)
curve(dgamma(x, 1.5), 0, 10, add=T, col=3)
curve(dgamma(x, 5), 0, 10, add=T, col=4)
title(main=expression(paste("Gamma distributions:  Gamma",
(list(alpha, beta)))))
mtext(expression(paste("If ", italic(X), " ~ Gamma",
(list(alpha, 1)), ", then ",
italic(beta*X), " ~ Gamma",
(list(alpha, 1/beta)))),
mtext(expression(paste("If ", italic(X), " ~ Gamma",
(list(alpha, beta)), ", then ",
E(italic(X)) == alpha/beta, " and ",
SD(italic(X)) == sqrt(alpha/beta^2))),
legend(4, 1.5, lty=1, col=1:4, bty="n",
legend=c(expression(list(alpha == 0.5, beta == 1)),
expression(list(alpha == 1.0, ~~~beta == 1)),
expression(list(alpha == 1.5, beta == 1)),
expression(list(alpha == 5.0, ~~~beta == 1))))
```

##### Log-normal distributions
We know that Gamma distributions can be used to model positive-valued data, and that Gamma(α, β) has mean α/β and SD √α /β. Then, when α > 1, mean > SD; when α = 1, mean = SD, and it is an exponential distribution; when α < 1, mean < SD. When α ≤ 1, the mode of the Gamma distribution has to be at zero. (Techinically, this statement is not correct when α < 1 but is good enough for practical purpose.) Because of this, when we have positive-valued data with mean < SD and mode not at zero, the Gamma distribution won't be a good approximation, and we need to find other families of distributions to model the data. Log-normal is one such family of distributions.

When X ~ lognormal(μ, σ), then ln(X) ~ N(μ, σ2). The mean of X is m = exp(μ + 0.5σ2) and the SD is s = m • √[exp(σ2) − 1]. To use a lognormal distribution to model positive-valued data with mean m and SD s, we can solve the above two equations to get σ = √ {ln[(s/m)2 + 1]} and μ = ln(m) − 0.5σ2. For Gamma(α, β), its CV is 1/√α. For lognormal(μ, σ), its CV is √[exp(σ2) − 1].

The following code gives graphic comparisons of log-normal and Gamma distributions:
```m = 2; s = 3
sigma = sqrt(log((s/m)^2+1)); mu = log(m)-.5*sigma^2
curve(dlnorm(x, mu, sigma), 0, 5, col=1, xlab="", ylab="Density")
beta = m/s^2; alpha = beta*m
curve(dgamma(x, alpha, beta), 0, 5, add=T, lty=2, col=1)

m = 2; s = 2
sigma = sqrt(log((s/m)^2+1)); mu = log(m)-.5*sigma^2
curve(dlnorm(x, mu, sigma), 0, 5, add=T, col=2)
beta = m/s^2; alpha = beta*m
curve(dgamma(x, alpha, beta), 0, 5, add=T, lty=2, col=2)

m = 2; s = 1
sigma = sqrt(log((s/m)^2+1)); mu = log(m)-.5*sigma^2
curve(dlnorm(x, mu, sigma), 0, 5, add=T, col=3)
beta = m/s^2; alpha = beta*m
curve(dgamma(x, alpha, beta), 0, 5, add=T, lty=2, col=3)

title(main=c("Compare log-normal and Gamma distributions when",
"modeling positive-valued data with mean m and SD s"))
mtext(expression(paste("For log-normal, ",
sigma == sqrt(ln((s/m)^2+1)),
" and ", mu == ln(m) - 0.5*sigma^2)),
mtext(expression(paste("For Gamma, ", alpha == (m/s)^2,
" and ", beta == m/s^2)),
legend(2, 0.6, lty=rep(1:2,3), col=rep(1:3,each=2), bty="n",
legend=c("Log-normal:  m=2, s=3",
"Gamma:       m=2, s=3",
"Log-normal:  m=2, s=2",
"Gamma:       m=2, s=2",
"Log-normal:  m=2, s=1",
"Gamma:       m=2, s=1"))
```

##### Other positive-valued continuous distributions
Chi-squared, exponential, and gamma are well known families of distributions over positive values. In fact, the first two families are special cases of the gamma family: χ2p ~ Gamma(α = p/2, β = 0.5), Exp(β) ~ Gamma(α = 1, β), where β is rate and 1/β is scale. As a special case, χ22 ~ Exp(β = 0.5).

F and log-normal are two other families of positive-valued distributions. F is related to Beta distribution, and log-normal is related to normal distribution. In fact, one can always define a family of positive-valued distributions by taking the exponential transformation of a family of distributions over the whole real line, such as t and logistic distributions. For example, exponential of logistic(0, 1) is F2, 2. Similarly, one can always define a family of distributions over the whole real line by taking the logarithmic transformation of a family of positive-valued distributions. For example, the Gumbel distribution is the log of an exponential distribution.

There are some other families of positive-valued distributions that are functions of existing families of positive-valued distributions. For examples, if X ~ Exp(β), then X1/γ has the Weibull(γ, β) distribution; if X ~ Exp(1), then (2X)1/2 has the Rayleigh distribution and -log(X) has the Gumbel distribution; if X ~ Gamma(α, β), then 1/X has the inverted gamma distribution; if X ~ Gamma(1.5, 1), then X1/2 has the Maxwell distribution. All these families of distributions were defined with similar thought process: If the data cannot be well modeled using existing, well-understood families of distributions, their transformations may be better modeled by existing distributions.

##### Beta distributions with various α's and β's
```par(mfrow=c(2,2), oma=c(1.5,0,1.5,0), mar=c(3, 4, 1, 1)+.1)
curve(dbeta(x, 0.5, 0.5), 0, 1,
xlab="", ylab="Density", ylim=c(0,5))
curve(dbeta(x, 0.5, 1), 0, 1, add=T, col=2)
curve(dbeta(x, 0.5, 5), 0, 1, add=T, col=3)
legend(0.2, 5, lty=1, col=1:3, bty="n",
legend=c(expression(list(alpha == 0.5, beta == 0.5)),
expression(list(alpha == 0.5, beta == 1)),
expression(list(alpha == 0.5, beta == 5))))

curve(dbeta(x, 1, 0.5), 0, 1,
xlab="", ylab="Density", ylim=c(0,5))
curve(dbeta(x, 1, 1), 0, 1, add=T, col=2)
curve(dbeta(x, 1, 5), 0, 1, add=T, col=3)
legend(0.2, 5, lty=1, col=1:3, bty="n",
legend=c(expression(list(alpha == 1, beta == 0.5)),
expression(list(alpha == 1, beta == 1)),
expression(list(alpha == 1, beta == 5))))

curve(dbeta(x, 3, 0.5), 0, 1,
xlab="", ylab="Density", ylim=c(0,5))
curve(dbeta(x, 3, 1), 0, 1, add=T, col=2)
curve(dbeta(x, 3, 5), 0, 1, add=T, col=3)
legend(0.2, 5, lty=1, col=1:3, bty="n",
legend=c(expression(list(alpha == 3, beta == 0.5)),
expression(list(alpha == 3, beta == 1)),
expression(list(alpha == 3, beta == 5))))

curve(dbeta(x, 3, 5), 0, 1, lwd=2,
xlab="", ylab="Density", ylim=c(0,8))
curve(dbeta(x, 9, 15), 0, 1, add=T, col=2, lwd=2)
curve(dbeta(x, 30, 50), 0, 1, add=T, col=3, lwd=2)
curve(dbeta(x, .3, .5), 0, 1, add=T, col=1, lty=2)
curve(dbeta(x, .6, 1), 0, 1, add=T, col=2, lty=2)
curve(dbeta(x, .9, 1.5), 0, 1, add=T, col=3, lty=2)
abline(v=3/8)
legend(0.4, 8, lwd=rep(2:1,each=3), lty=rep(1:2,each=3),
col=rep(1:3,2), bty="n",
legend=c(expression(list(alpha == 3, beta == 5)),
expression(list(alpha == 9, beta == 15)),
expression(list(alpha == 30, beta == 50)),
expression(list(alpha == .3, beta == 0.5)),
expression(list(alpha == .6, beta == 1)),
expression(list(alpha == .9, beta == 1.5))))

mtext(expression(paste("Beta distributions:  Beta",
(list(alpha, beta)))),
side=3, outer=T)
mtext(expression(paste("Note:  If ", italic(X), " ~ Beta",
(list(alpha, beta)), ", then ",
E(italic(X)) == alpha / (alpha + beta),
" and ", 1 - italic(X), " ~ Beta",
(list(beta, alpha)))),
side=1, outer=T)
```

##### Connections between Beta and other distributions
If X ~ Beta(α 1), then -log(X) ~ Exp(α), where α is rate. In other words, if Y ~ Exp(α), then exp(-Y) ~ Beta(α 1).

If X1 ~ Gamma(r, β) and X2 ~ Gamma(s, β) are independent, then Z1 = X1 + X2 and Z2 = X1/(X1 + X2) are independent, and Z1 ~ Gamma(r + s, β) and Z2 ~ Beta(r, s). As a special case, if X1 ~ χ2p and X2 ~ χ2q are independent of each other, then X1/(X1 + X2) ~ Beta(p/2, q/2). As a consequence, if X ~ Fp, q, then (p/q)X/[1 + (p/q)X] ~ Beta(p/2, q/2). In other words, if Y ~ Beta(p/2, q/2), then [Y/(1 - Y)]/[p/q] = [Y/p]/[(1 - Y)/q] ~ Fp, q.

When p = q = 2, and Y ~ Uniform(0, 1) ~ Beta(1, 1), then Y/(1 - Y) ~ F2, 2. Since the exponential of logistic(0, 1) is F2, 2, we have log[Y/(1 - Y)] ~ logistic(0, 1). Another note: If X1 and X2 are independently distributed as Exp(1), then X1/X2 ~ F2, 2, and log[X1/X2] ~ logistic(0, 1).

If X1, ..., Xn are independent and uniformly distributed over the interval (0, 1), then the k-th smallest number X(k) ~ Beta(k, n - k + 1) and the length of the range X(n) - X(1) ~ Beta(n - 1, 2).

#### Discrete distributions

##### Binomial distributions with various success probabilities
```drawbinomial1 = function(n) {
x = 0:n
a1 = dbinom(x, n, .1)
a3 = dbinom(x, n, .3)
a5 = dbinom(x, n, .5)
a7 = dbinom(x, n, .7)
a9 = dbinom(x, n, .9)
ymax = max(a1, a3, a5, a7, a9)

plot(1:n, 1:n, type="n", xlab="x", ylab="Probability",
xlim=c(-.2, n+.2), ylim=c(0,ymax))
segments(x-.2, 0, x-.2, a1, lwd=2)
segments(x-.1, 0, x-.1, a3, lwd=2, col=2)
segments(x, 0, x, a5, lwd=2, col=3)
segments(x+.1, 0, x+.1, a7, lwd=2, col=4)
segments(x+.2, 0, x+.2, a9, lwd=2, col=5)

title(main=eval(expression(paste("Binomial Distributions with n = ",
n))))
legend(n/2, ymax, lty=1, col=1:5, bty="n",
legend=c("p = 0.1", "p = 0.3", "p = 0.5",
"p = 0.7", "p = 0.9"))
}
drawbinomial1(10)
```

##### Binomial distributions with various numbers of trials
```drawbinomial2 = function(p) {
b10 = dbinom(0:10, 10, p)
b20 = dbinom(0:20, 20, p)
b50 = dbinom(0:50, 50, p)
b100 = dbinom(0:100, 100, p)
b200 = dbinom(0:200, 200, p)
ymax = max(b10, b20, b50, b100, b200)

plot(1, 1, type="n", xlab="x/n", ylab="Probability",
xlim=c(0, 1), ylim=c(0,ymax))
segments((0:10)/10, 0, (0:10)/10, b10, lwd=2)
segments((0:20)/20, 0, (0:20)/20, b20, lwd=2, col=2)
segments((0:50)/50, 0, (0:50)/50, b50, lwd=2, col=3)
segments((0:100)/100, 0, (0:100)/100, b100, lwd=2, col=4)
segments((0:200)/200, 0, (0:200)/200, b200, lwd=2, col=5)

title(main=eval(expression(paste("Binomial Distributions with p = ",
p))))
xlgd = ifelse(p<.5, .6, 0)
legend(xlgd, ymax, lty=1, col=1:5, bty="n",
legend=c("n = 10", "n = 20", "n = 50",
"n = 100", "n = 200"))
}
drawbinomial2(.3)
```

##### Poisson distributions with various λ's
```x = 0:10
plot(x, x, type="n", xlab="x", ylab="Probability",
xlim=c(-.2, 10+.2), ylim=c(0, 0.61))
segments(x-.2, 0, x-.2, dpois(x, .5), lwd=2)
segments(x-.1, 0, x-.1, dpois(x, 1), lwd=2, col=2)
segments(x, 0, x, dpois(x, 3), lwd=2, col=3)
segments(x+.1, 0, x+.1, dpois(x, 5), lwd=2, col=4)

title(main=expression(paste("Poisson Distributions:  ",
Poisson(lambda))),
sub=expression(paste("Note:  ", Poisson(lambda),
" has mean ", lambda,
" and SD ", sqrt(lambda))))
legend(5, 0.6, lty=1, col=1:4, bty="n",
legend=c(expression(lambda == 0.5),
expression(lambda == 1),
expression(lambda == 3),
expression(lambda == 5)))
```

##### Compare Poisson and binomial distributions
```x = 0:20
plot(x, x, type="n", xlab="x", ylab="Probability",
xlim=c(-.2, 20+.2), ylim=c(0, 0.2))
segments(x-.2, 0, x-.2, dpois(x, 5), lwd=2)
segments(x-.1, 0, x-.1, dbinom(x, 100, .05), lwd=1)
segments(x+.1, 0, x+.1, dpois(x, 10), lwd=2, col=2)
segments(x+.2, 0, x+.2, dbinom(x, 100, .1), lwd=1, col=2)

title(main=expression(paste("Compare ", Poisson(lambda),
" and ", Binomial(list(n,p)),
" when ", lambda == np, " is small")))
legend(10, 0.2, lty=1, col=c(1,1,2,2), lwd=c(2,1,2,1), bty="n",
legend=c("Poisson(5)", "Binomial(100, 0.05)",
"Poisson(10)", "Binomial(100, 0.10)"))
```

##### Compare Poisson and negative binomial distributions
Negative binomial distributions are over all non-negative integers, the same as Poisson distributions. While the variance of a Poisson distribution is always the same as its mean, the variance of a negative binomial distribution is bigger than its mean. Thus the negative binomial family is useful to model count data with over-dispersion. In fact, a negative binomial distribution can be viewed as a over-dispersed Poisson distribution; it can be derived as a gamma mixture of Poisson distributions. [Note: On the other hand, the variance of a binomial distribution always is smaller than its mean.]
```x = 0:20
plot(x, x, type="n", xlab="x", ylab="Probability",
xlim=c(-.3, 20+.3), ylim=c(0, 0.2))

m = 5
## Poisson for s2 = m
segments(x-.2, 0, x-.2, dpois(x, m), lwd=2)
## negative binomial for s2 > m
s2 = 10; r=round(m^2/(s2-m)); p=m/s2
print(c(r, p))
segments(x-.1, 0, x-.1, dnbinom(x, r, p), lwd=1)

m = 10
segments(x+.1, 0, x+.1, dpois(x, m), lwd=2, col=2)
s2 = 20; r=m^2/(s2-m); p=m/s2
print(c(r, p))
segments(x+.2, 0, x+.2, dnbinom(x, r, p), lwd=1, col=2)

title(main=expression(paste("Compare ", Poisson(lambda),
" and negative binomial", (list(r,p)))))
legend(7, 0.2, lty=1, col=c(1,1,2,2), lwd=c(2,1,2,1), bty="n",
legend=c("Poisson(5): m = 5, var = 5",
"NBinomial(5, 0.5): m = 5, var = 10",
"Poisson(10): m = 10, var = 10",
"NBinomial(10, 0.5): m = 10, var = 20"))
```

#### Logistic regression

##### Link functions for binary outcome
For binary outcome, three link functions can be used: logit [log(p/(1-p))], probit [Φ-1(p), where Φ is the cdf of the standard normal distribution N(0,1)], and complementary log-log [log(-log(1-p))]. All three functions are one-to-one maps between the probability scale to the whole real line. All are arbitrary to some extent. But it is the easiest to explain parameters when using logit, while the hardest when using probit. This is probably why logistic regression is more popular than the other two. Logit and probit functions lead to symmetric risk functions: the rate of risk increase at p is as fast as the rate of risk decrease at 1 − p. Complementary log-log leads to an asymmetric risk function: the risk increases to 1 faster than it decreases to zero.

```par(mfrow=c(1,2), oma=c(4,0,1.5,0), mar=c(2, 4, 2, 1)+.1)
revbeta = pi/sqrt(6)
cllmedian = (0.577216+log(log(2)))/revbeta

curve(dlogis(x, 0, sqrt(3)/pi), -2, 2, xlab="", ylab="Density",
ylim=c(0,.5), main="Distributions")
curve(revbeta*exp(-exp(x*revbeta-0.577216))*exp(x*revbeta-0.577216),
legend(-2.7, .5, lty=1, col=1:3, bty="n",
legend=c("Logistic",
"Standard normal",
"Gumbel"))

curve(plogis(x, 0, sqrt(3)/pi), -2, 2, xlab="", ylab="Probability",
main="Corresponding CDFs")
segments(-2, 0.5, cllmedian, 0.5)
segments(c(0, cllmedian), 0, c(0, cllmedian), 0.5)
legend(-2.5, 1, lty=1, col=1:3, bty="n",
legend=c("logit", "probit",
"complementary log-log"))

mtext("Links functions in regressions for binary outcome",
side=3, outer=T, cex=1.5)
mtext("Note: All distributions have mean = 0 and SD = 1.",
mtext(paste("Note: If a link function is correct to model the effect",
"of a variable, then the right panel shows the risk as",
"a function of the z-score of the variable."),
```

##### Variation in logistic regressions
Suppose we have a variable x with values ranging from 10 to 20, and a binary outcome variable y. We can fit a logistic regression to find the best fit. The following graph illustrates some potential risk functions.
```curve(plogis((x-15)/2, 0, sqrt(3)/pi), 10, 20, xlab="",
ylab="Probability", main="Logistic Distributions")
curve(plogis((x-15)/2, 0, 1), 10, 20, add=T)
curve(plogis((x-15)/2, 0, .2), 10, 20, add=T)
curve(plogis((x-12)/2, 0, sqrt(3)/pi), 10, 20, add=T)
curve(plogis((x-12)/2, 0, 1), 10, 20, add=T)
curve(plogis((x-12)/2, 0, .2), 10, 20, add=T)
curve(plogis((x-17)/2, 0, sqrt(3)/pi), 10, 20, add=T)
curve(plogis((x-17)/2, 0, 1), 10, 20, add=T)
curve(plogis((x-17)/2, 0, .2), 10, 20, add=T)
```

The following is a simulation to demostrate the variation in fitting logistic regression. Suppose we have 4N observations; N observations have x = 12, 14, 16, and 18, respectively. Under a "real" logistic risk function, we can calculate the risks for each x value, and generate a binary outcome y using binomial sampling. Then we fit a logistic regression to the generated data and draw the fitted risk curve. We repeat this 10 times. The red curve is the "real" underlying curve.
```xx = seq(12,18,2)  # xx takes 12, 14, 16, 18
prob = plogis(xx, 15, 2)  # "real" risks
N = 20  # A total of 4N observations
curve(plogis(x, 15, 2), 10, 20, ylim=c(0,1), type="n", ylab="probability")
for(i in 1:10) {
dd = rbinom(4, N, prob)
data = data.frame(x=c(xx,xx), y=rep(1:0,each=4), w=c(dd, N-dd))
cc = glm(y ~ x, family=binomial, data=data, weights=w)\$coef
curve(plogis(x, -cc/cc, 1/cc), 10, 20, add=T)
}
curve(plogis(x, 15, 2), 10, 20, add=T, col=2, lwd=2)
```

#### Simulations

##### Simulation to demonstrate sampling variation
Suppose the population has 200 numbers and we mimic sampling 10 numbers from the population. For each sample, we calculate the sample mean and sample standard deviation and see how much they differ from the population counterparts and how much variation we have.
```x <- rnorm(200)
hist(x)
a <- matrix(,100,2)
for(i in 1:100) {
y <- sample(x, 10)
a[i, ] <- c(mean(y), sqrt(var(y)))
}
plot(a, main="Variation Due to Sampling (n=10)",
xlab="sample mean", ylab="sample standard deviation")
abline(h=sqrt(var(x)), v=mean(x))
```

##### Simulation to demonstrate the attenuation effect due to measurement errors on the input variable
```runif(20, -2, 2)
plot(x, 2+x, type="n")
abline(2, 1, lwd=2, col=2)
for(i in 1:10) {
y = 2 + x + rnorm(20, 0, 1)
x1 = x + rnorm(20, 0, 1)
lines(x1, lm(y~x1)\$fitted)
}
```

##### Simulation to compare different tests on a 2x2 table generated through two independent binomial sampling
The following program takes five arguments: sample sizes and success probabilities for two binomial distributions and the number of replicates with default value 1000. It simulates sampling from the two binomial distributions and then carrying out seven tests on the resulting 2x2 table. The output of the function is a list of seven sets of p-values. We then can compare the methods by comparing the distributions of their p-values. If you use the same success probabilities for the two distributions, you can compare type I error rates; if you use different success probabilities, you can compare power.
```twobytwo = function(n1, p1, n2, p2, REPL=1000) {
pv1 = pv2 = pv3 = pv4 = pv5 = pv6 = pv7 = NULL
x1 = rbinom(REPL, n1, p1)
x2 = rbinom(REPL, n2, p2)

for(i in 1:REPL) {
xx1 = x1[i]; xx2 = x2[i]
p1 = xx1/n1; p2 = xx2/n2; poverall = (xx1+xx2)/(n1+n2)

## Fisher's exact test
pv1[i] = fisher.test(matrix(c(xx1, n1-xx1, xx2, n2-xx2),2))\$p.value
## Pearson's chi-squared test with continuity correction
pv2[i] = chisq.test(matrix(c(xx1, n1-xx1, xx2, n2-xx2),2),
correct=T)\$p.value
## Pearson's chi-squared test without continuity correction
pv3[i] = chisq.test(matrix(c(xx1, n1-xx1, xx2, n2-xx2),2),
correct=F)\$p.value
## Test on risk difference, using normal approximation
## It is equivalent to Pearson's chi-squred test.
stat4 = abs(p1 - p2)/sqrt(poverall*(1-poverall)*(1/n1 + 1/n2))
pv4[i] = 2 * (1 - pnorm(stat4))
## Test on risk ratio, using normal approximation
stat5 = abs(log(p1/p2))/sqrt(1/xx1 - 1/n1 + 1/xx2 - 1/n2)
pv5[i] = 2 * (1 - pnorm(stat5))
## Test on odds ratio, using normal approximation
## It is equivalent to logistic regression.
stat6 = abs(log(p1*(1-p2)/(p2*(1-p1)))) /
sqrt(1/xx1+1/(n1-xx1)+1/xx2+1/(n2-xx2))
pv6[i] = 2 * (1 - pnorm(stat6))
## Likelihood ratio test
stat7 = 2*(xx1*log(p1) + (n1-xx1)*log(1-p1) +
xx2*log(p2) + (n2-xx2)*log(1-p2) -
(xx1+xx2)*log(poverall) - (n1+n2-xx1-xx2)*log(1-poverall))
pv7[i] = 1-pchisq(stat7, 1)
}

data.frame(pv1=pv1, pv2=pv2, pv3=pv3, pv4=pv4, pv5=pv5, pv6=pv6, pv7=pv7)
}

aa = twobytwo(100, .1, 100, .1, REPL=10000)
plot(aa)
plot(aa, xlim=c(0,.05), ylim=c(0,.05))

par(mfrow=c(2,3))
plot(ecdf(aa\$pv1), xlim=c(0,.05), ylim=c(0,.05)); abline(0,1)
plot(ecdf(aa\$pv2), xlim=c(0,.05), ylim=c(0,.05)); abline(0,1)
plot(ecdf(aa\$pv3), xlim=c(0,.05), ylim=c(0,.05)); abline(0,1)
plot(ecdf(aa\$pv5), xlim=c(0,.05), ylim=c(0,.05)); abline(0,1)
plot(ecdf(aa\$pv6), xlim=c(0,.05), ylim=c(0,.05)); abline(0,1)
plot(ecdf(aa\$pv7), xlim=c(0,.05), ylim=c(0,.05)); abline(0,1)
```

##### Simulation to evaluate Mantel-Haenszel method on stratified binary data
For stratified binary data, we may reasonably think the baseline risk (baseline denoted as A) may be different for different subjects. We also may assume the effect of B vs. A as measured by odds ratio is the same for all subjects. This is a pretty strong assumption to start with. On the basis of this assumption, we use Mantel-Haenszel method to estimate that common odds ratio. We can use simulations to evaluate the performance of Mantel-Haenszel method when the assumption is correct.

Suppose the real distribution of baseline (A) success probabilities is Beta(alpha, beta) and the subject-specific odds ratio, OR, of B vs A is a constant for all subjects. We simulation N pairs of data.
```matched = function(alpha, beta, OR, N, REPL=1000) {
pA = alpha/(alpha + beta)
fn = function(x) (1-1/(1+OR*x/(1-x)))*dbeta(x, alpha, beta)
pB = integrate(fn, 0, 1, subdivisions=1000)\$value
message("Marginal OR of B vs. A is ", pB*(1-pA)/(pA*(1-pB)))

pv1 = pv2 = rd = ornaive = ormh = numeric(REPL)
for(i in 1:REPL) {
## generate data consisting of N subjects
ppA = rbeta(N, alpha, beta)
dataA = rbinom(N, 1, ppA)
ppB = 1-1/(1+OR*ppA/(1-ppA))
dataB = rbinom(N, 1, ppB)
## create a vector of four: (A+, B+), (A-, B+),
## (A+, B-), and (A-, B-).
data = as.numeric(table(1-dataA, 1-dataB))

## estimate of risk difference
rd[i] = data-data
## McNemar's test of risk difference
pv1[i] = 1-pchisq(rd[i]^2/sum(data[2:3]), 1)

## Naive estimate of odds ratio: (pB+ * pA-)/(pB- * pA+)
ornaive[i] = sum(data[1:2])*sum(data[c(2,4)]) /
(sum(data[3:4])*sum(data[c(1,3)]))
## Mantel-Haenszel's estimate of odds ratio
ormh[i] = data/data
## Wald test based on Mantel-Haenszel's estimate of odds ratio
pv2[i] = 2*pnorm(-abs(log(ormh[i]))/sqrt(1/data+1/data))
}

return(data.frame(pv1=pv1, pv2=pv2,
rd=rd, ornaive=ornaive, ormh=ormh))
}

aa = matched(8, 2, 4, 100)
#aa = matched(1, 1, 4, 100)
#aa = matched(8, 2, 2, 100)
#aa = matched(1, 1, 2, 100)
mean(aa\$ornaive, na.rm=T); median(aa\$ornaive, na.rm=T)
mean(aa\$ormh, na.rm=T); median(aa\$ormh, na.rm=T)
```
The results suggest that the Mantel-Haenszel method gives a good estimate, with median of the estimate near to real value. The mean of the estimate tends to be bigger because the distribution of the estimate is skewed to the right.

Topic revision: r44 - 16 Apr 2007, ChunLi

• Biostatistics Webs Copyright © 2013-2020 by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
Ideas, requests, problems regarding Vanderbilt Biostatistics Wiki? Send feedback