########################################################################### # R code to explore the geometric, negative binomial, # gamma, beta, bivariate normal, and lognormal # parametric families of distributions # discrete case: ##### geometric distribution: # for 0 < p < 1, X ~ Geometric( p ) has PMF # f_X ( x ) = p ( 1 - p )^x for x = 0, 1, ... and 0 otherwise # recall that this distribution arises when you're observing # an IID Bernoulli (success/failure) process with success probability p # and X counts the number of failures until the first success p <- 0.9 cbind( 0:10, dgeom( 0:10, p ) ) par( mfrow = c( 2, 1 ) ) plot( 0, 0, xlab = 'x', ylab = 'Probability', type = 'n', xlim = c( -0.5, 3.5 ), ylim = c( 0, 1 ), main = 'Geometric PMF with p = 0.9 (spike plot)' ) for ( i in 0:5 ) { segments( i, 0, i, dgeom( i, p ), lwd = 2, col = 'red' ) } seed <- 1 set.seed( seed ) x.star.0.9 <- rgeom( 10000, 0.9 ) table( x.star.0.9 ) # x.star.0.9 # 0 1 2 3 # 8979 926 88 7 hist( x.star.0.9, prob = T, breaks = ( 0:4 ) - 0.5, xlab = 'x', ylab = 'Probability', ylim = c( 0, 1 ), main = 'Geometric PMF with p = 0.9 (histogram)' ) par( mfrow = c( 3, 1 ) ) hist( x.star.0.9, prob = T, breaks = ( 0:30 ) - 0.5, xlab = 'x', ylab = 'Probability', ylim = c( 0, 1 ), main = 'Geometric PMF with p = 0.9 (histogram)' ) x.star.0.5 <- rgeom( 10000, 0.5 ) table( x.star.0.5 ) # x.star.0.5 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 # 4999 2461 1222 650 339 174 75 33 20 16 3 4 2 2 hist( x.star.0.5, prob = T, breaks = ( 0:30 ) - 0.5, xlab = 'x', ylab = 'Probability', ylim = c( 0, 1 ), main = 'Geometric PMF with p = 0.5 (histogram)' ) x.star.0.1 <- rgeom( 10000, 0.1 ) table( x.star.0.1 ) hist( x.star.0.1[ x.star.0.1 <= 29 ], prob = T, breaks = ( 0:30 ) - 0.5, xlab = 'x', ylab = 'Probability', ylim = c( 0, 1 ), main = 'Geometric PMF with p = 0.1 (histogram)' ) par( mfrow = c( 1, 1 ) ) ##### negative binomial distribution: # for 0 < p < 1, X ~ NegativeBinomial( n, p ) has PMF # f_X ( x ) = choose( n + x - 1, x ) p^n ( 1 - p )^x # for x = 0, 1, ... and 0 otherwise # recall that this distribution arises when you're observing # an IID Bernoulli (success/failure) process with success probability p # and X counts the number of failures until the nth success # so the Geometric( p ) distribution is a special case # of the Negative Binomial distribution with n = 1 x.star.10.0.9 <- rnbinom( 10000, 10, 0.9 ) table( x.star.10.0.9 ) # x.star.10.0.9 # 0 1 2 3 4 5 6 7 8 # 3547 3467 1961 707 235 60 16 4 3 hist( x.star.10.0.9, prob = T, breaks = ( 0:9 ) - 0.5, xlab = 'x', ylab = 'Probability', ylim = c( 0, 0.4 ), main = 'Negative Binomial PMF with ( n, p ) = ( 10, 0.9 )' ) x.star.10.0.5 <- rnbinom( 10000, 10, 0.5 ) table( x.star.10.0.5 ) hist( x.star.10.0.5, prob = T, breaks = ( 0:38 ) - 0.5, xlab = 'x', ylab = 'Probability', ylim = c( 0, 0.4 ), main = 'Negative Binomial PMF with ( n, p ) = ( 10, 0.5 )' ) set.seed( 1 ) x.star.10.0.1 <- rnbinom( 10000, 10, 0.1 ) table( x.star.10.0.1 ) hist( x.star.10.0.1, prob = T, breaks = ( 0:256 ) - 0.5, xlab = 'x', ylab = 'Probability', ylim = c( 0, 0.4 ), main = 'Negative Binomial PMF with ( n, p ) = ( 10, 0.1 )' ) par( mfrow = c( 3, 1 ) ) hist( x.star.10.0.9, prob = T, breaks = ( 0:256 ) - 0.5, xlab = 'x', ylab = 'Probability', ylim = c( 0, 0.4 ), main = 'Negative Binomial PMF with ( n, p ) = ( 10, 0.9 )' ) hist( x.star.10.0.5, prob = T, breaks = ( 0:256 ) - 0.5, xlab = 'x', ylab = 'Probability', ylim = c( 0, 0.4 ), main = 'Negative Binomial PMF with ( n, p ) = ( 10, 0.5 )' ) hist( x.star.10.0.1, prob = T, breaks = ( 0:256 ) - 0.5, xlab = 'x', ylab = 'Probability', ylim = c( 0, 0.4 ), main = 'Negative Binomial PMF with ( n, p ) = ( 10, 0.1 )' ) par( mfrow = c( 1, 1 ) ) # continuous case: ##### bivariate normal density for galton's father-son height data # install CRAN package 'mvtnorm' if not already installed # install.packages( 'mvtnorm' ) library( mvtnorm ) n.grid <- 100 # fathers had ( mean, sd ) = ( 67, 2.5 ) x.grid <- seq( 67 - 3 * 2.5, 67 + 3 * 2.5, length = n.grid ) # sons had ( mean, sd ) = ( 68, 2.5 ) y.grid <- seq( 68 - 3 * 2.5, 68 + 3 * 2.5, length = n.grid ) # correlation was +0.7 Sigma <- matrix( c( 2.5^2, 0.7 * 2.5 * 2.5, 0.7 * 2.5 * 2.5, 2.5^2 ), 2, 2 ) f.grid <- matrix( NA, n.grid, n.grid ) for ( i in 1:n.grid ) { for ( j in 1:n.grid ) { f.grid[ i, j ] <- dmvnorm( c( x.grid[ i ], y.grid[ j ] ), c( 67, 68 ), Sigma ) } } par( mfrow = c( 1, 1 ) ) contour( x.grid, y.grid, f.grid, nlevels = 50, col = 'red', xlab = 'x', ylab = 'y' ) par( mfrow = c( 2, 1 ) ) contour( x.grid, y.grid, f.grid, nlevels = 50, col = 'red', xlab = 'x', ylab = 'y' ) galton.sample <- rmvnorm( 1000, c( 67, 68 ), Sigma ) plot( galton.sample, xlim = c( 67 - 3 * 2.5, 67 + 3 * 2.5 ), ylim = c( 68 - 3 * 2.5, 68 + 3 * 2.5 ), xlab = 'Height of Father', ylab = 'Height of Son' ) par( mfrow = c( 1, 1 ) ) library( plotly ) f.grid.t <- t( f.grid ) interactive.bivariate.density.perspective.plot <- plot_ly( x = x.grid, y = y.grid, z = f.grid.t ) %>% add_surface( contours = list( z = list( show = T, usecolormap = T, highlightcolor = '#ff0000', project = list( z = T ) ) ) ) %>% layout( title = 'Perspective plot of the bivariate density', scene = list( xaxis = list( title = 'x' ), yaxis = list( title = 'y' ), zaxis = list( title = 'f' ), camera = list( eye = list( x = 1.5, y = 0.5, z = -0.5 ) ) ) ) %>% colorbar( title = 'Bivariate Density' ) interactive.bivariate.density.perspective.plot ##### lognormal distribution: # Normal( 0, 1 ) (X) versus Lognormal( 0, 1 ) (Y = exp( X )) par( mfrow = c( 1, 2 ) ) x.grid <- seq( -3, 3, length = 500 ) plot( x.grid, dnorm( x.grid ), type = 'l', xlab = 'x', ylab ='PDF of X', lwd = 2, col = 'black', main = 'Normal( 0, 1 )' ) y.grid.1 <- seq( 0, 10, length = 500 ) plot( y.grid.1, dlnorm( y.grid.1 ), type = 'l', xlab = 'y', ylab ='PDF of Y', lwd = 2, col = 'red', main = 'Lognormal( 0, 1 )' ) par( mfrow = c( 1, 1 ) ) # Hold original-scale SD at 1, vary original-scale mean from 0 to 1 to 2 par( mfrow = c( 3, 1 ) ) y.grid.2 <- seq( 0, 25, length = 500 ) plot( y.grid.2, dlnorm( y.grid.2 ), type = 'l', xlab = 'y', ylab ='PDF of Y', lwd = 2, col = 'red', main = 'Lognormal( 0, 1 )' ) plot( y.grid.2, dlnorm( y.grid.2, 1, 1 ), type = 'l', xlab = 'y', ylab ='PDF of Y', lwd = 2, col = 'red', main = 'Lognormal( 1, 1 )' ) plot( y.grid.2, dlnorm( y.grid.2, 2, 1 ), type = 'l', xlab = 'y', ylab ='PDF of Y', lwd = 2, col = 'red', main = 'Lognormal( 2, 1 )' ) par( mfrow = c( 1, 1 ) ) # Hold original-scale mean at 0, vary original-scale SD from 1 to 2 to 3 par( mfrow = c( 3, 1 ) ) y.grid.3 <- seq( 0, 4, length = 500 ) plot( y.grid.3, dlnorm( y.grid.3, 0, 1 ), type = 'l', xlab = 'y', ylab ='PDF of Y', lwd = 2, col = 'red', main = 'Lognormal( 0, 1 )' ) plot( y.grid.3, dlnorm( y.grid.3, 0, 2 ), type = 'l', xlab = 'y', ylab ='PDF of Y', lwd = 2, col = 'red', main = 'Lognormal( 0, 2 )' ) plot( y.grid.3, dlnorm( y.grid.3, 0, 3 ), type = 'l', xlab = 'y', ylab ='PDF of Y', lwd = 2, col = 'red', main = 'Lognormal( 0, 3 )' ) par( mfrow = c( 1, 1 ) ) #########################################################################