# # Regression models - review # Exercises # # 1. Explaining weight according to height # # Suppose that only women are involved, and that they all have normal weight # Generating data at random, with a normal distribution # n <- 100 height.w <- rnorm(n,mean=1.70,sd=0.1) weight.h <- (height.w-1.10)*100 + rnorm(n,mean=5,sd=5) # Now fitting linear regression model model.w <- lm( weight.h ~ height.w) summary(model.w) # Compare the estimates with the model used to generate the data # Do they correspond? # # Now make graphs to check the model assumptions and analyse results # First, observed data and fitted values plot(height.w,weight.h) lines(height.w,model.w$fitted.values) # Checking normality assumption hist(model.w$residuals) # Now to check that there are no further variables left to explain the data, # and that the linear assumption gives a reasonable representation of the relation between weight and height, # we examine the plot of predicted values and residuals plot(model.w$fitted.values,model.w$residuals) # As expected from a model describing well the data variability, # the residuals do not show a pattern associated with the fitted values. # # In the model above, what does the intercept mean? # Does it make sense having an intercept in the model? # Compare the fit above with the fit without intercept: model.w2 <- lm( weight.h ~ height.w-1) summary(model.w2) # # Which model fit, model.w or modelw.2, represents the data best? # Have a look at the Multiple R-square # Which one of the two is easier to interpret? # # # Make a graph of observed and fitted models in both cases # # par(mfrow=c(1,2)) plot(height.w,weight.h) lines(height.w,model.w$fitted) plot(height.w,weight.h) lines(height.w,model.w2$fitted) # So, eventhough model.w2 has a larger R-square than model.w, # model.w clearly describes the data better. # # 2. Explaining height according to gender # # First, generate the data - 20 observations for males, then 20 for females height <- c( rnorm( 20 , mean=1.8, sd=0.1), rnorm( 20 , mean=1.7, sd=0.1) ) gender <- c(rep(1,20),rep(2,20)) f.gender <- factor(gender) # # Now fit linear model # First, we use gender as a continuous variable # Here the model includes an intercept by default model1 <- lm( height ~ gender ) summary(model1) # # Now use the factor f.gender as explanatory variable # model2 <- lm( height ~ f.gender ) summary(model2) # # Note that the model fits 1 and 2 are exactly the same, since the data is the same; # however, the parameter estimates for the intercept are different # Check out the anova tables anova(model1) anova(model2) # Because both model1 and model2 only have one variable, the p-value for that # variable obtained with the t-test (see model summary) is exactly the same as the # one obtained with the F-test in the anova table # # What does the intercept represent in model2? # # To fit the model with the factor without the intercept we use model3 <- lm( height ~ f.gender-1) summary(model3) # Note that both levels of the factor are now explicit in the output given by "summary" # # Note once again that, because gender has only two levels, if we transform its values from 1,2 # to 0,1, we obtain exactly the same model fit, including intercept estimate, # as when we use f.gender g1 <- gender-1 model4 <- lm(height ~ g1) summary(model4) # review model2 summary(model2) # # Now let us analyse results for model2 # # Histograms would in principle be interesting to study the dist. of height per level of f.gender, # but in this case there are not enough observations par(mfrow=c(1,2)) hist(height[gender==1]) hist(height[gender==2]) # You may wish to change the titles of the histograms par(mfrow=c(1,2)) hist(height[gender==1],main="Height for males") hist(height[gender==2],main="Height for females") # You may also suppress the axis titles, since they do not add information par(mfrow=c(1,2)) hist(height[gender==1],main="Height for males",xlab="") hist(height[gender==2],main="Height for females",xlab="") # Fixing scale to be the same in both graphs min.h <- min(height)-0.1 max.h <- max(height)+0.1 par(mfrow=c(1,2)) hist(height[gender==1],main="Height for males",xlab="",xlim=c(min.h,max.h)) hist(height[gender==2],main="Height for females",xlab="",xlim=c(min.h,max.h)) # We draw box-plots for each level of gender - they suit this situation better par(mfrow=c(1,2)) boxplot(height[gender==1]) boxplot(height[gender==2]) # adding titles par(mfrow=c(1,2)) boxplot(height[gender==1],main="Height distribution",xlab="Males") boxplot(height[gender==2],main="Height distribution",xlab="Females") # Fixing scale to be the same on both graphs par(mfrow=c(1,2)) boxplot(height[gender==1],main="Height distribution",xlab="Males",ylim=c(min.h,max.h)) boxplot(height[gender==2],main="Height distribution",xlab="Females",ylim=c(min.h,max.h)) # We can see that there is considerable overlap between the two distributions, # and that is why the linear regression model seems unable to find the existing gender effect # In other words, there is more variability of height within each gender, than between genders # # Graphs of predicted values with observed data, and predicted values vs. residuals # plot(gender,height) lines(gender,model2$fitted.values) # It becomes clear from this graph that, when the explanatory variable under study is categorical, # it is more difficult to interpret the graph # # 3. Multivariate linear models # height, weight and gender simultaneously # gender3 <- c(rep(0,30),rep(1,30)) f.gender3 <- factor(gender3) height3 <- rnorm( length(gender3) , mean=(1.7+gender3*0.1), sd=0.1) weight3 <- (height3-1.10)*100 + rnorm(length(height3),mean=5,sd=5) # model3.a <- lm( weight3 ~ gender3 + height3 ) model3.b <- lm( weight3 ~ f.gender3 + height3 ) summary(model3.a) summary(model3.b) # # What does the intercept represent in model3.b? # # Because of the coding and number of levels used for gender, when "gender" is used the same # model fit as with f.gender is yielded # # Now produce diagnostic plots for one of these regression fits # # ANOVA tables are also the same for both model fits anova(model3.a) anova(model3.b) # # # 4. Multivariate linear models # # height as function of gender and age group, the latter with three classes # age3=1 represents 18-50 y.o.; age3=2 represents 51-64 y.o.; age3=3 represents 65+ y.o. # It is a good idea to get used to declaring as factors all variables that are # categorical, even if in this particular case the result would not be affected by # not doing so. So, here we declare both age3 and gender3 as factors. # age3 <- rep(1:3,length(gender3)/3) f.age <- factor(age3) f.gender <- factor(gender3) height4 <- rnorm( length(age3), mean=(1.7+gender3*0.1)*(0.9-age3/40), sd=0.1 ) # model4 <- lm( height4 ~ f.age + f.gender ) summary(model4) anova(model4) # Note that the model summary gives one p-value per factor level, # whereas the ANOVA table gives one p-value for the entire factor # Often what you are interested in testing is the effect of the # factor as a whole, so you wish to use the p-value obtained from the ANOVA table # You may also extract elements of the ANOVA table and store them # in variables - this will be useful when handling one regression per gene! # Check anova(model4)[[1]] anova(model4)[[2]] anova(model4)[[3]] anova(model4)[[4]] anova(model4)[[5]] # The last one extracts the p-values # You may also extract one specific p-value using the order of the variables in the model # For the first variable, in this case f.age anova(model4)[[5]][1] # and for the second, f.gender anova(model4)[[5]][2] # # What does the intercept mean in model4? # # Compare the p-values from the anova table with those from the summary of the fit # What would you say about the p-values corresponding to levels 2 and 3 of f.age # in the model fit summary, and the p-value for f.age given by the anova table? # # Produce diagnostic plots for the regression fit of model4