This data set is be real data, published by Cowles and Davis (1987). It is freely available on the Internet. The following fields are available:
Variables | Description |
---|---|
subject number | All participants are numbered consecutively |
neuroticism | Scale from Eysenck personality inventory |
extraversion | Scale from Eysenck personality inventory |
sex | Female or male |
volunteer | Did the subject volunteer in a study? |
What does the willingness to participate in a study depend on?
library(ggplot2)
library(dplyr)
data = read.csv('cowles.csv')
summary(data)
## X neuroticism extraversion sex volunteer
## Min. : 1 Min. : 0.00 Min. : 2.00 female:780 no :824
## 1st Qu.: 356 1st Qu.: 8.00 1st Qu.:10.00 male :641 yes:597
## Median : 711 Median :11.00 Median :13.00
## Mean : 711 Mean :11.47 Mean :12.37
## 3rd Qu.:1066 3rd Qu.:15.00 3rd Qu.:15.00
## Max. :1421 Max. :24.00 Max. :23.00
numeric
and subtract one.data$volunteer = (data$volunteer=='yes')+0
stat_summary()
.# Solutions
#data = data[,!(colnames(data) %in% 'X')] # onlz needed if you reaload the very original data
# One Way Interactions
ggplot(data,aes(x=sex,y=volunteer))+
stat_summary()
## No summary function supplied, defaulting to `mean_se()
ggplot(data,aes(x=extraversion,y=volunteer))+
stat_summary()
## No summary function supplied, defaulting to `mean_se()
## Warning: Removed 1 rows containing missing values (geom_pointrange).
ggplot(data,aes(x=neuroticism,y=volunteer))+
stat_summary()
## No summary function supplied, defaulting to `mean_se()
## Warning: Removed 1 rows containing missing values (geom_pointrange).
# Two way interactions with sex
ggplot(data,aes(x=extraversion,y=volunteer,color=sex))+
stat_summary()
## No summary function supplied, defaulting to `mean_se()
## Warning: Removed 2 rows containing missing values (geom_pointrange).
ggplot(data,aes(x=neuroticism,y=volunteer,color=sex))+
stat_summary()
## No summary function supplied, defaulting to `mean_se()
## Warning: Removed 2 rows containing missing values (geom_pointrange).
mres = glm(family=binomial,data=data,formula=volunteer~extraversion+neuroticism+sex)
sum(resid(mres, type = "pearson")^2) / df.residual(mres) # should be 1
## [1] 1.003463
data$residuals
and plot them against extraversion and neuroticism. Add geom_smooth
and interpret the plot. Are there any non-linearities left to be modeled? What do the plots tell you about the model fit?data$resid = (residuals(mres))
ggplot(data,aes(x=extraversion,y=resid))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'gam'
ra = diff(range(data$extraversion))
print(ra)
## [1] 21
coeffs = broom::tidy(mres)
print(coeffs)
## term estimate std.error statistic p.value
## 1 (Intercept) -1.116496350 0.24905747 -4.4828863 7.364016e-06
## 2 extraversion 0.066325404 0.01425996 4.6511619 3.300700e-06
## 3 neuroticism 0.006361871 0.01135727 0.5601585 5.753713e-01
## 4 sexmale -0.235161439 0.11118454 -2.1150552 3.442524e-02
exp(coeffs$estimate[coeffs$term=="extraversion"]*ra)
## [1] 4.026242
"No, they don't depend because we do not have an interaction modeled yet. The relative risks would depend though!"
## [1] "No, they don't depend because we do not have an interaction modeled yet. The relative risks would depend though!"
#logit <=> log(odds)
"logit(y(extra=2)) = b0 + b1 * 2 + b2*is_male + b1*2*is_male"
## [1] "logit(y(extra=2)) = b0 + b1 * 2 + b2*is_male + b1*2*is_male"
"logit(y(extra=23)) = b0 + b1 * 23 + b2*is_male + b1*23*is_male"
## [1] "logit(y(extra=23)) = b0 + b1 * 23 + b2*is_male + b1*23*is_male"
"b0 + b1*23 +b2*is_male - (b0 + b1*2 + b2*is_male)"
## [1] "b0 + b1*23 +b2*is_male - (b0 + b1*2 + b2*is_male)"
"b0 + b1*23 +b2*is_male - b0 - b1*2 -b2*is_male"
## [1] "b0 + b1*23 +b2*is_male - b0 - b1*2 -b2*is_male"
"b1*21"
## [1] "b1*21"
#(exp(b1*21)
predict
function from R.data$predict = predict(mres)
ggplot(data=data,aes(x=extraversion,y=predict))+
geom_point()
# Add some color to explain main effect
ggplot(data=data,aes(x=extraversion,y=predict,colour=sex))+
geom_point()
+ Convert the outputs to probabilities through the inv-logit function:
invlogit = function(x) 1/(1 + exp(-x))
invlogit = function(x) exp(x) / (1+exp(x))
data$predictProb = invlogit(data$predict)
# Alternative: data$predictProb = residuals(mres,type="response")
ggplot(data=data,aes(x=extraversion,y=predictProb,colour=sex))+
geom_point()
predict(... newdata=XXX)
command. Apply the invlogit
function only for plotting (you can do àes(x=XXX,y=invlogit(predict))`)df.predict = data.frame(expand.grid(neuroticism = seq(XXX),sex=c(XX,XX),extraversion=seq(XXX)))
df.predict$XXX = predict(XXX,XXX)
df.predict = data.frame(expand.grid(neuroticism = seq(0,24),sex=c('male','female'),extraversion=seq(2,23)))
df.predict$predict = predict(mres,newdata = df.predict)
ggplot(df.predict,aes(x=extraversion,y=invlogit(predict)))+
geom_point()
- Combine the predicted data fame with your original data frame. A convenient way is to use a function from the dplyr
package:
combined = dplyr::bind_rows(a,b,c,.id="type")
# Note: 'dplyr' is the successor to the 'plyr' package, for some part they are incompatible. Don't load dplyr using "library(dplyr)".
# You can access single functions from a package using the `package::function` syntax.
combined = dplyr::bind_rows(data,df.predict,.id="type")
ggplot(data=combined,aes(x=extraversion,y=invlogit(predict),group=type,color=type))+
stat_summary()
## No summary function supplied, defaulting to `mean_se()
## Warning: Removed 1 rows containing missing values (geom_pointrange).
- We want to show that we can extrapolate from seen data: Change the range of predicted values for extraversion to 0 to 35 (even though in this example the maximal extraversion score seems to be at 23). Plot again the predicted values. This of course could also be used to interpolate data between scores (i.e. predict to the non-existing score of 10.5) by reducing the step size in seq
df.predict2 = data.frame(expand.grid(neuroticism = seq(0,24),sex=c('male','female'),extraversion=seq(0,35)))
df.predict2$predict = predict(mres,newdata = df.predict2)
ggplot(df.predict2,aes(x=extraversion,y=invlogit(predict)))+
geom_point()
mres2 = glm(family=binomial,data=data,formula=volunteer~extraversion*neuroticism+sex)
anova(mres,mres2,test="LRT")
## Analysis of Deviance Table
##
## Model 1: volunteer ~ extraversion + neuroticism + sex
## Model 2: volunteer ~ extraversion * neuroticism + sex
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1417 1906.1
## 2 1416 1897.4 1 8.6213 0.003323 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df.predict
where you used the expand.grid
, but apply it to the new model with the interaction.extraversion
against predict
but split it by color using aes(..., color=factor(neuroticism))
.df.predict$predictFull = predict(mres2,newdata=df.predict)
ggplot(df.predict,aes(x=extraversion,
y=invlogit(predictFull),
color=factor(neuroticism)))+
geom_point()
- Who is the most likely candidate to volunteer? Who is the least likely one? What is the relative risk of being very high on extraversion and very low on neuroticism against very high on extraversion and very high on neuroticism?
geom_smooth
and facet_wrap
ggplot(df.predict,aes(x=extraversion,
y=invlogit(predictFull)))+
geom_point()+
geom_path()+
geom_point(data=data,aes(y=volunteer),color='red')+
geom_smooth(data=data,aes(y=volunteer),color='red')+
facet_wrap(~neuroticism)+scale_y_continuous(limits=c(0,1))
## `geom_smooth()` using method = 'loess'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 5.955
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 4.045
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 25.452
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small.
## fewer data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 5.955
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 4.045
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 25.452
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 3.945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 9.055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4.223
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small.
## fewer data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 3.945
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 9.055
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 4.223
## Warning: Removed 100 rows containing missing values (geom_smooth).
This concludes the logistic regression exercise. Check back to the initial goal and see whether you learned something!