Chapter 10 Categorical Dependent Variables and Survival Models
10.1 Import Data
#yogurtbasic<-read.table(choose.files(), header=TRUE, sep="\t")
#library(Ecdat)# You need to install package 'Ecdat' for the data 'Yogurt'.
#data(Yogurt) #the data used in this Chapter.
#yogurtdata<-Yogurt
#now we need to modify the dataset
colnames(yogurtdata) = c("id","fy","fd","fh","fw","py","pd","ph","pw","choice")
yogurtdata$yoplait<-(yogurtdata$choice=="yoplait")
yogurtdata$dannon<-(yogurtdata$choice=="dannon")
yogurtdata$hiland<-(yogurtdata$choice=="hiland")
yogurtdata$weight<-(yogurtdata$choice=="weight")
10.2 Chap11Yogurt2013.R
yogurtdata<-read.csv("TXTData/yogurt.dat", header=F, sep=" ")
colnames(yogurtdata) = c("id","yoplait","dannon","weight","hiland","fy","fd","fw","fh","py","pd","pw","ph")
10.3 Table 11.2 Number of Choices
yogurtdata$occasion<-seq(yogurtdata$id)
yogurtdata$TYPE<-1*yogurtdata$yoplait+2*yogurtdata$dannon+3*yogurtdata$weight+4*yogurtdata$hiland
yogurtdata$PRICE<-yogurtdata$py*yogurtdata$yoplait + yogurtdata$pd*yogurtdata$dannon + yogurtdata$pw*yogurtdata$weight + yogurtdata$ph*yogurtdata$hiland
yogurtdata$FEATURE<-yogurtdata$fy*yogurtdata$yoplait + yogurtdata$fd*yogurtdata$dannon + yogurtdata$fw*yogurtdata$weight + yogurtdata$fh*yogurtdata$hiland
table(yogurtdata$TYPE)
1 2 3 4
818 970 553 71
summary(yogurtdata[, c("fy", "fd", "fw", "fh")])[4,]
fy fd fw
"Mean :0.05597 " "Mean :0.03773 " "Mean :0.03773 "
fh
"Mean :0.0369 "
Table 11.2 shows that Yoplait was the most frequently selected (33.9%) type ofyogurt in our sample whereas Hiland was the least frequently selected (2.9%). Yoplait was also the most heavily advertised, appearing in newspaper advertisements 5.6% of the time that the brand was chosen.
10.3.1 Table 11.2 Basic summary statistics for prices
t(summary(yogurtdata[, c("py", "pd", "pw", "ph")]))
py Min. :0.0030 1st Qu.:0.1030 Median :0.1080
pd Min. :0.01900 1st Qu.:0.08100 Median :0.08600
pw Min. :0.00400 1st Qu.:0.07900 Median :0.07900
ph Min. :0.02500 1st Qu.:0.05000 Median :0.05400
py Mean :0.1068 3rd Qu.:0.1150 Max. :0.1930
pd Mean :0.08163 3rd Qu.:0.08600 Max. :0.11100
pw Mean :0.07949 3rd Qu.:0.08600 Max. :0.10400
ph Mean :0.05363 3rd Qu.:0.06100 Max. :0.08600
sd(as.matrix(yogurtdata[, c("py")]))
[1] 0.01906265
sd(as.matrix(yogurtdata[, c("pd")]))
[1] 0.01062886
sd(as.matrix(yogurtdata[, c("pw")]))
[1] 0.007735004
sd(as.matrix(yogurtdata[, c("ph")]))
[1] 0.00805391
Table 11.3 shows that Yoplait was also the most expensive, costing 10.7 cents per ounce, on average. Table 11.3 also shows that there are several prices that were far below the average, suggesting some potential influential observations.
10.3.2 vissualize the data
boxplot(PRICE~TYPE, range=0, data=yogurtdata, boxwex=0.5, border="red", yaxt="n", xaxt="n", ylab="")
axis(2, at=seq(0,0.125, by=0.025), las=1, font=10, cex=0.005, tck=0.01)
axis(1, at=seq(1,4, by=1), font=10, cex=0.005, tck=0.01)
mtext("Price", side=2, adj=-1, line=5, at=0.135, font=10, las=1)
mtext("Type", side=1, adj=0, line=3, at=2.3, font=10)
box()
10.3.3 Note the small relationships among prices
cor(yogurtdata[, c("py", "pd", "pw", "ph")])
py pd pw ph
py 1.00000000 0.03201738 0.1538099 -0.01844819
pd 0.03201738 1.00000000 0.2428201 -0.04349290
pw 0.15380986 0.24282008 1.0000000 -0.02755800
ph -0.01844819 -0.04349290 -0.0275580 1.00000000
plot(pw~pd, data=yogurtdata, yaxt="n", xaxt="n", ylab="", xlab="")
axis(2, at=seq(0.00, 0.20, by=0.01), las=1, font=10, cex=0.005, tck=0.01)
axis(2, at=seq(0.00, 0.20, by=0.002),lab=F, tck=0.005)
axis(1, at=seq(0.01, 0.12, by=0.01), font=10, cex=0.005, tck=0.01)
axis(1, at=seq(0.01, 0.12, by=0.002), lab=F, tck=0.005)
mtext("pw", side=2, line=1, at=0.11, las=1, font=10)
mtext("pd", side=1, line=3, at=0.062, font=10)
10.3.4 More on prices
summary(yogurtdata$PRICE)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00300 0.07900 0.08300 0.08495 0.10300 0.12500
range(yogurtdata$PRICE)
[1] 0.003 0.125
which(yogurtdata$PRICE == min(yogurtdata$PRICE))
[1] 1210 1215 1930 1931 2381
which(yogurtdata$PRICE == max(yogurtdata$PRICE))
[1] 71 952 961 1212 1213 1214 1929 2199
library(nnet)
test <- multinom(TYPE ~ FEATURE+PRICE, data = yogurtdata)
# weights: 16 (9 variable)
initial value 3343.741999
iter 10 value 2587.908201
iter 20 value 2364.679552
iter 30 value 2360.691887
final value 2360.191855
converged
summary(test)
Call:
multinom(formula = TYPE ~ FEATURE + PRICE, data = yogurtdata)
Coefficients:
(Intercept) FEATURE PRICE
2 7.458657 -1.8039258 -80.44352
3 6.883787 -1.7072398 -80.32860
4 8.529886 -0.9595805 -139.53475
Std. Errors:
(Intercept) FEATURE PRICE
2 0.3509758 0.2400558 3.836776
3 0.3756930 0.2673651 4.169266
4 0.4773418 0.3787246 6.632942
Residual Deviance: 4720.384
AIC: 4738.384
10.4 Fitting fixed effects multinomial logit model by the poisson log-linear model
# RESHAPE yogurtdata FROM WIDE FORMAT INTO LONG FORMAT
yogurt<-reshape(yogurtdata, varying=list(c("yoplait","dannon","weight","hiland")), v.names=
"choice", idvar="occasion",timevar="brand", direction="long")
yogurt<-yogurt[order(yogurt$occasion),]
yogurt[1:8,]
id fy fd fw fh py pd pw ph occasion TYPE PRICE FEATURE
1.1 1 0 0 0 0 0.108 0.081 0.079 0.061 1 3 0.079 0
1.2 1 0 0 0 0 0.108 0.081 0.079 0.061 1 3 0.079 0
1.3 1 0 0 0 0 0.108 0.081 0.079 0.061 1 3 0.079 0
1.4 1 0 0 0 0 0.108 0.081 0.079 0.061 1 3 0.079 0
2.1 1 0 0 0 0 0.108 0.098 0.075 0.064 2 2 0.098 0
2.2 1 0 0 0 0 0.108 0.098 0.075 0.064 2 2 0.098 0
2.3 1 0 0 0 0 0.108 0.098 0.075 0.064 2 2 0.098 0
2.4 1 0 0 0 0 0.108 0.098 0.075 0.064 2 2 0.098 0
brand choice
1.1 1 0
1.2 2 0
1.3 3 1
1.4 4 0
2.1 1 0
2.2 2 1
2.3 3 0
2.4 4 0
yogurt$brand<-factor(yogurt$brand)
yogurt$occasion<-factor(yogurt$occasion)
# yogurtloglinear<-glm(choice~brand+occasion+FEATURE+PRICE-1, data=yogurt, family=# poisson(link="log"))
# THE ABOVE GLM INCLUDES THE FIXED EFFECTS OF THE 2412 OCCASIONS, WHICH ARE
# NUISANCE PARAMETERS, THE ESTIMATES ARE NOT OBTAINED SIMPLY BECAUSE THE
# LARGE NUMBER.
# GLM USE ITERATIVELY REWEIGHTED LEAST SQUARES TO ESTIMATE, COMPARED WITH
# GENMOD IN SAS # USING MAXIMUMLIKELIHOOD.
# DROP occasion THE GLM IS ESTIMATABLE
model1 <- glm(choice~brand+FEATURE+PRICE-1, data=yogurt, family=poisson(link="log"))
summary(model1)
Call:
glm(formula = choice ~ brand + FEATURE + PRICE - 1, family = poisson(link = "log"),
data = yogurt)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.89683 -0.82357 -0.67716 0.01585 2.26052
Coefficients:
Estimate Std. Error z value Pr(>|z|)
brand1 -1.081e+00 9.183e-02 -11.78 <2e-16 ***
brand2 -9.109e-01 9.079e-02 -10.03 <2e-16 ***
brand3 -1.473e+00 9.497e-02 -15.51 <2e-16 ***
brand4 -3.526e+00 1.459e-01 -24.16 <2e-16 ***
FEATURE 3.206e-16 8.028e-02 0.00 1
PRICE -1.375e-13 9.840e-01 0.00 1
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 14472.0 on 9648 degrees of freedom
Residual deviance: 5665.9 on 9642 degrees of freedom
AIC: 10502
Number of Fisher Scoring iterations: 6
10.5 Fitting multinomial logit model with random intercepts by the possion-log-linear with random intercepts
library(MASS)
# glmmPQL(choice~feature+price+occasion, data=yogurt, family=poisson(link="log"), random=~1|brand)
# THE ABOVE HAS SIMILAR PROBLEM WHEN INCLUDING occasion AS FIXED EFFECTS
# OTHERWISE IT IS ESTIMATABLE IN R; HOWEVER THE RESULT IS QUITE DIFFERENT FROM # THAT OF SAS
glmmPQL(choice~FEATURE+PRICE, data=yogurt, family=poisson(link="log"), random=~1|brand)
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
Linear mixed-effects model fit by maximum likelihood
Data: yogurt
Log-likelihood: NA
Fixed: choice ~ FEATURE + PRICE
(Intercept) FEATURE PRICE
-1.743787e+00 2.005532e-14 2.982266e-13
Random effects:
Formula: ~1 | brand
(Intercept) Residual
StdDev: 1.040625 0.8639861
Variance function:
Structure: fixed weights
Formula: ~invwt
Number of Observations: 9648
Number of Groups: 4