Xianxiong Ma
2020年3月14日
案例:所示数据是一个前列腺癌数据。虽然这个数据集比较小,只有97个观测共9个变量,但通过与传统技术比较,足以让我们掌握正则化技术。斯坦福大学医疗中心提供了97个病人的前列腺特异性抗原(PSA)数据,这些病人均接受前列腺根治切除术。我们的目标是,通过临床检测提供的数据建立一个预测模型预测患者术后PSA水平。对于患者在手术后能恢复到什么程度,PSA水平可能是一个较为有效的预后指标。手术之后,医生会在各个时间区间检测患者的PSA水平,并通过各种公式确定患者是否康复。术前预测模型和术后数据(这里没有提供)互相配合,就可能提高前列腺癌诊疗水平。 收集自97个男性的数据集保存在一个含10个变量的数据框中。如下:
lcavol : 肿瘤体积的对数值
lweight : 前列腺重量的对数值
age : 患者年龄(年)
lbph : 良性前列腺增生(BPH)量的对数值,非癌症性质的前列腺增生
svi : 精囊是否受侵,一个指标变量,表示癌细胞是否已经透过前列腺壁侵入精囊腺(1=是,0=否)
lcp : 包膜穿透度的对数值,表示癌细胞扩散到前列腺包膜之外的程度
gleason : 患者的Gleason评分;由病理学家进行活体检查后给出(2~10),表示癌细胞的变异程度—评分越高,程度越危险
pgg45 : Gleason评分为4或者5所占的百分比
lpsa : PSA值的对数值,响应变量
train : 一个逻辑向量(True或False,用来区分训练集和测试集)
library(ElemStatLearn) #contains the data
library(car) #package to calculate Variance Inflation Factor
library(corrplot) #correlation plots
library(leaps) #best subsets regression
library(glmnet) #allows ridge regression, LASSO and elastic net
library(caret) #this will help identify the appropriate parameters
data(prostate)
str(prostate)
## 'data.frame': 97 obs. of 10 variables:
## $ lcavol : num -0.58 -0.994 -0.511 -1.204 0.751 ...
## $ lweight: num 2.77 3.32 2.69 3.28 3.43 ...
## $ age : int 50 58 74 58 62 50 64 58 47 63 ...
## $ lbph : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ svi : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lcp : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ gleason: int 6 6 7 6 6 6 6 6 6 6 ...
## $ pgg45 : int 0 0 20 0 0 0 0 0 0 0 ...
## $ lpsa : num -0.431 -0.163 -0.163 -0.163 0.372 ...
## $ train : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
### 示例数据,训练集、测试集划分
plot(prostate)
plot(prostate$gleason, ylab = "Gleason Score")
table(prostate$gleason)
##
## 6 7 8 9
## 35 56 1 5
boxplot(prostate$lpsa ~ prostate$gleason, xlab = "Gleason Score",
ylab = "Log of PSA")
prostate$gleason <- ifelse(prostate$gleason == 6, 0, 1)
table(prostate$gleason)
##
## 0 1
## 35 62
p.cor = cor(prostate)
corrplot.mixed(p.cor)
train <- subset(prostate, train == TRUE)[, 1:9]
str(train)
## 'data.frame': 67 obs. of 9 variables:
## $ lcavol : num -0.58 -0.994 -0.511 -1.204 0.751 ...
## $ lweight: num 2.77 3.32 2.69 3.28 3.43 ...
## $ age : int 50 58 74 58 62 50 58 65 63 63 ...
## $ lbph : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ svi : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lcp : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ gleason: num 0 0 1 0 0 0 0 0 0 1 ...
## $ pgg45 : int 0 0 20 0 0 0 0 0 0 30 ...
## $ lpsa : num -0.431 -0.163 -0.163 -0.163 0.372 ...
test <- subset(prostate, train == FALSE)[, 1:9]
str(test)
## 'data.frame': 30 obs. of 9 variables:
## $ lcavol : num 0.737 -0.777 0.223 1.206 2.059 ...
## $ lweight: num 3.47 3.54 3.24 3.44 3.5 ...
## $ age : int 64 47 63 57 60 69 68 67 65 54 ...
## $ lbph : num 0.615 -1.386 -1.386 -1.386 1.475 ...
## $ svi : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lcp : num -1.386 -1.386 -1.386 -0.431 1.348 ...
## $ gleason: num 0 0 0 1 1 0 0 1 0 0 ...
## $ pgg45 : int 0 0 0 5 20 0 0 20 0 0 ...
## $ lpsa : num 0.765 1.047 1.047 1.399 1.658 ...
subfit <- regsubsets(lpsa ~ ., data = train)
b.sum <- summary(subfit)
which.min(b.sum$bic)
## [1] 3
plot(b.sum$bic, type = "l", xlab = "# of Features", ylab = "BIC",
main = "BIC score by Feature Inclusion")
plot(subfit, scale = "bic", main = "Best Subset Features")
ols <- lm(lpsa ~ lcavol + lweight + gleason, data = train)
plot(ols$fitted.values, train$lpsa, xlab = "Predicted", ylab = "Actual",
main = "Predicted vs Actual")
pred.subfit = predict(ols, newdata = test)
plot(pred.subfit, test$lpsa, xlab = "Predicted", ylab = "Actual",
main = "Predicted vs Actual")
resid.subfit = test$lpsa - pred.subfit
mean(resid.subfit^2)
## [1] 0.5084126
x <- as.matrix(train[, 1:8])
y <- train[, 9]
ridge <- glmnet(x, y, family = "gaussian", alpha = 0)
print(ridge)
##
## Call: glmnet(x = x, y = y, family = "gaussian", alpha = 0)
##
## Df %Dev Lambda
## 1 8 0.00000 878.90
## 2 8 0.00559 800.80
## 3 8 0.00613 729.70
## 4 8 0.00672 664.80
## 5 8 0.00737 605.80
## 6 8 0.00809 552.00
## 7 8 0.00886 502.90
## 8 8 0.00972 458.20
## 9 8 0.01065 417.50
## 10 8 0.01168 380.40
## 11 8 0.01279 346.60
## 12 8 0.01402 315.90
## 13 8 0.01536 287.80
## 14 8 0.01682 262.20
## 15 8 0.01842 238.90
## 16 8 0.02017 217.70
## 17 8 0.02208 198.40
## 18 8 0.02417 180.70
## 19 8 0.02644 164.70
## 20 8 0.02892 150.10
## 21 8 0.03163 136.70
## 22 8 0.03457 124.60
## 23 8 0.03777 113.50
## 24 8 0.04126 103.40
## 25 8 0.04504 94.24
## 26 8 0.04915 85.87
## 27 8 0.05360 78.24
## 28 8 0.05842 71.29
## 29 8 0.06364 64.96
## 30 8 0.06928 59.19
## 31 8 0.07536 53.93
## 32 8 0.08191 49.14
## 33 8 0.08896 44.77
## 34 8 0.09652 40.79
## 35 8 0.10460 37.17
## 36 8 0.11330 33.87
## 37 8 0.12250 30.86
## 38 8 0.13240 28.12
## 39 8 0.14280 25.62
## 40 8 0.15390 23.34
## 41 8 0.16550 21.27
## 42 8 0.17780 19.38
## 43 8 0.19070 17.66
## 44 8 0.20410 16.09
## 45 8 0.21810 14.66
## 46 8 0.23270 13.36
## 47 8 0.24770 12.17
## 48 8 0.26310 11.09
## 49 8 0.27900 10.10
## 50 8 0.29510 9.21
## 51 8 0.31150 8.39
## 52 8 0.32810 7.64
## 53 8 0.34470 6.96
## 54 8 0.36140 6.35
## 55 8 0.37800 5.78
## 56 8 0.39450 5.27
## 57 8 0.41080 4.80
## 58 8 0.42680 4.37
## 59 8 0.44240 3.99
## 60 8 0.45760 3.63
## 61 8 0.47240 3.31
## 62 8 0.48660 3.02
## 63 8 0.50030 2.75
## 64 8 0.51340 2.50
## 65 8 0.52600 2.28
## 66 8 0.53800 2.08
## 67 8 0.54930 1.89
## 68 8 0.56010 1.73
## 69 8 0.57030 1.57
## 70 8 0.58000 1.43
## 71 8 0.58910 1.30
## 72 8 0.59760 1.19
## 73 8 0.60570 1.08
## 74 8 0.61330 0.99
## 75 8 0.62040 0.90
## 76 8 0.62700 0.82
## 77 8 0.63330 0.75
## 78 8 0.63910 0.68
## 79 8 0.64450 0.62
## 80 8 0.64960 0.56
## 81 8 0.65430 0.51
## 82 8 0.65870 0.47
## 83 8 0.66280 0.43
## 84 8 0.66660 0.39
## 85 8 0.67010 0.35
## 86 8 0.67330 0.32
## 87 8 0.67630 0.29
## 88 8 0.67900 0.27
## 89 8 0.68150 0.24
## 90 8 0.68380 0.22
## 91 8 0.68590 0.20
## 92 8 0.68770 0.18
## 93 8 0.68940 0.17
## 94 8 0.69090 0.15
## 95 8 0.69230 0.14
## 96 8 0.69350 0.13
## 97 8 0.69460 0.12
## 98 8 0.69550 0.11
## 99 8 0.69640 0.10
## 100 8 0.69710 0.09
plot(ridge, label = TRUE)
plot(ridge, xvar = "lambda", label = TRUE)
ridge.coef <- predict(ridge, s = 0.1, type = "coefficients")
ridge.coef
## 9 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.130475478
## lcavol 0.457279371
## lweight 0.645792042
## age -0.017356156
## lbph 0.122497573
## svi 0.636779442
## lcp -0.104712451
## gleason 0.346022979
## pgg45 0.004287179
plot(ridge, xvar = "dev", label = TRUE)
newx <- as.matrix(test[, 1:8])
ridge.y = predict(ridge, newx = newx, type = "response", s = 0.1)
plot(ridge.y, test$lpsa, xlab = "Predicted", ylab = "Actual", main = "Ridge Regression")
ridge.resid <- ridge.y - test$lpsa
mean(ridge.resid^2)
## [1] 0.4783559
lasso <- glmnet(x, y, family = "gaussian", alpha = 1)
print(lasso)
##
## Call: glmnet(x = x, y = y, family = "gaussian", alpha = 1)
##
## Df %Dev Lambda
## 1 0 0.00000 0.87890
## 2 1 0.09126 0.80080
## 3 1 0.16700 0.72970
## 4 1 0.22990 0.66480
## 5 1 0.28220 0.60580
## 6 1 0.32550 0.55200
## 7 1 0.36150 0.50290
## 8 1 0.39140 0.45820
## 9 2 0.42810 0.41750
## 10 2 0.45980 0.38040
## 11 3 0.48770 0.34660
## 12 3 0.51310 0.31590
## 13 4 0.53490 0.28780
## 14 4 0.55570 0.26220
## 15 4 0.57300 0.23890
## 16 4 0.58740 0.21770
## 17 4 0.59930 0.19840
## 18 5 0.61170 0.18070
## 19 5 0.62200 0.16470
## 20 5 0.63050 0.15010
## 21 5 0.63760 0.13670
## 22 5 0.64350 0.12460
## 23 5 0.64840 0.11350
## 24 5 0.65240 0.10340
## 25 6 0.65580 0.09424
## 26 6 0.65870 0.08587
## 27 6 0.66110 0.07824
## 28 6 0.66310 0.07129
## 29 7 0.66630 0.06496
## 30 7 0.66960 0.05919
## 31 7 0.67240 0.05393
## 32 7 0.67460 0.04914
## 33 7 0.67650 0.04477
## 34 8 0.67970 0.04079
## 35 8 0.68340 0.03717
## 36 8 0.68660 0.03387
## 37 8 0.68920 0.03086
## 38 8 0.69130 0.02812
## 39 8 0.69310 0.02562
## 40 8 0.69460 0.02334
## 41 8 0.69580 0.02127
## 42 8 0.69680 0.01938
## 43 8 0.69770 0.01766
## 44 8 0.69840 0.01609
## 45 8 0.69900 0.01466
## 46 8 0.69950 0.01336
## 47 8 0.69990 0.01217
## 48 8 0.70020 0.01109
## 49 8 0.70050 0.01010
## 50 8 0.70070 0.00921
## 51 8 0.70090 0.00839
## 52 8 0.70110 0.00764
## 53 8 0.70120 0.00696
## 54 8 0.70130 0.00635
## 55 8 0.70140 0.00578
## 56 8 0.70150 0.00527
## 57 8 0.70150 0.00480
## 58 8 0.70160 0.00437
## 59 8 0.70160 0.00399
## 60 8 0.70170 0.00363
## 61 8 0.70170 0.00331
## 62 8 0.70170 0.00302
## 63 8 0.70170 0.00275
## 64 8 0.70180 0.00250
## 65 8 0.70180 0.00228
## 66 8 0.70180 0.00208
## 67 8 0.70180 0.00189
## 68 8 0.70180 0.00172
## 69 8 0.70180 0.00157
plot(lasso, xvar = "lambda", label = TRUE)
lasso.coef <- predict(lasso, s = 0.045, type = "coefficients")
lasso.coef
## 9 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.1305900670
## lcavol 0.4479592050
## lweight 0.5910476764
## age -0.0073162861
## lbph 0.0974103575
## svi 0.4746790830
## lcp .
## gleason 0.2968768129
## pgg45 0.0009788059
lasso.y <- predict(lasso, newx = newx, type = "response", s = 0.045)
plot(lasso.y, test$lpsa, xlab = "Predicted", ylab = "Actual", main = "LASSO")
lasso.resid <- lasso.y - test$lpsa
mean(lasso.resid^2)
## [1] 0.4437209
grid <- expand.grid(.alpha = seq(0, 1, by = 0.2), .lambda = seq(0,
0.2, by = 0.02))
table(grid)
## .lambda
## .alpha 0 0.02 0.04 0.06 0.08 0.1 0.12 0.14 0.16 0.18 0.2
## 0 1 1 1 1 1 1 1 1 1 1 1
## 0.2 1 1 1 1 1 1 1 1 1 1 1
## 0.4 1 1 1 1 1 1 1 1 1 1 1
## 0.6 1 1 1 1 1 1 1 1 1 1 1
## 0.8 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
head(grid)
## .alpha .lambda
## 1 0.0 0
## 2 0.2 0
## 3 0.4 0
## 4 0.6 0
## 5 0.8 0
## 6 1.0 0
control <- trainControl(method = "LOOCV") #selectionFunction='best'
set.seed(701) #our random seed
enet.train = train(lpsa ~ ., data = train, method = "glmnet", trControl = control,
tuneGrid = grid)
enet.train
## glmnet
##
## 67 samples
## 8 predictor
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 66, 66, 66, 66, 66, 66, ...
## Resampling results across tuning parameters:
##
## alpha lambda RMSE Rsquared MAE
## 0.0 0.00 0.7498311 0.6091434 0.5684548
## 0.0 0.02 0.7498311 0.6091434 0.5684548
## 0.0 0.04 0.7498311 0.6091434 0.5684548
## 0.0 0.06 0.7498311 0.6091434 0.5684548
## 0.0 0.08 0.7498311 0.6091434 0.5684548
## 0.0 0.10 0.7508038 0.6079576 0.5691343
## 0.0 0.12 0.7512303 0.6073484 0.5692774
## 0.0 0.14 0.7518536 0.6066518 0.5694451
## 0.0 0.16 0.7526409 0.6058786 0.5696155
## 0.0 0.18 0.7535502 0.6050559 0.5699154
## 0.0 0.20 0.7545583 0.6041945 0.5709377
## 0.2 0.00 0.7550170 0.6073913 0.5746434
## 0.2 0.02 0.7542648 0.6065770 0.5713633
## 0.2 0.04 0.7550427 0.6045111 0.5738111
## 0.2 0.06 0.7571893 0.6015388 0.5764117
## 0.2 0.08 0.7603394 0.5978544 0.5793641
## 0.2 0.10 0.7642219 0.5936131 0.5846567
## 0.2 0.12 0.7684937 0.5890900 0.5898445
## 0.2 0.14 0.7713716 0.5861699 0.5939008
## 0.2 0.16 0.7714681 0.5864527 0.5939535
## 0.2 0.18 0.7726979 0.5856865 0.5948039
## 0.2 0.20 0.7744041 0.5845281 0.5959671
## 0.4 0.00 0.7551309 0.6072931 0.5746628
## 0.4 0.02 0.7559055 0.6044693 0.5737468
## 0.4 0.04 0.7599608 0.5989266 0.5794120
## 0.4 0.06 0.7667450 0.5911506 0.5875213
## 0.4 0.08 0.7746900 0.5824231 0.5979116
## 0.4 0.10 0.7760605 0.5809784 0.6001763
## 0.4 0.12 0.7784160 0.5788024 0.6024102
## 0.4 0.14 0.7792216 0.5786250 0.6035698
## 0.4 0.16 0.7784433 0.5806388 0.6024696
## 0.4 0.18 0.7779134 0.5828322 0.6014095
## 0.4 0.20 0.7797721 0.5826306 0.6020934
## 0.6 0.00 0.7553016 0.6071317 0.5748038
## 0.6 0.02 0.7579330 0.6020021 0.5766881
## 0.6 0.04 0.7665234 0.5917067 0.5863966
## 0.6 0.06 0.7778600 0.5790948 0.6021629
## 0.6 0.08 0.7801170 0.5765439 0.6049216
## 0.6 0.10 0.7819242 0.5750003 0.6073379
## 0.6 0.12 0.7800315 0.5782365 0.6053700
## 0.6 0.14 0.7813077 0.5785548 0.6059119
## 0.6 0.16 0.7846753 0.5770307 0.6075899
## 0.6 0.18 0.7886388 0.5753101 0.6104210
## 0.6 0.20 0.7931549 0.5734018 0.6140249
## 0.8 0.00 0.7553734 0.6070686 0.5748255
## 0.8 0.02 0.7603679 0.5991567 0.5797001
## 0.8 0.04 0.7747753 0.5827275 0.5975303
## 0.8 0.06 0.7812784 0.5752601 0.6065340
## 0.8 0.08 0.7832103 0.5734172 0.6092662
## 0.8 0.10 0.7811248 0.5769787 0.6071341
## 0.8 0.12 0.7847355 0.5750115 0.6093756
## 0.8 0.14 0.7894184 0.5725728 0.6122536
## 0.8 0.16 0.7951091 0.5696205 0.6158407
## 0.8 0.18 0.8018475 0.5659672 0.6205741
## 0.8 0.20 0.8090352 0.5622252 0.6256726
## 1.0 0.00 0.7554439 0.6070354 0.5749409
## 1.0 0.02 0.7632577 0.5958706 0.5827830
## 1.0 0.04 0.7813519 0.5754986 0.6065914
## 1.0 0.06 0.7843882 0.5718847 0.6103528
## 1.0 0.08 0.7819175 0.5755415 0.6082954
## 1.0 0.10 0.7860004 0.5731009 0.6107923
## 1.0 0.12 0.7921572 0.5692525 0.6146159
## 1.0 0.14 0.7999326 0.5642789 0.6198758
## 1.0 0.16 0.8089248 0.5583637 0.6265797
## 1.0 0.18 0.8185327 0.5521348 0.6343174
## 1.0 0.20 0.8259445 0.5494268 0.6411104
##
## RMSE was used to select the optimal model using the
## smallest value.
## The final values used for the model were alpha = 0 and lambda
## = 0.08.
enet <- glmnet(x, y, family = "gaussian", alpha = 0, lambda = 0.08)
enet.coef <- coef(enet, s = 0.08, exact = TRUE)
enet.coef
## 9 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.137811097
## lcavol 0.470960525
## lweight 0.652088157
## age -0.018257308
## lbph 0.123608113
## svi 0.648209192
## lcp -0.118214386
## gleason 0.345480799
## pgg45 0.004478267
enet.y <- predict(enet, newx = newx, type = "response", s = 0.08)
plot(enet.y, test$lpsa, xlab = "Predicted", ylab = "Actual", main = "Elastic Net")
enet.resid <- enet.y - test$lpsa
mean(enet.resid^2)
## [1] 0.4795019
set.seed(317)
lasso.cv = cv.glmnet(x, y, nfolds = 3)
plot(lasso.cv)
lasso.cv$lambda.min #minimum
## [1] 0.007644054
lasso.cv$lambda.1se #one standard error away
## [1] 0.3158532
coef(lasso.cv, s = "lambda.1se")
## 9 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.0746246
## lcavol 0.4176727
## lweight 0.2246438
## age .
## lbph .
## svi 0.0648923
## lcp .
## gleason .
## pgg45 .
lasso.y.cv = predict(lasso.cv, newx = newx, type = "response", s = "lambda.1se")
lasso.cv.resid = lasso.y.cv - test$lpsa
mean(lasso.cv.resid^2)
## [1] 0.5508264
library(MASS)
biopsy$ID = NULL
names(biopsy) = c("thick", "u.size", "u.shape", "adhsn", "s.size",
"nucl", "chrom", "n.nuc", "mit", "class")
biopsy.v2 <- na.omit(biopsy)
set.seed(123) #random number generator
ind <- sample(2, nrow(biopsy.v2), replace = TRUE, prob = c(0.7, 0.3))
train <- biopsy.v2[ind == 1, ] #the training data set
test <- biopsy.v2[ind == 2, ] #the test data set
x <- as.matrix(train[, 1:9])
y <- train[, 10]
set.seed(3)
fitCV <- cv.glmnet(x, y, family = "binomial", type.measure = "auc",
nfolds = 5)
plot(fitCV)
fitCV$lambda.1se
## [1] 0.1710154
coef(fitCV, s = "lambda.1se")
## 10 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -2.011804541
## thick 0.033203900
## u.size 0.107893817
## u.shape 0.086171949
## adhsn .
## s.size .
## nucl 0.150382121
## chrom .
## n.nuc 0.004832709
## mit .
library(InformationValue)
predCV <- predict(fitCV, newx = as.matrix(test[, 1:9]), s = "lambda.1se",
type = "response")
actuals <- ifelse(test$class == "malignant", 1, 0)
misClassError(actuals, predCV)
## [1] 0.0574
plotROC(actuals, predCV)
predCV.min <- predict(fitCV, newx = as.matrix(test[, 1:9]), s = "lambda.min",
type = "response")
misClassError(actuals, predCV.min)
## [1] 0.0239