线性模型中的高级特征选择技术

Xianxiong Ma

2020年3月14日

案例：所示数据是一个前列腺癌数据。虽然这个数据集比较小，只有97个观测共9个变量，但通过与传统技术比较，足以让我们掌握正则化技术。斯坦福大学医疗中心提供了97个病人的前列腺特异性抗原（PSA)数据，这些病人均接受前列腺根治切除术。我们的目标是，通过临床检测提供的数据建立一个预测模型预测患者术后PSA水平。对于患者在手术后能恢复到什么程度，PSA水平可能是一个较为有效的预后指标。手术之后，医生会在各个时间区间检测患者的PSA水平，并通过各种公式确定患者是否康复。术前预测模型和术后数据（这里没有提供）互相配合，就可能提高前列腺癌诊疗水平。收集自97个男性的数据集保存在一个含10个变量的数据框中。如下：

lcavol : 肿瘤体积的对数值

lweight : 前列腺重量的对数值

age : 患者年龄（年）

lbph : 良性前列腺增生（BPH）量的对数值，非癌症性质的前列腺增生

svi : 精囊是否受侵，一个指标变量，表示癌细胞是否已经透过前列腺壁侵入精囊腺（1=是，0=否）

lcp : 包膜穿透度的对数值，表示癌细胞扩散到前列腺包膜之外的程度

gleason : 患者的Gleason评分；由病理学家进行活体检查后给出（2~10），表示癌细胞的变异程度—评分越高，程度越危险

pgg45 : Gleason评分为4或者5所占的百分比

lpsa : PSA值的对数值，响应变量

train : 一个逻辑向量（True或False，用来区分训练集和测试集）

library(ElemStatLearn)  #contains the data
library(car)  #package to calculate Variance Inflation Factor
library(corrplot)  #correlation plots
library(leaps)  #best subsets regression
library(glmnet)  #allows ridge regression, LASSO and elastic net
library(caret)  #this will help identify the appropriate parameters
data(prostate)
str(prostate)

## 'data.frame':    97 obs. of  10 variables:
##  $ lcavol : num  -0.58 -0.994 -0.511 -1.204 0.751 ...
##  $ lweight: num  2.77 3.32 2.69 3.28 3.43 ...
##  $ age    : int  50 58 74 58 62 50 64 58 47 63 ...
##  $ lbph   : num  -1.39 -1.39 -1.39 -1.39 -1.39 ...
##  $ svi    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lcp    : num  -1.39 -1.39 -1.39 -1.39 -1.39 ...
##  $ gleason: int  6 6 7 6 6 6 6 6 6 6 ...
##  $ pgg45  : int  0 0 20 0 0 0 0 0 0 0 ...
##  $ lpsa   : num  -0.431 -0.163 -0.163 -0.163 0.372 ...
##  $ train  : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...

### 示例数据，训练集、测试集划分
plot(prostate)

plot of chunk unnamed-chunk-1

plot(prostate$gleason, ylab = "Gleason Score")

plot of chunk unnamed-chunk-1

table(prostate$gleason)

## 
##  6  7  8  9 
## 35 56  1  5

boxplot(prostate$lpsa ~ prostate$gleason, xlab = "Gleason Score", 
    ylab = "Log of PSA")

plot of chunk unnamed-chunk-1

prostate$gleason <- ifelse(prostate$gleason == 6, 0, 1)
table(prostate$gleason)

## 
##  0  1 
## 35 62

p.cor = cor(prostate)
corrplot.mixed(p.cor)

plot of chunk unnamed-chunk-1

train <- subset(prostate, train == TRUE)[, 1:9]
str(train)

## 'data.frame':    67 obs. of  9 variables:
##  $ lcavol : num  -0.58 -0.994 -0.511 -1.204 0.751 ...
##  $ lweight: num  2.77 3.32 2.69 3.28 3.43 ...
##  $ age    : int  50 58 74 58 62 50 58 65 63 63 ...
##  $ lbph   : num  -1.39 -1.39 -1.39 -1.39 -1.39 ...
##  $ svi    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lcp    : num  -1.39 -1.39 -1.39 -1.39 -1.39 ...
##  $ gleason: num  0 0 1 0 0 0 0 0 0 1 ...
##  $ pgg45  : int  0 0 20 0 0 0 0 0 0 30 ...
##  $ lpsa   : num  -0.431 -0.163 -0.163 -0.163 0.372 ...

test <- subset(prostate, train == FALSE)[, 1:9]
str(test)

## 'data.frame':    30 obs. of  9 variables:
##  $ lcavol : num  0.737 -0.777 0.223 1.206 2.059 ...
##  $ lweight: num  3.47 3.54 3.24 3.44 3.5 ...
##  $ age    : int  64 47 63 57 60 69 68 67 65 54 ...
##  $ lbph   : num  0.615 -1.386 -1.386 -1.386 1.475 ...
##  $ svi    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lcp    : num  -1.386 -1.386 -1.386 -0.431 1.348 ...
##  $ gleason: num  0 0 0 1 1 0 0 1 0 0 ...
##  $ pgg45  : int  0 0 0 5 20 0 0 20 0 0 ...
##  $ lpsa   : num  0.765 1.047 1.047 1.399 1.658 ...

最优子集

subfit <- regsubsets(lpsa ~ ., data = train)
b.sum <- summary(subfit)
which.min(b.sum$bic)

## [1] 3

plot(b.sum$bic, type = "l", xlab = "# of Features", ylab = "BIC", 
    main = "BIC score by Feature Inclusion")

plot of chunk unnamed-chunk-2

plot(subfit, scale = "bic", main = "Best Subset Features")

plot of chunk unnamed-chunk-2

ols <- lm(lpsa ~ lcavol + lweight + gleason, data = train)
plot(ols$fitted.values, train$lpsa, xlab = "Predicted", ylab = "Actual", 
    main = "Predicted vs Actual")

plot of chunk unnamed-chunk-2

pred.subfit = predict(ols, newdata = test)
plot(pred.subfit, test$lpsa, xlab = "Predicted", ylab = "Actual", 
    main = "Predicted vs Actual")

plot of chunk unnamed-chunk-2

resid.subfit = test$lpsa - pred.subfit
mean(resid.subfit^2)

## [1] 0.5084126

岭回归

x <- as.matrix(train[, 1:8])
y <- train[, 9]
ridge <- glmnet(x, y, family = "gaussian", alpha = 0)
print(ridge)

## 
## Call:  glmnet(x = x, y = y, family = "gaussian", alpha = 0) 
## 
##     Df    %Dev Lambda
## 1    8 0.00000 878.90
## 2    8 0.00559 800.80
## 3    8 0.00613 729.70
## 4    8 0.00672 664.80
## 5    8 0.00737 605.80
## 6    8 0.00809 552.00
## 7    8 0.00886 502.90
## 8    8 0.00972 458.20
## 9    8 0.01065 417.50
## 10   8 0.01168 380.40
## 11   8 0.01279 346.60
## 12   8 0.01402 315.90
## 13   8 0.01536 287.80
## 14   8 0.01682 262.20
## 15   8 0.01842 238.90
## 16   8 0.02017 217.70
## 17   8 0.02208 198.40
## 18   8 0.02417 180.70
## 19   8 0.02644 164.70
## 20   8 0.02892 150.10
## 21   8 0.03163 136.70
## 22   8 0.03457 124.60
## 23   8 0.03777 113.50
## 24   8 0.04126 103.40
## 25   8 0.04504  94.24
## 26   8 0.04915  85.87
## 27   8 0.05360  78.24
## 28   8 0.05842  71.29
## 29   8 0.06364  64.96
## 30   8 0.06928  59.19
## 31   8 0.07536  53.93
## 32   8 0.08191  49.14
## 33   8 0.08896  44.77
## 34   8 0.09652  40.79
## 35   8 0.10460  37.17
## 36   8 0.11330  33.87
## 37   8 0.12250  30.86
## 38   8 0.13240  28.12
## 39   8 0.14280  25.62
## 40   8 0.15390  23.34
## 41   8 0.16550  21.27
## 42   8 0.17780  19.38
## 43   8 0.19070  17.66
## 44   8 0.20410  16.09
## 45   8 0.21810  14.66
## 46   8 0.23270  13.36
## 47   8 0.24770  12.17
## 48   8 0.26310  11.09
## 49   8 0.27900  10.10
## 50   8 0.29510   9.21
## 51   8 0.31150   8.39
## 52   8 0.32810   7.64
## 53   8 0.34470   6.96
## 54   8 0.36140   6.35
## 55   8 0.37800   5.78
## 56   8 0.39450   5.27
## 57   8 0.41080   4.80
## 58   8 0.42680   4.37
## 59   8 0.44240   3.99
## 60   8 0.45760   3.63
## 61   8 0.47240   3.31
## 62   8 0.48660   3.02
## 63   8 0.50030   2.75
## 64   8 0.51340   2.50
## 65   8 0.52600   2.28
## 66   8 0.53800   2.08
## 67   8 0.54930   1.89
## 68   8 0.56010   1.73
## 69   8 0.57030   1.57
## 70   8 0.58000   1.43
## 71   8 0.58910   1.30
## 72   8 0.59760   1.19
## 73   8 0.60570   1.08
## 74   8 0.61330   0.99
## 75   8 0.62040   0.90
## 76   8 0.62700   0.82
## 77   8 0.63330   0.75
## 78   8 0.63910   0.68
## 79   8 0.64450   0.62
## 80   8 0.64960   0.56
## 81   8 0.65430   0.51
## 82   8 0.65870   0.47
## 83   8 0.66280   0.43
## 84   8 0.66660   0.39
## 85   8 0.67010   0.35
## 86   8 0.67330   0.32
## 87   8 0.67630   0.29
## 88   8 0.67900   0.27
## 89   8 0.68150   0.24
## 90   8 0.68380   0.22
## 91   8 0.68590   0.20
## 92   8 0.68770   0.18
## 93   8 0.68940   0.17
## 94   8 0.69090   0.15
## 95   8 0.69230   0.14
## 96   8 0.69350   0.13
## 97   8 0.69460   0.12
## 98   8 0.69550   0.11
## 99   8 0.69640   0.10
## 100  8 0.69710   0.09

plot(ridge, label = TRUE)

plot of chunk unnamed-chunk-3

plot(ridge, xvar = "lambda", label = TRUE)

plot of chunk unnamed-chunk-3

ridge.coef <- predict(ridge, s = 0.1, type = "coefficients")
ridge.coef

## 9 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## (Intercept)  0.130475478
## lcavol       0.457279371
## lweight      0.645792042
## age         -0.017356156
## lbph         0.122497573
## svi          0.636779442
## lcp         -0.104712451
## gleason      0.346022979
## pgg45        0.004287179

plot(ridge, xvar = "dev", label = TRUE)

plot of chunk unnamed-chunk-3

newx <- as.matrix(test[, 1:8])
ridge.y = predict(ridge, newx = newx, type = "response", s = 0.1)
plot(ridge.y, test$lpsa, xlab = "Predicted", ylab = "Actual", main = "Ridge Regression")

plot of chunk unnamed-chunk-3

ridge.resid <- ridge.y - test$lpsa
mean(ridge.resid^2)

## [1] 0.4783559

lasso回归

lasso <- glmnet(x, y, family = "gaussian", alpha = 1)
print(lasso)

## 
## Call:  glmnet(x = x, y = y, family = "gaussian", alpha = 1) 
## 
##    Df    %Dev  Lambda
## 1   0 0.00000 0.87890
## 2   1 0.09126 0.80080
## 3   1 0.16700 0.72970
## 4   1 0.22990 0.66480
## 5   1 0.28220 0.60580
## 6   1 0.32550 0.55200
## 7   1 0.36150 0.50290
## 8   1 0.39140 0.45820
## 9   2 0.42810 0.41750
## 10  2 0.45980 0.38040
## 11  3 0.48770 0.34660
## 12  3 0.51310 0.31590
## 13  4 0.53490 0.28780
## 14  4 0.55570 0.26220
## 15  4 0.57300 0.23890
## 16  4 0.58740 0.21770
## 17  4 0.59930 0.19840
## 18  5 0.61170 0.18070
## 19  5 0.62200 0.16470
## 20  5 0.63050 0.15010
## 21  5 0.63760 0.13670
## 22  5 0.64350 0.12460
## 23  5 0.64840 0.11350
## 24  5 0.65240 0.10340
## 25  6 0.65580 0.09424
## 26  6 0.65870 0.08587
## 27  6 0.66110 0.07824
## 28  6 0.66310 0.07129
## 29  7 0.66630 0.06496
## 30  7 0.66960 0.05919
## 31  7 0.67240 0.05393
## 32  7 0.67460 0.04914
## 33  7 0.67650 0.04477
## 34  8 0.67970 0.04079
## 35  8 0.68340 0.03717
## 36  8 0.68660 0.03387
## 37  8 0.68920 0.03086
## 38  8 0.69130 0.02812
## 39  8 0.69310 0.02562
## 40  8 0.69460 0.02334
## 41  8 0.69580 0.02127
## 42  8 0.69680 0.01938
## 43  8 0.69770 0.01766
## 44  8 0.69840 0.01609
## 45  8 0.69900 0.01466
## 46  8 0.69950 0.01336
## 47  8 0.69990 0.01217
## 48  8 0.70020 0.01109
## 49  8 0.70050 0.01010
## 50  8 0.70070 0.00921
## 51  8 0.70090 0.00839
## 52  8 0.70110 0.00764
## 53  8 0.70120 0.00696
## 54  8 0.70130 0.00635
## 55  8 0.70140 0.00578
## 56  8 0.70150 0.00527
## 57  8 0.70150 0.00480
## 58  8 0.70160 0.00437
## 59  8 0.70160 0.00399
## 60  8 0.70170 0.00363
## 61  8 0.70170 0.00331
## 62  8 0.70170 0.00302
## 63  8 0.70170 0.00275
## 64  8 0.70180 0.00250
## 65  8 0.70180 0.00228
## 66  8 0.70180 0.00208
## 67  8 0.70180 0.00189
## 68  8 0.70180 0.00172
## 69  8 0.70180 0.00157

plot(lasso, xvar = "lambda", label = TRUE)

plot of chunk unnamed-chunk-4

lasso.coef <- predict(lasso, s = 0.045, type = "coefficients")
lasso.coef

## 9 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept) -0.1305900670
## lcavol       0.4479592050
## lweight      0.5910476764
## age         -0.0073162861
## lbph         0.0974103575
## svi          0.4746790830
## lcp          .           
## gleason      0.2968768129
## pgg45        0.0009788059

lasso.y <- predict(lasso, newx = newx, type = "response", s = 0.045)
plot(lasso.y, test$lpsa, xlab = "Predicted", ylab = "Actual", main = "LASSO")

plot of chunk unnamed-chunk-4

lasso.resid <- lasso.y - test$lpsa
mean(lasso.resid^2)

## [1] 0.4437209

弹性网络

grid <- expand.grid(.alpha = seq(0, 1, by = 0.2), .lambda = seq(0, 
    0.2, by = 0.02))
table(grid)

##       .lambda
## .alpha 0 0.02 0.04 0.06 0.08 0.1 0.12 0.14 0.16 0.18 0.2
##    0   1    1    1    1    1   1    1    1    1    1   1
##    0.2 1    1    1    1    1   1    1    1    1    1   1
##    0.4 1    1    1    1    1   1    1    1    1    1   1
##    0.6 1    1    1    1    1   1    1    1    1    1   1
##    0.8 1    1    1    1    1   1    1    1    1    1   1
##    1   1    1    1    1    1   1    1    1    1    1   1

head(grid)

##   .alpha .lambda
## 1    0.0       0
## 2    0.2       0
## 3    0.4       0
## 4    0.6       0
## 5    0.8       0
## 6    1.0       0

control <- trainControl(method = "LOOCV")  #selectionFunction='best'
set.seed(701)  #our random seed
enet.train = train(lpsa ~ ., data = train, method = "glmnet", trControl = control, 
    tuneGrid = grid)
enet.train

## glmnet 
## 
## 67 samples
##  8 predictor
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 66, 66, 66, 66, 66, 66, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda  RMSE       Rsquared   MAE      
##   0.0    0.00    0.7498311  0.6091434  0.5684548
##   0.0    0.02    0.7498311  0.6091434  0.5684548
##   0.0    0.04    0.7498311  0.6091434  0.5684548
##   0.0    0.06    0.7498311  0.6091434  0.5684548
##   0.0    0.08    0.7498311  0.6091434  0.5684548
##   0.0    0.10    0.7508038  0.6079576  0.5691343
##   0.0    0.12    0.7512303  0.6073484  0.5692774
##   0.0    0.14    0.7518536  0.6066518  0.5694451
##   0.0    0.16    0.7526409  0.6058786  0.5696155
##   0.0    0.18    0.7535502  0.6050559  0.5699154
##   0.0    0.20    0.7545583  0.6041945  0.5709377
##   0.2    0.00    0.7550170  0.6073913  0.5746434
##   0.2    0.02    0.7542648  0.6065770  0.5713633
##   0.2    0.04    0.7550427  0.6045111  0.5738111
##   0.2    0.06    0.7571893  0.6015388  0.5764117
##   0.2    0.08    0.7603394  0.5978544  0.5793641
##   0.2    0.10    0.7642219  0.5936131  0.5846567
##   0.2    0.12    0.7684937  0.5890900  0.5898445
##   0.2    0.14    0.7713716  0.5861699  0.5939008
##   0.2    0.16    0.7714681  0.5864527  0.5939535
##   0.2    0.18    0.7726979  0.5856865  0.5948039
##   0.2    0.20    0.7744041  0.5845281  0.5959671
##   0.4    0.00    0.7551309  0.6072931  0.5746628
##   0.4    0.02    0.7559055  0.6044693  0.5737468
##   0.4    0.04    0.7599608  0.5989266  0.5794120
##   0.4    0.06    0.7667450  0.5911506  0.5875213
##   0.4    0.08    0.7746900  0.5824231  0.5979116
##   0.4    0.10    0.7760605  0.5809784  0.6001763
##   0.4    0.12    0.7784160  0.5788024  0.6024102
##   0.4    0.14    0.7792216  0.5786250  0.6035698
##   0.4    0.16    0.7784433  0.5806388  0.6024696
##   0.4    0.18    0.7779134  0.5828322  0.6014095
##   0.4    0.20    0.7797721  0.5826306  0.6020934
##   0.6    0.00    0.7553016  0.6071317  0.5748038
##   0.6    0.02    0.7579330  0.6020021  0.5766881
##   0.6    0.04    0.7665234  0.5917067  0.5863966
##   0.6    0.06    0.7778600  0.5790948  0.6021629
##   0.6    0.08    0.7801170  0.5765439  0.6049216
##   0.6    0.10    0.7819242  0.5750003  0.6073379
##   0.6    0.12    0.7800315  0.5782365  0.6053700
##   0.6    0.14    0.7813077  0.5785548  0.6059119
##   0.6    0.16    0.7846753  0.5770307  0.6075899
##   0.6    0.18    0.7886388  0.5753101  0.6104210
##   0.6    0.20    0.7931549  0.5734018  0.6140249
##   0.8    0.00    0.7553734  0.6070686  0.5748255
##   0.8    0.02    0.7603679  0.5991567  0.5797001
##   0.8    0.04    0.7747753  0.5827275  0.5975303
##   0.8    0.06    0.7812784  0.5752601  0.6065340
##   0.8    0.08    0.7832103  0.5734172  0.6092662
##   0.8    0.10    0.7811248  0.5769787  0.6071341
##   0.8    0.12    0.7847355  0.5750115  0.6093756
##   0.8    0.14    0.7894184  0.5725728  0.6122536
##   0.8    0.16    0.7951091  0.5696205  0.6158407
##   0.8    0.18    0.8018475  0.5659672  0.6205741
##   0.8    0.20    0.8090352  0.5622252  0.6256726
##   1.0    0.00    0.7554439  0.6070354  0.5749409
##   1.0    0.02    0.7632577  0.5958706  0.5827830
##   1.0    0.04    0.7813519  0.5754986  0.6065914
##   1.0    0.06    0.7843882  0.5718847  0.6103528
##   1.0    0.08    0.7819175  0.5755415  0.6082954
##   1.0    0.10    0.7860004  0.5731009  0.6107923
##   1.0    0.12    0.7921572  0.5692525  0.6146159
##   1.0    0.14    0.7999326  0.5642789  0.6198758
##   1.0    0.16    0.8089248  0.5583637  0.6265797
##   1.0    0.18    0.8185327  0.5521348  0.6343174
##   1.0    0.20    0.8259445  0.5494268  0.6411104
## 
## RMSE was used to select the optimal model using the
##  smallest value.
## The final values used for the model were alpha = 0 and lambda
##  = 0.08.

enet <- glmnet(x, y, family = "gaussian", alpha = 0, lambda = 0.08)
enet.coef <- coef(enet, s = 0.08, exact = TRUE)
enet.coef

## 9 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## (Intercept)  0.137811097
## lcavol       0.470960525
## lweight      0.652088157
## age         -0.018257308
## lbph         0.123608113
## svi          0.648209192
## lcp         -0.118214386
## gleason      0.345480799
## pgg45        0.004478267

enet.y <- predict(enet, newx = newx, type = "response", s = 0.08)
plot(enet.y, test$lpsa, xlab = "Predicted", ylab = "Actual", main = "Elastic Net")

plot of chunk unnamed-chunk-5

enet.resid <- enet.y - test$lpsa
mean(enet.resid^2)

## [1] 0.4795019

交叉验证

set.seed(317)
lasso.cv = cv.glmnet(x, y, nfolds = 3)
plot(lasso.cv)

plot of chunk unnamed-chunk-6

lasso.cv$lambda.min  #minimum

## [1] 0.007644054

lasso.cv$lambda.1se  #one standard error away

## [1] 0.3158532

coef(lasso.cv, s = "lambda.1se")

## 9 x 1 sparse Matrix of class "dgCMatrix"
##                     1
## (Intercept) 1.0746246
## lcavol      0.4176727
## lweight     0.2246438
## age         .        
## lbph        .        
## svi         0.0648923
## lcp         .        
## gleason     .        
## pgg45       .

lasso.y.cv = predict(lasso.cv, newx = newx, type = "response", s = "lambda.1se")
lasso.cv.resid = lasso.y.cv - test$lpsa
mean(lasso.cv.resid^2)

## [1] 0.5508264

正则化与分类

library(MASS)
biopsy$ID = NULL
names(biopsy) = c("thick", "u.size", "u.shape", "adhsn", "s.size", 
    "nucl", "chrom", "n.nuc", "mit", "class")
biopsy.v2 <- na.omit(biopsy)
set.seed(123)  #random number generator
ind <- sample(2, nrow(biopsy.v2), replace = TRUE, prob = c(0.7, 0.3))
train <- biopsy.v2[ind == 1, ]  #the training data set
test <- biopsy.v2[ind == 2, ]  #the test data set

x <- as.matrix(train[, 1:9])
y <- train[, 10]

set.seed(3)
fitCV <- cv.glmnet(x, y, family = "binomial", type.measure = "auc", 
    nfolds = 5)
plot(fitCV)

plot of chunk unnamed-chunk-7

fitCV$lambda.1se

## [1] 0.1710154

coef(fitCV, s = "lambda.1se")

## 10 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## (Intercept) -2.011804541
## thick        0.033203900
## u.size       0.107893817
## u.shape      0.086171949
## adhsn        .          
## s.size       .          
## nucl         0.150382121
## chrom        .          
## n.nuc        0.004832709
## mit          .

library(InformationValue)
predCV <- predict(fitCV, newx = as.matrix(test[, 1:9]), s = "lambda.1se", 
    type = "response")
actuals <- ifelse(test$class == "malignant", 1, 0)
misClassError(actuals, predCV)

## [1] 0.0574

plotROC(actuals, predCV)

plot of chunk unnamed-chunk-7

predCV.min <- predict(fitCV, newx = as.matrix(test[, 1:9]), s = "lambda.min", 
    type = "response")
misClassError(actuals, predCV.min)

## [1] 0.0239