RWEPA | R - shiny企業實務應用

第5集-shiny小明算命師(上)

Business Data Analytics

大綱

5.1 套件總覽

5.2 Excel檔案匯入

5.3 資料轉換與視覺化

5.4 建立預測模型

5.5 習題-R與小明算命師-shiny實作篇

5.6 結論

5.1 套件總覽

套件 功能
readr 匯入文字檔案,例:‘csv’, ‘tsv’, ‘fwf’
dplyr 資料轉換
ggcorrplot 相關矩陣繪圖
ggplot2 繪圖
randomForest 隨機森林
caret 分類,迴歸
pROC ROC曲線繪圖

載入套件

library(readr) # read_csv
library(dplyr) # mutate_if, mutate, group_by, summarise, select, rename_with
library(ggcorrplot)   # ggcorrplot
library(ggplot2)      # ggplot
library(randomForest) # randomForest, importance, varImpPlot
library(caret)        # trainControl, train
library(pROC)         # plot.roc

5.2 Excel檔案匯入

# 載入人力資源資料集
urls <- "https://raw.githubusercontent.com/rwepa/DataDemo/master/human_resource.csv"
df <- read_csv(urls)
df # A tibble: 14,999 × 10
# A tibble: 14,999 × 10
   last_eva…¹ numbe…² avera…³ time_…⁴ Work_…⁵ satis…⁶  left promo…⁷ role  salary
        <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <dbl>   <dbl> <chr> <chr> 
 1       0.53       2     157       3       0    0.38     1       0 sales low   
 2       0.86       5     262       6       0    0.8      1       0 sales medium
 3       0.88       7     272       4       0    0.11     1       0 sales medium
 4       0.87       5     223       5       0    0.72     1       0 sales low   
 5       0.52       2     159       3       0    0.37     1       0 sales low   
 6       0.5        2     153       3       0    0.41     1       0 sales low   
 7       0.77       6     247       4       0    0.1      1       0 sales low   
 8       0.85       5     259       5       0    0.92     1       0 sales low   
 9       1          5     224       5       0    0.89     1       0 sales low   
10       0.53       2     142       3       0    0.42     1       0 sales low   
# … with 14,989 more rows, and abbreviated variable names ¹​last_evaluation,
#   ²​number_project, ³​average_montly_hours, ⁴​time_spend_company,
#   ⁵​Work_accident, ⁶​satisfaction_level, ⁷​promotion_last_5years

5.3 資料轉換與視覺化

  • 資料結構 str

  • 資料摘要 summary

  • 資料轉換 mutate_if, mutate

  • left群組百分比, left群組的變數平均值

  • 相關係數矩陣圖,left群組盒鬚圖

資料結構

str(df) # left: num數值, role, salary: chr字元
spc_tbl_ [14,999 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ last_evaluation      : num [1:14999] 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
 $ number_project       : num [1:14999] 2 5 7 5 2 2 6 5 5 2 ...
 $ average_montly_hours : num [1:14999] 157 262 272 223 159 153 247 259 224 142 ...
 $ time_spend_company   : num [1:14999] 3 6 4 5 3 3 4 5 5 3 ...
 $ Work_accident        : num [1:14999] 0 0 0 0 0 0 0 0 0 0 ...
 $ satisfaction_level   : num [1:14999] 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
 $ left                 : num [1:14999] 1 1 1 1 1 1 1 1 1 1 ...
 $ promotion_last_5years: num [1:14999] 0 0 0 0 0 0 0 0 0 0 ...
 $ role                 : chr [1:14999] "sales" "sales" "sales" "sales" ...
 $ salary               : chr [1:14999] "low" "medium" "medium" "low" ...
 - attr(*, "spec")=
  .. cols(
  ..   last_evaluation = col_double(),
  ..   number_project = col_double(),
  ..   average_montly_hours = col_double(),
  ..   time_spend_company = col_double(),
  ..   Work_accident = col_double(),
  ..   satisfaction_level = col_double(),
  ..   left = col_double(),
  ..   promotion_last_5years = col_double(),
  ..   role = col_character(),
  ..   salary = col_character()
  .. )
 - attr(*, "problems")=<externalptr> 

資料摘要

summary(df) # 資料沒有NA
 last_evaluation  number_project  average_montly_hours time_spend_company
 Min.   :0.3600   Min.   :2.000   Min.   : 96.0        Min.   : 2.000    
 1st Qu.:0.5600   1st Qu.:3.000   1st Qu.:156.0        1st Qu.: 3.000    
 Median :0.7200   Median :4.000   Median :200.0        Median : 3.000    
 Mean   :0.7161   Mean   :3.803   Mean   :201.1        Mean   : 3.498    
 3rd Qu.:0.8700   3rd Qu.:5.000   3rd Qu.:245.0        3rd Qu.: 4.000    
 Max.   :1.0000   Max.   :7.000   Max.   :310.0        Max.   :10.000    
 Work_accident    satisfaction_level      left        promotion_last_5years
 Min.   :0.0000   Min.   :0.0900     Min.   :0.0000   Min.   :0.00000      
 1st Qu.:0.0000   1st Qu.:0.4400     1st Qu.:0.0000   1st Qu.:0.00000      
 Median :0.0000   Median :0.6400     Median :0.0000   Median :0.00000      
 Mean   :0.1446   Mean   :0.6128     Mean   :0.2381   Mean   :0.02127      
 3rd Qu.:0.0000   3rd Qu.:0.8200     3rd Qu.:0.0000   3rd Qu.:0.00000      
 Max.   :1.0000   Max.   :1.0000     Max.   :1.0000   Max.   :1.00000      
     role              salary         
 Length:14999       Length:14999      
 Class :character   Class :character  
 Mode  :character   Mode  :character  
                                      
                                      
                                      

資料轉換

# 資料轉換
df <- df %>%
  mutate_if(is.character, as.factor) %>% # 轉換 character 為 factor
  mutate(left=factor(left, labels=c("No","Yes")))
str(df)
tibble [14,999 × 10] (S3: tbl_df/tbl/data.frame)
 $ last_evaluation      : num [1:14999] 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
 $ number_project       : num [1:14999] 2 5 7 5 2 2 6 5 5 2 ...
 $ average_montly_hours : num [1:14999] 157 262 272 223 159 153 247 259 224 142 ...
 $ time_spend_company   : num [1:14999] 3 6 4 5 3 3 4 5 5 3 ...
 $ Work_accident        : num [1:14999] 0 0 0 0 0 0 0 0 0 0 ...
 $ satisfaction_level   : num [1:14999] 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
 $ left                 : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
 $ promotion_last_5years: num [1:14999] 0 0 0 0 0 0 0 0 0 0 ...
 $ role                 : Factor w/ 10 levels "accounting","hr",..: 8 8 8 8 8 8 8 8 8 8 ...
 $ salary               : Factor w/ 3 levels "high","low","medium": 2 3 3 2 2 2 2 2 2 2 ...

left群組百分比

df %>%
  group_by(left) %>% # 建立依left為群組
  summarise(Count = n()) %>% # 建立群組總計Count
  mutate(Freq = round(Count / sum(Count), 2)) #建立群組百分比Freq
# A tibble: 2 × 3
  left  Count  Freq
  <fct> <int> <dbl>
1 No    11428  0.76
2 Yes    3571  0.24

left群組的變數平均值

df %>%
  select(-c(role, salary)) %>% # 刪除 role, salary 二個欄位
  group_by(left) %>% # 依left建立群組
  summarise(across(everything(), list(mean))) %>% # 計算各群組的平均值
  rename_with(~ tolower(gsub("_1", "_mean", .x, fixed = TRUE))) # 更改欄位名稱
# A tibble: 2 × 8
  left  last_evaluation_mean number_pr…¹ avera…² time_…³ work_…⁴ satis…⁵ promo…⁶
  <fct>                <dbl>       <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 No                   0.715        3.79    199.    3.38  0.175    0.667 0.0263 
2 Yes                  0.718        3.86    207.    3.88  0.0473   0.440 0.00532
# … with abbreviated variable names ¹​number_project_mean,
#   ²​average_montly_hours_mean, ³​time_spend_company_mean, ⁴​work_accident_mean,
#   ⁵​satisfaction_level_mean, ⁶​promotion_last_5years_mean

相關係數

corr <- cor(as.data.frame(df %>% select(-c(role, salary, left))))
corr
                      last_evaluation number_project average_montly_hours
last_evaluation           1.000000000    0.349332589          0.339741800
number_project            0.349332589    1.000000000          0.417210634
average_montly_hours      0.339741800    0.417210634          1.000000000
time_spend_company        0.131590722    0.196785891          0.127754910
Work_accident            -0.007104289   -0.004740548         -0.010142888
satisfaction_level        0.105021214   -0.142969586         -0.020048113
promotion_last_5years    -0.008683768   -0.006063958         -0.003544414
                      time_spend_company Work_accident satisfaction_level
last_evaluation              0.131590722  -0.007104289         0.10502121
number_project               0.196785891  -0.004740548        -0.14296959
average_montly_hours         0.127754910  -0.010142888        -0.02004811
time_spend_company           1.000000000   0.002120418        -0.10086607
Work_accident                0.002120418   1.000000000         0.05869724
satisfaction_level          -0.100866073   0.058697241         1.00000000
promotion_last_5years        0.067432925   0.039245435         0.02560519
                      promotion_last_5years
last_evaluation                -0.008683768
number_project                 -0.006063958
average_montly_hours           -0.003544414
time_spend_company              0.067432925
Work_accident                   0.039245435
satisfaction_level              0.025605186
promotion_last_5years           1.000000000

相關係數矩陣圖

ggcorrplot(corr, lab = TRUE, colors=c("sienna1","white","deepskyblue3")) +
  ggtitle("人力資源相關係數矩陣圖") +
  theme(plot.title = element_text(hjust = 0.5))

員工工作滿意度-left群組盒鬚圖

ggplot(aes(y = satisfaction_level, x = left), data = df) + 
  geom_boxplot() +
  ggtitle("員工工作滿意度-left群組盒鬚圖") +
  theme(plot.title = element_text(hjust = 0.5))

整體vs.離職員工平均工作滿意度

# 整體員工的平均工作滿意度
emp_population_satisfaction <- mean(df$satisfaction_level)

# 離職員工的平均工作滿意度
left_pop <- subset(df, left == "Yes")
emp_turnover_satisfaction <- mean(left_pop$satisfaction_level)

print(paste(c('整體員工的工作滿意度: ', 
              round(emp_population_satisfaction, 4)*100, '%'), 
            collapse =""))
[1] "整體員工的工作滿意度: 61.28%"
print(paste(c('離職員工的工作滿意度: ', 
              round(emp_turnover_satisfaction, 4)*100, '%'), 
            collapse =""))
[1] "離職員工的工作滿意度: 44.01%"

T檢定-整體vs.離職平均工作滿意度

# T檢定-比較整體員工與離職員工的平均工作滿意度
# H0: 離職員工的平均工作滿意度等於整體員工的平均工作滿意度
# H1: 離職員工的平均工作滿意度不等於整體員工的平均工作滿意度
ttest_satisfaction <- t.test(x=left_pop$satisfaction_level, mu=emp_population_satisfaction)
ttest_satisfaction

    One Sample t-test

data:  left_pop$satisfaction_level
t = -39.109, df = 3570, p-value < 2.2e-16
alternative hypothesis: true mean is not equal to 0.6128335
95 percent confidence interval:
 0.4314385 0.4487576
sample estimates:
mean of x 
 0.440098 
if (ttest_satisfaction$p.value <= 0.05) {
  print("Rejet H0")
} else {
  print("Do not reject H0")
}
[1] "Rejet H0"

5.4 建立預測模型

  • 特徵重要性繪圖

  • 建立訓練集(70%),測試集(30%)

  • 建立監督式學習模型

特徵重要性

# 使用隨機森林(random forest)
df_rf <- randomForest(left~., data=df)

# 特徵重要性(feature importance)
importance(df_rf)
                      MeanDecreaseGini
last_evaluation             661.578598
number_project              976.199414
average_montly_hours        759.250818
time_spend_company         1002.005265
Work_accident                28.940414
satisfaction_level         1848.913213
promotion_last_5years         5.032504
role                         81.938004
salary                       42.301898

特徵重要性繪圖

# 特徵重要性繪圖(feature importance plot)
varImpPlot(df_rf, pch = 16, main="人力資源特徵重要性繪圖")

建立訓練集(70%),測試集(30%)

set.seed(168)
trainIndex <- createDataPartition(df$left, p = 0.7, list = FALSE)
train <- df[trainIndex,]
head(train, n=3)
# A tibble: 3 × 10
  last_eval…¹ numbe…² avera…³ time_…⁴ Work_…⁵ satis…⁶ left  promo…⁷ role  salary
        <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <fct>   <dbl> <fct> <fct> 
1        0.53       2     157       3       0    0.38 Yes         0 sales low   
2        0.86       5     262       6       0    0.8  Yes         0 sales medium
3        0.88       7     272       4       0    0.11 Yes         0 sales medium
# … with abbreviated variable names ¹​last_evaluation, ²​number_project,
#   ³​average_montly_hours, ⁴​time_spend_company, ⁵​Work_accident,
#   ⁶​satisfaction_level, ⁷​promotion_last_5years
test <- df[-trainIndex,] # 使用負號
head(test, n=3)
# A tibble: 3 × 10
  last_eval…¹ numbe…² avera…³ time_…⁴ Work_…⁵ satis…⁶ left  promo…⁷ role  salary
        <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <fct>   <dbl> <fct> <fct> 
1        0.85       5     259       5       0    0.92 Yes         0 sales low   
2        0.81       6     305       4       0    0.11 Yes         0 sales low   
3        0.92       4     234       5       0    0.84 Yes         0 sales low   
# … with abbreviated variable names ¹​last_evaluation, ²​number_project,
#   ³​average_montly_hours, ⁴​time_spend_company, ⁵​Work_accident,
#   ⁶​satisfaction_level, ⁷​promotion_last_5years

建立監督式學習模型

set.seed(168)

fitControl <- trainControl(
  method = "repeatedcv",
  number = 10,
  summaryFunction = twoClassSummary, 
  classProbs = TRUE,
  savePredictions = TRUE)

model <- function(usedmethod, data_train, data_test) {
  myformula <- as.formula("left ~ .")
  tmp <- train(myformula, # 使用 train 進行模型訓練
               data = data_train, 
               trControl = fitControl, 
               method = usedmethod)
  
  mymodel <- list(trainmodel=tmp, # <技巧>使用list回傳模型結果與正確率
                  accuracy=mean(data_test$left == predict(tmp, data_test)))
  return(mymodel)
}

方法1: 廣義線性模型(GLM)

model_glm <- model("glm", train, test)

# 顯示 summary, accuracy
# summary(model_glm$trainmodel)
model_glm$accuracy  # 0.7879529
[1] 0.7879529
# Plot roc curve
plot.roc(model_glm$trainmodel$pred$obs,
         model_glm$trainmodel$pred$Yes,
         print.auc=TRUE,
         print.auc.x=0.2, 
         print.auc.y=0.7,
         grid=TRUE,
         col=2)
legend("bottomright", 
       legend=c("glm", "cart", "knn"),
       col=c(2:4), 
       lwd=2)

ROC曲線

方法2: CART決策樹

# 方法2: CART決策樹
model_tree <- model("rpart", train, test)

# 顯示 accuracy
# summary(model_tree$trainmodel)
model_tree$accuracy # 0.9113136

# Plot roc curve
plot.roc(model_tree$trainmodel$pred$obs,
         model_tree$trainmodel$pred$Yes,
         print.auc=TRUE,
         add=TRUE,
         col=3,
         print.auc.x=0.3, 
         print.auc.y=0.6)

方法3: KNN(K-近鄰演算法 K-Nearest Neighbors)

# 方法3: KNN
model_knn <- model("knn", train, test)

# 顯示 accuracy
model_knn$accuracy # 0.9404312

# Plot roc curve
plot.roc(model_knn$trainmodel$pred$obs,
         model_knn$trainmodel$pred$Yes,
         print.auc=TRUE,
         add=TRUE,
         col=4,
         print.auc.x=0.4, 
         print.auc.y=0.5)

ROC曲線(GLM, CART, KNN)

5.5 習題-R與小明算命師-shiny實作篇

  • 實作shiny人力資源儀表板

  • 建立資料摘要,結構,繪圖,監督式學習預測

  • 自動產生報表結果,內容包括監督式學習模型,模型視覺化,混淆矩陣,ROC曲線,預測正確率等

5.6 結論

  • 熟悉 readr, dplyr, ggcorrplot, ggplot2, randomForest, caret, pROC 套件

  • 理解第4章R與小明算命師的資料APC方法

  • 第5章習題-R與小明算命師(下) …期待最終回【第6集】

THANK YOU😃