"media", "media", "media", "dem", "tech", "media", "algo", "tech", "dem", "algo", NA))
overlap <- c(rep("yes", 4), "no", "no", "yes", "no", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", rep("no", 5), "yes", "yes", 'yes',
rep("no", 4), "yes", rep("no", 4), "yes", "yes", rep("no", 8), "yes", "yes", "no", "no", "yes", "yes")
vid_hiring_lasso_tidy_2 <- ggplot(vid_hiring_lasso_tidy %>% filter(term != "(Intercept)")) +
aes(x = estimate, y = reorder(term, - estimate)) +
geom_point(aes(shape = category), size = 1.7) +
geom_vline(xintercept = 0, size = 0.2) +
theme_bw() +
theme(panel.border = element_blank(),
plot.title = element_text(hjust = 0.5),
axis.text.y = element_text(colour= rev(ifelse(overlap == "yes", "grey30", "black")))) +
ggtitle("Predictors of thinking that mining videos is fair") +
xlab("Magnitude and sign of lasso coefficient") +
ylab("Predictor") +
scale_shape_discrete(name = "Variable type",
labels = c("Algorithms", "Demographics", "Soc media", "Politics",
"Technology", "Other")) +
scale_y_discrete(labels = rev(c("Fair finance score", "Fair risk score", "Acceptable risk score", "Acceptable soc media differentially experiment sentiments",
"Effective finance score", "Tech corps should be less regulated", "Soc media content makes feel connected",
"Male", "Users lots control over Newsfeed", "Rel attendance once or twice a month",
"Very conservative", "Acceptable soc media use data advertise",
"Acceptable finance score", "$75,000 + family income", "Little content soc media about gun control/violence",
"South", "Effective risk score", "Unions have too much power", "Rel attendance more than once a week",
"65+ years old", "Married or living with a partner", "Acceptable soc media differentially remind vote",
"Tech corps have right amount of power", "Tech corps more ethical than others", "Easy for soc media to figure rel based data",
"Soc media content makes angry", "Soc media content mix of supportive and mean", "High school or less",
"Has adjusted Fb privacy last year", "Users no control over Newsfeed content", "Has liked, shared or comment influence Newsfeed",
"West", "Tech corps often fail anticipate soc impact", "Has deleted Fb app phone last year",
"Tech corps have more bad than good soc impact", "Few Fb posts relevant/interesting",
"Soc media doesn't highlight issues that would't get attention",
"Programs are biased", "Not likely soc media intentionally censor", "Not easy for soc media to figure rel based data",
"Indicated want to see on Newsfeed", "Other religion", "Can't trust tech corps do what's right",
"Soc media contents don't provide accurate picture society", "Ineffective finance score", "Tech corps don't protect data",
"50-64 years old", "Unfair risk score"))) +
labs(caption="Only coefficients different than zero displayed. Intercept not displayed.
Model fitted excluding effectiveness of response as predictor.") # 670 x 550
pdf("vid_hiring_lasso_tidy_2.pdf")
print(vid_hiring_lasso_tidy_2)
dev.off()
cv_hiring_lasso <- glmnet(fair_cv_hiring_bin ~ ., data = data_cv_hiring_train, alpha = 1, lambda = 0.01900601, family = "binomial")
cv_hiring_lasso_tidy <- tidy(cv_hiring_lasso, return_zeros = FALSE)
cv_hiring_lasso_tidy <- cv_hiring_lasso_tidy %>%
arrange(estimate) %>%
add_column(category = c("algo", "pol", "algo", "algo", "media", "media", "media", "algo", "media", "algo", "media", "media", "tech", "tech",
"tech", "pol", "tech", "media", "tech", "algo", "media", "dem", "tech", "pol", "media", "tech", "tech", "algo", "tech",
NA))
overlap <- c("yes", "no", rep("yes", 6), "no", "no", "yes", "yes", "yes", rep("no", 8), "yes", "no", "yes", "yes", "no", "no", "yes", "yes")
cv_hiring_lasso_tidy_2 <- ggplot(cv_hiring_lasso_tidy %>% filter(term != "(Intercept)")) +
aes(x = estimate, y = reorder(term, - estimate)) +
geom_point(aes(shape = category), size = 1.7) +
geom_vline(xintercept = 0, size = 0.2) +
theme_bw() +
theme(panel.border = element_blank(),
plot.title = element_text(hjust = 0.5),
axis.text.y = element_text(colour= rev(ifelse(overlap == "yes", "grey30", "black")))) +
ggtitle("Predictors of thinking that mining resumes is fair") +
xlab("Magnitude and sign of lasso coefficient") +
ylab("Predictor") +
scale_shape_discrete(name = "Variable type",
labels = c("Algorithms", "Demographics", "Soc media", "Politics",
"Technology", "Other")) +
scale_y_discrete(labels = rev(c("Acceptable finance score", "Advertisers don't have enough power", "Effective risk score", "Fair risk score",
"Acceptable soc media differentially experiment sentiments", "Acceptable soc media differentially change site",
"Acceptable soc media use data advertise", "Acceptable risk score", "No response soc media content accurate picture society",
"Fair finance score", "Hasn't changed privacy/ad preferences influence Newsfeed",
"Hasn't liked, shared or commented influence Newsfeed", "Tech corps protect data", "Programs are unbiased",
"Tech corps should be less regulated", "Energy industry has about right amount power",
"Tech corps more good than bad pers impact", "Soc media content makes feel depressed",
"Tech corps should be more regulated", "Ineffective finance score", "Users no control over content Newsfeed", "Non-white",
"Programs are biased", "Advertisers too much power and influence",  "Soc media content makes angry",
"Tech corps more bad than good pers impact", "No response how much tech corps regulated", "Ineffective risk score",
"Tech corps have more bad than good soc impact"))) +
labs(caption="Only coefficients different than zero displayed. Intercept not displayed.
Model fitted excluding effectiveness of response as predictor.") # 670 x 550
pdf("cv_hiring_lasso_tidy_2.pdf")
print(cv_hiring_lasso_tidy_2)
dev.off()
citation()
citation()
citation(tidyverse)
citation(haven)
citation(glmnet)
citation(glmnetUtils)
citation(randomForest)
citation(caret)
citation(gbm)
citation(bartMachine)
citation(broom)
library(tidyverse)
library(haven)
library(glmnet)
library(glmnetUtils)
library(randomForest)
library(caret)
library(gbm)
library(bartMachine)
library(broom)
citation(tidyverse)
citation(haven)
citation(glmnet)
citation(glmnetUtils)
citation(randomForest)
citation(caret)
citation(gbm)
citation(bartMachine)
citation(broom)
library(tidyverse)
citation(tidyverse)
citation("tidyverse")
citation("tidyverse")
citation("haven")
citation("glmnet")
citation("glmnetUtils")
citation("randomForest")
citation("caret")
citation("gbm")
citation("bartMachine")
citation("broom")
# Model selection
# Setting parameters ------------------------------------------------------
options(java.parameters = "-Xmx54g")
# Loading packages --------------------------------------------------------
library(tidyverse)
library(glmnet) # masks 'expand', 'pack', and 'unpack' from tidyr
library(glmnetUtils)
library(randomForest) # masks 'combine' from 'dplyr' and 'margin' from ggplot2
library(caret) # masks 'lift' from purrr
library(gbm)
library(bartMachine)
# Reading data ------------------------------------------------------------
data_cred_score_train <- readRDS("data/processed/data_cred_score_train.rds") %>%
select(-c(acceptable_cred_score))
data_risk_score_train <- readRDS("data/processed/data_risk_score_train.rds") %>%
select(-c(acceptable_risk_score))
data_vid_hiring_train <- readRDS("data/processed/data_vid_hiring_train.rds") %>%
select(-c(acceptable_vid_hiring))
data_cv_hiring_train <- readRDS("data/processed/data_cv_hiring_train.rds") %>%
select(-c(acceptable_cv_hiring))
data_cred_score_test <- readRDS("data/processed/data_cred_score_test.rds") %>%
select(-c(acceptable_cred_score))
data_risk_score_test <- readRDS("data/processed/data_risk_score_test.rds") %>%
select(-c(acceptable_risk_score))
data_vid_hiring_test <- readRDS("data/processed/data_vid_hiring_test.rds") %>%
select(-c(acceptable_vid_hiring))
data_cv_hiring_test <- readRDS("data/processed/data_cv_hiring_test.rds") %>%
select(-c(acceptable_cv_hiring))
# Setting parameters ------------------------------------------------------
set.seed(2620)
fit_control <- trainControl(method = "none", classProbs = TRUE)
# Helper function ---------------------------------------------------------
#' @name misclass_rate
misclass_rate <- function(outcome, model, dataset){
p_hat <- predict(model, newdata = dataset, type = "response")
class_hat <- if_else(p_hat > 0.5, 1, 0)
misclass_rate <- mean(class_hat != outcome)
misclass_rate
}
#' @name misclass_rate_rf
misclass_rate_rf <- function(outcome, model, dataset){
class_hat <- predict(model, newdata = dataset, type = "response")
misclass_rate <- mean(class_hat != outcome)
misclass_rate
}
#' @name misclass_rate_bt
misclass_rate_bt <- function(outcome, model, dataset){
p_hat <- predict(model, newdata = dataset, type = "prob")
class_hat <- if_else(p_hat[1] > 0.5, 0, 1)
misclass_rate <- mean(class_hat != outcome)
misclass_rate
}
#' @name misclass_rate_bart
misclass_rate_bart <- function(outcome, model, x_test, y_test){
predictions <- bart_predict_for_test_data(model, x_test, y_test)
class_hat <- predictions$y_hat
misclass_rate <- mean(class_hat != y_test)
misclass_rate
}
# Logistic regressions ----------------------------------------------------
cred_score_logit <- glm(fair_cred_score_bin ~ ., data = data_cred_score_train, family = binomial(link = "logit"))
cred_score_logit_mr <- misclass_rate(data_cred_score_test$fair_cred_score_bin, cred_score_logit, data_cred_score_test)
risk_score_logit <- glm(fair_risk_score_bin ~ ., data = data_risk_score_train, family = binomial(link = "logit"))
risk_score_logit_mr <- misclass_rate(data_risk_score_test$fair_risk_score_bin, risk_score_logit, data_risk_score_test)
vid_hiring_logit <- glm(fair_vid_hiring_bin ~ ., data = data_vid_hiring_train, family = binomial(link = "logit"))
vid_hiring_logit_mr <- misclass_rate(data_vid_hiring_test$fair_vid_hiring_bin, vid_hiring_logit, data_vid_hiring_test)
cv_hiring_logit <- glm(fair_cv_hiring_bin ~ ., data = data_cv_hiring_train, family = binomial(link = "logit"))
cv_hiring_logit_mr <- misclass_rate(data_cv_hiring_test$fair_cv_hiring_bin, cv_hiring_logit, data_cv_hiring_test)
# Regularized regressions -------------------------------------------------
cred_score_ridge <- glmnet(fair_cred_score_bin ~ ., data = data_cred_score_train, alpha = 0, lambda = 0.1846425, family = "binomial")
cred_score_ridge_mr <- misclass_rate(data_cred_score_test$fair_cred_score_bin, cred_score_ridge, data_cred_score_test)
cred_score_lasso <- glmnet(fair_cred_score_bin ~ ., data = data_cred_score_train, alpha = 1, lambda = 0.01397735, family = "binomial")
cred_score_lasso_mr <- misclass_rate(data_cred_score_test$fair_cred_score_bin, cred_score_lasso, data_cred_score_test)
risk_score_ridge <- glmnet(fair_risk_score_bin ~ ., data = data_risk_score_train, alpha = 0, lambda = 0.09221979, family = "binomial")
risk_score_ridge_mr <- misclass_rate(data_risk_score_test$fair_risk_score_bin, risk_score_ridge, data_risk_score_test)
risk_score_lasso <- glmnet(fair_risk_score_bin ~ ., data = data_risk_score_train, alpha = 1, lambda = 0.01569086, family = "binomial")
risk_score_lasso_mr <- misclass_rate(data_risk_score_test$fair_risk_score_bin, risk_score_lasso, data_risk_score_test)
vid_hiring_ridge <- glmnet(fair_vid_hiring_bin ~ ., data = data_vid_hiring_train, alpha = 0, lambda = 0.1217383, family = "binomial")
vid_hiring_ridge_mr <- misclass_rate(data_vid_hiring_test$fair_vid_hiring_bin, vid_hiring_ridge, data_vid_hiring_test)
vid_hiring_lasso <- glmnet(fair_vid_hiring_bin ~ ., data = data_vid_hiring_train, alpha = 1, lambda = 0.01452274, family = "binomial")
vid_hiring_lasso_mr <- misclass_rate(data_vid_hiring_test$fair_vid_hiring_bin, vid_hiring_lasso, data_vid_hiring_test)
cv_hiring_ridge <- glmnet(fair_cv_hiring_bin ~ ., data = data_cv_hiring_train, alpha = 0, lambda = 0.105956, family = "binomial")
cv_hiring_ridge_mr <- misclass_rate(data_cv_hiring_test$fair_cv_hiring_bin, cv_hiring_ridge, data_cv_hiring_test)
cv_hiring_lasso <- glmnet(fair_cv_hiring_bin ~ ., data = data_cv_hiring_train, alpha = 1, lambda = 0.01519515, family = "binomial")
cv_hiring_lasso_mr <- misclass_rate(data_cv_hiring_test$fair_cv_hiring_bin, cv_hiring_lasso, data_cv_hiring_test)
# Random forests ----------------------------------------------------------
cred_score_rf <- randomForest(fair_cred_score_bin ~ ., data = data_cred_score_train, nodesize = 10, mtry = 30)
cred_score_rf_mr <- misclass_rate_rf(data_cred_score_test$fair_cred_score_bin, cred_score_rf, data_cred_score_test)
risk_score_rf <- randomForest(fair_risk_score_bin ~ ., data = data_risk_score_train, nodesize = 10, mtry = 33)
risk_score_rf_mr <- misclass_rate_rf(data_risk_score_test$fair_risk_score_bin, risk_score_rf, data_risk_score_test)
vid_hiring_rf <- randomForest(fair_vid_hiring_bin ~ ., data = data_vid_hiring_train, nodesize = 10, mtry = 54)
vid_hiring_rf_mr <- misclass_rate_rf(data_vid_hiring_test$fair_vid_hiring_bin, vid_hiring_rf, data_vid_hiring_test)
cv_hiring_rf <- randomForest(fair_cv_hiring_bin ~ ., data = data_cv_hiring_train, nodesize = 10, mtry = 53)
cv_hiring_rf_mr <- misclass_rate_rf(data_cv_hiring_test$fair_cv_hiring_bin, cv_hiring_rf, data_cv_hiring_test)
# Boosted trees -----------------------------------------------------------
cred_score_bt <- train(fair_cred_score_bin ~ ., data = data_cred_score_train %>% mutate(fair_cred_score_bin = factor(fair_cred_score_bin,
labels = make.names(levels(fair_cred_score_bin)))),
method = "gbm",
trControl = fit_control,
verbose = FALSE,
tuneGrid = data.frame(interaction.depth = 1,
n.trees = 1100,
shrinkage = 0.012,
n.minobsinnode = 10),
metric = "Accuracy")
cred_score_bt_mr <- misclass_rate_bt(data_cred_score_test$fair_cred_score_bin, cred_score_bt, data_cred_score_test)
risk_score_bt <- train(fair_risk_score_bin ~ ., data = data_risk_score_train %>% mutate(fair_risk_score_bin = factor(fair_risk_score_bin,
labels = make.names(levels(fair_risk_score_bin)))),
method = "gbm",
trControl = fit_control,
verbose = FALSE,
tuneGrid = data.frame(interaction.depth = 1,
n.trees = 100,
shrinkage = 0.001,
n.minobsinnode = 10),
metric = "Accuracy")
risk_score_bt_mr <- misclass_rate_bt(data_risk_score_test$fair_risk_score_bin, risk_score_bt, data_risk_score_test)
vid_hiring_bt <- train(fair_vid_hiring_bin ~ ., data = data_vid_hiring_train %>% mutate(fair_vid_hiring_bin = factor(fair_vid_hiring_bin,
labels = make.names(levels(fair_vid_hiring_bin)))),
method = "gbm",
trControl = fit_control,
verbose = FALSE,
tuneGrid = data.frame(interaction.depth = 1,
n.trees = 650,
shrinkage = 0.001,
n.minobsinnode = 10),
metric = "Accuracy")
vid_hiring_bt_mr <- misclass_rate_bt(data_vid_hiring_test$fair_vid_hiring_bin, vid_hiring_bt, data_vid_hiring_test)
cv_hiring_bt <- train(fair_cv_hiring_bin ~ ., data = data_cv_hiring_train %>% mutate(fair_cv_hiring_bin = factor(fair_cv_hiring_bin,
labels = make.names(levels(fair_cv_hiring_bin)))),
method = "gbm",
trControl = fit_control,
verbose = FALSE,
tuneGrid = data.frame(interaction.depth = 1,
n.trees = 500,
shrinkage = 0.023,
n.minobsinnode = 10),
metric = "Accuracy")
cv_hiring_bt_mr <- misclass_rate_bt(data_cv_hiring_test$fair_cv_hiring_bin, cv_hiring_bt, data_cv_hiring_test)
# BART --------------------------------------------------------------------
cred_score_y <- data_cred_score_train$fair_cred_score_bin
cred_score_x <- data_cred_score_train
cred_score_x$fair_cred_score_bin <- NULL
risk_score_y <- data_risk_score_train$fair_risk_score_bin
risk_score_x <- data_risk_score_train
risk_score_x$fair_risk_score_bin <- NULL
vid_hiring_y <- data_vid_hiring_train$fair_vid_hiring_bin
vid_hiring_x <- data_vid_hiring_train
vid_hiring_x$fair_vid_hiring_bin <- NULL
cv_hiring_y <- data_cv_hiring_train$fair_cv_hiring_bin
cv_hiring_x <- data_cv_hiring_train
cv_hiring_x$fair_cv_hiring_bin <- NULL
cred_score_y_test <- data_cred_score_test$fair_cred_score_bin
cred_score_x_test <- data_cred_score_test
cred_score_x_test$fair_cred_score_bin <- NULL
risk_score_y_test <- data_risk_score_test$fair_risk_score_bin
risk_score_x_test <- data_risk_score_test
risk_score_x_test$fair_risk_score_bin <- NULL
vid_hiring_y_test <- data_vid_hiring_test$fair_vid_hiring_bin
vid_hiring_x_test <- data_vid_hiring_test
vid_hiring_x_test$fair_vid_hiring_bin <- NULL
cv_hiring_y_test <- data_cv_hiring_test$fair_cv_hiring_bin
cv_hiring_x_test <- data_cv_hiring_test
cv_hiring_x_test$fair_cv_hiring_bin <- NULL
cred_score_bart <- bartMachine(data.frame(cred_score_x), cred_score_y, k = 2, num_trees = 200)
cred_score_bart_mr <- misclass_rate_bart(data_cred_score_test$fair_cred_score_bin, cred_score_bart, cred_score_x_test, cred_score_y_test)
risk_score_bart <- bartMachine(data.frame(risk_score_x), risk_score_y, k = 5, num_trees = 50)
risk_score_bart_mr <- misclass_rate_bart(data_risk_score_test$fair_risk_score_bin, risk_score_bart, risk_score_x_test, risk_score_y_test)
vid_hiring_bart <- bartMachine(data.frame(vid_hiring_x), vid_hiring_y, k = 3, num_trees = 200)
vid_hiring_bart_mr <- misclass_rate_bart(data_vid_hiring_test$fair_vid_hiring_bin, vid_hiring_bart, vid_hiring_x_test, vid_hiring_y_test)
cv_hiring_bart <- bartMachine(data.frame(cv_hiring_x), cv_hiring_y, k = 5, num_trees = 50)
cv_hiring_bart_mr <- misclass_rate_bart(data_cv_hiring_test$fair_cv_hiring_bin, cv_hiring_bart, cv_hiring_x_test, cv_hiring_y_test)
# Visual model comparison -------------------------------------------------
misclass_rates <- tibble(models = rep(c("Logistic", "Ridge", "Lasso", "Random forest", "Boosted trees", "BART"), times=4),
response = c(rep("Credit score", times=6), rep("Risk score", times=6), rep("Video hiring", times=6), rep("Resume hiring", times=6)),
misclass_rate = c(cred_score_logit_mr, cred_score_ridge_mr, cred_score_lasso_mr, cred_score_rf_mr, cred_score_bt_mr, cred_score_bart_mr,
risk_score_logit_mr, risk_score_ridge_mr, risk_score_lasso_mr, risk_score_rf_mr, risk_score_bt_mr, risk_score_bart_mr,
vid_hiring_logit_mr, vid_hiring_ridge_mr, vid_hiring_lasso_mr, vid_hiring_rf_mr, vid_hiring_bt_mr, vid_hiring_bart_mr,
cv_hiring_logit_mr, cv_hiring_ridge_mr, cv_hiring_lasso_mr, cv_hiring_rf_mr, cv_hiring_bt_mr, cv_hiring_bart_mr))
saveRDS(misclass_rates, file = "misclass_rates.rds")
misclass_rates <- ggplot(misclass_rates) +
aes(x=misclass_rate, y=models) +
geom_point() +
facet_wrap(~response) +
theme_bw() +
theme(panel.border = element_blank(),
plot.title = element_text(hjust = 0.5)) +
ggtitle("Test misclassification rates across models and responses") +
xlab("Misclassification rate") +
ylab("Model") # 632 x 421
pdf("misclass_rates.pdf")
print(misclass_rates)
dev.off()
table(data_cred_score_test$fair_cred_score_bin)/nrow(data_cred_score_test) # 0.2759382
table(data_risk_score_test$fair_risk_score_bin)/nrow(data_risk_score_test) # 0.4847162
table(data_vid_hiring_test$fair_vid_hiring_bin)/nrow(data_vid_hiring_test) # 0.3535792
table(data_cv_hiring_test$fair_cv_hiring_bin)/nrow(data_cv_hiring_test) # 0.4635762
# Model selection
# Setting parameters ------------------------------------------------------
options(java.parameters = "-Xmx54g")
# Loading packages --------------------------------------------------------
library(tidyverse)
library(glmnet) # masks 'expand', 'pack', and 'unpack' from tidyr
library(glmnetUtils)
library(randomForest) # masks 'combine' from 'dplyr' and 'margin' from ggplot2
library(caret) # masks 'lift' from purrr
library(gbm)
library(bartMachine)
# Reading data ------------------------------------------------------------
data_cred_score_train <- readRDS("data/processed/data_cred_score_train.rds") %>%
select(-c(effective_cred_score_bin, acceptable_cred_score))
data_risk_score_train <- readRDS("data/processed/data_risk_score_train.rds") %>%
select(-c(effective_risk_score_bin, acceptable_risk_score))
data_vid_hiring_train <- readRDS("data/processed/data_vid_hiring_train.rds") %>%
select(-c(effective_vid_hiring_bin, acceptable_vid_hiring))
data_cv_hiring_train <- readRDS("data/processed/data_cv_hiring_train.rds") %>%
select(-c(effective_cv_hiring_bin, acceptable_cv_hiring))
data_cred_score_test <- readRDS("data/processed/data_cred_score_test.rds") %>%
select(-c(effective_cred_score_bin, acceptable_cred_score))
data_risk_score_test <- readRDS("data/processed/data_risk_score_test.rds") %>%
select(-c(effective_risk_score_bin, acceptable_risk_score))
data_vid_hiring_test <- readRDS("data/processed/data_vid_hiring_test.rds") %>%
select(-c(effective_vid_hiring_bin, acceptable_vid_hiring))
data_cv_hiring_test <- readRDS("data/processed/data_cv_hiring_test.rds") %>%
select(-c(effective_cv_hiring_bin, acceptable_cv_hiring))
# Setting parameters ------------------------------------------------------
set.seed(2620)
fit_control <- trainControl(method = "none", classProbs = TRUE)
# Helper function ---------------------------------------------------------
#' @name misclass_rate
misclass_rate <- function(outcome, model, dataset){
p_hat <- predict(model, newdata = dataset, type = "response")
class_hat <- if_else(p_hat > 0.5, 1, 0)
misclass_rate <- mean(class_hat != outcome)
misclass_rate
}
#' @name misclass_rate_rf
misclass_rate_rf <- function(outcome, model, dataset){
class_hat <- predict(model, newdata = dataset, type = "response")
misclass_rate <- mean(class_hat != outcome)
misclass_rate
}
#' @name misclass_rate_bt
misclass_rate_bt <- function(outcome, model, dataset){
p_hat <- predict(model, newdata = dataset, type = "prob")
class_hat <- if_else(p_hat[1] > 0.5, 0, 1)
misclass_rate <- mean(class_hat != outcome)
misclass_rate
}
#' @name misclass_rate_bart
misclass_rate_bart <- function(outcome, model, x_test, y_test){
predictions <- bart_predict_for_test_data(model, x_test, y_test)
class_hat <- predictions$y_hat
misclass_rate <- mean(class_hat != y_test)
misclass_rate
}
# Logistic regressions ----------------------------------------------------
cred_score_logit <- glm(fair_cred_score_bin ~ ., data = data_cred_score_train, family = binomial(link = "logit"))
cred_score_logit_mr <- misclass_rate(data_cred_score_test$fair_cred_score_bin, cred_score_logit, data_cred_score_test)
risk_score_logit <- glm(fair_risk_score_bin ~ ., data = data_risk_score_train, family = binomial(link = "logit"))
risk_score_logit_mr <- misclass_rate(data_risk_score_test$fair_risk_score_bin, risk_score_logit, data_risk_score_test)
vid_hiring_logit <- glm(fair_vid_hiring_bin ~ ., data = data_vid_hiring_train, family = binomial(link = "logit"))
vid_hiring_logit_mr <- misclass_rate(data_vid_hiring_test$fair_vid_hiring_bin, vid_hiring_logit, data_vid_hiring_test)
cv_hiring_logit <- glm(fair_cv_hiring_bin ~ ., data = data_cv_hiring_train, family = binomial(link = "logit"))
cv_hiring_logit_mr <- misclass_rate(data_cv_hiring_test$fair_cv_hiring_bin, cv_hiring_logit, data_cv_hiring_test)
# Regularized regressions -------------------------------------------------
cred_score_ridge <- glmnet(fair_cred_score_bin ~ ., data = data_cred_score_train, alpha = 0, lambda = 0.4247572, family = "binomial")
cred_score_ridge_mr <- misclass_rate(data_cred_score_test$fair_cred_score_bin, cred_score_ridge, data_cred_score_test)
cred_score_lasso <- glmnet(fair_cred_score_bin ~ ., data = data_cred_score_train, alpha = 1, lambda = 0.01543331, family = "binomial")
cred_score_lasso_mr <- misclass_rate(data_cred_score_test$fair_cred_score_bin, cred_score_lasso, data_cred_score_test)
risk_score_ridge <- glmnet(fair_risk_score_bin ~ ., data = data_risk_score_train, alpha = 0, lambda = 0.3696913, family = "binomial")
risk_score_ridge_mr <- misclass_rate(data_risk_score_test$fair_risk_score_bin, risk_score_ridge, data_risk_score_test)
risk_score_lasso <- glmnet(fair_risk_score_bin ~ ., data = data_risk_score_train, alpha = 1, lambda = 0.01573379, family = "binomial")
risk_score_lasso_mr <- misclass_rate(data_risk_score_test$fair_risk_score_bin, risk_score_lasso, data_risk_score_test)
vid_hiring_ridge <- glmnet(fair_vid_hiring_bin ~ ., data = data_vid_hiring_train, alpha = 0, lambda = 0.3217642, family = "binomial")
vid_hiring_ridge_mr <- misclass_rate(data_vid_hiring_test$fair_vid_hiring_bin, vid_hiring_ridge, data_vid_hiring_test)
vid_hiring_lasso <- glmnet(fair_vid_hiring_bin ~ ., data = data_vid_hiring_train, alpha = 1, lambda = 0.01493088, family = "binomial")
vid_hiring_lasso_mr <- misclass_rate(data_vid_hiring_test$fair_vid_hiring_bin, vid_hiring_lasso, data_vid_hiring_test)
cv_hiring_ridge <- glmnet(fair_cv_hiring_bin ~ ., data = data_cv_hiring_train, alpha = 0, lambda = 0.560717, family = "binomial")
cv_hiring_ridge_mr <- misclass_rate(data_cv_hiring_test$fair_cv_hiring_bin, cv_hiring_ridge, data_cv_hiring_test)
cv_hiring_lasso <- glmnet(fair_cv_hiring_bin ~ ., data = data_cv_hiring_train, alpha = 1, lambda = 0.01900601, family = "binomial")
cv_hiring_lasso_mr <- misclass_rate(data_cv_hiring_test$fair_cv_hiring_bin, cv_hiring_lasso, data_cv_hiring_test)
# Random forests ----------------------------------------------------------
cred_score_rf <- randomForest(fair_cred_score_bin ~ ., data = data_cred_score_train, nodesize = 10, mtry = 85)
cred_score_rf_mr <- misclass_rate_rf(data_cred_score_test$fair_cred_score_bin, cred_score_rf, data_cred_score_test)
risk_score_rf <- randomForest(fair_risk_score_bin ~ ., data = data_risk_score_train, nodesize = 10, mtry = 38)
risk_score_rf_mr <- misclass_rate_rf(data_risk_score_test$fair_risk_score_bin, risk_score_rf, data_risk_score_test)
vid_hiring_rf <- randomForest(fair_vid_hiring_bin ~ ., data = data_vid_hiring_train, nodesize = 10, mtry = 45)
vid_hiring_rf_mr <- misclass_rate_rf(data_vid_hiring_test$fair_vid_hiring_bin, vid_hiring_rf, data_vid_hiring_test)
cv_hiring_rf <- randomForest(fair_cv_hiring_bin ~ ., data = data_cv_hiring_train, nodesize = 10, mtry = 24)
cv_hiring_rf_mr <- misclass_rate_rf(data_cv_hiring_test$fair_cv_hiring_bin, cv_hiring_rf, data_cv_hiring_test)
# Boosted trees -----------------------------------------------------------
cred_score_bt <- train(fair_cred_score_bin ~ ., data = data_cred_score_train %>% mutate(fair_cred_score_bin = factor(fair_cred_score_bin,
labels = make.names(levels(fair_cred_score_bin)))),
method = "gbm",
trControl = fit_control,
verbose = FALSE,
tuneGrid = data.frame(interaction.depth = 1,
n.trees = 550,
shrinkage = 0.045,
n.minobsinnode = 10),
metric = "Accuracy")
cred_score_bt_mr <- misclass_rate_bt(data_cred_score_test$fair_cred_score_bin, cred_score_bt, data_cred_score_test)
risk_score_bt <- train(fair_risk_score_bin ~ ., data = data_risk_score_train %>% mutate(fair_risk_score_bin = factor(fair_risk_score_bin,
labels = make.names(levels(fair_risk_score_bin)))),
method = "gbm",
trControl = fit_control,
verbose = FALSE,
tuneGrid = data.frame(interaction.depth = 1,
n.trees = 750,
shrinkage = 0.001,
n.minobsinnode = 10),
metric = "Accuracy")
risk_score_bt_mr <- misclass_rate_bt(data_risk_score_test$fair_risk_score_bin, risk_score_bt, data_risk_score_test)
vid_hiring_bt <- train(fair_vid_hiring_bin ~ ., data = data_vid_hiring_train %>% mutate(fair_vid_hiring_bin = factor(fair_vid_hiring_bin,
labels = make.names(levels(fair_vid_hiring_bin)))),
method = "gbm",
trControl = fit_control,
verbose = FALSE,
tuneGrid = data.frame(interaction.depth = 1,
n.trees = 150,
shrinkage = 0.089,
n.minobsinnode = 10),
metric = "Accuracy")
vid_hiring_bt_mr <- misclass_rate_bt(data_vid_hiring_test$fair_vid_hiring_bin, vid_hiring_bt, data_vid_hiring_test)
cv_hiring_bt <- train(fair_cv_hiring_bin ~ ., data = data_cv_hiring_train %>% mutate(fair_cv_hiring_bin = factor(fair_cv_hiring_bin,
labels = make.names(levels(fair_cv_hiring_bin)))),
method = "gbm",
trControl = fit_control,
verbose = FALSE,
tuneGrid = data.frame(interaction.depth = 1,
n.trees = 1000,
shrinkage = 0.012,
n.minobsinnode = 10),
metric = "Accuracy")
cv_hiring_bt_mr <- misclass_rate_bt(data_cv_hiring_test$fair_cv_hiring_bin, cv_hiring_bt, data_cv_hiring_test)
# BART --------------------------------------------------------------------
cred_score_y <- data_cred_score_train$fair_cred_score_bin
cred_score_x <- data_cred_score_train
cred_score_x$fair_cred_score_bin <- NULL
risk_score_y <- data_risk_score_train$fair_risk_score_bin
risk_score_x <- data_risk_score_train
risk_score_x$fair_risk_score_bin <- NULL
vid_hiring_y <- data_vid_hiring_train$fair_vid_hiring_bin
vid_hiring_x <- data_vid_hiring_train
vid_hiring_x$fair_vid_hiring_bin <- NULL
cv_hiring_y <- data_cv_hiring_train$fair_cv_hiring_bin
cv_hiring_x <- data_cv_hiring_train
cv_hiring_x$fair_cv_hiring_bin <- NULL
cred_score_y_test <- data_cred_score_test$fair_cred_score_bin
cred_score_x_test <- data_cred_score_test
cred_score_x_test$fair_cred_score_bin <- NULL
risk_score_y_test <- data_risk_score_test$fair_risk_score_bin
risk_score_x_test <- data_risk_score_test
risk_score_x_test$fair_risk_score_bin <- NULL
vid_hiring_y_test <- data_vid_hiring_test$fair_vid_hiring_bin
vid_hiring_x_test <- data_vid_hiring_test
vid_hiring_x_test$fair_vid_hiring_bin <- NULL
cv_hiring_y_test <- data_cv_hiring_test$fair_cv_hiring_bin
cv_hiring_x_test <- data_cv_hiring_test
cv_hiring_x_test$fair_cv_hiring_bin <- NULL
cred_score_bart <- bartMachine(data.frame(cred_score_x), cred_score_y, k = 2, num_trees = 200)
cred_score_bart_mr <- misclass_rate_bart(data_cred_score_test$fair_cred_score_bin, cred_score_bart, cred_score_x_test, cred_score_y_test)
risk_score_bart <- bartMachine(data.frame(risk_score_x), risk_score_y, k = 5, num_trees = 50)
risk_score_bart_mr <- misclass_rate_bart(data_risk_score_test$fair_risk_score_bin, risk_score_bart, risk_score_x_test, risk_score_y_test)
vid_hiring_bart <- bartMachine(data.frame(vid_hiring_x), vid_hiring_y, k = 5, num_trees = 200)
vid_hiring_bart_mr <- misclass_rate_bart(data_vid_hiring_test$fair_vid_hiring_bin, vid_hiring_bart, vid_hiring_x_test, vid_hiring_y_test)
cv_hiring_bart <- bartMachine(data.frame(cv_hiring_x), cv_hiring_y, k = 5, num_trees = 50)
cv_hiring_bart_mr <- misclass_rate_bart(data_cv_hiring_test$fair_cv_hiring_bin, cv_hiring_bart, cv_hiring_x_test, cv_hiring_y_test)
# Visual model comparison -------------------------------------------------
misclass_rates <- tibble(models = rep(c("Logistic", "Ridge", "Lasso", "Random forest", "Boosted trees", "BART"), times=4),
response = c(rep("Credit score", times=6), rep("Risk score", times=6), rep("Video hiring", times=6), rep("Resume hiring", times=6)),
misclass_rate = c(cred_score_logit_mr, cred_score_ridge_mr, cred_score_lasso_mr, cred_score_rf_mr, cred_score_bt_mr, cred_score_bart_mr,
risk_score_logit_mr, risk_score_ridge_mr, risk_score_lasso_mr, risk_score_rf_mr, risk_score_bt_mr, risk_score_bart_mr,
vid_hiring_logit_mr, vid_hiring_ridge_mr, vid_hiring_lasso_mr, vid_hiring_rf_mr, vid_hiring_bt_mr, vid_hiring_bart_mr,
cv_hiring_logit_mr, cv_hiring_ridge_mr, cv_hiring_lasso_mr, cv_hiring_rf_mr, cv_hiring_bt_mr, cv_hiring_bart_mr))
saveRDS(misclass_rates, file = "misclass_rates.rds")
misclass_rates_2 <- ggplot(misclass_rates) +
aes(x=misclass_rate, y=models) +
geom_point() +
facet_wrap(~response) +
theme_bw() +
theme(panel.border = element_blank(),
plot.title = element_text(hjust = 0.5)) +
ggtitle("Test misclassification rates across models and responses") +
xlab("Misclassification rate") +
ylab("Model") # 632 x 421
pdf("misclass_rates_2.pdf")
print(misclass_rates_2)
dev.off()
table(data_cred_score_test$fair_cred_score_bin)/nrow(data_cred_score_test) # 0.2759382
table(data_risk_score_test$fair_risk_score_bin)/nrow(data_risk_score_test) # 0.4847162
table(data_vid_hiring_test$fair_vid_hiring_bin)/nrow(data_vid_hiring_test) # 0.3535792
table(data_cv_hiring_test$fair_cv_hiring_bin)/nrow(data_cv_hiring_test) # 0.4635762
