## Tested on R version 4.1.2 (2021-11-01) ########################################################### ## Libraries, not sure we need all of them for this psedo-code ## but here they are. textir is certainly a must to fit MNIR, ## slam for handling the dtms, the rest are peripheral. ## ## To install libraries, use install.packages(packagename), i.e. ## install.packages("textir"). ########################################################### library(stringr) library(tibble) library(dplyr) library(lfe) library(Matrix) library(stargazer) library(textir) library(readr) library(dplyr) library(tidytext) library(tm) library(slam) library(parallel) #################################################################### ## Auxiliary functions #################################################################### winsorize_dg <- function(x, fudge_me=0.01){ y <- x y_top <- quantile(y, 1-fudge_me, na.rm=TRUE) y_bot <- quantile(y, fudge_me, na.rm=TRUE) y[y>y_top] <- y_top y[y0,1] ML_uni_neg_main_spec <- unigrams_dictionary[unigrams_dictionary[,2]<0,1] ML_bi_pos_main_spec <- bigrams_dictionary[bigrams_dictionary[,2]>0,1] ML_bi_neg_main_spec <- bigrams_dictionary[bigrams_dictionary[,2]<0,1] meta_public$ML_uni_pos <- slam::row_sums(dtm_unigrams[,colnames(dtm_unigrams) %in% ML_uni_pos_main_spec]) / meta_public$WC_tot meta_public$ML_uni_neg <- slam::row_sums(dtm_unigrams[,colnames(dtm_unigrams) %in% ML_uni_neg_main_spec]) / meta_public$WC_tot meta_public$ML_bi_pos <- slam::row_sums(dtm_bigrams[,colnames(dtm_bigrams) %in% ML_bi_pos_main_spec]) / meta_public$WC_tot meta_public$ML_bi_neg <- slam::row_sums(dtm_bigrams[,colnames(dtm_bigrams) %in% ML_bi_neg_main_spec]) / meta_public$WC_tot ## Not winsorizing on actual regression (which we do in paper), ## but it does not matter much (here we are winsorizing over the whole sample). ## Also normalize sentiment scores to SD=1 meta_public$LM_pos <- winsorize_dg(meta_public$LM_pos) meta_public$LM_neg <- winsorize_dg(meta_public$LM_neg) meta_public$LM_pos <- norm(meta_public$LM_pos) meta_public$LM_neg <- norm(meta_public$LM_neg) meta_public$ML_uni_pos <- winsorize_dg(meta_public$ML_uni_pos) meta_public$ML_uni_neg <- winsorize_dg(meta_public$ML_uni_neg) meta_public$ML_uni_pos <- norm(meta_public$ML_uni_pos) meta_public$ML_uni_neg <- norm(meta_public$ML_uni_neg) meta_public$ML_bi_pos <- winsorize_dg(meta_public$ML_bi_pos) meta_public$ML_bi_neg <- winsorize_dg(meta_public$ML_bi_neg) meta_public$ML_bi_pos <- norm(meta_public$ML_bi_pos) meta_public$ML_bi_neg <- norm(meta_public$ML_bi_neg) ############################################################### ## PART 3. Run regressions in 2015-2019 data ## This should be close to Table 2 in the paper ############################################################### ###################### Define time stamps time_year <- as.numeric(gsub("^(\\d{4}).+", "\\1", meta_public$date_call)) time_month <- as.numeric(gsub("^(\\d{4})-(\\d{2})-(\\d{2})", "\\2", meta_public$date_call)) time_day <- as.numeric(gsub("^(\\d{4})-(\\d{2})-(\\d{2})", "\\3", meta_public$date_call)) time_dg <- time_year + (time_month-1)/12 + time_day/365 limit_sample <- time_dg<2015 ## Run regressions reg <- list() ## To save regression output reg[[1]] <- felm(return_kaggle ~ LM_pos + LM_neg | FF_49 + year_quarter | 0 | FF_49 + year_quarter, data = meta_public[!limit_sample,]) reg[[2]] <- felm(return_kaggle ~ ML_uni_pos + ML_uni_neg | FF_49 + year_quarter | 0 | FF_49 + year_quarter, data = meta_public[!limit_sample,]) reg[[3]] <- felm(return_kaggle ~ ML_bi_pos + ML_bi_neg | FF_49 + year_quarter | 0 | FF_49 + year_quarter, data = meta_public[!limit_sample,]) reg[[4]] <- felm(return_kaggle ~ LM_pos + LM_neg + ML_uni_pos + ML_uni_neg | FF_49 + year_quarter | 0 | FF_49 + year_quarter, data = meta_public[!limit_sample,]) reg[[5]] <- felm(return_kaggle ~ LM_pos + LM_neg + ML_bi_pos + ML_bi_neg | FF_49 + year_quarter | 0 | FF_49 + year_quarter, data = meta_public[!limit_sample,]) stargazer(reg, keep.stat=c("n", "adj.rsq"), digits = 3, digits.extra = 0, align = T, no.space = T, report = "vc*t", type = "text") ###################################################### ## PART 4. ## ML scoring using plain English created using data 2006-2014 ###################################################### meta_public$ML_uni_pos <- slam::row_sums(dtm_unigrams[,colnames(dtm_unigrams) %in% ML_pos_unigrams_2014]) / meta_public$WC_tot meta_public$ML_uni_neg <- slam::row_sums(dtm_unigrams[,colnames(dtm_unigrams) %in% ML_neg_unigrams_2014]) / meta_public$WC_tot meta_public$ML_bi_pos <- slam::row_sums(dtm_bigrams[,colnames(dtm_bigrams) %in% ML_pos_bigrams_2014]) / meta_public$WC_tot meta_public$ML_bi_neg <- slam::row_sums(dtm_bigrams[,colnames(dtm_bigrams) %in% ML_neg_bigrams_2014]) / meta_public$WC_tot ## Winsorizing/normalizing meta_public$LM_pos <- winsorize_dg(meta_public$LM_pos) meta_public$LM_neg <- winsorize_dg(meta_public$LM_neg) meta_public$LM_pos <- norm(meta_public$LM_pos) meta_public$LM_neg <- norm(meta_public$LM_neg) meta_public$ML_uni_pos <- winsorize_dg(meta_public$ML_uni_pos) meta_public$ML_uni_neg <- winsorize_dg(meta_public$ML_uni_neg) meta_public$ML_uni_pos <- norm(meta_public$ML_uni_pos) meta_public$ML_uni_neg <- norm(meta_public$ML_uni_neg) meta_public$ML_bi_pos <- winsorize_dg(meta_public$ML_bi_pos) meta_public$ML_bi_neg <- winsorize_dg(meta_public$ML_bi_neg) meta_public$ML_bi_pos <- norm(meta_public$ML_bi_pos) meta_public$ML_bi_neg <- norm(meta_public$ML_bi_neg) ############################################################### ## PART 5. ## Run regressions in 2015-2019 data ## This should be close to Panel A, Table 10 in the paper ############################################################### reg <- list() ## To save regression output ## Table 1, Model 2 reg[[1]] <- felm(return_kaggle ~ LM_pos + LM_neg | FF_49 + year_quarter | 0 | FF_49 + year_quarter, data = meta_public[!limit_sample,]) reg[[2]] <- felm(return_kaggle ~ ML_uni_pos + ML_uni_neg | FF_49 + year_quarter | 0 | FF_49 + year_quarter, data = meta_public[!limit_sample,]) reg[[3]] <- felm(return_kaggle ~ ML_bi_pos + ML_bi_neg | FF_49 + year_quarter | 0 | FF_49 + year_quarter, data = meta_public[!limit_sample,]) reg[[4]] <- felm(return_kaggle ~ LM_pos + LM_neg + ML_uni_pos + ML_uni_neg | FF_49 + year_quarter | 0 | FF_49 + year_quarter, data = meta_public[!limit_sample,]) reg[[5]] <- felm(return_kaggle ~ LM_pos + LM_neg + ML_bi_pos + ML_bi_neg | FF_49 + year_quarter | 0 | FF_49 + year_quarter, data = meta_public[!limit_sample,]) stargazer(reg, keep.stat=c("n", "adj.rsq"), digits = 3, digits.extra = 0, align = T, no.space = T, report = "vc*t", type = "text") ###################################################### ## PART 6. ## Section on estimating MNIR for unigrams ###################################################### limit_sample_kaggle <- limit_sample & !is.na(meta_public$return_kaggle) counts.words <- slam::col_sums(dtm_unigrams[limit_sample_kaggle,]) dtm_new <- dtm_unigrams[,counts.words>0] time1 <- Sys.time() nr.clusters <- 6 # number of clusters used for MNIR implementation cl <- makeCluster(nr.clusters) ## MNIR fit fits <- dmr(cl, meta_public$return_kaggle[limit_sample_kaggle], dtm_new[limit_sample_kaggle,], bins=NULL, gamma=0, nlambda=10, verb= 2) ## Extract MNIR coefficients mnir.coef <- sort(coef(fits)[2,]) ## end cluster stopCluster(cl) time2 <- Sys.time() time2-time1 ## 15 minutes ## Since it takes more than 1 minute, let's save output file.out <- paste("fits.dtm.unigrams.main", ".RData", sep="") save(fits, mnir.coef, file=file.out) ###################################################### ## PART 7. ## Section on estimating MNIR for bigrams ###################################################### counts.words <- slam::col_sums(dtm_bigrams[limit_sample_kaggle,]) dtm_new <- dtm_bigrams[,counts.words>0] time1 <- Sys.time() nr.clusters <- 6 # number of clusters used for MNIR implementation cl <- makeCluster(nr.clusters) ## MNIR fit fits <- dmr(cl, meta_public$return_kaggle[limit_sample_kaggle], dtm_new[limit_sample_kaggle,], bins=NULL, gamma=0, nlambda=10, verb= 2) ## Extract MNIR coefficients mnir.coef <- sort(coef(fits)[2,]) ## end cluster stopCluster(cl) time2 <- Sys.time() time2-time1 ## 14 minutes ## Since it takes more than 1 minute, let's save output file.out <- paste("fits.dtm.bigrams.main", ".RData", sep="") save(fits, mnir.coef, file=file.out) ##################################################################### ## PART 8. ## Create sentiment and do OOS estimation as before. ##################################################################### ## Unigrams load('fits.dtm.unigrams.main.RData') meta_public$ML_uni_pos <- slam::row_sums(dtm_unigrams[,colnames(dtm_unigrams) %in% names(mnir.coef)[mnir.coef>0]]) / meta_public$WC_tot meta_public$ML_uni_neg <- slam::row_sums(dtm_unigrams[,colnames(dtm_unigrams) %in% names(mnir.coef)[mnir.coef<0]]) / meta_public$WC_tot ## Bigrams load('fits.dtm.bigrams.main.RData') meta_public$ML_bi_pos <- slam::row_sums(dtm_bigrams[,colnames(dtm_bigrams) %in% names(mnir.coef)[mnir.coef>0]]) / meta_public$WC_tot meta_public$ML_bi_neg <- slam::row_sums(dtm_bigrams[,colnames(dtm_bigrams) %in% names(mnir.coef)[mnir.coef<0]]) / meta_public$WC_tot ## normalize for(cn in c("ML_bi_pos","ML_bi_neg","ML_uni_pos","ML_uni_neg")) { meta_public[[cn]] <- norm(meta_public[[cn]]) meta_public[[cn]] <- winsorize_dg(meta_public[[cn]]) } ######################################################### ## Main specification. ## Subset of Table 1 and Table 2. ######################################################### reg <- list() ## To save regression output ## Table 1, Model 2 reg[[1]] <- felm(return_kaggle ~ LM_pos + LM_neg | FF_49 + year_quarter | 0 | FF_49 + year_quarter, data = meta_public[!limit_sample,]) reg[[2]] <- felm(return_kaggle ~ ML_uni_pos + ML_uni_neg | FF_49 + year_quarter | 0 | FF_49 + year_quarter, data = meta_public[!limit_sample,]) reg[[3]] <- felm(return_kaggle ~ ML_bi_pos + ML_bi_neg | FF_49 + year_quarter | 0 | FF_49 + year_quarter, data = meta_public[!limit_sample,]) reg[[4]] <- felm(return_kaggle ~ LM_pos + LM_neg + ML_uni_pos + ML_uni_neg | FF_49 + year_quarter | 0 | FF_49 + year_quarter, data = meta_public[!limit_sample,]) reg[[5]] <- felm(return_kaggle ~ LM_pos + LM_neg + ML_bi_pos + ML_bi_neg | FF_49 + year_quarter | 0 | FF_49 + year_quarter, data = meta_public[!limit_sample,]) stargazer(reg, keep.stat=c("n", "adj.rsq"), digits = 3, digits.extra = 0, align = T, no.space = T, report = "vc*t", type = "text")