## Tested on R version 4.1.2 (2021-11-01)

###########################################################
## Libraries, not sure we need all of them for this psedo-code
## but here they are. textir is certainly a must to fit MNIR,
## slam for handling the dtms, the rest are peripheral.
##
## To install libraries, use install.packages(packagename), i.e.
## install.packages("textir").
###########################################################

library(stringr)
library(tibble)
library(dplyr)
library(lfe)
library(Matrix)
library(stargazer)
library(textir)
library(readr)
library(dplyr)
library(tidytext)
library(tm)
library(slam)
library(parallel)

####################################################################
## Auxiliary functions
####################################################################

winsorize_dg <- function(x, fudge_me=0.01){
    y <- x
    y_top <- quantile(y, 1-fudge_me, na.rm=TRUE)
    y_bot <- quantile(y, fudge_me, na.rm=TRUE)
    y[y>y_top] <- y_top
    y[y<y_bot] <- y_bot
    return(y)
    }

norm <- function(x){x/sd(x, na.rm=T)}

########################################################################################
## Loading metadata with stock return information, as well as dtms (Color.Part2* object).
## Note that these are "big" objects (even when stored as sparse matrices).
########################################################################################

load("Color.Part2.20211201.RData")
load("Color.Part1.20211201.RData") ## We need the dictionaries too.

######################################################
## PART 1. LM scoring/data
## Just computes the LM term frequency for our corpora
######################################################

dtm_unigrams_LM_pos <- dtm_unigrams[,colnames(dtm_unigrams) %in% LM_pos_words]
dtm_unigrams_LM_neg <- dtm_unigrams[,colnames(dtm_unigrams) %in% LM_neg_words]

meta_public$LM_pos <- slam::row_sums(dtm_unigrams_LM_pos)/meta_public$WC_tot
meta_public$LM_neg <- slam::row_sums(dtm_unigrams_LM_neg)/meta_public$WC_tot

## LM coverage
coverage_lm_pos <- slam::col_sums(dtm_unigrams_LM_pos)
coverage_lm_neg <- slam::col_sums(dtm_unigrams_LM_neg)

all_lm_counts <- sum(coverage_lm_neg) + sum(coverage_lm_pos)

all_word_counts <- sum(meta_public$WC_tot)

all_lm_counts/all_word_counts
sum(coverage_lm_pos) /all_word_counts
sum(coverage_lm_neg) /all_word_counts

######################################################
## PART 2. ML scoring/data
##
## Just computes the ML term frequency for our corpora,
## using the dictionaries constructed from the 2006-2014
## sample (both unigrams and bigrams scores).
######################################################

ML_uni_pos_main_spec <- unigrams_dictionary[unigrams_dictionary[,2]>0,1]
ML_uni_neg_main_spec <- unigrams_dictionary[unigrams_dictionary[,2]<0,1]
ML_bi_pos_main_spec <- bigrams_dictionary[bigrams_dictionary[,2]>0,1]
ML_bi_neg_main_spec <- bigrams_dictionary[bigrams_dictionary[,2]<0,1]

meta_public$ML_uni_pos <- slam::row_sums(dtm_unigrams[,colnames(dtm_unigrams) %in% ML_uni_pos_main_spec]) / meta_public$WC_tot
meta_public$ML_uni_neg <- slam::row_sums(dtm_unigrams[,colnames(dtm_unigrams) %in% ML_uni_neg_main_spec]) / meta_public$WC_tot

meta_public$ML_bi_pos <- slam::row_sums(dtm_bigrams[,colnames(dtm_bigrams) %in% ML_bi_pos_main_spec]) / meta_public$WC_tot
meta_public$ML_bi_neg <- slam::row_sums(dtm_bigrams[,colnames(dtm_bigrams) %in% ML_bi_neg_main_spec]) / meta_public$WC_tot

## Not winsorizing on actual regression (which we do in paper),
## but it does not matter much (here we are winsorizing over the whole sample).
## Also normalize sentiment scores to SD=1

meta_public$LM_pos <- winsorize_dg(meta_public$LM_pos)
meta_public$LM_neg <- winsorize_dg(meta_public$LM_neg)
meta_public$LM_pos <- norm(meta_public$LM_pos)
meta_public$LM_neg <- norm(meta_public$LM_neg)

meta_public$ML_uni_pos <- winsorize_dg(meta_public$ML_uni_pos)
meta_public$ML_uni_neg <- winsorize_dg(meta_public$ML_uni_neg)
meta_public$ML_uni_pos <- norm(meta_public$ML_uni_pos)
meta_public$ML_uni_neg <- norm(meta_public$ML_uni_neg)

meta_public$ML_bi_pos <- winsorize_dg(meta_public$ML_bi_pos)
meta_public$ML_bi_neg <- winsorize_dg(meta_public$ML_bi_neg)
meta_public$ML_bi_pos <- norm(meta_public$ML_bi_pos)
meta_public$ML_bi_neg <- norm(meta_public$ML_bi_neg)

###############################################################
## PART 3. Run regressions in 2015-2019 data
## This should be close to Table 2 in the paper
###############################################################

###################### Define time stamps
time_year <- as.numeric(gsub("^(\\d{4}).+", "\\1", meta_public$date_call))
time_month <- as.numeric(gsub("^(\\d{4})-(\\d{2})-(\\d{2})", "\\2", meta_public$date_call))
time_day <- as.numeric(gsub("^(\\d{4})-(\\d{2})-(\\d{2})", "\\3", meta_public$date_call))

time_dg <- time_year + (time_month-1)/12 + time_day/365

limit_sample <- time_dg<2015

## Run regressions
reg <- list() ## To save regression output

reg[[1]] <- felm(return_kaggle ~ LM_pos + LM_neg  
                 | FF_49 + year_quarter  | 0  | FF_49 + year_quarter, data = meta_public[!limit_sample,])

reg[[2]] <- felm(return_kaggle ~ ML_uni_pos + ML_uni_neg 
  | FF_49 + year_quarter  | 0  | FF_49 + year_quarter, data = meta_public[!limit_sample,])

reg[[3]] <- felm(return_kaggle ~ ML_bi_pos + ML_bi_neg
  | FF_49 + year_quarter  | 0  | FF_49 + year_quarter, data = meta_public[!limit_sample,])

reg[[4]] <- felm(return_kaggle ~ LM_pos + LM_neg + ML_uni_pos + ML_uni_neg
  | FF_49 + year_quarter  | 0  | FF_49 + year_quarter, data = meta_public[!limit_sample,])

reg[[5]] <- felm(return_kaggle ~ LM_pos + LM_neg + ML_bi_pos + ML_bi_neg
  | FF_49 + year_quarter  | 0  | FF_49 + year_quarter, data = meta_public[!limit_sample,])

stargazer(reg, 
          keep.stat=c("n", "adj.rsq"), 
          digits = 3, digits.extra = 0,
          align = T, no.space = T, report = "vc*t", type = "text")

######################################################
## PART 4.
## ML scoring using plain English created using data 2006-2014
######################################################

meta_public$ML_uni_pos <- slam::row_sums(dtm_unigrams[,colnames(dtm_unigrams) %in% ML_pos_unigrams_2014]) / meta_public$WC_tot
meta_public$ML_uni_neg <- slam::row_sums(dtm_unigrams[,colnames(dtm_unigrams) %in% ML_neg_unigrams_2014]) / meta_public$WC_tot

meta_public$ML_bi_pos <- slam::row_sums(dtm_bigrams[,colnames(dtm_bigrams) %in% ML_pos_bigrams_2014]) / meta_public$WC_tot
meta_public$ML_bi_neg <- slam::row_sums(dtm_bigrams[,colnames(dtm_bigrams) %in% ML_neg_bigrams_2014]) / meta_public$WC_tot

## Winsorizing/normalizing

meta_public$LM_pos <- winsorize_dg(meta_public$LM_pos)
meta_public$LM_neg <- winsorize_dg(meta_public$LM_neg)
meta_public$LM_pos <- norm(meta_public$LM_pos)
meta_public$LM_neg <- norm(meta_public$LM_neg)

meta_public$ML_uni_pos <- winsorize_dg(meta_public$ML_uni_pos)
meta_public$ML_uni_neg <- winsorize_dg(meta_public$ML_uni_neg)
meta_public$ML_uni_pos <- norm(meta_public$ML_uni_pos)
meta_public$ML_uni_neg <- norm(meta_public$ML_uni_neg)

meta_public$ML_bi_pos <- winsorize_dg(meta_public$ML_bi_pos)
meta_public$ML_bi_neg <- winsorize_dg(meta_public$ML_bi_neg)
meta_public$ML_bi_pos <- norm(meta_public$ML_bi_pos)
meta_public$ML_bi_neg <- norm(meta_public$ML_bi_neg)

###############################################################
## PART 5.
## Run regressions in 2015-2019 data
## This should be close to Panel A, Table 10 in the paper
###############################################################

reg <- list() ## To save regression output

## Table 1, Model 2
reg[[1]] <- felm(return_kaggle ~ LM_pos + LM_neg 
                 | FF_49 + year_quarter  | 0  | FF_49 + year_quarter, data = meta_public[!limit_sample,])

reg[[2]] <- felm(return_kaggle ~ ML_uni_pos + ML_uni_neg
  | FF_49 + year_quarter  | 0  | FF_49 + year_quarter, data = meta_public[!limit_sample,])

reg[[3]] <- felm(return_kaggle ~ ML_bi_pos + ML_bi_neg
  | FF_49 + year_quarter  | 0  | FF_49 + year_quarter, data = meta_public[!limit_sample,])

reg[[4]] <- felm(return_kaggle ~ LM_pos + LM_neg + ML_uni_pos + ML_uni_neg 
  | FF_49 + year_quarter  | 0  | FF_49 + year_quarter, data = meta_public[!limit_sample,])

reg[[5]] <- felm(return_kaggle ~ LM_pos + LM_neg + ML_bi_pos + ML_bi_neg 
  | FF_49 + year_quarter  | 0  | FF_49 + year_quarter, data = meta_public[!limit_sample,])

stargazer(reg, 
          keep.stat=c("n", "adj.rsq"), 
          digits = 3, digits.extra = 0,
          align = T, no.space = T, report = "vc*t", type = "text")

######################################################
## PART 6.
## Section on estimating MNIR for unigrams
######################################################

limit_sample_kaggle <- limit_sample & !is.na(meta_public$return_kaggle)

counts.words <- slam::col_sums(dtm_unigrams[limit_sample_kaggle,])
dtm_new <- dtm_unigrams[,counts.words>0]

time1 <- Sys.time()
nr.clusters <- 6 # number of clusters used for MNIR implementation
cl <- makeCluster(nr.clusters)

## MNIR fit
fits <- dmr(cl,
            meta_public$return_kaggle[limit_sample_kaggle],
            dtm_new[limit_sample_kaggle,],
            bins=NULL,
            gamma=0,
            nlambda=10,
            verb= 2)
  
## Extract MNIR coefficients
mnir.coef <- sort(coef(fits)[2,])

## end cluster
stopCluster(cl)
time2 <- Sys.time()
time2-time1 ## 15 minutes

## Since it takes more than 1 minute, let's save output
file.out <- paste("fits.dtm.unigrams.main", ".RData", sep="")
save(fits, mnir.coef, file=file.out)

######################################################
## PART 7.
## Section on estimating MNIR for bigrams
######################################################

counts.words <- slam::col_sums(dtm_bigrams[limit_sample_kaggle,])
dtm_new <- dtm_bigrams[,counts.words>0]

time1 <- Sys.time()
nr.clusters <- 6 # number of clusters used for MNIR implementation
cl <- makeCluster(nr.clusters)

## MNIR fit
fits <- dmr(cl,
            meta_public$return_kaggle[limit_sample_kaggle],
            dtm_new[limit_sample_kaggle,],
            bins=NULL,
            gamma=0,
            nlambda=10,
            verb= 2)
  
## Extract MNIR coefficients
mnir.coef <- sort(coef(fits)[2,])

## end cluster
stopCluster(cl)
time2 <- Sys.time()
time2-time1 ## 14 minutes

## Since it takes more than 1 minute, let's save output
file.out <- paste("fits.dtm.bigrams.main", ".RData", sep="")
save(fits, mnir.coef, file=file.out)

#####################################################################
## PART 8.
## Create sentiment and do OOS estimation as before.
#####################################################################

## Unigrams

load('fits.dtm.unigrams.main.RData')

meta_public$ML_uni_pos <- slam::row_sums(dtm_unigrams[,colnames(dtm_unigrams) %in% names(mnir.coef)[mnir.coef>0]]) / meta_public$WC_tot
meta_public$ML_uni_neg <- slam::row_sums(dtm_unigrams[,colnames(dtm_unigrams) %in% names(mnir.coef)[mnir.coef<0]]) / meta_public$WC_tot

## Bigrams

load('fits.dtm.bigrams.main.RData')

meta_public$ML_bi_pos <- slam::row_sums(dtm_bigrams[,colnames(dtm_bigrams) %in% names(mnir.coef)[mnir.coef>0]]) / meta_public$WC_tot
meta_public$ML_bi_neg <- slam::row_sums(dtm_bigrams[,colnames(dtm_bigrams) %in% names(mnir.coef)[mnir.coef<0]]) / meta_public$WC_tot

## normalize
for(cn in c("ML_bi_pos","ML_bi_neg","ML_uni_pos","ML_uni_neg"))
{
    meta_public[[cn]] <- norm(meta_public[[cn]])
    meta_public[[cn]] <- winsorize_dg(meta_public[[cn]])
}

#########################################################
## Main specification.
## Subset of Table 1 and Table 2.
#########################################################

reg <- list() ## To save regression output

## Table 1, Model 2
reg[[1]] <- felm(return_kaggle ~ LM_pos + LM_neg
                 | FF_49 + year_quarter  | 0  | FF_49 + year_quarter, data = meta_public[!limit_sample,])

reg[[2]] <- felm(return_kaggle ~ ML_uni_pos + ML_uni_neg 
  | FF_49 + year_quarter  | 0  | FF_49 + year_quarter, data = meta_public[!limit_sample,])

reg[[3]] <- felm(return_kaggle ~ ML_bi_pos + ML_bi_neg
  | FF_49 + year_quarter  | 0  | FF_49 + year_quarter, data = meta_public[!limit_sample,])

reg[[4]] <- felm(return_kaggle ~ LM_pos + LM_neg + ML_uni_pos + ML_uni_neg 
  | FF_49 + year_quarter  | 0  | FF_49 + year_quarter, data = meta_public[!limit_sample,])

reg[[5]] <- felm(return_kaggle ~ LM_pos + LM_neg + ML_bi_pos + ML_bi_neg 
  | FF_49 + year_quarter  | 0  | FF_49 + year_quarter, data = meta_public[!limit_sample,])


stargazer(reg, 
          keep.stat=c("n", "adj.rsq"), 
          digits = 3, digits.extra = 0,
          align = T, no.space = T, report = "vc*t", type = "text")