## Tested on R version 4.1.2 (2021-11-01)

###########################################################
## Libraries. Install with install.packages("") if needed.
###########################################################

library(stringr)
library(slam)

##################################################################################
## Load relevant objects for PART 1.
## 
## unigrams_dictionary has three columns:
## 1. unigram
## 2. MNIR loading (fitted on earning calls 2006-2014).
## 3. frequency counts (across full corpora)
##
## bigrams_dictionary has three columns:
## 1. bigram
## 2. MNIR loading (fitted on earning calls 2006-2014).
## 3. frequency counts (across full corpora)
##
## LM_pos_words and LM_neg_words are the Loughran and McDonald positive
## and negative word lists (2017).
##
## ML_unigrams_pos and ML_unigrams_neg are the plain money English dictionaries
## for unigrams estimated over 2006-2019.
## ML_bigrams_pos and ML_bigrams_neg are the plain money English dictionaries
## for bigrams estimated over 2006-2019.
## These are the dictionaries provided in the Appendix of the paper.
##
## ML_unigrams_pos_2014 and ML_unigrams_neg_2014 are the plain money English dictionaries
## for unigrams estimated over 2006-2014.
## ML_bigrams_pos_2014 and ML_bigrams_neg_2014 are the plain money English dictionaries
## for bigrams estimated over 2006-2014.
## These are the dictionaries used for in Table 10 (OOS).
##################################################################################

load("Color.Part1.20211201.RData")
objects()

##################################################################################
## Simple comparisons of ML unigrams and LM dictionaries.
## Grab positive/negative/neutrals unigrams (main-spec, 2006-2014 fit, for Tables 1+).
## These calculations roughly correspond to Table 3 in the paper.
##################################################################################

ML_uni_pos_main_spec <- unigrams_dictionary[unigrams_dictionary[,2]>0,1]
ML_uni_neg_main_spec <- unigrams_dictionary[unigrams_dictionary[,2]<0,1]
ML_uni_neu_main_spec <- unigrams_dictionary[unigrams_dictionary[,2]==0,1]

flag_pos_ml_neg <- ML_uni_neg_main_spec %in% LM_pos_words
flag_neg_ml_neg <- ML_uni_neg_main_spec %in% LM_neg_words
flag_pos_ml_pos <- ML_uni_pos_main_spec %in% LM_pos_words
flag_neg_ml_pos <- ML_uni_pos_main_spec %in% LM_neg_words
flag_pos_ml_neu <- ML_uni_neu_main_spec %in% LM_pos_words
flag_neg_ml_neu <- ML_uni_neu_main_spec %in% LM_neg_words

length(ML_pos_unigrams)
length(LM_pos_words)
length(ML_neg_unigrams)
length(LM_neg_words)

## Numbers for Panel A of Table 3 in the paper.
sum(flag_pos_ml_pos)
sum(flag_neg_ml_pos)
sum(flag_pos_ml_neg)
sum(flag_neg_ml_neg)
sum(flag_pos_ml_neu)
sum(flag_neg_ml_neu)

##################################################################################
## Simple comparisons of ML bigrams and LM dictionaries
##################################################################################

## Grab positive/negative/neutrals bigrams (2006-2014 fit, Tables 1 and 2)

ML_bi_pos_main_spec <- bigrams_dictionary[bigrams_dictionary[,2]>0,1]
ML_bi_neg_main_spec <- bigrams_dictionary[bigrams_dictionary[,2]<0,1]
ML_bi_neu_main_spec <- bigrams_dictionary[bigrams_dictionary[,2]==0,1]

regex_lm_pos <- paste("\\b(", paste(LM_pos_words, collapse="|"), ")\\b", sep="")
regex_lm_neg <- paste("\\b(", paste(LM_neg_words, collapse="|"), ")\\b", sep="")

flag_pos_ml_pos <- grepl(regex_lm_pos, ML_bi_pos_main_spec)
flag_neg_ml_pos <- grepl(regex_lm_neg, ML_bi_pos_main_spec)
flag_pos_ml_neg <- grepl(regex_lm_pos, ML_bi_neg_main_spec)
flag_neg_ml_neg <- grepl(regex_lm_neg, ML_bi_neg_main_spec)

length(ML_bi_pos_main_spec)
length(ML_bi_neg_main_spec)
sum(flag_pos_ml_pos)
sum(flag_neg_ml_pos)
sum(flag_pos_ml_neg)
sum(flag_neg_ml_neg)
sum(flag_neg_ml_neg & flag_pos_ml_neg)
sum(flag_neg_ml_pos & flag_pos_ml_pos)

##################################################################################
## Simple comparisons of ML plain money English and LM dictionaries
##################################################################################

## Unigrams
flag_pos_ml_neg <- ML_neg_unigrams %in% LM_pos_words
flag_neg_ml_neg <- ML_neg_unigrams %in% LM_neg_words
flag_pos_ml_pos <- ML_pos_unigrams %in% LM_pos_words
flag_neg_ml_pos <- ML_pos_unigrams %in% LM_neg_words

length(ML_pos_unigrams)
length(LM_pos_words)
length(ML_neg_unigrams)
length(LM_neg_words)

sum(flag_neg_ml_pos)
sum(flag_pos_ml_pos)
sum(flag_neg_ml_neg)
sum(flag_pos_ml_neg)

## Bigrams
regex_lm_pos <- paste("\\b(", paste(LM_pos_words, collapse="|"), ")\\b", sep="")
regex_lm_neg <- paste("\\b(", paste(LM_neg_words, collapse="|"), ")\\b", sep="")

flag_pos_ml_neg <- grepl(regex_lm_pos, ML_neg_bigrams)
flag_neg_ml_neg <- grepl(regex_lm_neg, ML_neg_bigrams)
flag_pos_ml_pos <- grepl(regex_lm_pos, ML_pos_bigrams)
flag_neg_ml_pos <- grepl(regex_lm_neg, ML_pos_bigrams)

length(ML_pos_bigrams)
length(ML_neg_bigrams)
sum(flag_neg_ml_pos)
sum(flag_pos_ml_pos)
sum(flag_neg_ml_neg)
sum(flag_pos_ml_neg)

## Frequency counts
flag_lm_pos <- unigrams_dictionary[,1] %in% LM_pos_words 
flag_lm_neg <- unigrams_dictionary[,1] %in% LM_neg_words 
sum(unigrams_dictionary[flag_lm_pos,3])/sum(unigrams_dictionary[,3])
sum(unigrams_dictionary[flag_lm_neg,3])/sum(unigrams_dictionary[,3])

flag_ml_pos <- unigrams_dictionary[,1] %in% ML_pos_unigrams
flag_ml_neg <- unigrams_dictionary[,1] %in% ML_neg_unigrams
sum(unigrams_dictionary[flag_ml_pos,3])/sum(unigrams_dictionary[,3])
sum(unigrams_dictionary[flag_ml_neg,3])/sum(unigrams_dictionary[,3])

flag_ml_pos <- bigrams_dictionary[,1] %in% ML_pos_bigrams
flag_ml_neg <- bigrams_dictionary[,1] %in% ML_neg_bigrams
sum(bigrams_dictionary[flag_ml_pos,3])/sum(unigrams_dictionary[,3])
sum(bigrams_dictionary[flag_ml_neg,3])/sum(unigrams_dictionary[,3]) 

##################################################################################
## Disambiguation function, replicating Table 6 in the paper.
## I takes a single word as an input, with an optional variable
## restricting how much to print out in the output (30 is the default).
##
## The output, if the word is associated with a bigram in our 65K dtm, is
## a list with five items:
## 1. Number of bigrams, positive/neutral/negative.
## 2. Total frequency counts of all bigrams, percent positive/neutral/negative.
## 3. Top positive bigrams associated with that term (frequency ranked).
## 4. Top neutral bigrams associated with that term (frequency ranked).
## 5. Top negative bigrams associated with that term (frequency ranked).
##################################################################################

bigram_colors <- function(word, top_no = 30){
    regex_word <- paste("\\b", word, "\\b", sep="")
    bigram_out <- "No bigrams associated with this name in top 65K."
    regex_hit_2 <- grepl(regex_word, as.character(unlist(bigrams_dictionary[,1])))
    uni_temp_count <- unigrams_dictionary[word==unigrams_dictionary[,1],3]
    if(sum(regex_hit_2)>0){
        bigram_temp <- bigrams_dictionary[regex_hit_2,]
        bigram_temp[,4] <- bigram_temp[,3]/uni_temp_count

        no_bigrams <- dim(bigram_temp)[1]
        no_bigrams_pos <- sum(bigram_temp[,2]>0)
        no_bigrams_neu <- sum(bigram_temp[,2]==0)
        no_bigrams_neg <- sum(bigram_temp[,2]<0)
        sum_bigrams <- sum(bigram_temp[,3])
        sum_bigrams_pos <- sum(bigram_temp[bigram_temp[,2]>0,3])/sum_bigrams
        sum_bigrams_neu <- sum(bigram_temp[bigram_temp[,2]==0,3])/sum_bigrams
        sum_bigrams_neg <- sum(bigram_temp[bigram_temp[,2]<0,3])/sum_bigrams
        
        bigrams_pos <- bigram_temp[bigram_temp[,2]>0,c(1,3,4)]
        bigrams_neu <- bigram_temp[bigram_temp[,2]==0,c(1,3,4)]
        bigrams_neg <- bigram_temp[bigram_temp[,2]<0,c(1,3,4)]
        bigrams_pos <- bigrams_pos[order(bigrams_pos[,2], decreasing=TRUE),]
        bigrams_neu <- bigrams_neu[order(bigrams_neu[,2], decreasing=TRUE),]
        bigrams_neg <- bigrams_neg[order(bigrams_neg[,2], decreasing=TRUE),]

        if(dim(bigrams_pos)[1]>top_no){ bigrams_pos <- bigrams_pos[1:top_no,]}
        if(dim(bigrams_neu)[1]>top_no){ bigrams_neu <- bigrams_neu[1:top_no,]}
        if(dim(bigrams_neg)[1]>top_no){ bigrams_neg <- bigrams_neg[1:top_no,]}
                
        no_bigrams_all <- c(no_bigrams, no_bigrams_pos, no_bigrams_neu, no_bigrams_neg)
        sum_bigrams_all <- c(sum_bigrams, sum_bigrams_pos, sum_bigrams_neu, sum_bigrams_neg)
        bigram_out <- list(no_bigrams_all, sum_bigrams_all, bigrams_pos, bigrams_neu, bigrams_neg)
    }
    return(bigram_out)
}

#######################

bigram_colors("demand")
bigram_colors("solid")
bigram_colors("softer")
bigram_colors("not")
bigram_colors("dead")
bigram_colors("leverage")
bigram_colors("sheet")
bigram_colors("cash")
bigram_colors("confident")