Homework 3 Key

PS 452: Text as Data

Fall 2014

Problem 1

Creating term-document matrices for unigrams and trigrams with python.

### HW3_Key, Text as Data 2014
### FJZ
import os, re, nltk, urllib2, csv

##make nested dictionary of press releases and metadata
wd = "/Users/franceszlotnick/Dropbox/Text/tad14/HW3/GrimmerSenatePressReleases-master/raw/"


os.chdir(wd)

folders = [wd + "Sessions", wd + "Shelby"]

pressReleases = {}

for folder in folders:
    files = os.listdir(folder)
    for pr in files:
        pressReleases[pr] = {}
        pressReleases[pr]["day"] = pr[:2]
        pressReleases[pr]["month"] = pr[2:5]
        pressReleases[pr]["year"] = pr[5:9]
        pressReleases[pr]["author"] = re.sub("[0-9]+.txt", "", pr[9:])
        with open(folder + "/" + pr, "r") as fileIn:
            pressReleases[pr]["text"] = fileIn.read()

#get stopwords, store as set for quick lookup. apply porter stemmer
porter = nltk.stem.PorterStemmer()

stopwords = urllib2.urlopen("http://jmlr.org/papers/volume5/lewis04a/a11-smart-stop-list/english.stop").readlines()
stopwords = {x.strip("\n") for x in stopwords}
toAdd = {"shelby", "sessions", "richard", "jeff", "email", "press", "room", "member", "senate"}
stopwords.update(toAdd) 
stopwords = {porter.stem(w) for w in stopwords}




#to count unigrams and trigrams, create dictionaries linking the -grams to their counts.
#For unigram and each trigram in each doc, check whether it already exists in the dictionary.
#If not, add it and give it a value of 1. If it does, increment the value by 1


unigrams = {}
trigrams = {}

for pr in pressReleases:
    txt = pressReleases[pr]["text"]
    txt = re.sub("\W", " ", txt)
    txt = txt.lower()
    tokens = nltk.word_tokenize(txt)
    tokens = [porter.stem(w) for w in tokens]
    tokens = [x for x in tokens if x not in stopwords]
    pressReleases[pr]["numUnigrams"] = len(tokens)
    #add another layer of dictionary to record the frequencies of unigrams in each document
    pressReleases[pr]["doc_unigrams"] = {}
    for i in set(tokens):
        count = tokens.count(i)
        #for each unigram, add a count to the new inner dictionary
        pressReleases[pr]["doc_unigrams"][i] = count
        #add to overall unigrams dictionary, with count
        if i in unigrams:
            unigrams[i] += count
        else:
            unigrams[i] = count
    #now to deal with trigrams
    trigList = list(nltk.trigrams(tokens))
    pressReleases[pr]["numTrigrams"] = len(trigList)
    pressReleases[pr]["doc_trigrams"] = {}
    for j in set(trigList):
        count = trigList.count(j)
        pressReleases[pr]["doc_trigrams"][j] = count
        if j in trigrams:
            trigrams[j] += count
        else:
            trigrams[j] = count 

#To sort by values:
#sorted(a, key=a.get, reverse=True)
#for i in topUnigrams:
#   print i, unigrams[i]


topUnigrams = sorted(unigrams, key=unigrams.get, reverse=True)[:1000]
topTrigrams = sorted(trigrams, key=trigrams.get, reverse=True)[:500]


#write unigrams file
#if you want to calculate your rates out of total words, add a column for this
#headerUni = ["file"] + ["id"] + topUnigrams + ["allUnigrams"] 
headerUni =  ["file"]+ ["id"] + topUnigrams

#os.chdir("/Users/Frannie/Dropbox/TextAsData/ProblemSets/PS3/")
os.chdir("/Users/franceszlotnick/Dropbox/TextAsData/ProblemSets/PS3/")

with open("unigrams.csv", "w") as csvfile:
    writer = csv.writer(csvfile, delimiter= ",")
    writer.writerow(headerUni)
    for i in pressReleases:
        toWrite = []
        toWrite.append(i)
        toWrite.append(pressReleases[i]["author"])
        for j in topUnigrams:
            if j in pressReleases[i]["doc_unigrams"]:
                toWrite.append(str(pressReleases[i]["doc_unigrams"][j]))
            else:
                toWrite.append(str(0))
        #if you want to calculate your rates out of total words, you can add this to your tdm
        #toWrite.append(str(pressReleases[i]["numUnigrams"]))
        writer.writerow(toWrite)


#write trigrams file

#define funciton that takes in tuple, converts to dot delimited string
def tupToString(tup):
    return ".".join(tup)


stringifiedTris = [tupToString(x) for x in topTrigrams]

#if you want to calculate your rates out of total , add a column for this
#headerTri = ["file"] + ["id"] + stringifiedTris + ["allTrigrams"]
headerTri = ["file"] + ["id"] + stringifiedTris

os.chdir("/Users/franceszlotnick/Dropbox/TextAsData/ProblemSets/PS3/")

with open("trigrams.csv", "w") as csvfile:
    writer = csv.writer(csvfile, delimiter= ",")
    writer.writerow(headerTri)
    for i in pressReleases:
        toWrite = []
        toWrite.append(i)
        toWrite.append(pressReleases[i]["author"])
        for j in topTrigrams:
            if j in pressReleases[i]["doc_trigrams"]:
                toWrite.append(str(pressReleases[i]["doc_trigrams"][j]))
            else:
                toWrite.append(str(0))
        #if you want to calculate your rates out of total words, you can add this to your tdm
        #toWrite.append(str(pressReleases[i]["numTrigrams"]))
        writer.writerow(toWrite)

Applying Word Separating Algorithms

Some of our methods require word rates, not frequencies. Need to do some additional processing to convert our TDM into rates.

library(foreign)
setwd("/Users/franceszlotnick/Dropbox/TextAsData/ProblemSets/PS3/")
uni_raw<- read.csv("unigrams.csv")
tri_raw<- read.csv("trigrams.csv")

#convert frequencies to rates
#take off id label and filenames
uni<- subset(uni_raw[,-c(1,2)])
tri<- subset(tri_raw[,-c(1,2)])

#calculate rates. 
for (i in 1:nrow(uni)){
    s<- sum(uni[i,])
    uni[i,]<- uni[i,]/s
}


#much faster, but returns a different type of data structure which requires a little more work to use
#norm<- function(x){
#    x<- x/sum(x)
#    return(x)
#}

#m<- t(apply(uni,1, function(x) norm(x)))

#add id back on
uni<-cbind(uni_raw$file, uni_raw$id, uni)
colnames(uni)[1:2]<- c("file", "id")

#same for trigrams
for (j in 1:nrow(tri)){
    s<- sum(tri[i,])
    tri[i,]<- tri[i,]/s
}
tri<-cbind(tri_raw$file, tri_raw$id, tri)
colnames(tri)[1:2]<- c("file","id")

#we will need these for our discriminant methods
#number of sessions and shelby docs
sessionsN<-table(uni$id)[1]
shelbyN<-table(uni$id)[2]

#unigrams
muShelbyU <- colMeans(uni[uni$id=="Shelby",-c(1,2)], na.rm=T)
muSessionsU<- colMeans(uni[uni$id=="Sessions",-c(1,2)], na.rm=T)
varShelbyU<- apply(uni[uni$id=="Shelby", -c(1,2)], 2, var)
varSessionsU<- apply(uni[uni$id=="Sessions", -c(1,2)], 2, var)




#trigrams
muShelbyT <- colMeans(tri[tri$id=="Shelby",-c(1,2)], na.rm=T)
muSessionsT<- colMeans(tri[tri$id=="Sessions",-c(1,2)], na.rm=T)
varShelbyT<- apply(tri[tri$id=="Shelby", -c(1,2)], 2, var)
varSessionsT<- apply(tri[uni$id=="Sessions", -c(1,2)], 2, var)
  1. Mosteller and Wallace LDA
#unigrams
ldWeightsU<- (muShelbyU - muSessionsU) / (varShelbyU + varSessionsU)

#trigrams
ldWeightsT<- (muShelbyT - muSessionsT) / (varShelbyT + varSessionsT)

#trim weights - not necessary here
#ldWeightsU[abs(ldWeightsU)< 0.025]<- 0
#ldWeightsT[abs(ldWeightsT)< 0.025]<- 0

#ldWeightsT!="-Inf"]
ldWeightsU<- sort(ldWeightsU)
ldWeightsT<- sort(ldWeightsT)

#get 20 most discriminating words
LDmostSessionsU<- head(ldWeightsU, 10)
LDmostShelbyU<- tail(ldWeightsU, 10)
LDmostSessionsT<- head(ldWeightsT, 10) 
LDmostShelbyT<- tail(ldWeightsT, 10)


#plot
LDdiscrimU<-c(LDmostSessionsU, LDmostShelbyU)
LDdiscrimT<- c(LDmostSessionsT, LDmostShelbyT)
  1. Standardized mean difference
#unigrams
numU<- muShelbyU - muSessionsU
denomU<- sqrt((varShelbyU/shelbyN) + (varSessionsU/sessionsN))
stdDiffU<-numU / denomU

#trigrams
numT<- muShelbyT - muSessionsT
denomT<- sqrt((varShelbyT/shelbyN) + varSessionsT/shelbyN)
stdDiffT<- numT/denomT

sdWeightsU<- sort(stdDiffU[stdDiffU!="-Inf"])
sdWeightsT<- sort(stdDiffT[stdDiffT!="-Inf"])

SDmostSessionsU<- head(sdWeightsU,10)
SDmostShelbyU<- tail(sdWeightsU, 10)
SDmostSessionsT<- head(sdWeightsT, 10)
SDmostShelbyT<- tail(sdWeightsT, 10)

SDdiscrimU<- c(SDmostSessionsU, SDmostShelbyU)
SDdiscrimT<- c(SDmostSessionsT, SDmostShelbyT)
  1. Standardized Log Odds
#unigrams
totShelbyU<- sum(colSums(uni_raw[uni_raw$id=="Shelby", -c(1,2)]))
piShelbyU<- (colSums(uni_raw[uni_raw$id=="Shelby", -c(1,2)]) + 1)/(totShelbyU + (ncol(uni_raw)-1))
totSessionsU<- sum(colSums(uni_raw[uni_raw$id=="Sessions", -c(1,2)])) 
piSessionsU<- (colSums(uni_raw[uni_raw$id=="Sessions", -c(1,2)]) + 1)/(totSessionsU + (ncol(uni_raw)-1))

logOddsRatioU<- log(piShelbyU/ (1- piShelbyU)) - log(piSessionsU/ (1-piSessionsU))
varLogOddsU<- (1/(colSums(uni_raw[uni_raw$id=="Shelby", -c(1,2)]) + 1)) + (1/(colSums(uni_raw[uni_raw$id=="Sessions", -c(1,2)]) + 1)) 

stdLogOddsU<- logOddsRatioU/ sqrt(varLogOddsU)


#trigrams
totShelbyT<- sum(colSums(tri_raw[tri_raw$id=="Shelby", -c(1,2)]))
piShelbyT<- (colSums(tri_raw[tri_raw$id=="Shelby", -c(1,2)]) + 1)/(totShelbyT + (ncol(tri_raw)-1))
totSessionsT<- sum(colSums(tri_raw[tri_raw$id=="Sessions", -c(1,2)])) 
piSessionsT<- (colSums(tri_raw[tri_raw$id=="Sessions", -c(1,2)]) + 1)/(totSessionsT + (ncol(tri_raw)-1))

logOddsRatioT<- log(piShelbyT/ (1- piShelbyT)) - log(piSessionsT/ (1-piSessionsT))
varLogOddsT<- (1/(colSums(tri_raw[tri_raw$id=="Shelby", -c(1,2)]) + 1)) + (1/(colSums(tri_raw[tri_raw$id=="Sessions", -c(1,2)]) + 1)) 

stdLogOddsT<- logOddsRatioT/ sqrt(varLogOddsT)

sloWeightsU<- sort(stdLogOddsU)
sloWeightsT<- sort(stdLogOddsT)

sloSessionsMostU<- head(sloWeightsU,10)
sloShelbyMostU<- tail(sloWeightsU, 10)
sloSessionsMostT<- head(sloWeightsT, 10)
sloShelbyMostT<- tail(sloWeightsT, 10)

sloDiscrimU<- c(sloSessionsMostU, sloShelbyMostU)
sloDiscrimT<- c(sloSessionsMostT, sloShelbyMostT)

Unigram Plots

Plotting words with the largest (absolute value) weights.

index<- seq(1, 20, 1)

plot(LDdiscrimU, index, pch="", xlab="weight", ylab="", yaxt="n", main="Most discriminating words\n Linear Discriminant Analysis")
text(LDdiscrimU, index , label=names(LDdiscrimU), cex=.7)

plot of chunk unnamed-chunk-6

#axis(2, at=c(2, 18), labels=c("Sessions", "Shelby"), tcl=0, las=1, cex.axis=.9)

Another way of visualizing this that better displays the relative weights, but makes it more difficult to read the words.

plot(LDdiscrimU, pch="", xaxt="n", xlab="", ylab="Weight", main="Most discriminating words\n Linear Discriminant Analysis")
text(LDdiscrimU, label=names(LDdiscrimU), cex=.7)

plot of chunk unnamed-chunk-7

Back to our regular plot style.

plot(SDdiscrimU, index, pch="", xlab="weight", ylab="", yaxt="n", main="Most discriminating words\n Standard Mean Difference")
text(SDdiscrimU, index, label=names(SDdiscrimU), cex=.7)

plot of chunk unnamed-chunk-8

plot(sloDiscrimU, index, pch="", xlab="weight", ylab="", yaxt="n", main="Most discriminating words\n Standardized Log Odds Ratio")
text(sloDiscrimU, index, label=names(sloDiscrimU), cex=.7)

plot of chunk unnamed-chunk-8

Trigram plots

plot(LDdiscrimT, index, pch="", xlab="weight", xlim=c(-400,1400), ylab="", yaxt="n", main="Most discriminating words\n Linear Discriminant Analysis")
text(LDdiscrimT, index, label=names(LDdiscrimT), cex=.7)

plot of chunk unnamed-chunk-9

plot(SDdiscrimT, index, pch="", xlab="weight", xlim=c(-1000,1500), ylab="", yaxt="n", main="Most discriminating words\n Standard Mean Difference")
text(SDdiscrimT, index, label=names(SDdiscrimT), cex=.7)