Homework 3 Key

PS 452: Text as Data

Fall 2014

Problem 1

Creating term-document matrices for unigrams and trigrams with python.

### HW3_Key, Text as Data 2014
### FJZ
import os, re, nltk, urllib2, csv

##make nested dictionary of press releases and metadata
wd = "/Users/franceszlotnick/Dropbox/Text/tad14/HW3/GrimmerSenatePressReleases-master/raw/"


os.chdir(wd)

folders = [wd + "Sessions", wd + "Shelby"]

pressReleases = {}

for folder in folders:
    files = os.listdir(folder)
    for pr in files:
        pressReleases[pr] = {}
        pressReleases[pr]["day"] = pr[:2]
        pressReleases[pr]["month"] = pr[2:5]
        pressReleases[pr]["year"] = pr[5:9]
        pressReleases[pr]["author"] = re.sub("[0-9]+.txt", "", pr[9:])
        with open(folder + "/" + pr, "r") as fileIn:
            pressReleases[pr]["text"] = fileIn.read()

#get stopwords, store as set for quick lookup. apply porter stemmer
porter = nltk.stem.PorterStemmer()

stopwords = urllib2.urlopen("http://jmlr.org/papers/volume5/lewis04a/a11-smart-stop-list/english.stop").readlines()
stopwords = {x.strip("\n") for x in stopwords}
toAdd = {"shelby", "sessions", "richard", "jeff", "email", "press", "room", "member", "senate"}
stopwords.update(toAdd) 
stopwords = {porter.stem(w) for w in stopwords}




#to count unigrams and trigrams, create dictionaries linking the -grams to their counts.
#For unigram and each trigram in each doc, check whether it already exists in the dictionary.
#If not, add it and give it a value of 1. If it does, increment the value by 1


unigrams = {}
trigrams = {}

for pr in pressReleases:
    txt = pressReleases[pr]["text"]
    txt = re.sub("\W", " ", txt)
    txt = txt.lower()
    tokens = nltk.word_tokenize(txt)
    tokens = [porter.stem(w) for w in tokens]
    tokens = [x for x in tokens if x not in stopwords]
    pressReleases[pr]["numUnigrams"] = len(tokens)
    #add another layer of dictionary to record the frequencies of unigrams in each document
    pressReleases[pr]["doc_unigrams"] = {}
    for i in set(tokens):
        count = tokens.count(i)
        #for each unigram, add a count to the new inner dictionary
        pressReleases[pr]["doc_unigrams"][i] = count
        #add to overall unigrams dictionary, with count
        if i in unigrams:
            unigrams[i] += count
        else:
            unigrams[i] = count
    #now to deal with trigrams
    trigList = list(nltk.trigrams(tokens))
    pressReleases[pr]["numTrigrams"] = len(trigList)
    pressReleases[pr]["doc_trigrams"] = {}
    for j in set(trigList):
        count = trigList.count(j)
        pressReleases[pr]["doc_trigrams"][j] = count
        if j in trigrams:
            trigrams[j] += count
        else:
            trigrams[j] = count 

#To sort by values:
#sorted(a, key=a.get, reverse=True)
#for i in topUnigrams:
#   print i, unigrams[i]


topUnigrams = sorted(unigrams, key=unigrams.get, reverse=True)[:1000]
topTrigrams = sorted(trigrams, key=trigrams.get, reverse=True)[:500]


#write unigrams file
#if you want to calculate your rates out of total words, add a column for this
#headerUni = ["file"] + ["id"] + topUnigrams + ["allUnigrams"] 
headerUni =  ["file"]+ ["id"] + topUnigrams

#os.chdir("/Users/Frannie/Dropbox/TextAsData/ProblemSets/PS3/")
os.chdir("/Users/franceszlotnick/Dropbox/TextAsData/ProblemSets/PS3/")

with open("unigrams.csv", "w") as csvfile:
    writer = csv.writer(csvfile, delimiter= ",")
    writer.writerow(headerUni)
    for i in pressReleases:
        toWrite = []
        toWrite.append(i)
        toWrite.append(pressReleases[i]["author"])
        for j in topUnigrams:
            if j in pressReleases[i]["doc_unigrams"]:
                toWrite.append(str(pressReleases[i]["doc_unigrams"][j]))
            else:
                toWrite.append(str(0))
        #if you want to calculate your rates out of total words, you can add this to your tdm
        #toWrite.append(str(pressReleases[i]["numUnigrams"]))
        writer.writerow(toWrite)


#write trigrams file

#define funciton that takes in tuple, converts to dot delimited string
def tupToString(tup):
    return ".".join(tup)


stringifiedTris = [tupToString(x) for x in topTrigrams]

#if you want to calculate your rates out of total , add a column for this
#headerTri = ["file"] + ["id"] + stringifiedTris + ["allTrigrams"]
headerTri = ["file"] + ["id"] + stringifiedTris

os.chdir("/Users/franceszlotnick/Dropbox/TextAsData/ProblemSets/PS3/")

with open("trigrams.csv", "w") as csvfile:
    writer = csv.writer(csvfile, delimiter= ",")
    writer.writerow(headerTri)
    for i in pressReleases:
        toWrite = []
        toWrite.append(i)
        toWrite.append(pressReleases[i]["author"])
        for j in topTrigrams:
            if j in pressReleases[i]["doc_trigrams"]:
                toWrite.append(str(pressReleases[i]["doc_trigrams"][j]))
            else:
                toWrite.append(str(0))
        #if you want to calculate your rates out of total words, you can add this to your tdm
        #toWrite.append(str(pressReleases[i]["numTrigrams"]))
        writer.writerow(toWrite)

Applying Word Separating Algorithms

Some of our methods require word rates, not frequencies. Need to do some additional processing to convert our TDM into rates.

library(foreign)
setwd("/Users/franceszlotnick/Dropbox/TextAsData/ProblemSets/PS3/")
uni_raw<- read.csv("unigrams.csv")
tri_raw<- read.csv("trigrams.csv")

#convert frequencies to rates
#take off id label and filenames
uni<- subset(uni_raw[,-c(1,2)])
tri<- subset(tri_raw[,-c(1,2)])

#calculate rates. 
for (i in 1:nrow(uni)){
    s<- sum(uni[i,])
    uni[i,]<- uni[i,]/s
}


#much faster, but returns a different type of data structure which requires a little more work to use
#norm<- function(x){
#    x<- x/sum(x)
#    return(x)
#}

#m<- t(apply(uni,1, function(x) norm(x)))

#add id back on
uni<-cbind(uni_raw$file, uni_raw$id, uni)
colnames(uni)[1:2]<- c("file", "id")

#same for trigrams
for (j in 1:nrow(tri)){
    s<- sum(tri[i,])
    tri[i,]<- tri[i,]/s
}
tri<-cbind(tri_raw$file, tri_raw$id, tri)
colnames(tri)[1:2]<- c("file","id")

#we will need these for our discriminant methods
#number of sessions and shelby docs
sessionsN<-table(uni$id)[1]
shelbyN<-table(uni$id)[2]

#unigrams
muShelbyU <- colMeans(uni[uni$id=="Shelby",-c(1,2)], na.rm=T)
muSessionsU<- colMeans(uni[uni$id=="Sessions",-c(1,2)], na.rm=T)
varShelbyU<- apply(uni[uni$id=="Shelby", -c(1,2)], 2, var)
varSessionsU<- apply(uni[uni$id=="Sessions", -c(1,2)], 2, var)




#trigrams
muShelbyT <- colMeans(tri[tri$id=="Shelby",-c(1,2)], na.rm=T)
muSessionsT<- colMeans(tri[tri$id=="Sessions",-c(1,2)], na.rm=T)
varShelbyT<- apply(tri[tri$id=="Shelby", -c(1,2)], 2, var)
varSessionsT<- apply(tri[uni$id=="Sessions", -c(1,2)], 2, var)
  1. Mosteller and Wallace LDA
#unigrams
ldWeightsU<- (muShelbyU - muSessionsU) / (varShelbyU + varSessionsU)

#trigrams
ldWeightsT<- (muShelbyT - muSessionsT) / (varShelbyT + varSessionsT)

#trim weights - not necessary here
#ldWeightsU[abs(ldWeightsU)< 0.025]<- 0
#ldWeightsT[abs(ldWeightsT)< 0.025]<- 0

#ldWeightsT!="-Inf"]
ldWeightsU<- sort(ldWeightsU)
ldWeightsT<- sort(ldWeightsT)

#get 20 most discriminating words
LDmostSessionsU<- head(ldWeightsU, 10)
LDmostShelbyU<- tail(ldWeightsU, 10)
LDmostSessionsT<- head(ldWeightsT, 10) 
LDmostShelbyT<- tail(ldWeightsT, 10)


#plot
LDdiscrimU<-c(LDmostSessionsU, LDmostShelbyU)
LDdiscrimT<- c(LDmostSessionsT, LDmostShelbyT)
  1. Standardized mean difference
#unigrams
numU<- muShelbyU - muSessionsU
denomU<- sqrt((varShelbyU/shelbyN) + (varSessionsU/sessionsN))
stdDiffU<-numU / denomU

#trigrams
numT<- muShelbyT - muSessionsT
denomT<- sqrt((varShelbyT/shelbyN) + varSessionsT/shelbyN)
stdDiffT<- numT/denomT

sdWeightsU<- sort(stdDiffU[stdDiffU!="-Inf"])
sdWeightsT<- sort(stdDiffT[stdDiffT!="-Inf"])

SDmostSessionsU<- head(sdWeightsU,10)
SDmostShelbyU<- tail(sdWeightsU, 10)
SDmostSessionsT<- head(sdWeightsT, 10)
SDmostShelbyT<- tail(sdWeightsT, 10)

SDdiscrimU<- c(SDmostSessionsU, SDmostShelbyU)
SDdiscrimT<- c(SDmostSessionsT, SDmostShelbyT)
  1. Standardized Log Odds
#unigrams
totShelbyU<- sum(colSums(uni_raw[uni_raw$id=="Shelby", -c(1,2)]))
piShelbyU<- (colSums(uni_raw[uni_raw$id=="Shelby", -c(1,2)]) + 1)/(totShelbyU + (ncol(uni_raw)-1))
totSessionsU<- sum(colSums(uni_raw[uni_raw$id=="Sessions", -c(1,2)])) 
piSessionsU<- (colSums(uni_raw[uni_raw$id=="Sessions", -c(1,2)]) + 1)/(totSessionsU + (ncol(uni_raw)-1))

logOddsRatioU<- log(piShelbyU/ (1- piShelbyU)) - log(piSessionsU/ (1-piSessionsU))
varLogOddsU<- (1/(colSums(uni_raw[uni_raw$id=="Shelby", -c(1,2)]) + 1)) + (1/(colSums(uni_raw[uni_raw$id=="Sessions", -c(1,2)]) + 1)) 

stdLogOddsU<- logOddsRatioU/ sqrt(varLogOddsU)


#trigrams
totShelbyT<- sum(colSums(tri_raw[tri_raw$id=="Shelby", -c(1,2)]))
piShelbyT<- (colSums(tri_raw[tri_raw$id=="Shelby", -c(1,2)]) + 1)/(totShelbyT + (ncol(tri_raw)-1))
totSessionsT<- sum(colSums(tri_raw[tri_raw$id=="Sessions", -c(1,2)])) 
piSessionsT<- (colSums(tri_raw[tri_raw$id=="Sessions", -c(1,2)]) + 1)/(totSessionsT + (ncol(tri_raw)-1))

logOddsRatioT<- log(piShelbyT/ (1- piShelbyT)) - log(piSessionsT/ (1-piSessionsT))
varLogOddsT<- (1/(colSums(tri_raw[tri_raw$id=="Shelby", -c(1,2)]) + 1)) + (1/(colSums(tri_raw[tri_raw$id=="Sessions", -c(1,2)]) + 1)) 

stdLogOddsT<- logOddsRatioT/ sqrt(varLogOddsT)

sloWeightsU<- sort(stdLogOddsU)
sloWeightsT<- sort(stdLogOddsT)

sloSessionsMostU<- head(sloWeightsU,10)
sloShelbyMostU<- tail(sloWeightsU, 10)
sloSessionsMostT<- head(sloWeightsT, 10)
sloShelbyMostT<- tail(sloWeightsT, 10)

sloDiscrimU<- c(sloSessionsMostU, sloShelbyMostU)
sloDiscrimT<- c(sloSessionsMostT, sloShelbyMostT)

Unigram Plots

Plotting words with the largest (absolute value) weights.

index<- seq(1, 20, 1)

plot(LDdiscrimU, index, pch="", xlab="weight", ylab="", yaxt="n", main="Most discriminating words\n Linear Discriminant Analysis")
text(LDdiscrimU, index , label=names(LDdiscrimU), cex=.7)

plot of chunk unnamed-chunk-6

#axis(2, at=c(2, 18), labels=c("Sessions", "Shelby"), tcl=0, las=1, cex.axis=.9)

Another way of visualizing this that better displays the relative weights, but makes it more difficult to read the words.

plot(LDdiscrimU, pch="", xaxt="n", xlab="", ylab="Weight", main="Most discriminating words\n Linear Discriminant Analysis")
text(LDdiscrimU, label=names(LDdiscrimU), cex=.7)

plot of chunk unnamed-chunk-7

Back to our regular plot style.

plot(SDdiscrimU, index, pch="", xlab="weight", ylab="", yaxt="n", main="Most discriminating words\n Standard Mean Difference")
text(SDdiscrimU, index, label=names(SDdiscrimU), cex=.7)

plot of chunk unnamed-chunk-8

plot(sloDiscrimU, index, pch="", xlab="weight", ylab="", yaxt="n", main="Most discriminating words\n Standardized Log Odds Ratio")
text(sloDiscrimU, index, label=names(sloDiscrimU), cex=.7)

plot of chunk unnamed-chunk-8

Trigram plots

plot(LDdiscrimT, index, pch="", xlab="weight", xlim=c(-400,1400), ylab="", yaxt="n", main="Most discriminating words\n Linear Discriminant Analysis")
text(LDdiscrimT, index, label=names(LDdiscrimT), cex=.7)

plot of chunk unnamed-chunk-9

plot(SDdiscrimT, index, pch="", xlab="weight", xlim=c(-1000,1500), ylab="", yaxt="n", main="Most discriminating words\n Standard Mean Difference")
text(SDdiscrimT, index, label=names(SDdiscrimT), cex=.7)

plot of chunk unnamed-chunk-9

plot(sloDiscrimT, index, pch="",xlab="weight",xlim=c(-20,10), ylab="", yaxt="n", main="Most discriminating words\n Standardized Log Odds Ratio")
text(sloDiscrimT, index, label=names(sloDiscrimT), cex=.7)

plot of chunk unnamed-chunk-9

Comparing Document Similarity

  1. Select 100 documents from each senator’s collection.
shelby100<- sample(which(tri_raw$id=="Shelby"), 100, replace=F)
sessions100<- sample(which(tri_raw$id=="Sessions"), 100, replace=F)
distSample<- c(shelby100, sessions100)
ds<- tri_raw[distSample,]
ids<-subset(ds[,1:2])
ids[,1]<- as.character(ids[,1])
ids[,2]<- as.character(ids[,2])

#now take off ids
ds<- as.matrix(ds[,-c(1,2)])

#need to do a little preprocessing- if a trigram doesn't appear in any of our selected documents, it will cause problems, so we'll need to drop those columns
ds<-ds[,-c(which(colSums(ds)==0))]

2a. Euclidean Distance

ed<- matrix(nrow=200, ncol=200)

euclidean<- function(x, y){
    return(sqrt(sum((x-y)^2)))        
}

for (m in 1:nrow(ds)){
    for (n in 1:nrow(ds)){
        if (is.na(ed[m,n])==T){
            a<- euclidean(ds[m,], ds[n,])
            ed[m,n]<- a
            ed[n,m]<- a
        }
    }
}


#could also just do the upper triangle and then copy to the lower triangle
#ed[lower.tri(ed)]<- t(ed)[lower.tri(ed)]

m<-NULL
n<-NULL

#you could also do this
#ed2<- as.matrix(dist(ds, method="euclidean"))

write.csv(ed, "euclidean.csv")



#Many matches; (diags are all 0 in addition to "true 0s"). Can use the following hacky method to get only the nondiagonal minimums
# a<-which(ed == min(ed), arr.ind=T)
# a<- as.data.frame(a)  #because you can't use $ on atomic vectors in R.
# a[which(a$row != a$col)]
#alternatively, just do this:
diag(ed)<- NA


#max distance
which(ed == max(ed, na.rm=T), arr.ind=T)
##      row col
## [1,]  93  26
## [2,]  26  93
#For the remaineder of this document, Min/max won't be executed, since this will be indiosyncratic to whatever documents were randomly selected and not very informative
#which(ed == min(ed), arr.ind=T)

#to view files, do something like the following, where x and y are the rows of the docs you want to compare
#stored = "/Users/franceszlotnick/Dropbox/Text/tad14/HW3/GrimmerSenatePressReleases-master/raw/"
#file.show(paste(stored, ids[x,2],"/", ids[x,1], sep=""))
#file.show(paste(stored, ids[y,2],"/", ids[y,1], sep=""))

2b. Euclidean with tf-idf weights

edWithTfidf<-matrix(nrow=200, ncol=200)

calcIDF<- function(x){
    return(log(200/length(which(x>0))))
}

idf<- apply(ds, 2, calcIDF)

ds_idf<- as.matrix(t(apply(ds, 1, function(x) x*idf)))

edWithTfidf<- as.matrix(dist(ds_idf, method="euclidean"))
diag(edWithTfidf)<- NA

write.csv(edWithTfidf, "edTFIDF.csv")



#most similar
#which(edWithTfidf == min(edWithTfidf, na.rm=T), arr.ind=T)

#most different
#which(edWithTfidf == max(edWithTfidf, na.rm=T), arr.ind=T)

2c. Cosine Similarity

cs<- matrix(nrow=200, ncol=200)
cosineSim<- function(x, y){
    return( sum(x*y)/ sqrt( sum(x^2)* sum(y^2)))
}

#this function is faster so we can just make the whole thing
for (m in 1:nrow(ds)){
    for (n in 1:nrow(ds)){
        a<- cosineSim(ds[m,], ds[n,])
        cs[m, n]<- a
        cs[n, m]<- a
    }
}
m<-NULL
n<-NULL


#you could also use the cosine function from the lsa package
#cs2<- matrix(nrow=200, ncol=200)
#for (m in 1:nrow(ds)){
#    for (n in 1:nrow(ds)){
#        a<- cosine(ds[m,], ds[n,])
#        cs2[m, n]<- a
#        cs2[n, m]<- a
#    }
#}

diag(cs)<-NA
write.csv(cs, "cosineSim.csv")

#most similar
#which(cs == max(cs, na.rm=T), arr.ind=T)

#most dissimilar
#which(cs == min(cs, na.rm=T), arr.ind=T)

2d. Cosine similarity with tf-idf weights

csWithTfidf<- matrix(nrow=200, ncol=200)

for (m in 1:nrow(ds_idf)){
    for (n in 1:nrow(ds_idf)){
        a <- cosineSim(ds_idf[m,], ds_idf[n,])
        csWithTfidf[m, n]<- a
        csWithTfidf[n, m]<- a
    }
}

diag(csWithTfidf)<- NA

write.csv(csWithTfidf, "csTFIDF.csv")
#most similar
#which(csWithTfidf == max(csWithTfidf, na.rm=T), arr.ind=T)

#most dissimilar
#which(csWithTfidf == min(csWithTfidf, na.rm=T), arr.ind=T)

2e. Normalize word counts, apply Gaussian kernel

normed<-ds
for (i in 1:nrow(normed)){
    normed[i,]<- normed[i,]/sum(normed[i,])
}

#choose sigma
sigma = 100
gauss<- exp(-(as.matrix(dist(normed)))/sigma)

diag(gauss)<- NA

write.csv(gauss, "gaussian.csv")

#most similar
#which(gauss == max(gauss, na.rm=T), arr.ind=T)

#most different
#which(gauss == min(gauss, na.rm=T), arr.ind=T)

2f. Gaussian with tf-idf weights.

idf<- apply(normed, 2, calcIDF)
normed_idf<- as.matrix(t(apply(normed, 1, function(x) x*idf)))
gaussNorm<- exp(-(as.matrix(dist(normed_idf)))/sigma)
write.csv(gaussNorm, "gaussNorm.csv")

diag(gaussNorm)<- NA
#most similar
#which(gaussNorm == max(gaussNorm, na.rm=T), arr.ind=T)

#most different
#which(gaussNorm == min(gaussNorm, na.rm=T), arr.ind=T)