Creating term-document matrices for unigrams and trigrams with python.
### HW3_Key, Text as Data 2014
### FJZ
import os, re, nltk, urllib2, csv
##make nested dictionary of press releases and metadata
wd = "/Users/franceszlotnick/Dropbox/Text/tad14/HW3/GrimmerSenatePressReleases-master/raw/"
os.chdir(wd)
folders = [wd + "Sessions", wd + "Shelby"]
pressReleases = {}
for folder in folders:
files = os.listdir(folder)
for pr in files:
pressReleases[pr] = {}
pressReleases[pr]["day"] = pr[:2]
pressReleases[pr]["month"] = pr[2:5]
pressReleases[pr]["year"] = pr[5:9]
pressReleases[pr]["author"] = re.sub("[0-9]+.txt", "", pr[9:])
with open(folder + "/" + pr, "r") as fileIn:
pressReleases[pr]["text"] = fileIn.read()
#get stopwords, store as set for quick lookup. apply porter stemmer
porter = nltk.stem.PorterStemmer()
stopwords = urllib2.urlopen("http://jmlr.org/papers/volume5/lewis04a/a11-smart-stop-list/english.stop").readlines()
stopwords = {x.strip("\n") for x in stopwords}
toAdd = {"shelby", "sessions", "richard", "jeff", "email", "press", "room", "member", "senate"}
stopwords.update(toAdd)
stopwords = {porter.stem(w) for w in stopwords}
#to count unigrams and trigrams, create dictionaries linking the -grams to their counts.
#For unigram and each trigram in each doc, check whether it already exists in the dictionary.
#If not, add it and give it a value of 1. If it does, increment the value by 1
unigrams = {}
trigrams = {}
for pr in pressReleases:
txt = pressReleases[pr]["text"]
txt = re.sub("\W", " ", txt)
txt = txt.lower()
tokens = nltk.word_tokenize(txt)
tokens = [porter.stem(w) for w in tokens]
tokens = [x for x in tokens if x not in stopwords]
pressReleases[pr]["numUnigrams"] = len(tokens)
#add another layer of dictionary to record the frequencies of unigrams in each document
pressReleases[pr]["doc_unigrams"] = {}
for i in set(tokens):
count = tokens.count(i)
#for each unigram, add a count to the new inner dictionary
pressReleases[pr]["doc_unigrams"][i] = count
#add to overall unigrams dictionary, with count
if i in unigrams:
unigrams[i] += count
else:
unigrams[i] = count
#now to deal with trigrams
trigList = list(nltk.trigrams(tokens))
pressReleases[pr]["numTrigrams"] = len(trigList)
pressReleases[pr]["doc_trigrams"] = {}
for j in set(trigList):
count = trigList.count(j)
pressReleases[pr]["doc_trigrams"][j] = count
if j in trigrams:
trigrams[j] += count
else:
trigrams[j] = count
#To sort by values:
#sorted(a, key=a.get, reverse=True)
#for i in topUnigrams:
# print i, unigrams[i]
topUnigrams = sorted(unigrams, key=unigrams.get, reverse=True)[:1000]
topTrigrams = sorted(trigrams, key=trigrams.get, reverse=True)[:500]
#write unigrams file
#if you want to calculate your rates out of total words, add a column for this
#headerUni = ["file"] + ["id"] + topUnigrams + ["allUnigrams"]
headerUni = ["file"]+ ["id"] + topUnigrams
#os.chdir("/Users/Frannie/Dropbox/TextAsData/ProblemSets/PS3/")
os.chdir("/Users/franceszlotnick/Dropbox/TextAsData/ProblemSets/PS3/")
with open("unigrams.csv", "w") as csvfile:
writer = csv.writer(csvfile, delimiter= ",")
writer.writerow(headerUni)
for i in pressReleases:
toWrite = []
toWrite.append(i)
toWrite.append(pressReleases[i]["author"])
for j in topUnigrams:
if j in pressReleases[i]["doc_unigrams"]:
toWrite.append(str(pressReleases[i]["doc_unigrams"][j]))
else:
toWrite.append(str(0))
#if you want to calculate your rates out of total words, you can add this to your tdm
#toWrite.append(str(pressReleases[i]["numUnigrams"]))
writer.writerow(toWrite)
#write trigrams file
#define funciton that takes in tuple, converts to dot delimited string
def tupToString(tup):
return ".".join(tup)
stringifiedTris = [tupToString(x) for x in topTrigrams]
#if you want to calculate your rates out of total , add a column for this
#headerTri = ["file"] + ["id"] + stringifiedTris + ["allTrigrams"]
headerTri = ["file"] + ["id"] + stringifiedTris
os.chdir("/Users/franceszlotnick/Dropbox/TextAsData/ProblemSets/PS3/")
with open("trigrams.csv", "w") as csvfile:
writer = csv.writer(csvfile, delimiter= ",")
writer.writerow(headerTri)
for i in pressReleases:
toWrite = []
toWrite.append(i)
toWrite.append(pressReleases[i]["author"])
for j in topTrigrams:
if j in pressReleases[i]["doc_trigrams"]:
toWrite.append(str(pressReleases[i]["doc_trigrams"][j]))
else:
toWrite.append(str(0))
#if you want to calculate your rates out of total words, you can add this to your tdm
#toWrite.append(str(pressReleases[i]["numTrigrams"]))
writer.writerow(toWrite)
Some of our methods require word rates, not frequencies. Need to do some additional processing to convert our TDM into rates.
library(foreign)
setwd("/Users/franceszlotnick/Dropbox/TextAsData/ProblemSets/PS3/")
uni_raw<- read.csv("unigrams.csv")
tri_raw<- read.csv("trigrams.csv")
#convert frequencies to rates
#take off id label and filenames
uni<- subset(uni_raw[,-c(1,2)])
tri<- subset(tri_raw[,-c(1,2)])
#calculate rates.
for (i in 1:nrow(uni)){
s<- sum(uni[i,])
uni[i,]<- uni[i,]/s
}
#much faster, but returns a different type of data structure which requires a little more work to use
#norm<- function(x){
# x<- x/sum(x)
# return(x)
#}
#m<- t(apply(uni,1, function(x) norm(x)))
#add id back on
uni<-cbind(uni_raw$file, uni_raw$id, uni)
colnames(uni)[1:2]<- c("file", "id")
#same for trigrams
for (j in 1:nrow(tri)){
s<- sum(tri[i,])
tri[i,]<- tri[i,]/s
}
tri<-cbind(tri_raw$file, tri_raw$id, tri)
colnames(tri)[1:2]<- c("file","id")
#we will need these for our discriminant methods
#number of sessions and shelby docs
sessionsN<-table(uni$id)[1]
shelbyN<-table(uni$id)[2]
#unigrams
muShelbyU <- colMeans(uni[uni$id=="Shelby",-c(1,2)], na.rm=T)
muSessionsU<- colMeans(uni[uni$id=="Sessions",-c(1,2)], na.rm=T)
varShelbyU<- apply(uni[uni$id=="Shelby", -c(1,2)], 2, var)
varSessionsU<- apply(uni[uni$id=="Sessions", -c(1,2)], 2, var)
#trigrams
muShelbyT <- colMeans(tri[tri$id=="Shelby",-c(1,2)], na.rm=T)
muSessionsT<- colMeans(tri[tri$id=="Sessions",-c(1,2)], na.rm=T)
varShelbyT<- apply(tri[tri$id=="Shelby", -c(1,2)], 2, var)
varSessionsT<- apply(tri[uni$id=="Sessions", -c(1,2)], 2, var)
#unigrams
ldWeightsU<- (muShelbyU - muSessionsU) / (varShelbyU + varSessionsU)
#trigrams
ldWeightsT<- (muShelbyT - muSessionsT) / (varShelbyT + varSessionsT)
#trim weights - not necessary here
#ldWeightsU[abs(ldWeightsU)< 0.025]<- 0
#ldWeightsT[abs(ldWeightsT)< 0.025]<- 0
#ldWeightsT!="-Inf"]
ldWeightsU<- sort(ldWeightsU)
ldWeightsT<- sort(ldWeightsT)
#get 20 most discriminating words
LDmostSessionsU<- head(ldWeightsU, 10)
LDmostShelbyU<- tail(ldWeightsU, 10)
LDmostSessionsT<- head(ldWeightsT, 10)
LDmostShelbyT<- tail(ldWeightsT, 10)
#plot
LDdiscrimU<-c(LDmostSessionsU, LDmostShelbyU)
LDdiscrimT<- c(LDmostSessionsT, LDmostShelbyT)
#unigrams
numU<- muShelbyU - muSessionsU
denomU<- sqrt((varShelbyU/shelbyN) + (varSessionsU/sessionsN))
stdDiffU<-numU / denomU
#trigrams
numT<- muShelbyT - muSessionsT
denomT<- sqrt((varShelbyT/shelbyN) + varSessionsT/shelbyN)
stdDiffT<- numT/denomT
sdWeightsU<- sort(stdDiffU[stdDiffU!="-Inf"])
sdWeightsT<- sort(stdDiffT[stdDiffT!="-Inf"])
SDmostSessionsU<- head(sdWeightsU,10)
SDmostShelbyU<- tail(sdWeightsU, 10)
SDmostSessionsT<- head(sdWeightsT, 10)
SDmostShelbyT<- tail(sdWeightsT, 10)
SDdiscrimU<- c(SDmostSessionsU, SDmostShelbyU)
SDdiscrimT<- c(SDmostSessionsT, SDmostShelbyT)
#unigrams
totShelbyU<- sum(colSums(uni_raw[uni_raw$id=="Shelby", -c(1,2)]))
piShelbyU<- (colSums(uni_raw[uni_raw$id=="Shelby", -c(1,2)]) + 1)/(totShelbyU + (ncol(uni_raw)-1))
totSessionsU<- sum(colSums(uni_raw[uni_raw$id=="Sessions", -c(1,2)]))
piSessionsU<- (colSums(uni_raw[uni_raw$id=="Sessions", -c(1,2)]) + 1)/(totSessionsU + (ncol(uni_raw)-1))
logOddsRatioU<- log(piShelbyU/ (1- piShelbyU)) - log(piSessionsU/ (1-piSessionsU))
varLogOddsU<- (1/(colSums(uni_raw[uni_raw$id=="Shelby", -c(1,2)]) + 1)) + (1/(colSums(uni_raw[uni_raw$id=="Sessions", -c(1,2)]) + 1))
stdLogOddsU<- logOddsRatioU/ sqrt(varLogOddsU)
#trigrams
totShelbyT<- sum(colSums(tri_raw[tri_raw$id=="Shelby", -c(1,2)]))
piShelbyT<- (colSums(tri_raw[tri_raw$id=="Shelby", -c(1,2)]) + 1)/(totShelbyT + (ncol(tri_raw)-1))
totSessionsT<- sum(colSums(tri_raw[tri_raw$id=="Sessions", -c(1,2)]))
piSessionsT<- (colSums(tri_raw[tri_raw$id=="Sessions", -c(1,2)]) + 1)/(totSessionsT + (ncol(tri_raw)-1))
logOddsRatioT<- log(piShelbyT/ (1- piShelbyT)) - log(piSessionsT/ (1-piSessionsT))
varLogOddsT<- (1/(colSums(tri_raw[tri_raw$id=="Shelby", -c(1,2)]) + 1)) + (1/(colSums(tri_raw[tri_raw$id=="Sessions", -c(1,2)]) + 1))
stdLogOddsT<- logOddsRatioT/ sqrt(varLogOddsT)
sloWeightsU<- sort(stdLogOddsU)
sloWeightsT<- sort(stdLogOddsT)
sloSessionsMostU<- head(sloWeightsU,10)
sloShelbyMostU<- tail(sloWeightsU, 10)
sloSessionsMostT<- head(sloWeightsT, 10)
sloShelbyMostT<- tail(sloWeightsT, 10)
sloDiscrimU<- c(sloSessionsMostU, sloShelbyMostU)
sloDiscrimT<- c(sloSessionsMostT, sloShelbyMostT)
Plotting words with the largest (absolute value) weights.
index<- seq(1, 20, 1)
plot(LDdiscrimU, index, pch="", xlab="weight", ylab="", yaxt="n", main="Most discriminating words\n Linear Discriminant Analysis")
text(LDdiscrimU, index , label=names(LDdiscrimU), cex=.7)
#axis(2, at=c(2, 18), labels=c("Sessions", "Shelby"), tcl=0, las=1, cex.axis=.9)
Another way of visualizing this that better displays the relative weights, but makes it more difficult to read the words.
plot(LDdiscrimU, pch="", xaxt="n", xlab="", ylab="Weight", main="Most discriminating words\n Linear Discriminant Analysis")
text(LDdiscrimU, label=names(LDdiscrimU), cex=.7)
Back to our regular plot style.
plot(SDdiscrimU, index, pch="", xlab="weight", ylab="", yaxt="n", main="Most discriminating words\n Standard Mean Difference")
text(SDdiscrimU, index, label=names(SDdiscrimU), cex=.7)
plot(sloDiscrimU, index, pch="", xlab="weight", ylab="", yaxt="n", main="Most discriminating words\n Standardized Log Odds Ratio")
text(sloDiscrimU, index, label=names(sloDiscrimU), cex=.7)
plot(LDdiscrimT, index, pch="", xlab="weight", xlim=c(-400,1400), ylab="", yaxt="n", main="Most discriminating words\n Linear Discriminant Analysis")
text(LDdiscrimT, index, label=names(LDdiscrimT), cex=.7)
plot(SDdiscrimT, index, pch="", xlab="weight", xlim=c(-1000,1500), ylab="", yaxt="n", main="Most discriminating words\n Standard Mean Difference")
text(SDdiscrimT, index, label=names(SDdiscrimT), cex=.7)
plot(sloDiscrimT, index, pch="",xlab="weight",xlim=c(-20,10), ylab="", yaxt="n", main="Most discriminating words\n Standardized Log Odds Ratio")
text(sloDiscrimT, index, label=names(sloDiscrimT), cex=.7)
shelby100<- sample(which(tri_raw$id=="Shelby"), 100, replace=F)
sessions100<- sample(which(tri_raw$id=="Sessions"), 100, replace=F)
distSample<- c(shelby100, sessions100)
ds<- tri_raw[distSample,]
ids<-subset(ds[,1:2])
ids[,1]<- as.character(ids[,1])
ids[,2]<- as.character(ids[,2])
#now take off ids
ds<- as.matrix(ds[,-c(1,2)])
#need to do a little preprocessing- if a trigram doesn't appear in any of our selected documents, it will cause problems, so we'll need to drop those columns
ds<-ds[,-c(which(colSums(ds)==0))]
2a. Euclidean Distance
ed<- matrix(nrow=200, ncol=200)
euclidean<- function(x, y){
return(sqrt(sum((x-y)^2)))
}
for (m in 1:nrow(ds)){
for (n in 1:nrow(ds)){
if (is.na(ed[m,n])==T){
a<- euclidean(ds[m,], ds[n,])
ed[m,n]<- a
ed[n,m]<- a
}
}
}
#could also just do the upper triangle and then copy to the lower triangle
#ed[lower.tri(ed)]<- t(ed)[lower.tri(ed)]
m<-NULL
n<-NULL
#you could also do this
#ed2<- as.matrix(dist(ds, method="euclidean"))
write.csv(ed, "euclidean.csv")
#Many matches; (diags are all 0 in addition to "true 0s"). Can use the following hacky method to get only the nondiagonal minimums
# a<-which(ed == min(ed), arr.ind=T)
# a<- as.data.frame(a) #because you can't use $ on atomic vectors in R.
# a[which(a$row != a$col)]
#alternatively, just do this:
diag(ed)<- NA
#max distance
which(ed == max(ed, na.rm=T), arr.ind=T)
## row col
## [1,] 93 26
## [2,] 26 93
#For the remaineder of this document, Min/max won't be executed, since this will be indiosyncratic to whatever documents were randomly selected and not very informative
#which(ed == min(ed), arr.ind=T)
#to view files, do something like the following, where x and y are the rows of the docs you want to compare
#stored = "/Users/franceszlotnick/Dropbox/Text/tad14/HW3/GrimmerSenatePressReleases-master/raw/"
#file.show(paste(stored, ids[x,2],"/", ids[x,1], sep=""))
#file.show(paste(stored, ids[y,2],"/", ids[y,1], sep=""))
2b. Euclidean with tf-idf weights
edWithTfidf<-matrix(nrow=200, ncol=200)
calcIDF<- function(x){
return(log(200/length(which(x>0))))
}
idf<- apply(ds, 2, calcIDF)
ds_idf<- as.matrix(t(apply(ds, 1, function(x) x*idf)))
edWithTfidf<- as.matrix(dist(ds_idf, method="euclidean"))
diag(edWithTfidf)<- NA
write.csv(edWithTfidf, "edTFIDF.csv")
#most similar
#which(edWithTfidf == min(edWithTfidf, na.rm=T), arr.ind=T)
#most different
#which(edWithTfidf == max(edWithTfidf, na.rm=T), arr.ind=T)
2c. Cosine Similarity
cs<- matrix(nrow=200, ncol=200)
cosineSim<- function(x, y){
return( sum(x*y)/ sqrt( sum(x^2)* sum(y^2)))
}
#this function is faster so we can just make the whole thing
for (m in 1:nrow(ds)){
for (n in 1:nrow(ds)){
a<- cosineSim(ds[m,], ds[n,])
cs[m, n]<- a
cs[n, m]<- a
}
}
m<-NULL
n<-NULL
#you could also use the cosine function from the lsa package
#cs2<- matrix(nrow=200, ncol=200)
#for (m in 1:nrow(ds)){
# for (n in 1:nrow(ds)){
# a<- cosine(ds[m,], ds[n,])
# cs2[m, n]<- a
# cs2[n, m]<- a
# }
#}
diag(cs)<-NA
write.csv(cs, "cosineSim.csv")
#most similar
#which(cs == max(cs, na.rm=T), arr.ind=T)
#most dissimilar
#which(cs == min(cs, na.rm=T), arr.ind=T)
2d. Cosine similarity with tf-idf weights
csWithTfidf<- matrix(nrow=200, ncol=200)
for (m in 1:nrow(ds_idf)){
for (n in 1:nrow(ds_idf)){
a <- cosineSim(ds_idf[m,], ds_idf[n,])
csWithTfidf[m, n]<- a
csWithTfidf[n, m]<- a
}
}
diag(csWithTfidf)<- NA
write.csv(csWithTfidf, "csTFIDF.csv")
#most similar
#which(csWithTfidf == max(csWithTfidf, na.rm=T), arr.ind=T)
#most dissimilar
#which(csWithTfidf == min(csWithTfidf, na.rm=T), arr.ind=T)
2e. Normalize word counts, apply Gaussian kernel
normed<-ds
for (i in 1:nrow(normed)){
normed[i,]<- normed[i,]/sum(normed[i,])
}
#choose sigma
sigma = 100
gauss<- exp(-(as.matrix(dist(normed)))/sigma)
diag(gauss)<- NA
write.csv(gauss, "gaussian.csv")
#most similar
#which(gauss == max(gauss, na.rm=T), arr.ind=T)
#most different
#which(gauss == min(gauss, na.rm=T), arr.ind=T)
2f. Gaussian with tf-idf weights.
idf<- apply(normed, 2, calcIDF)
normed_idf<- as.matrix(t(apply(normed, 1, function(x) x*idf)))
gaussNorm<- exp(-(as.matrix(dist(normed_idf)))/sigma)
write.csv(gaussNorm, "gaussNorm.csv")
diag(gaussNorm)<- NA
#most similar
#which(gaussNorm == max(gaussNorm, na.rm=T), arr.ind=T)
#most different
#which(gaussNorm == min(gaussNorm, na.rm=T), arr.ind=T)