Let’s look at a more contemporary source than Vasari, Cultural Bloggers Interviewed.
We can download the PDF with a shell script:
#!/bin/bash wget "http://live.labforculture.org/2010/09/cbi/files/cultural_blogger.pdf"
And then load in the data and process it in R using tm again (with a slight modification to the function that cleans up the text):
library(tm) blogfile<-"./cultural_blogger.pdf" bloggers.names<-c("Claire Welsby", "Michelle Kasprzak", "Alek Tarkowski", "Marco Mancuso", "Anne Helmond", "Robert Misik", "Marta Peirano & José Luis de Vicente", "Alessandro Ludovico", "Régine Debatty") bloggers<-data.frame(name=bloggers.names, from=c(6, 11, 15, 19, 23, 27, 31, 35, 41), to=c(10, 14, 18, 22, 26, 30, 34, 40, 44)) ## Clean footnotes, etc. from article text cleanArticle<-function(text){ ## Remove urls. Would miss final url in a document ;-) text<-lapply(text, function(line){gsub("http://.+\\s", "", line, perl=TRUE)}) ## Remove punctuation text<-lapply(text, function(line){gsub("[[:punct:]]", "", line)}) ## Lowercase words text<-lapply(text, tolower) text } ## Load the blogger texts bloggers.texts<-apply(bloggers, 1, function(blogger){ reader<-readPDF(PdftotextOptions=paste("-layout", "-f", blogger[2], "-l", blogger[3])) reader(elem=list(uri=blogfile), language="en", id=blogger[1])}) ## Clean up the blogger texts bloggers.texts<-lapply(bloggers.texts, cleanArticle) ## Make a corpus of the bloggers texts bloggers.corpus<-Corpus(VectorSource(bloggers.texts), readerControl=list(language="english", reader=readPlain)) ## Remove whitespace within terms bloggers.clean<-tm_map(bloggers.corpus, stripWhitespace) ## Remove stopwords bloggers.clean<-tm_map(bloggers.clean, removeWords, stopwords("english")) ## Stem words ## No, this looks weird in the results ##bloggers.clean<-tm_map(bloggers.clean, stemDocument) ## Term/document matrix dtm<-DocumentTermMatrix(bloggers.clean) ## Remove infrequent terms to save memory dtm<-removeSparseTerms(dtm, 0.4)
Then we can find the most common terms:
## Frequent terms in the matrix findFreqTerms(dtm, 4)
[1] "art" "artists" "arts" "audience" "based" [6] "bit" "blog" "blogging" "blogs" "community" [11] "contemporary" "content" "difficult" "example" "experience" [16] "feel" "include" "involved" "issues" "mainly" [21] "media" "people" "platform" "post" "probably" [26] "project" "public" "regarding" "scene" "technology" [31] "thats" "time" "via" "website" "world" [36] "course" "definitely" "describe" "dont" "facebook" [41] "focus" "interview" "job" "money" "personal" [46] "research" "started" "cultural" "culture" "digital" [51] "music" "write" "writing" "active" "consider" [56] "critical" "english" "following" "hand" "information" [61] "network" "popular" "tools" "actually" "especially" [66] "etc" "hard" "led" "live" "lot" [71] "question" "ive" "online" "read" "video" [76] "book" "changed" "european" "model" "moment" [81] "specific" "start" "times" "economic" "readers"
Look at associations:
## Frequently associated terms findAssocs(dtm, "blogging", 0.2)
blogging dont read usually chat video blog follow 1.00 0.74 0.61 0.57 0.56 0.55 0.49 0.45 research blogs hard via life etc live role 0.42 0.38 0.38 0.38 0.37 0.35 0.35 0.33 scene cultural job question able interview money ive 0.31 0.30 0.30 0.30 0.27 0.27 0.24 0.23 led course 0.21 0.20
Find similar bloggers:
## Dissimilarity dis<-dissimilarity(dtm, method="cosine") ## The most similar bloggers for each blogger, in order of similarity similarityMin<-0.25 mostSimilarBloggers<-apply(dis, 1, function(row){ sorted<-sort(row) ordered<-order(row) ## 0.0 == same blogger ordered[sorted > 0.0 & sorted < similarityMin] }) for(doc in 1:length(mostSimilarBloggers)){ mostSimilar<-unlist(mostSimilarBloggers[doc]) if(length(mostSimilar) > 0){ count<-min(length(mostSimilar), 5) similar<-paste(bloggers.names[mostSimilar[1:count]], collapse=", ") }else{ similar<-"None" } cat(bloggers.names[[doc]], ": ", similar, "\n\n") }
Claire Welsby : None Michelle Kasprzak : Régine Debatty, Anne Helmond Alek Tarkowski : Anne Helmond, Régine Debatty Marco Mancuso : None Anne Helmond : Alek Tarkowski, Michelle Kasprzak, Régine Debatty Robert Misik : None Marta Peirano & José Luis de Vicente : None Alessandro Ludovico : None Régine Debatty : Michelle Kasprzak, Alek Tarkowski, Anne Helmond
Cluster bloggers:
## Clusters of similar bloggers clusterCount<-3 clusters<-kmeans(dtm, clusterCount) clusters.bloggers<-lapply(1:clusterCount, function(cluster){ bloggers.names[clusters$cluster == cluster]}) for(cluster in 1:clusterCount){ cat("Cluster", cluster, ":", paste(unlist(clusters.bloggers[cluster]), collapse=", "), "\n\n") }
Cluster 1 : Michelle Kasprzak, Alek Tarkowski, Anne Helmond, Régine Debatty Cluster 2 : Claire Welsby, Marco Mancuso Cluster 3 : Robert Misik, Marta Peirano & José Luis de Vicente, Alessandro Ludovico
And plot associations between terms used in the text:
## Plot associations between terms plot(dtm, findFreqTerms(dtm, 6), attrs=list(graph=list(), node=list(shape="rectangle", fontsize="120", fixedsize="false")))