Exploring Art Data 11

Let’s look at a more contemporary source than Vasari, Cultural Bloggers Interviewed.

We can download the PDF with a shell script:

#!/bin/bash
wget "http://live.labforculture.org/2010/09/cbi/files/cultural_blogger.pdf"

And then load in the data and process it in R using tm again (with a slight modification to the function that cleans up the text):


library(tm)
blogfile<-"./cultural_blogger.pdf"
bloggers.names<-c("Claire Welsby", "Michelle Kasprzak",
"Alek Tarkowski", "Marco Mancuso", "Anne Helmond",
"Robert Misik", "Marta Peirano & José Luis de Vicente",
"Alessandro Ludovico", "Régine Debatty")
bloggers<-data.frame(name=bloggers.names,
from=c(6, 11, 15, 19, 23,  27, 31, 35, 41),
to=c(10, 14, 18, 22, 26, 30, 34, 40, 44))
## Clean footnotes, etc. from article text
cleanArticle<-function(text){
## Remove urls. Would miss final url in a document ;-)
text<-lapply(text, function(line){gsub("http://.+\\s", "", line, perl=TRUE)})
## Remove punctuation
text<-lapply(text, function(line){gsub("[[:punct:]]", "", line)})
## Lowercase words
text<-lapply(text, tolower)
text
}
## Load the blogger texts
bloggers.texts<-apply(bloggers, 1,
function(blogger){
reader<-readPDF(PdftotextOptions=paste("-layout",
"-f", blogger[2],
"-l", blogger[3]))
reader(elem=list(uri=blogfile),
language="en", id=blogger[1])})
## Clean up the blogger texts
bloggers.texts<-lapply(bloggers.texts, cleanArticle)
## Make a corpus of the bloggers texts
bloggers.corpus<-Corpus(VectorSource(bloggers.texts),
readerControl=list(language="english",
reader=readPlain))
## Remove whitespace within terms
bloggers.clean<-tm_map(bloggers.corpus, stripWhitespace)
## Remove stopwords
bloggers.clean<-tm_map(bloggers.clean, removeWords, stopwords("english"))
## Stem words
## No, this looks weird in the results
##bloggers.clean<-tm_map(bloggers.clean, stemDocument)
## Term/document matrix
dtm<-DocumentTermMatrix(bloggers.clean)
## Remove infrequent terms to save memory
dtm<-removeSparseTerms(dtm, 0.4)

Then we can find the most common terms:


## Frequent terms in the matrix
findFreqTerms(dtm, 4)

 [1] "art"          "artists"      "arts"         "audience"     "based"
[6] "bit"          "blog"         "blogging"     "blogs"        "community"
[11] "contemporary" "content"      "difficult"    "example"      "experience"
[16] "feel"         "include"      "involved"     "issues"       "mainly"
[21] "media"        "people"       "platform"     "post"         "probably"
[26] "project"      "public"       "regarding"    "scene"        "technology"
[31] "thats"        "time"         "via"          "website"      "world"
[36] "course"       "definitely"   "describe"     "dont"         "facebook"
[41] "focus"        "interview"    "job"          "money"        "personal"
[46] "research"     "started"      "cultural"     "culture"      "digital"
[51] "music"        "write"        "writing"      "active"       "consider"
[56] "critical"     "english"      "following"    "hand"         "information"
[61] "network"      "popular"      "tools"        "actually"     "especially"
[66] "etc"          "hard"         "led"          "live"         "lot"
[71] "question"     "ive"          "online"       "read"         "video"
[76] "book"         "changed"      "european"     "model"        "moment"
[81] "specific"     "start"        "times"        "economic"     "readers"

Look at associations:


## Frequently associated terms
findAssocs(dtm, "blogging", 0.2)

 blogging      dont      read   usually      chat     video      blog    follow
1.00      0.74      0.61      0.57      0.56      0.55      0.49      0.45
research     blogs      hard       via      life       etc      live      role
0.42      0.38      0.38      0.38      0.37      0.35      0.35      0.33
scene  cultural       job  question      able interview     money       ive
0.31      0.30      0.30      0.30      0.27      0.27      0.24      0.23
led    course
0.21      0.20

Find similar bloggers:


## Dissimilarity
dis<-dissimilarity(dtm, method="cosine")
## The most similar bloggers for each blogger, in order of similarity
similarityMin<-0.25
mostSimilarBloggers<-apply(dis, 1,
function(row){
sorted<-sort(row)
ordered<-order(row)
## 0.0 == same blogger
ordered[sorted > 0.0 & sorted < similarityMin]
})
for(doc in 1:length(mostSimilarBloggers)){
mostSimilar<-unlist(mostSimilarBloggers[doc])
if(length(mostSimilar) > 0){
count<-min(length(mostSimilar), 5)
similar<-paste(bloggers.names[mostSimilar[1:count]], collapse=", ")
}else{
similar<-"None"
}
cat(bloggers.names[[doc]], ": ", similar, "\n\n")
}

Claire Welsby :  None
Michelle Kasprzak :  Régine Debatty, Anne Helmond
Alek Tarkowski :  Anne Helmond, Régine Debatty
Marco Mancuso :  None
Anne Helmond :  Alek Tarkowski, Michelle Kasprzak, Régine Debatty
Robert Misik :  None
Marta Peirano & José Luis de Vicente :  None
Alessandro Ludovico :  None
Régine Debatty :  Michelle Kasprzak, Alek Tarkowski, Anne Helmond

Cluster bloggers:


## Clusters of similar bloggers
clusterCount<-3
clusters<-kmeans(dtm, clusterCount)
clusters.bloggers<-lapply(1:clusterCount,
function(cluster){
bloggers.names[clusters$cluster == cluster]})
for(cluster in 1:clusterCount){
cat("Cluster", cluster, ":",
paste(unlist(clusters.bloggers[cluster]), collapse=", "),
"\n\n")
}

Cluster 1 : Michelle Kasprzak, Alek Tarkowski, Anne Helmond, Régine Debatty
Cluster 2 : Claire Welsby, Marco Mancuso
Cluster 3 : Robert Misik, Marta Peirano & José Luis de Vicente, Alessandro Ludovico

And plot associations between terms used in the text:


## Plot associations between terms
plot(dtm, findFreqTerms(dtm, 6),
attrs=list(graph=list(),
node=list(shape="rectangle", fontsize="120", fixedsize="false")))

Related