1 import os
2 def getHotResults():
3 hrf=os.popen('wget http://www.google.com/trends/hottrends/atom/hourly').read()
4 eidx=0
5 phrases=[]
6 while True:
7 sidx=hrf.find('sa=X">',eidx)
8 if sidx==-1: break
9 eidx=hrf.find('</a>',sidx)
10 phrases.append(hrf[sidx+6:eidx])
11 return phrases
getHotResults() fetches the latest Google Hot Trends and then extracts the phrases.
1 def updateTerm(phr,timestamp):
2 #first set up access to the term collection in MongoDB
3 import pymongo
4 connection=pymongo.Connection()
5 term=connection.hs.trm2.h
6 #check to see if this is a new hot term
7 if phr not in term.find().distinct('term'):
8 rdoc={'term':phr,
9 'sthotd':util.parse.makeDateTime(timestamp),
10 'ehotd':util.parse.makeDateTime(timestamp),
11 }
12 term.insert(rdoc)
13 #otherwise update the collection with the new end time.
14 else:
15 term.update({"term":phr},
16 {'$set':{"ehotd":,util.parse.makeDateTime(timestamp)}})
17
18 import time
19 todaystring=time.strftime('%Y-%m-%d:%H:00:00')
20 #loop over the hot phrases and update the database of terms
21 for phrase in getHotResults():
22 updateTerm(phrase,todaystring)
We then set up a crontab
in Linux to run this code once an hour
1 def checkBing(query,timeout,numresults=32):
2 import urllib2,simplejson
3 if timeout<.5: timeout=0.5
4 bingapikey='xxxxxxxxxxxxxxxxxxxxxxxx'
5 url="""http://api.search.live.net/json.aspx?Appid=%s&query=
6 %s&sources=web&web.count=%i"""
7 %(bingapikey,urllib.quote(query),numresults)
8 #build the API request
9 request = urllib2.Request(url)
10 request.add_header('User-Agent', 'Mozilla/4.0 (compatible;
11 MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)')
12 opener=urllib2.build_opener()
13 response = opener.open(request)
14 results = simplejson.load(response)
15 br=[]
16 #now parse the results and stick them into br
17 for res in results['SearchResponse']['Web']['Results']:
18 if 'Url' in res.keys():
19 url=res['Url']
20 else: url='NA'
21 if 'Title' in res.keys():
22 title=res['Title']
23 else: title='NA'
24 if 'Description' in res.keys():
25 desc=res['Description']
26 else: desc='NA'
27 br.append((url,title,desc))
28 time.sleep(random.uniform(timeout/2,timeout*1.5))
29 return br
1 hotphrases=term.find({'ehotd':
2 {'$gt':util.parse.makeDateTime(timestamp)-datetime.timedelta(days=-3}
3 }).distinct('term')
4 for phrase in hotphrases:
5 try:
6 bres=util.web.checkBing(phrase,0.8)
7 docrb=[]
8 i=1
9 for (url,title,desc) in bres:
10 dom=util.parse.gettld(url)
11 docrb.append({'term':phrase,
12 'dom':dom,
13 'srchtm':util.parse.makeDate(timestamp),
14 'pos':i,
15 'url':url,
16 'title':title,
17 'desc':desc})
18 i+=1
19 resb.insert(docrb)
20 except:
21 print 'bing error on %s'%phrase
!R
> Tdf<-read.table('qtw-trends-case.csv',header=T,sep=',')
> Tds<-Tdf[,c('term','pop','epc','cat','yahoot10','googlet10','googlemalu','yahoomalu')]
> Tds$term<-as.character(Tds$term)
> summary(Tds)
term pop epc cat
Length:19792 Min. : 0 Min. : 0.0000 Entertainment:5936
Class :character 1st Qu.: 88240 1st Qu.: 0.0500 Sports :2978
Mode :character Median : 184743 Median : 0.1800 Local :2137
Mean : 1417431 Mean : 0.4523 Society : 990
3rd Qu.: 460068 3rd Qu.: 0.6600 Lifestyles : 874
Max. :6448000000 Max. : 9.7200 (Other) :6459
NA's : 4252 NA's :17.0000 NA's : 418
yahoot10 googlet10 googlemalu yahoomalu
Min. : 0.00000 Min. : 0.00000 Min. : 0.00000 Min. : 0.00000
1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.: 0.00000
Median : 0.00000 Median : 0.00000 Median : 0.00000 Median : 0.00000
Mean : 0.02034 Mean : 0.03080 Mean : 0.04734 Mean : 0.02184
3rd Qu.: 0.01053 3rd Qu.: 0.03333 3rd Qu.: 0.00000 3rd Qu.: 0.00000
Max. : 0.60000 Max. : 0.77778 Max. : 4.00000 Max. : 10.00000
NA's :15.00000 NA's :15.00000 NA's :14617.00000 NA's :14617.00000
!R
> Tds$avgt10<-(Tds$googlet10+Tds$yahoot10)/2
> Tds$hasmalu<-Tds$googlemalu>=1|Tds$yahoomalu>=1
> describe(Tds$avgt10)
Tds$avgt10
n missing unique Mean .05 .10 .25 .50 .75 .90 .95
19777 15 2294 0.02557 0.00000 0.00000 0.00000 0.00000 0.03750 0.08834 0.12308
lowest : 0.0000000 0.0003845 0.0004465 0.0005620 0.0006665, highest: 0.4500000 0.4694445 0.5111110 0.5404685 0.5972225
> describe(Tds$hasmalu)
Tds$hasmalu
n missing unique
5175 14617 2
FALSE (4901, 95%), TRUE (274, 5%)
>
!R
> sort(tapply(Tds$avgt10,Tds$cat,mean,na.rm=T),decreasing=T)
Reference Food & Drink Science
0.04587835 0.03726216 0.03656121
Shopping Social Networks & Online Communities News & Current Events
0.03583536 0.03462080 0.02841300
Sports Lifestyles Arts & Humanities
0.02840974 0.02715549 0.02583131
Recreation Internet Local
0.02557728 0.02547137 0.02495497
Entertainment Industries Real Estate
0.02409986 0.02230698 0.02206655
Business Home & Garden Beauty & Personal Care
0.02082178 0.02079487 0.01956457
Travel Society Computers & Electronics
0.01760996 0.01745131 0.01721446
Games Finance & Insurance Health
0.01573290 0.01545625 0.01501522
Telecommunications Automotive Photo & Video
0.01485854 0.01458493 0.01416894
> sort(tapply(Tds$hasmalu,Tds$cat,mean,na.rm=T),decreasing=T)
Social Networks & Online Communities Finance & Insurance Recreation
0.11111111 0.10638298 0.09803922
News & Current Events Lifestyles Science
0.07482993 0.06696429 0.06666667
Sports Telecommunications Local
0.06635071 0.06521739 0.06306306
Society Health Reference
0.06083650 0.05521472 0.05263158
Arts & Humanities Entertainment Computers & Electronics
0.04733728 0.04612299 0.04605263
Automotive Food & Drink Industries
0.04000000 0.03658537 0.03260870
Games Travel Shopping
0.02816901 0.02752294 0.02040816
Beauty & Personal Care Business Home & Garden
0.01785714 0.00000000 0.00000000
Internet Photo & Video Real Estate
0.00000000 0.00000000 0.00000000
png('boxplot-mfa-epc.png')
boxplot(avgt10~cut(epc,10),data=Tds,main='EPC per term ($)', ylab='Fraction of top 10 results MFA',las=2)
dev.off()
png('boxplot-mfa-pop.png')
boxplot(avgt10~cut(log(pop),10),data=Tds[Tds$pop>1,],main='EPC per term ($)', ylab='Fraction of top 10 results MFA',las=2)
dev.off()
!R
> contrasts(Tds$cat) = contr.sum(length(levels(Tds$cat)))
> mfat10<-lm(avgt10~epc+log(pop)+cat,data=Tds[Tds$pop>0,])
> summary(mfat10)
1Call:
lm(formula = avgt10 ~ epc + log(pop) + cat, data = Tds[Tds$pop >
0, ])
Residuals:
Min 1Q Median 3Q Max
-0.07674 -0.02514 -0.01182 0.01002 0.55779
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.0932048 0.0017380 53.628 < 2e-16 ***
epc -0.0091048 0.0005687 -16.009 < 2e-16 ***
log(pop) -0.0058897 0.0001601 -36.786 < 2e-16 ***
cat1 -0.0009007 0.0018864 -0.477 0.633035
cat2 -0.0062208 0.0026123 -2.381 0.017259 *
...
cat24 -0.0085300 0.0014548 -5.863 4.61e-09 ***
cat25 -0.0043826 0.0009987 -4.388 1.15e-05 ***
cat26 -0.0008093 0.0033927 -0.239 0.811453
---
Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
Residual standard error: 0.04224 on 18825 degrees of freedom
(401 observations deleted due to missingness)
Multiple R-squared: 0.1373, Adjusted R-squared: 0.136
F-statistic: 107 on 28 and 18825 DF, p-value: < 2.2e-16
Table of Contents | t |
---|---|
Exposé | ESC |
Full screen slides | e |
Presenter View | p |
Source Files | s |
Slide Numbers | n |
Toggle screen blanking | b |
Show/hide slide context | c |
Notes | 2 |
Help | h |