Morrie the Toupee Salesman

By Owen Byrne

Morrie the Toupee Salesman header image 2

The Code

May 7th, 2008 · No Comments

Here’s the code for those images. Having something of a scientific background, I’m inclined to think that I’m stepping ahead of those other graphers I mentioned, by exposing my methodology (ha!) to public scrutiny. Open source is really just “peer review” in disguise. The program spits out data to stdout which can be piped to a file and creates the 3 plots as pngs. As the comments say, it needs a little bit more work to handle the DST switch properly. BeautifulSoup is probably overkill since I only parse one attribute out of one tag. The program takes a couple of hours to run, mostly because of the need to be polite - It calls the digg servers 672 times with 15 second delays in between.

from BeautifulSoup import BeautifulStoneSoup
from rpy import *

import os
import urllib
import datetime
import time
import sys
# please change
digg_appkey= urllib.quote('http://example.com/example', '')

class AppURLopener(urllib.FancyURLopener):
    # userAgent string (please change)
    version = 'FillYourBots'

# San Francisco is the center of the universe
os.environ['TZ'] = 'US/Pacific'
time.tzset()

urllib._urlopener = AppURLopener()
endpoint_upcoming = 'http://digg.com/tools/services?endPoint=/stories&type=xml'
endpoint_promoted = 'http://digg.com/tools/services?endPoint=/stories/popular&type=xml'
# it's currently May, so we're dealing with DST for the next few months.
# this will need updating before the times switch
# 4 weeks (30 days ago to 2 days ago)
start_date = datetime.datetime.now() - datetime.timedelta(days=30)
# Zero out minutes and seconds
start_date = datetime.datetime(start_date.year, start_date.month, start_date.day,
              start_date.hour, 0, 0)
# 2 days in the past to allow for promotion
end_date = datetime.datetime.now() -  datetime.timedelta(days=2)

hourly_totals = list(0 for i in range(24))
hourly_prom_totals = list(0 for i in range(24))
dayhour_totals = dayhour_prom_totals = list(list(0 for j in range(24)) for i in range(7))
day_totals = day_prom_totals = list(0 for i in range(7))

while start_date < end_date:
    interval_start = time.mktime(start_date.timetuple())
    interval_end = interval_start + 3600
    url = '%s&appkey=%s&min_submit_date=%d&max_submit_date=%d' % (endpoint_upcoming,
             digg_appkey, interval_start, interval_end)
    instring = urllib.urlopen(url).read()
    d = BeautifulStoneSoup(urllib.urlopen(url).read())

    promoted_url = '%s&appkey=%s&min_submit_date=%d&max_submit_date=%d' %
          (endpoint_promoted, digg_appkey, interval_start, interval_end)
    dp = BeautifulStoneSoup(urllib.urlopen(promoted_url).read())

    data_line = (start_date.strftime('%Y/%m/%d %H:%M'),
                       int(d.stories['total']),
                       int(dp.stories['total']),
                       float(dp.stories['total'])/float(d.stories['total'])*100,
                       )
    sys.stderr.write('Date: %s Total Upcoming: %d Total Promoted: %d %12.10f\n' % data_line)
    print data_line

    # need hour and weekday in PST/PDT
    hour = start_date.hour
    # isoweekday - 1 means 0=Mon, 6=Sun
    weekday = start_date.isoweekday()-1
    print weekday, hour, interval_start
    hourly_totals[hour] += int(d.stories['total'])
    hourly_prom_totals[hour] += int(dp.stories['total'])
    dayhour_totals[weekday][hour] += int(d.stories['total'])
    dayhour_prom_totals[weekday][hour] += int(dp.stories['total'])
    day_totals[weekday] += int(d.stories['total'])
    day_prom_totals[weekday] += int(dp.stories['total'])
    start_date = start_date + datetime.timedelta(hours=1)
    # be polite
    time.sleep(15)

print 'Hourly Totals:'
for i in range(24):
    print i, hourly_prom_totals[i], hourly_totals[i],
         float(hourly_prom_totals[i])/hourly_totals[i]*100
# do some plotting
hourly_outfile = 'hours.png'
x = range(24)
y = [float(hourly_prom_totals[p])/hourly_totals[p]*100 for p in hourly_prom_totals]
r.bitmap('hours.png', res=200)
xlabels = [ "%d" % (i,) for i in x ]
ylabels = [0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5]
r.barplot(y, xlab="Hour", ylab="POP (%)", names_arg=xlabels, ylim=(0, 1.5),
         main="Digg.com POP% By Hour (Pacific Time)")

print 'Daily Totals:'
for i in range(7):
    print i, day_totals[i], day_prom_totals[i], float(day_prom_totals[i])/day_totals[i]*100
x = range(7)
y = [float(day_prom_totals[p])/day_totals[p]*100 for p in day_prom_totals]
print day_totals
print day_prom_totals
r.bitmap('days.png', res=200)
xlabels = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
r.barplot(y, xlab="Day", ylab="POP (%)", names_arg=xlabels, ylim=(0, 1.5),
         main="Digg.com POP% By Day (Pacific Time)")

print 'Day-hour Totals:'
y = []
for i in range(7):
    for j in range(24):
        print i, j, dayhour_prom_totals[i][j], dayhour_totals[i][j],
                float(dayhour_prom_totals[i][j])/dayhour_totals[i][j]*100
        y.append(float(dayhour_prom_totals[i][j])/dayhour_totals[i][j]*100)

r.bitmap('dayhours.png', res=200)
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
xlabels = ['%s %2d:00' % (days[i], j) for i in range(7) for j in range(24)]
r.barplot(y, xlab="Day, Hour", ylab="POP (%)", names_arg=xlabels, ylim=(0, 2.5),
         main="Digg.com POP% By Day/Hour (Pacific Time)")

Tags: Uncategorized

0 responses so far ↓

  • There are no comments yet...Kick things off by filling out the form below.

Leave a Comment