#!/usr/bin/perl # Created by Damon P. Cortesi (http://twitter.com/dacort) # in honor of my 2000th tweet # # Use as you will, but I always appreciate a little credit to # http://twitter.com/dacort # or # http://dcortesi.com/2007/12/27/twitter-stats/ # # Google Chart implementation by Yoz Grahame (yoz@yoz.com) # http://twitter.com/yoz # # Enough of that, enjoy! use Date::Calc qw(:all); use Time::Local; use POSIX; use File::Temp qw/ tempfile /; use HTML::Entities; my $user = "dacort"; #head -n 2 tweets.csv | tail -n 1 | perl -e '$tw = qq("Happy Stripper Friday!!! (And 1,999 tweets)",2007-11-30T15:53:09+00:00); if ($tw =~ /"(.*)",(\d{4}.*)/) { print $1." - ".$2 }' # Chars for Google Chart simple encoding @gcachars = split(//,"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"); # # Variables for the statistics being tracked # my %tweetsPerHour; my %tweetsPerDay; my %tweetsPerMonth; my %tweetReplies; my %tweetAts; my $statuses_count; my $csv_tweet_count = 0; my $tweet_date_format = '(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\+00:00'; my $last_tweet_time = 0; my $last_tweet_time_orig = ""; my $time; # # Print help # (($#ARGV+1 != 2) || ($ARGV[0] =~ /-h/)) and die "\n\tUsage: $ARGV[0] \n\n"; # # Set the tweets file from the input # $user = $ARGV[0]; $tweets_file = $ARGV[1]; # # Impelment proper opts hash # $OPT_FIND_MISSING_TWEETS = 0; # # Make sure the tweets file exists # if( not -e $tweets_file or -z $tweets_file ) { print "Creating Tweets File.\n"; open(TWEETS, ">$tweets_file") || die "Error opening Tweets ($tweets_file): $!\n"; print TWEETS "Tweet,Date\n"; close(TWEETS); } # # I use curl and sed instead of something cleaner like LWP # # deal with it # my $status_exec = `curl --silent http://twitter.com/$user | sed -n -e 's/.*Updates.*\\(.*\\)<\\/span>.*/\\1/p'`; $status_exec =~ tr/,//d; $statuses_count = int($status_exec) || die "Couldn't retrieve current statuses count:\n"; # # Let's see where we left off # open(TWEETS, $tweets_file) || die "Error opening Tweets ($tweets_file): $!\n"; # Ignore the header my $ignore = ; while () { if ($_ =~ /.*,($tweet_date_format)$/o) { $orig_time = $1; $csv_tweet_count++; $year = $2; $month = $3; $day = $4; $hour = $5; $min = $6; $sec = $7; # Math to take timegm functionality into account $time = timegm($sec,$min,$hour,$day,$month-1,$year-1900); # See if this is a later tweet if ($time > $last_tweet_time) { $last_tweet_time = $time; $last_tweet_time_orig = $orig_time; } } else { print "Couldn't read in a line: $_"; next; } } close(TWEETS); print "\nCurrent statuses count: $statuses_count\n"; print "Current tweets downloaded: $csv_tweet_count\n"; print "Time of last tweet downloaded: ".localtime($last_tweet_time)."\n"; # # Pull in any tweets we haven't already # my $max_archive_page = ceil(($statuses_count - $csv_tweet_count)/20); my $stop_first_dupe = 1; if($OPT_FIND_MISSING_TWEETS) { $max_archive_page = ceil($statuses_count/20); $stop_first_dupe = 0; } my @new_tweets = get_tweets($max_archive_page, $statuses_count, $stop_first_dupe); # # Now we should have all the new tweets # Open the csv, and append them # if (@new_tweets > 0) { open(TWEETS, ">>$tweets_file") || die "Error opening Tweets ($tweets_file): $!\n"; print TWEETS join("\n", @new_tweets)."\n"; close(TWEETS); } # # Initialize our statistic tracking variables # for($i=0;$i<24;$i++) { $tweetsPerHour{$i} = 0; } # Day_of_Week returns 1 for Monday, 7 for Sunday for($i=1;$i<=7;$i++) { $tweetsPerDay{$i} = 0; } for($i=1;$i<=12;$i++) { $tweetsPerMonth{$i} = 0; } # # Open up tweets and generate some statistics # open(TWEETS, $tweets_file) || die "Error opening Tweets ($tweets_file): $!\n"; # Ignore the header my $blah = ; # Raw date format: 2007-11-30T15:53:09+00:00 # Raw String: "Happy Stripper Friday!!! (And 1,999 tweets)",2007-11-30T15:53:09+00:00 while () { my ($tweet, $date) = get_line_data($_); # # Determine top tweeters # if ($tweet =~ /^@(\w+)/) { my $username = lc($1); if (not exists $tweetReplies{$username} ) { $tweetReplies{$username} = 0; } $tweetReplies{$username}++; } # # Determine all conversations # my @conversations = $tweet =~ /\s@(\w+)/g; foreach my $username (@conversations) { $username = lc($username); if (not exists $tweetAts{$username}) { $tweetAts{$username} = 0; } $tweetAts{$username}++; } # # Perform date statistics # my $year=$month=$day=$hours=$mins=$secs=0; if ($date =~ /(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\+.*/) { $year = $1; $month = $2; $day = $3; $hours = $4; $mins = $5; $secs = $6; $TimeInSeconds = timegm($secs,$mins,$hours,$day,$month-1,$year-1900); #convert the UTC seconds to broken-down time in time zone ($secs,$mins,$hours,$day,$month,$year) = localtime($TimeInSeconds); $month += 1; $year += 1900; } else { print "Couldn't read in a line: $date"; exit(0); next; } $tweetsPerHour{int($hours)}++; $tweetsPerDay{Day_of_Week($year,$month,$day)}++; $tweetsPerMonth{int($month)}++; } close(TWEETS); # # Now loop through our stats and print them to a csv # print "---- Tweets per Hour ----\n"; output_data(\%tweetsPerHour, [qw(0 3 6 9 12 15 18 21 23)], "&cht=lc&cmh=s&chxt=x,y,r"); print "\n---- Tweets per Day ----\n"; output_data(\%tweetsPerDay, [qw(Mon Tue Wed Thu Fri Sat Sun)], "&cht=lc&cmh=s&chxt=x,y,r"); print "\n---- Tweets per Month ----\n"; output_data(\%tweetsPerMonth, [qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)], "&chxt=x,y,r&cht=lc&cmh=s"); # # Stuff top replies into a hash that can be used by output_data # my %topTweetReplies; my @topTweetRepliers; my $i = 0; foreach $key (sort {$tweetReplies{$b} <=> $tweetReplies{$a}} (keys(%tweetReplies))) { $topTweetReplies{$i++} = $tweetReplies{$key}; push @topTweetRepliers,$key; if( $i >= 10 ) { last; } } print "\n---- Top \@replies ----\n"; output_data(\%topTweetReplies, [reverse @topTweetRepliers], "&chxt=y,x,t&cht=bhs&chxs=0,0000dd,9"); # # Stuff top @'s into hash for output_data # $i = 0; my %topTweetAts; my @topTweetRecips; foreach $key (sort {$tweetAts{$b} <=> $tweetAts{$a}} (keys(%tweetAts))) { $topTweetAts{$i++} = $tweetAts{$key}; push @topTweetRecips, $key; if( $i >= 10 ) { last; } } print "\n---- Top overall \@'s ----\n"; output_data(\%topTweetAts, [reverse @topTweetRecips], "&chxt=y,x,t&cht=bhs&chxs=0,0000dd,9"); print "\n\n"; sub output_data { my @data; my $max; my ($t,$l,$chartextras) = @_; my %tweets = %$t; my @labels = @$l; foreach $value (sort {$a <=> $b} keys %tweets) { push @data, $tweets{$value}; $max = $tweets{$value} > $max ? $tweets{$value} : $max; } # encode data into GCA simple encoding my $gcadata = join("", map {$gcachars[int((scalar(@gcachars)-1) * ($_/$max))]} @data); # x & y axis labels my $gcalabels = "0:|".join("|",@labels)."|1:||$max|2:||$max"; my $url = "http://chart.apis.google.com/chart?chs=300x300$chartextras"; $url .= "&chco=ff0000,00ff00,0000ff,ffff00,00ffff,ff00ff"; $url .= "&chxl=$gcalabels&chd=s:$gcadata"; print $url; } sub get_line_data { # Extract the tweets and the dates seperately # Unquote the tweet if necessary if (/"(.*)",(\d{4}.*)/) { $tweet = $1; $date = $2; $tweet =~ s/""/"/g; } elsif (/(.*),(\d{4}.*)/) { $tweet = $1; $date = $2; } else { print "UH, why didn't I find somethin?\n$_\n"; } return ($tweet, $date); } sub quote_value { # If a line has quotation marks or a comma in it, add quotes my $value = shift; if( $value =~ /"/ or $value =~ /,/ ) { $value =~ s/"/""/g; $value = "\"$value\""; } return $value; } sub get_tweets { # First parameter: Maximum page # Second parameter: Current statuses # Third parameter: Stop on first duplicate # # Returns: New tweets in an array # Use a temporary file to store the downloaded twitter pages in my $max_page = shift; my $current_statuses = shift; my $fail_on_first_dupe = shift; # If we're not supposed to fail on the first dupe # that means we want to loop trough all of our tweets # We'll need to read in the tweet_file quickly and # build an array of valid times my %csv_tweets; if (not $fail_on_first_dupe) { open(TWEETS, "$tweets_file") || die "Error opening Tweets ($tweets_file): $!\n"; my $blah = ; while () { my ($tweet, $date) = get_line_data($_); $csv_tweets{$date} = $tweet; } close(TWEETS); } # Create a temporary file for downloaded tweets my ($fh, $filename) = tempfile("tweets-tmp-XXXX", UNLINK => 1); my @downloaded_tweets; for(my $i=1; $i<=$max_page; $i++) { # Regex for extracting tweets my $pattern = '^\s*?]+>\s+(.*?)\s*?$'; # Current position in time array my $time_index = 0; #print "Extracting tweets ".$statuses_count*$i." - ".$statuses_count*$i-20." \n"; print "Extracting page $i (of $max_page)\n"; `curl --silent 'http://twitter.com/$user?page=$i' > $filename`; my $tweets = `cat $filename | sed -n '// p'`; my @times = split("\n",`cat $filename | sed -n -e 's/.*.*<\\/abbr>.*/\\1/p'`); # Extract the last tweet from the top of the page # Assumption: this is not a duplicate tweet, nor have we saved it before # Yes, I realize I do a useless comparison for most of the loop # and yes, I'm not happy about it. if (1 == $i ) { my $first_tweet = textify(`cat $filename | sed -n 's/.*

\\(.*\\)<\\/p>.*/\\1/p'`); my ($first_time) = split("\n",`cat $filename | sed -n -e 's/.*.*<\\/abbr>.*/\\1/p'`); push @downloaded_tweets, "$first_tweet,$first_time"; $time_index++; } # properly extract the tweets while( $tweets =~ s/($pattern)//msx ) { my $tweet = textify($2); # Make sure we haven't already recorded this tweet # btw, I think this logic sucks. if ($fail_on_first_dupe && $last_tweet_time_orig eq $times[$time_index]){ print "Encountered a duplicate tweet, breaking loop\n"; # Manually make us break out of the for loop too $i = $max_page; last; } elsif (!$fail_on_first_dupe && exists $csv_tweets{$times[$time_index]}) { # Skip to the next tweet - we've already seen this one #print "Skipping to next tweet\n"; #print "This one was $time[$time_index]: $csv_tweets{$time[$time_index]}\n"; $time_index++; next; } else { #print "Found a new tweet: $tweet - $times[$time_index]\n" } # Convert our tweet into plain text # Remove HTML, then decode HTML, then csv quote (if necessary) push @downloaded_tweets, "$tweet,$times[$time_index++]"; # If we're not stopping at the first dupe # make sure we're not pulling pages unnecessarily if (!$fail_on_first_dupe) { if ((scalar keys %csv_tweets) + scalar(@downloaded_tweets) == $current_statuses) { $i = $max_page; last; } } } print " ".scalar(@downloaded_tweets)." new tweets\n"; } return @downloaded_tweets; } sub textify { # First parameter: Tweet HTML # Returns: Plain-text representation my $tweet = shift; $tweet =~ s/<(?:[^>'"]*|(['"]).*?\1)*>//gs; # Strip HTML $tweet =~ s/\n/ /g; # Strip newlines $tweet = quote_value(decode_entities($tweet)); # Strip html entities return $tweet; }