#!/usr/bin/perl

# Created by Damon P. Cortesi (http://twitter.com/dacort)
# in honor of my 2000th tweet
#
# Use as you will, but I always appreciate a little credit to
#  http://twitter.com/dacort
#  or
#  http://dcortesi.com/2007/12/27/twitter-stats/
#
# Google Chart implementation by Yoz Grahame (yoz@yoz.com)
# http://twitter.com/yoz
#
# Enough of that, enjoy!

use Date::Calc qw(:all);
use Time::Local;
use POSIX;
use File::Temp qw/ tempfile /;
use HTML::Entities;

my $user = "dacort";

#head -n 2 tweets.csv | tail -n 1 | perl -e '$tw = qq("Happy Stripper Friday!!! (And 1,999 tweets)",2007-11-30T15:53:09+00:00); if ($tw =~ /"(.*)",(\d{4}.*)/) { print $1." - ".$2 }'

# Chars for Google Chart simple encoding

@gcachars = split(//,"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789");

#
# Variables for the statistics being tracked
#
my %tweetsPerHour;
my %tweetsPerDay;
my %tweetsPerMonth;
my %tweetReplies;
my %tweetAts;

my $statuses_count;
my $csv_tweet_count = 0;

my $tweet_date_format = '(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\+00:00';
my $last_tweet_time = 0;
my $last_tweet_time_orig = "";
my $time;

#
# Print help
#
(($#ARGV+1 != 2) || ($ARGV[0] =~ /-h/)) and
	die "\n\tUsage: $ARGV[0] <twitter_username> <tweets.csv>\n\n";
	
# 
# Set the tweets file from the input
#
$user = $ARGV[0];
$tweets_file = $ARGV[1];

#
# Impelment proper opts hash
#
$OPT_FIND_MISSING_TWEETS = 0;

#
# Make sure the tweets file exists
#
if( not -e $tweets_file or -z $tweets_file ) {
  print "Creating Tweets File.\n";
  open(TWEETS, ">$tweets_file") || die "Error opening Tweets ($tweets_file): $!\n";
  print TWEETS "Tweet,Date\n";
  close(TWEETS);
}
	
#
# I use curl and sed instead of something cleaner like LWP
#
# deal with it
#
my $status_exec = `curl --silent http://twitter.com/$user | sed -n -e 's/.*Updates.*<span class=\"stats_count numeric\">\\(.*\\)<\\/span>.*/\\1/p'`;
$status_exec =~ tr/,//d;
$statuses_count = int($status_exec) || die "Couldn't retrieve current statuses count:\n";

#
# Let's see where we left off
#
open(TWEETS, $tweets_file) || die "Error opening Tweets ($tweets_file): $!\n";

# Ignore the header
my $ignore = <TWEETS>;

while (<TWEETS>) {
  if ($_ =~ /.*,($tweet_date_format)$/o) {
    $orig_time = $1;
    $csv_tweet_count++;
    $year = $2;
    $month = $3;
    $day = $4;
    $hour = $5;
    $min = $6;
    $sec = $7;
    
    # Math to take timegm functionality into account
    $time = timegm($sec,$min,$hour,$day,$month-1,$year-1900);
    
    # See if this is a later tweet
    if ($time > $last_tweet_time) {
      $last_tweet_time = $time;
      $last_tweet_time_orig = $orig_time;
    }
    
  } else {
    print "Couldn't read in a line: $_";
    next;
  }
}
close(TWEETS);

print "\nCurrent statuses count: $statuses_count\n";
print "Current tweets downloaded: $csv_tweet_count\n";
print "Time of last tweet downloaded: ".localtime($last_tweet_time)."\n";

#
# Pull in any tweets we haven't already
#
my $max_archive_page = ceil(($statuses_count - $csv_tweet_count)/20);
my $stop_first_dupe = 1;
if($OPT_FIND_MISSING_TWEETS) {
  $max_archive_page = ceil($statuses_count/20);
  $stop_first_dupe = 0;
}
my @new_tweets = get_tweets($max_archive_page, $statuses_count, $stop_first_dupe);

# 
# Now we should have all the new tweets
# Open the csv, and append them
#
if (@new_tweets > 0) {
  open(TWEETS, ">>$tweets_file") || die "Error opening Tweets ($tweets_file): $!\n";
  print TWEETS join("\n", @new_tweets)."\n";
  close(TWEETS);
}
	
#
# Initialize our statistic tracking variables
#
for($i=0;$i<24;$i++) {
  $tweetsPerHour{$i} = 0;
}
# Day_of_Week returns 1 for Monday, 7 for Sunday
for($i=1;$i<=7;$i++) {
  $tweetsPerDay{$i} = 0;
}
for($i=1;$i<=12;$i++) {
  $tweetsPerMonth{$i} = 0;
}
	
#
# Open up tweets and generate some statistics
#
open(TWEETS, $tweets_file) || die "Error opening Tweets ($tweets_file): $!\n";

# Ignore the header
my $blah = <TWEETS>;

# Raw date format: 2007-11-30T15:53:09+00:00
# Raw String: "Happy Stripper Friday!!! (And 1,999 tweets)",2007-11-30T15:53:09+00:00

while (<TWEETS>) {
  
  my ($tweet, $date) = get_line_data($_);
  
  #
  # Determine top tweeters
  #
  if ($tweet =~ /^@(\w+)/) {
    my $username = lc($1);
    if (not exists $tweetReplies{$username} ) {
      $tweetReplies{$username} = 0;
    }
    $tweetReplies{$username}++;
  }
  
  #
  # Determine all conversations
  #
  my @conversations = $tweet =~ /\s@(\w+)/g;
  foreach my $username (@conversations) {
    $username = lc($username);
    if (not exists $tweetAts{$username}) {
      $tweetAts{$username} = 0;
    }
    $tweetAts{$username}++;
  }
  
  #
  # Perform date statistics
  #
  my $year=$month=$day=$hours=$mins=$secs=0;
  if ($date =~ /(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\+.*/) {
    $year = $1;
    $month = $2;
    $day = $3;
    $hours = $4;
    $mins = $5;
    $secs = $6;
    
    $TimeInSeconds = timegm($secs,$mins,$hours,$day,$month-1,$year-1900);
    #convert the UTC seconds to broken-down time in time zone
    ($secs,$mins,$hours,$day,$month,$year) = localtime($TimeInSeconds);
    $month += 1;
    $year += 1900;
  } else {
    print "Couldn't read in a line: $date";
    exit(0);
    next;
  }
  
  $tweetsPerHour{int($hours)}++;
  $tweetsPerDay{Day_of_Week($year,$month,$day)}++;
  $tweetsPerMonth{int($month)}++;
  
}
close(TWEETS);

#
# Now loop through our stats and print them to a csv
#
print "---- Tweets per Hour ----\n";
output_data(\%tweetsPerHour, [qw(0 3 6 9 12 15 18 21 23)],
    "&cht=lc&cmh=s&chxt=x,y,r");

print "\n---- Tweets per Day ----\n";
output_data(\%tweetsPerDay, [qw(Mon Tue Wed Thu Fri Sat Sun)],
	    "&cht=lc&cmh=s&chxt=x,y,r");

print "\n---- Tweets per Month ----\n";
output_data(\%tweetsPerMonth,
  [qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)], "&chxt=x,y,r&cht=lc&cmh=s");

#
# Stuff top replies into a hash that can be used by output_data
#
my %topTweetReplies;
my @topTweetRepliers;
my $i = 0;
foreach $key (sort {$tweetReplies{$b} <=> $tweetReplies{$a}} (keys(%tweetReplies))) {
  $topTweetReplies{$i++} = $tweetReplies{$key};
  push @topTweetRepliers,$key;
  if( $i >= 10 ) {
    last;
  }
}
print "\n---- Top \@replies ----\n";
output_data(\%topTweetReplies, [reverse @topTweetRepliers], "&chxt=y,x,t&cht=bhs&chxs=0,0000dd,9");

#
# Stuff top @'s into hash for output_data
#
$i = 0;
my %topTweetAts;
my @topTweetRecips;
foreach $key (sort {$tweetAts{$b} <=> $tweetAts{$a}} (keys(%tweetAts))) {
  $topTweetAts{$i++} = $tweetAts{$key};
  push @topTweetRecips, $key;
  if( $i >= 10 ) {
    last;
  }
}
print "\n---- Top overall \@'s ----\n";
output_data(\%topTweetAts, [reverse @topTweetRecips], "&chxt=y,x,t&cht=bhs&chxs=0,0000dd,9");

print "\n\n";

sub output_data {
  my @data;
  my $max;
  my ($t,$l,$chartextras) = @_;
  my %tweets = %$t;
  my @labels = @$l;

  foreach $value (sort {$a <=> $b} keys %tweets) {
    push @data, $tweets{$value};
    $max = $tweets{$value} > $max ? $tweets{$value} : $max;
  }
  
  # encode data into GCA simple encoding
  my $gcadata = join("", map {$gcachars[int((scalar(@gcachars)-1) * ($_/$max))]} @data);
  # x & y axis labels
  my $gcalabels = "0:|".join("|",@labels)."|1:||$max|2:||$max";
    
  my $url = "http://chart.apis.google.com/chart?chs=300x300$chartextras";
  $url .= "&chco=ff0000,00ff00,0000ff,ffff00,00ffff,ff00ff";
  $url .= "&chxl=$gcalabels&chd=s:$gcadata";

  print $url;
}

sub get_line_data {
  # Extract the tweets and the dates seperately
  # Unquote the tweet if necessary
  if (/"(.*)",(\d{4}.*)/) {
    $tweet = $1;
    $date = $2;
    $tweet =~ s/""/"/g;
  } elsif (/(.*),(\d{4}.*)/) {
    $tweet = $1;
    $date = $2;
  } else {
    print "UH, why didn't I find somethin?\n$_\n";
  }
  
  return ($tweet, $date);
}

sub quote_value {
  # If a line has quotation marks or a comma in it, add quotes
  my $value = shift;

  if( $value =~ /"/ or $value =~ /,/ ) {
    $value =~ s/"/""/g;
    $value = "\"$value\"";
  }
  return $value;
}


sub get_tweets {
  # First parameter: Maximum page
  # Second parameter: Current statuses
  # Third parameter: Stop on first duplicate
  #
  # Returns: New tweets in an array
  # Use a temporary file to store the downloaded twitter pages in
  my $max_page = shift;
  my $current_statuses = shift;
  my $fail_on_first_dupe = shift;
  
  # If we're not supposed to fail on the first dupe
  # that means we want to loop trough all of our tweets
  # We'll need to read in the tweet_file quickly and 
  # build an array of valid times
  my %csv_tweets;
  if (not $fail_on_first_dupe) {
    open(TWEETS, "$tweets_file") || die "Error opening Tweets ($tweets_file): $!\n";
    my $blah = <TWEETS>;
    while (<TWEETS>) {
      my ($tweet, $date) = get_line_data($_);
      $csv_tweets{$date} = $tweet;
    }
    close(TWEETS);
  }

  # Create a temporary file for downloaded tweets
  my ($fh, $filename) = tempfile("tweets-tmp-XXXX", UNLINK => 1);
  my @downloaded_tweets;

  for(my $i=1; $i<=$max_page; $i++) {
    
    # Regex for extracting tweets
    my $pattern = '^\s*?<span[^>]+>\s+(.*?)\s*?</span>$';
    
    # Current position in time array
    my $time_index = 0;

    #print "Extracting tweets ".$statuses_count*$i." - ".$statuses_count*$i-20." \n";
    print "Extracting page $i (of $max_page)\n";
    `curl --silent 'http://twitter.com/$user?page=$i' > $filename`;
    my $tweets = `cat $filename | sed -n '/<span .*entry-content/,/<\\/span>/ p'`;
    my @times = split("\n",`cat $filename | sed -n -e 's/.*<abbr.*title="*\\(.*\\)">.*<\\/abbr>.*/\\1/p'`);
    
    # Extract the last tweet from the top of the page
    # Assumption: this is not a duplicate tweet, nor have we saved it before
    # Yes, I realize I do a useless comparison for most of the loop
    # and yes, I'm not happy about it.
    if (1 == $i ) {
      my $first_tweet = textify(`cat $filename | sed -n 's/.*<p .*entry-content">\\(.*\\)<\\/p>.*/\\1/p'`);
      my ($first_time) = split("\n",`cat $filename | sed -n -e 's/.*<abbr.*title="*\\(.*\\)">.*<\\/abbr>.*/\\1/p'`);
      push @downloaded_tweets, "$first_tweet,$first_time";
      $time_index++;
    }

    # properly extract the tweets
    while( $tweets =~ s/($pattern)//msx ) {
      my $tweet = textify($2);
      # Make sure we haven't already recorded this tweet
      # btw, I think this logic sucks.
      
      if ($fail_on_first_dupe && $last_tweet_time_orig eq $times[$time_index]){
        print "Encountered a duplicate tweet, breaking loop\n";
        # Manually make us break out of the for loop too
        $i = $max_page;
        last;
      } elsif (!$fail_on_first_dupe && exists $csv_tweets{$times[$time_index]}) {
        # Skip to the next tweet - we've already seen this one
        #print "Skipping to next tweet\n";
        #print "This one was $time[$time_index]: $csv_tweets{$time[$time_index]}\n";
        $time_index++;
        next;
      } else {
        #print "Found a new tweet: $tweet - $times[$time_index]\n"
      }

      # Convert our tweet into plain text
      # Remove HTML, then decode HTML, then csv quote (if necessary)

      push @downloaded_tweets, "$tweet,$times[$time_index++]";
      
      # If we're not stopping at the first dupe
      # make sure we're not pulling pages unnecessarily
      if (!$fail_on_first_dupe) {
        if ((scalar keys %csv_tweets) + scalar(@downloaded_tweets) == $current_statuses) {
          $i = $max_page;
          last;
        }
      }
    }
    
    print "  ".scalar(@downloaded_tweets)." new tweets\n";
  }
  
  return @downloaded_tweets;
}

sub textify {
  # First parameter: Tweet HTML
  # Returns: Plain-text representation
  my $tweet = shift;
  $tweet =~ s/<(?:[^>'"]*|(['"]).*?\1)*>//gs; # Strip HTML
  $tweet =~ s/\n/ /g;   # Strip newlines
  $tweet = quote_value(decode_entities($tweet)); # Strip html entities
  
  return $tweet;
}