#!/usr/bin/perl -w ## $Id: lumberjack.pl.txt,v 1.5 2004/04/15 18:12:18 reinhard Exp $ use strict; use CGI; use Cwd qw(getcwd); use Data::Dumper; ## Globals. our %SEEN_MONTH_IPS; our %STOP_WORDS; our @STOP_WORDS = qw(a all and how i in is of on the or to with); @STOP_WORDS{@STOP_WORDS}=(); our %MONTHS = ( Jan => '01', Feb => '02', Mar => '03', Apr => '04', May => '05', Jun => '06', Jul => '07', Aug => '08', Sep => '09', Oct => '10', Nov => '11', Dec => '12' ); ## annoying encoded aolsearch queries. I looked up the top few and ## stuck them here. our %AOLSEARCH_ENCQUERIES = ( '6832def477d606b4' => 'no carbs', '6832def477d606b4875e5e1c68dff693' => 'no carbs diet', '1378137260dcf2ed' => 'i am fat', 'ddac7786858f8de7' => 'am i fat', '81f6bd3e295b6c9472a16dd6aa5741da' => 'low blood sugar', 'b263df1b82db4acb4c389378252641fc' => 'counting carbs', 'b263df1b82db4acbdf13f0c91e6931a7c203b263c0570005' => 'counting calories', 'fe7678422eaae28c3c41e93c7a22a8cb' => 'no carb foods', '7a41cf665a93638e640c1b8e63fcfa08dd96f9d6f2d72382' => 'foods with no carbs', 'fe7678422eaae28c52353042ed22b76a' => 'no carb recipes', '6b69c7f613923602' => 'gluttony', '5ca0a2d66bf8b55c345996b13ec79f6e' => 'good carbs', '3ad3a69d9638e135996d8eaed2f9088f5e7d9cd42d724998' => 'how to count calories', '31594e046e6ee3bc017331be146ee710' => 'low sugar diet', '5ca0a2d66bf8b55cc49fd7a83b74b59e6c24a4dc698f7836' => 'good carbs bad carbs', ); ## Main. { my $site = getcwd(); if ( $site =~ m%([^/]+)/[^/]+$% ) { $site = $1; } my $total_raw=0; my $total_ok=0; my %day_hits; my %month_hits; my %ok_ips; my %ok_useragents; my %ok_referers; my %search_strings; my %search_bigrams; my %search_words; open OK_HITS, ">ok.tab"; open NON_AD_HITS, ">non_ad.tab"; while (<>){ my ($ip,$when,$resource,$code,$bytes,$referer,$useragent); my $raw =$_; { s/([0-9.]+) // ; $ip = $1; } { s/[^\]]+\[([^\]]+)\]// ; $when = format_date($1); } { s{ "[^/]+(/[^ ]*)[^"]+"}{} ; $resource = $1; } { s/ (\d{3})// ; $code = $1; } { s/ (\d+)// ; $bytes = $1; } { s/ "([^"]*)"// ; $referer = $1 || ""; } { s/ "([^"]*)"// ; $useragent = $1 || ""; } my $month = $when->{month}; my $day = $when->{day}; $month_hits{$month}->{raw}++; $day_hits{$day}->{raw}++; $total_raw++; if ( is_ok( $ip, $resource, $code, $referer, $useragent ) ) { print OK_HITS $raw; my $referer_class = get_referer_class($referer,$resource); unless ( $referer_class eq 'ad' ) { print NON_AD_HITS $raw; } $total_ok++; $ok_ips{$ip}++; $ok_useragents{$useragent}++; $month_hits{$month}->{ok}++; $day_hits{$day}->{ok}++; $month_hits{$month}->{$referer_class}++; $day_hits{$day}->{get_referer_class($referer,$resource)}++; if ($referer =~ m{(\w+://[^/]+)} ) { $ok_referers{$1}++; } else { $ok_referers{$referer}++; } if ( is_new_ip_this_month($month,$ip ) ) { $month_hits{$month}->{ips}++; } if ( is_search($referer) ) { my $search = get_search($referer); $search_strings{$search->{string}}++; while ( my ($word,$count) = each %{$search->{word_counts}} ) { $search_words{$word}+=$count; } while ( my ($bigram,$count) = each %{$search->{bigram_counts}} ) { $search_bigrams{$bigram}+=$count; } } } } #print STDERR Dumper( \$day_hits{"2003-14"} ); open DAY_HITS, ">day_hits.tab"; for my $day ( sort keys %day_hits ) { print DAY_HITS join "\t", ( $day, $day_hits{$day}->{raw} || 0, $day_hits{$day}->{ok} || 0, $day_hits{$day}->{ad} && $day_hits{$day}->{ok} ? $day_hits{$day}->{ok} - $day_hits{$day}->{ad} : ( $day_hits{$day} && $day_hits{$day}->{ok} ? $day_hits{$day}->{ok} : 0 ), ); if ( $day_hits{$day}->{ad} ) { print DAY_HITS "\t"; printf DAY_HITS "%2.2f", ($day_hits{$day}->{ok} - $day_hits{$day}->{ad}) / $day_hits{$day}->{ok} * 100 ; } print DAY_HITS "\n"; } close DAY_HITS; my $timestamp = scalar localtime; print< Web Usage Stats for: $site
Web Usage Stats for: $site

Total ok hits since inception: $total_ok
               Total raw hits: $total_raw

Note: An 'ok' hit is a hit to the home page (index.html) by a human
other than the author. Strict but meaningful. Most raw hits are from
viruses or crawlers.

lumberjack.pl:
This static html page was generated on $timestamp by a homebrewed, hard coded,
totally uncommented perl script which you may admire here.

QUOTE

    my @headers = qw ( raw ok ips type_in google ad yahoo aol msn edays pperl other );
    print "\nMonthly Traffic Summary\n\n";
    print "Month  ";
    for my $header ( @headers ) {
	printf("%8s",$header);
    }
    print "\n";
    for my $month ( sort keys %month_hits ) {
	print $month;
	for my $header ( @headers ) {
	    printf("%8d",$month_hits{$month}->{$header} || 0);
	}
	print "\n";
    }
    print_hash_count(\%ok_referers  ,   'referer',       40  );
    print_hash_count(\%search_strings,  'search string', 40  );
    print_hash_count(\%search_bigrams,  'search bigram', 40  );
    print_hash_count(\%search_words,    'search word'  , 40  );
    print_hash_count(\%ok_ips       ,   'ip',            20  );
    print_hash_count(\%ok_useragents,   'useragent',     20  );
    print "
"; } ## Subroutines. sub format_date { #02/Mar/2003:10:42:24 -0500 my ($day,$month_string,$year,$time) = $_[0] =~ m{(\d{2})/(\w{3})/(\d{4}):(\d{2}:\d{2}:\d{2})}; my $month = $MONTHS{$month_string}; return { day => "$year-$month-$day", month => "$year-$month", }; } sub get_referer_class { my ($referer,$resource) = @_; if ( $referer eq '-') { return "type_in"; } elsif ( $resource =~ /\/?ad\d*/ ) { return 'ad'; } elsif ( $referer =~ /google.com/ ) { return "google"; } elsif ( $referer =~ /practicalperl.com/ ) { return "pperl"; } elsif ( $referer =~ /yahoo.com/ ) { return "yahoo"; } elsif ( $referer =~ /aol.com/ ) { return "aol"; } elsif ( $referer =~/msn/ ) { return "msn"; } elsif ( $referer =~ /nosdiet.com|urbanranger.com|shovelglove.com|everydaysystems.com/ ){ return "edays"; } else { return "other"; } } sub get_search { my $referer = shift; my $string; $referer =~ /(q|query)=([^&]+)/; $string=CGI::unescape($2); $string = lc $string; $string =~ s/[\?\)\(]//g; my @bigrams; $string =~s /"//g; $string = $AOLSEARCH_ENCQUERIES{$string} || $string; ## stupid aol bullshit. my %word_counts; my %bigram_counts; for my $word ( split /\s+/, $string ) { $word =~ s/,//g; if ( exists $STOP_WORDS{$word} ) { $word_counts{"@STOP_WORDS"}++; } else { $word_counts{$word}++; } push @bigrams, $word; if ( @bigrams == 2 ) { $bigram_counts{"@bigrams"}++; shift @bigrams; } } return{ bigram_counts => \%bigram_counts, string => "$string", word_counts=> \%word_counts, }; } sub is_new_ip_this_month { my ($month,$ip)=@_; my $key = "$month:$ip"; unless ( $SEEN_MONTH_IPS{$key} ) { $SEEN_MONTH_IPS{$key}++; return 1; } return; } sub is_ok { my ($ip,$resource,$code,$referer,$useragent)=@_; if ( ($code && $code =~ /200|304/ ) && ($useragent && $useragent =~ /Mozilla/ && $useragent !~ /netcraft|grub|inktomi|jeeves|obot|slurp|girafabot|looksmart|T-H-U-N-D-E-R-S-T-O-N-E|Indy Library|RPT-HTTPClient|intelliseek|MSIECrawler/i) && ($resource && ($resource eq '/' || $resource =~ /\/?ad\d*/ || $resource eq '/nice.html') ) && ($referer !~ /whois/ ) && ($ip ne '18.157.14.183' && $ip ne '18.157.14.31' && $ip ne '66.31.112.10' && $ip ne '65.223.250.253' && $ip ne '216.239.45.4' && $ip ne '136.248.127.2') ){ #print "ok: resource: $resource\n"; return 1; } } sub is_search { if ( $_[0] =~ /(q|query)=[^&]/ ) { return 1; } return; } sub print_hash_count { my ($hash,$name,$top) = @_; my %collapsed; my @printkeys; print "\nOK ${name}s:\n"; my $keylength= 15; my $total_count=0; my $key_count = 0; my $hash_count_file = $name; $hash_count_file =~ s/ /_/g; $hash_count_file .= ".txt"; open (HASH_COUNT_FILE, ">$hash_count_file"); for my $key ( sort { $hash->{$b} <=> $hash->{$a} } keys %$hash ) { print HASH_COUNT_FILE $key . "\t" . $hash->{$key} . "\n"; $key_count+=1; $total_count += $hash->{$key}; if ( $key_count <= $top ) { if ( length $key > $keylength ) { $keylength = length $key; } push @printkeys, $key; } } printf "%${keylength}s%5d\n","unique ${name}s", $key_count; printf "%${keylength}s%5d\n","total ${name}s" , $total_count; printf "%${keylength}s\n", " Top $top ${name}s by count:\n"; for my $key ( @printkeys ) { printf "%${keylength}s%5d\n",$key, $hash->{$key}; } }