#!/usr/bin/perl
#
# Verify the numbers from http-analyze reports
#
# Usage: verify <logfile
#

#sitename - - [01/Feb/1997:00:00:00 +0100] "GET /mydoc.html HTTP/1.0" 200 4207

while (<STDIN>) {
	chop;
	$line++;
	if (/^(\S+) \S+ \S+ \[([^ ]+) [^\]]+\] "(.+)" ([^ ]+) ([^ ]+)/) {
		$site = $1;	# the sitename
		$date = $2;	# the local date
		$req  = $3;	# the request
		$resp = $4+0 if ($4 ne "-");	# the response code
		$size = $5+0 if ($5 ne "-");	# the request size
		($mday,$mon,$year,$hour,$min,$sec) = split(/[:\/]/, $date);
		($method,$url,$prot) = split(/ /, $req);
		if ($prot ne "" && $prot !~ /^HTTP\/1.[01]/) {
			print "line $line: unknown protocol: $prot\n--> $_\n\n";
			$corrupt++;
			next;
		} elsif ($method ne "GET" && $method ne "HEAD" && $method ne "POST" &&
			 $method ne "PUT" && $method ne "TRACE" && $method ne "OPTIONS" &&
			 $method ne "DELETE") {
			print "line $line: unknown request method: $method\n--> $_\n\n";
			$corrupt++;
			next;
		}
	} else {
		print "line $line: corrupted logfile entry:\n--> $_\n\n";
		$corrupt++;
		next;
	}

	if ($method eq "HEAD") {
		$size = 0;	# don't account for size if HEAD request
	}
	$hits++;		# total hits
	if ($resp == 200) {
		$files++;
		$kbsent += $size;
	} elsif ($resp == 304) {
		$nomod++;
	} else {
		$other++;
	}
	if ($resp == 200 || $resp == 304) {
		$urls{$url}++;		# count URLs
	}
	$sites{$site}++;	# count sites
}
print "\n" if ($corrupt);
print "Total hits:\t$hits\n";
print "Total files:\t$files\n";
print "Total nomod:\t$nomod\n";
print "Total other:\t$other\n";
print "Total KB sent:\t$kbsent\n";
print "Total corrupted entries:\t$corrupt\n" if ($corrupt);

$cnt = 0;
#print "\nList of URLs:\n";
foreach $url (sort keys(%urls)) {
	#print "\t$urls{$url}\t$url\n";
	$cnt++;
}
print "Total # of unique URLs:\t$cnt\n";

$cnt = 0;
#print "\nList of sites:\n";
foreach $site (sort keys(%sites)) {
	#print "\t$sites{$site}\t$site\n";
	$cnt++;
}
print "Total # of unique sites:\t$cnt\n";
exit(0);
