#!/usr/bin/perl -w
# wiki2bios
# Paul Ohm
# http://paulohm.com
# Created July 8, 2008
# This code is placed in the public domain by Paul Ohm, 2008.
# No rights are reserved.
# Grab the Supreme Court Justices' Bios from the wikipedia page
# at http://en.wikipedia.org/wiki/List_of_Justices_of_the_Supreme_Court_of_the_United_States
# and parse it and turn it into a CSV (arbitrarily delimited) for use
# in projects relating to the Supreme Court
use LWP;
use HTML::TreeBuilder;
use URI;
my $wiki_url = "http://en.wikipedia.org/wiki/List_of_Justices_of_the_Supreme_Court_of_the_United_States";
my $D = "|"; # Delimiter
my $browser = LWP::UserAgent->new;
my $uri = URI->new($wiki_url);
$response = $browser->get($uri);
if (! $response->is_success) {
die "TROUBLE DOWNLOADING!!! $uri error: ", $response->status_line;
}
#print $response->content;
my $tree = HTML::TreeBuilder->new_from_content ($response->content);
# There are three tables (with the last having two internal tables)
# but thanks to wikipedia's clean design, we just want one...
# whose class is "wikitable sortable"
my @tables = $tree->look_down ('_tag', 'table',
'class', 'wikitable sortable');
if ($#tables != 0) {
print "Trouble finding the one critical table. ",
$#tables + 1, " found.\n";
exit;
}
my @rows = $tables[0]->look_down ('_tag', 'tr');
#print "$#rows rows found.\n";
# Print the column headers
print "ordinal${D}last_first${D}first${D}last${D}full${D}state${D}",
"born_mo${D}born_date${D}born_year${D}died_mo${D}died_date${D}died_year${D}",
"srv_from_mo${D}srv_from_date${D}srv_from_year${D}",
"srv_to_mo${D}srv_to_date${D}srv_to_year${D}",
"chief_from_mo${D}chief_from_date${D}chief_from_year${D}",
"chief_to_mo${D}chief_to_date${D}chief_to_year${D}",
"retired_from_mo${D}retired_from_date${D}retired_from_year${D}",
"retired_to_mo${D}retired_to_date${D}retired_to_year${D}",
"appointed_by${D}reason_for_term\n";
foreach $row (@rows) {
# There are nine cells of useful data in each row (indexed 0 to 8)
my @row_data = $row->look_down ('_tag', 'td');
# Skip the
header line
next if ($#row_data < 0);
# Field 0 is the ordinal number of the Justice...
print $row_data[0]->as_text, $D;
# Field 1 is the name, but needs some parsing...
my $name_field = $row_data[1];
my @names = $name_field->content_list;
if (($#names < 1) or ($#names > 2)) {
print "Quitting! Expected 2 name fields but instead parsed ", $#names + 1, "\n";
exit;
}
print $names[0]->as_text, $D;
# Split first into first and last,
my ($lname, $fmname) = split /\s*,\s*/, $names[0]->as_text;
# Annoyingly, starting with Abe Fortas, the Last, First
# name became just Last...so a little extra code to cope
if (! defined($fmname)) {
$fmname = $names[1]->as_text;
$fmname =~ s/\s*$lname\s*//;
}
print $fmname, $D;
print $lname, $D;
print $names[1]->as_text, $D;
# Field 2 is the home state (abbrev.)
print rid_fn($row_data[2]->as_text), $D;
# Field 3 is the born-death dates
my $born_dead_ref = parse_date_range($row_data[3]);
foreach $curr_date_field (@$born_dead_ref) {
print $curr_date_field, $D;
}
# Field 4 is the service dates
my $service_ref = parse_date_range($row_data[4]);
foreach $curr_date_field (@$service_ref) {
print $curr_date_field, $D;
}
# Field 5 is the CJ dates
my $chief_ref = parse_date_range($row_data[5]);
foreach $curr_date_field (@$chief_ref) {
print $curr_date_field, $D;
}
# Field 6 is the retirement (to death?) dates
my $retire_ref = parse_date_range($row_data[6]);
foreach $curr_date_field (@$retire_ref) {
print $curr_date_field, $D;
}
# Field 7 is the appointing president
print rid_fn($row_data[7]->as_text), $D;
# Field 8 is the reason for terminating
print rid_fn(rid_dash($row_data[8]->as_text)); # No delimiter on last.
print "\n";
}
sub parse_date_range {
# Given a | object that contains a range of dates, return
# the dates split up into month/day/year
# doing some error-checking along the way
my $td = $_[0];
my $anon_array_ref = [0, 0, 0];
my @date_parts = $td->content_list;
my $date_as_text = $td->as_trimmed_text;
my $parsed_date1;
my $parsed_date2;
# Special cases...
if (($#date_parts == 0)
|| ($date_as_text =~ /\(none\)/)
|| (($#date_parts == 1) && ($date_as_text =~ /\[\d+\]/))) {
# Just a dash within the cell--nothing else
$parsed_date1 = $anon_array_ref;
$parsed_date2 = $anon_array_ref;
}
elsif ($date_as_text =~ /present/) {
# Still serving and/or living
$parsed_date1 = parse_date_helper ($date_parts[0]->as_trimmed_text,
$date_parts[2]->as_trimmed_text);
$parsed_date2 = $anon_array_ref;
}
elsif ($#date_parts == 4) {
my $year;
# Probably means the first date in range has no month or date
# Get rid of everything but year (including em-dashes)
$year = $date_parts[0];
$year =~ s/^\s*(\d\d\d\d).+$/$1/;
$parsed_date1 = [0, 0, $year];
$parsed_date2 = parse_date_helper ($date_parts[2]->as_trimmed_text,
$date_parts[4]->as_trimmed_text);
}
elsif (($#date_parts != 7) # Standard case
&& ($#date_parts != 8)) { # Weird footnote case (Justice Stone)
print "Error parsing date ", $td->as_text, "(", $#date_parts + 1,
").\n";
$parsed_date1 = $anon_array_ref;
$parsed_date2 = $anon_array_ref;
}
else {
$parsed_date1 = parse_date_helper ($date_parts[0]->as_trimmed_text,
$date_parts[2]->as_trimmed_text);
$parsed_date2 = parse_date_helper ($date_parts[5]->as_trimmed_text,
$date_parts[7]->as_trimmed_text);
}
push @$parsed_date1, @$parsed_date2;
#print "Parsed: $mon1|$date1|$year1|$mon2|$date2|$year2\n";
return $parsed_date1;
}
sub parse_date_helper {
my ($mondate, $year) = @_;
my $mon;
my $date;
if ($mondate =~ /(January|February|March|April|
May|June|July|August|September|
October|November|December)\s+
(\d+)/mixs) {
$mon = $1;
$date = $2;
}
else {
print "Error parsing date $mondate.\n";
}
my @return_date = ($mon, $date, $year);
return \@return_date;
}
sub rid_fn {
# simple function to get rid of wikipedia-style footnotes
my $curr_str = $_[0];
$curr_str =~ s/\[\d+\]//;
return $curr_str;
}
sub rid_dash {
# simple function to get rid of dashes
my $curr_str = $_[0];
$curr_str =~ s/\342\200\224//;
return $curr_str;
}
|