#!/usr/bin/perl -w # wiki2bios # Paul Ohm # http://paulohm.com # Created July 8, 2008 # This code is placed in the public domain by Paul Ohm, 2008. # No rights are reserved. # Grab the Supreme Court Justices' Bios from the wikipedia page # at http://en.wikipedia.org/wiki/List_of_Justices_of_the_Supreme_Court_of_the_United_States # and parse it and turn it into a CSV (arbitrarily delimited) for use # in projects relating to the Supreme Court use LWP; use HTML::TreeBuilder; use URI; my $wiki_url = "http://en.wikipedia.org/wiki/List_of_Justices_of_the_Supreme_Court_of_the_United_States"; my $D = "|"; # Delimiter my $browser = LWP::UserAgent->new; my $uri = URI->new($wiki_url); $response = $browser->get($uri); if (! $response->is_success) { die "TROUBLE DOWNLOADING!!! $uri error: ", $response->status_line; } #print $response->content; my $tree = HTML::TreeBuilder->new_from_content ($response->content); # There are three tables (with the last having two internal tables) # but thanks to wikipedia's clean design, we just want one... # whose class is "wikitable sortable" my @tables = $tree->look_down ('_tag', 'table', 'class', 'wikitable sortable'); if ($#tables != 0) { print "Trouble finding the one critical table. ", $#tables + 1, " found.\n"; exit; } my @rows = $tables[0]->look_down ('_tag', 'tr'); #print "$#rows rows found.\n"; # Print the column headers print "ordinal${D}last_first${D}first${D}last${D}full${D}state${D}", "born_mo${D}born_date${D}born_year${D}died_mo${D}died_date${D}died_year${D}", "srv_from_mo${D}srv_from_date${D}srv_from_year${D}", "srv_to_mo${D}srv_to_date${D}srv_to_year${D}", "chief_from_mo${D}chief_from_date${D}chief_from_year${D}", "chief_to_mo${D}chief_to_date${D}chief_to_year${D}", "retired_from_mo${D}retired_from_date${D}retired_from_year${D}", "retired_to_mo${D}retired_to_date${D}retired_to_year${D}", "appointed_by${D}reason_for_term\n"; foreach $row (@rows) { # There are nine cells of useful data in each row (indexed 0 to 8) my @row_data = $row->look_down ('_tag', 'td'); # Skip the header line next if ($#row_data < 0); # Field 0 is the ordinal number of the Justice... print $row_data[0]->as_text, $D; # Field 1 is the name, but needs some parsing... my $name_field = $row_data[1]; my @names = $name_field->content_list; if (($#names < 1) or ($#names > 2)) { print "Quitting! Expected 2 name fields but instead parsed ", $#names + 1, "\n"; exit; } print $names[0]->as_text, $D; # Split first into first and last, my ($lname, $fmname) = split /\s*,\s*/, $names[0]->as_text; # Annoyingly, starting with Abe Fortas, the Last, First # name became just Last...so a little extra code to cope if (! defined($fmname)) { $fmname = $names[1]->as_text; $fmname =~ s/\s*$lname\s*//; } print $fmname, $D; print $lname, $D; print $names[1]->as_text, $D; # Field 2 is the home state (abbrev.) print rid_fn($row_data[2]->as_text), $D; # Field 3 is the born-death dates my $born_dead_ref = parse_date_range($row_data[3]); foreach $curr_date_field (@$born_dead_ref) { print $curr_date_field, $D; } # Field 4 is the service dates my $service_ref = parse_date_range($row_data[4]); foreach $curr_date_field (@$service_ref) { print $curr_date_field, $D; } # Field 5 is the CJ dates my $chief_ref = parse_date_range($row_data[5]); foreach $curr_date_field (@$chief_ref) { print $curr_date_field, $D; } # Field 6 is the retirement (to death?) dates my $retire_ref = parse_date_range($row_data[6]); foreach $curr_date_field (@$retire_ref) { print $curr_date_field, $D; } # Field 7 is the appointing president print rid_fn($row_data[7]->as_text), $D; # Field 8 is the reason for terminating print rid_fn(rid_dash($row_data[8]->as_text)); # No delimiter on last. print "\n"; } sub parse_date_range { # Given a object that contains a range of dates, return # the dates split up into month/day/year # doing some error-checking along the way my $td = $_[0]; my $anon_array_ref = [0, 0, 0]; my @date_parts = $td->content_list; my $date_as_text = $td->as_trimmed_text; my $parsed_date1; my $parsed_date2; # Special cases... if (($#date_parts == 0) || ($date_as_text =~ /\(none\)/) || (($#date_parts == 1) && ($date_as_text =~ /\[\d+\]/))) { # Just a dash within the cell--nothing else $parsed_date1 = $anon_array_ref; $parsed_date2 = $anon_array_ref; } elsif ($date_as_text =~ /present/) { # Still serving and/or living $parsed_date1 = parse_date_helper ($date_parts[0]->as_trimmed_text, $date_parts[2]->as_trimmed_text); $parsed_date2 = $anon_array_ref; } elsif ($#date_parts == 4) { my $year; # Probably means the first date in range has no month or date # Get rid of everything but year (including em-dashes) $year = $date_parts[0]; $year =~ s/^\s*(\d\d\d\d).+$/$1/; $parsed_date1 = [0, 0, $year]; $parsed_date2 = parse_date_helper ($date_parts[2]->as_trimmed_text, $date_parts[4]->as_trimmed_text); } elsif (($#date_parts != 7) # Standard case && ($#date_parts != 8)) { # Weird footnote case (Justice Stone) print "Error parsing date ", $td->as_text, "(", $#date_parts + 1, ").\n"; $parsed_date1 = $anon_array_ref; $parsed_date2 = $anon_array_ref; } else { $parsed_date1 = parse_date_helper ($date_parts[0]->as_trimmed_text, $date_parts[2]->as_trimmed_text); $parsed_date2 = parse_date_helper ($date_parts[5]->as_trimmed_text, $date_parts[7]->as_trimmed_text); } push @$parsed_date1, @$parsed_date2; #print "Parsed: $mon1|$date1|$year1|$mon2|$date2|$year2\n"; return $parsed_date1; } sub parse_date_helper { my ($mondate, $year) = @_; my $mon; my $date; if ($mondate =~ /(January|February|March|April| May|June|July|August|September| October|November|December)\s+ (\d+)/mixs) { $mon = $1; $date = $2; } else { print "Error parsing date $mondate.\n"; } my @return_date = ($mon, $date, $year); return \@return_date; } sub rid_fn { # simple function to get rid of wikipedia-style footnotes my $curr_str = $_[0]; $curr_str =~ s/\[\d+\]//; return $curr_str; } sub rid_dash { # simple function to get rid of dashes my $curr_str = $_[0]; $curr_str =~ s/\342\200\224//; return $curr_str; }