This is the script that builds the JavaScript data object used for this page.

#!/usr/bin/perl 

################################################################################
=head1 Stack Exchange Network Detail Grabber

This script helps provides a snapshot of the current sites in the Stack
Exchange Network. I wrote it because I'm curious how the network will change 
over time. The script does three things:

1. Grabs a copy of the Stack Overflow home page in order to parse out
its footer section for a list of active sites. 

2. Grabs a copy of the full list of Stack Exchange sites from:
http://stackexchange.com/sites?view=list and parses out the interesting data.

3. Outputs a JavaScript string with the relevant data that's used to build report
HTML tables. 

Since this script is parsing HTML, it will likely need to be updated if the
source HTML pages change their structure. 

=cut
################################################################################


################################################################################
### Calls
################################################################################

use Modern::Perl;
use FindBin qw($Bin);
use LWP::Simple;
use XML::Simple;
use HTML::Entities;
use YAML::XS;
use JSON;



################################################################################
### Config
################################################################################

### Set an option to turn debugging on if you have local copies of the files
my $debug = 0;

### Switch pretty printing of the JSON output on and off. 
### You probably want to leave it on if you are using TextMate.
my $print_pretty = 1;

my %config_hash = (
	active_site_list_urls => {
		debugging => sprintf("file://%s/stackoverflow.html", $Bin),
		live => "http://stackoverflow.com/"
	},
	full_site_list_urls => {
		debugging => sprintf("file://%s/site-list.html", $Bin),
		live => "http://stackexchange.com/sites?view=list"
	}
);


################################################################################
### Main
################################################################################


### Switch between testing and live URLs
my $active_site_list_url = $debug ? $config_hash{active_site_list_urls}{debugging} : $config_hash{active_site_list_urls}{live};
my $full_site_list_url = $debug ? $config_hash{full_site_list_urls}{debugging} : $config_hash{full_site_list_urls}{live};


### Pull in the list of active sites
my %active_site_list = get_active_site_list({url => $active_site_list_url});
# say Dump \%active_site_list;

### Get the data for the complete site list
my %full_site_list = get_full_site_list({url => $full_site_list_url, active_site_list_ref => \%active_site_list});
# say Dump \%full_site_list;


### Create the JSON Object
my $jsonObj = JSON->new;

### Create the output
my $json_output = $print_pretty ? $jsonObj->canonical->pretty->encode(\%full_site_list) : $jsonObj->canonical->encode(\%full_site_list);

### remove the last newline.
chomp($json_output);

### Output the full string as a var
say sprintf("var stackExchageData = %s;", $json_output);



################################################################################
### Subs
################################################################################


################################################################################
=head2 Sub: get_active_site_list

Pulls a copy of the stackoverflow.com home page and parses out the footer to
determine fully active sites. 

=cut
################################################################################

sub get_active_site_list {

	my $get_active_site_list_input_ref = shift;

	### Create a temporary hash to store the active sites
	my %tmp_active_sites;

	### Pull the stackoverflow.com home page
	my $stackoverflow_data = get($get_active_site_list_input_ref->{url});

	### Grab the footer content
	my $footer_content = (split(/<div id="footer-sites">/, $stackoverflow_data))[1];

	### remove the trailing content
	$footer_content =~ s{</div>.*}{}ms;

	### Match all the sites and pull out the urls
	while($footer_content =~ m{href="(http://[^"]+)"}g) {
	
		### Push the site onto the tmp list
		$tmp_active_sites{$1} = 1;
	}

	return %tmp_active_sites;

}


################################################################################
=head2 Sub: get_full_site_list

Loads a hash with data for the entire set of sites

=cut
################################################################################

sub get_full_site_list {

	my $get_full_site_list_input_ref = shift;

	### Create a temporary hash to store the full site data set
	my %tmp_full_site_list;

	### Set the list of input values to grab
	my @input_int_values = qw (answers percent-answered questions questions-per-day users visits-per-day);

	### Pull the site feed list page
	my $site_list_data = get($get_full_site_list_input_ref->{url});

	### Remove the date before the list
	$site_list_data = (split(/<div class="list-view-container">/, $site_list_data))[1];

	### Remove the data after the list
	$site_list_data = (split(/<p class="a51"/, $site_list_data))[0];

	### Add the initial div back in so it's well formed XML
	$site_list_data = sprintf("<div> %s", $site_list_data);

	### Parse the XML
	my $xml_ref = XMLin($site_list_data, ForceArray => [], KeyAttr => ['id', 'class', 'name'], ForceContent => 0);


	### Loop through the sites via their class
	for my $site_class (sort {$xml_ref->{div}->{$a}->{input}->{name}->{value} cmp $xml_ref->{div}->{$b}->{input}->{name}->{value}} keys %{$xml_ref->{div}}) {

		### Pull in the site URL - This will be the unique ID
		my $site_url = $xml_ref->{div}->{$site_class}->{a}->{href};
	
		### Determine if the site is active or beta
		my $active_beta_string = defined($get_full_site_list_input_ref->{active_site_list_ref}->{$site_url}) ? 'Active' : 'Beta';
	
		### Push the key onto the appropriate array for the title based sort output
		push @{$tmp_full_site_list{site_type_sorting}{$active_beta_string}}, $site_url;
	
		### Add the site name to the site_details reference
		$tmp_full_site_list{site_details}{$site_url}{name} = encode_entities($xml_ref->{div}->{$site_class}->{input}->{name}->{value});
	
		### Add the various integer inputs
		for my $input_int_value (@input_int_values) {
			$tmp_full_site_list{site_details}{$site_url}{$input_int_value} = int($xml_ref->{div}->{$site_class}->{input}->{$input_int_value}->{value});
		}
	
		### Add the description
		$tmp_full_site_list{site_details}{$site_url}{description} = encode_entities($xml_ref->{div}->{$site_class}->{div}->{'lv-info'}->{p}->{content});
	
	}	

	return %tmp_full_site_list;

}