Alister West

home is where your code is ...

Google Sitemap generator

#!/usr/bin/env perl

use HTML::Entities qw/encode_entities/;

my $host = 'http://www.example.com';
my $sitemap = My::Sitemap->new( root => '/var/www', host => $host );
for (1 .. 1000) {
    $sitemap->add_url( "$host/" . encode_entities( $_ ) . "/" )
}
exit;


package My::Sitemap;

#
# Build Sitemaps
#
# sitemap-index.xml
# sitemap-1.xml
# sitemap-2.xml
# ..
#
# ref: http://www.sitemaps.org/

# TODO: add_link - accept WC3 for lastmod
# TODO: _open_sitemap - take arg for lastmod
# TODO: pod

# Author: Alister West (c) 2011

use strict;
use warnings;
use autodie;
use feature qw/say/;


use File::Copy         qw/move/;
use IO::Compress::Gzip qw/gzip  $GzipError/;


sub new {
# -------------------------------------
# create a new sitemap object.
#
    my ($class, @params) = @_;
    my $self = {
        root        => '',
        host        => '',
        name        => 'sitemap',
        debug       => 1,
        compress    => 0,
        _count      => { pages => 0, urls => 0, total_urls => 0 },
        _fh_index   => undef,
        _fh_sitemap => undef,
        @params
    };

    die "root and host arguments to new are required"
        unless $self->{root} && $self->{host};

    return bless $self, $class;
}


sub _debug { say "[debug] $_[1]" if $_[0]->{debug} and $_[1]; }


sub _open_index {
# ----------------------------------
# open a sitemap-index if necessary
#
    my $self = shift;
    return if $self->{_fh_index};

    #
    # Open index xml doc
    #
    my $filename = "$self->{root}/$self->{name}-index.xml";
    open $self->{_fh_index}, '>:encoding(UTF-8)', "$filename.tmp";
    print {$self->{_fh_index}} qq{<?xml version="1.0" encoding="UTF-8"?>\n},
                               qq{<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n};

    $self->_debug( "Opening index $filename" );
}


sub _open_sitemap {
# ----------------------------------
# open a new sitemap page if necessary
#
    my $self = shift;
    return if $self->{_fh_page};


    #
    # Open new sitemap xml doc
    #
    $self->{_count}{pages}++;
    my $filename = "$self->{root}/$self->{name}-$self->{_count}{pages}.xml";
    open $self->{_fh_page}, '>:encoding(UTF-8)', "$filename.tmp";
    print {$self->{_fh_page}} qq{<?xml version="1.0" encoding="UTF-8"?>\n},
                              qq{<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n};


    #
    # Add page to the sitemap index
    #
    my $sitemap_url = "$self->{host}/$self->{name}"."-"."$self->{_count}{pages}.xml". ($self->{compress} ? '.gz' : '');
    print {$self->{_fh_index}} "    <sitemap><loc>$sitemap_url</loc></sitemap>\n";

    $self->_debug( "Starting sitemap $filename" );
}


sub add_url {
# ----------------------------------------
#
#
    my ($self, $url, %opts) = @_;

    #
    # Make sure we have open filehandles
    #
    $self->_open_index();
    $self->_open_sitemap();


    #
    # Optional tags
    #
    my ($lastmod,$changefreq,$priority) = ('','','');

    $lastmod = "      <lastmod>$1</lastmod>\n"
        if $opts{lastmod}
        and $opts{lastmod} =~ m/^(\d{4}-\d{2}-\d{2})/;

    $changefreq = "      <changefreq>$1<changefreq>\n"
        if $opts{changefreq}
        and $opts{changefreq} =~ m/(always|hourly|daily|weekly|monthly|yearly|never)/;

    $priority = "      <priority>$1</priority>\n"
        if $opts{priority}
        and $opts{priority} =~ m/^(0?\.\d+|1.0?)/;  # 0.0 -> 1.0


    #
    # Do it
    #
    print {$self->{_fh_page}} "    <url>\n      <loc>$url</loc>\n".$lastmod.$changefreq.$priority."    </url>\n";


    #
    # Log it
    #
    $self->{_count}{urls}++;
    $self->{_count}{total_urls}++;


    #
    # 50_000 is the magic number of urls for Googles sitemaps
    # - close current page and reset counter.
    #
    if ($self->{_count}{urls} > 49_999) {
        $self->{_count}{urls} = 0;
        print {$self->{_fh_page}} qq{</urlset>\n};
        undef $self->{_fh_page}; # close fh
    }

}


sub done { shift->DESTROY }
sub DESTROY {
# --------------------------------------
# - close the xml files with </end> tags
# - publish the *.xml.tmp files to *.xml
# - remove leftover *.tmp files
#
    my $self = shift;

    #
    # Make sure XML files when closed will validate
    #
    print {$self->{_fh_page}}  "</urlset>\n"       if $self->{_fh_page};
    print {$self->{_fh_index}} "</sitemapindex>\n" if $self->{_fh_index};


    #
    # Close filehandles - this would happen anyway when they go out of scope.
    #
    undef $self->{_fh_page};
    undef $self->{_fh_index};


    #
    # Publish files
    #
    if ( -e "$self->{root}/$self->{name}-index.xml.tmp") {
        move "$self->{root}/$self->{name}-index.xml.tmp", "$self->{root}/$self->{name}-index.xml";
        for my $i ( 1 .. $self->{_count}{pages} ) {
            move "$self->{root}/$self->{name}-$i.xml.tmp", "$self->{root}/$self->{name}-$i.xml";
            gzip "$self->{root}/$self->{name}-$i.xml",     "$self->{root}/$self->{name}-$i.xml.gz" if $self->{compress};
        }
    }
    else {
        warn "[error] No sitemap files built";
    }


    #
    # Cleanup build files if something went wrong and left them around
    #
    if (my @tmps = glob "$self->{root}/$self->{name}-*.xml.tmp") {
        $self->_debug("cleaning up $_") for @tmps;
        unlink @tmps;
    }


    #
    # Done!
    #
    say "created $self->{_count}{pages} pages containing $self->{_count}{total_urls} urls";
}


1;
By Alister West