#!/usr/bin/perl use strict; use LWP::UserAgent; use Getopt::Long; use URI; use Encode qw(encode_utf8 encode); use Digest::MD5 qw(md5_hex); use Net::FTP; # Copyright (c) 2010, Erik Aronesty (documentroot.com) # All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, # are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. my $ua = new LWP::UserAgent; my %opt; # this doesn't really do anything because of the sig code... maybe get rid of it $opt{time} = 60; GetOptions(\%opt, "home=s", "nav", "verbose", "src=s", "conf=s", "dest=s", "force", "all", "list", "rss=s", "other=s@", "base=s") || die usage(); if (!$opt{home}) { # get default home $opt{home} = $ENV{HOMEDRIVE} ? "$ENV{HOMEDRIVE}$ENV{HOMEPATH}" : $ENV{HOME}; $opt{home} .= '/.syncblog' if $opt{home}; $opt{home} = $ENV{SYNCBLOGHOME} if $ENV{SYNCBLOGHOME}; $opt{home} = "$ENV{TMP}/syncblog" if !$opt{home}; } if (!$opt{src} && !$opt{dest}) { $opt{src} = shift @ARGV; $opt{dest} = shift @ARGV; } if (!$opt{conf} && -s "$opt{home}/conf") { $opt{conf} = "$opt{home}/conf"; } if ($opt{conf}) { parseconf(\%opt, $opt{conf}); } if (!$opt{base}) { if ($opt{dest} =~ /public_html(.*)/) { $opt{base} = $1; } else { die "Base is required for this destination\n\n" . usage(); } } $opt{index} = 'index.html'; if ($opt{dest} =~ s|([^/]+\.html?)$||) { $opt{index} = $1; } # begins with a slash... or nothing $opt{base} =~ s/\/$//; $opt{base} =~ "/$opt{base}" if $opt{base} && $opt{base} !~ /^\// && $opt{base} !~ /^http:/; die usage() if !$opt{home} || @ARGV || !$opt{src} || !$opt{dest}; mkdir($opt{home}); $opt{src} = "http://$opt{src}" if $opt{src} !~ /^\w+:\//; $opt{src} = URI->new($opt{src})->canonical()->as_string(); my $host = URI->new($opt{src})->host; # get src (either rss or home) my $dat = cget($opt{src}, 0); sub mark; if (!$opt{all} && !$opt{force} && !cisnew($opt{src})) { mark("no change to root") if $opt{verbose}; syncup(); exit 0; } my $rss; # regexp to find rss/archive my $rss_re = qr/(rss|atom)\.(xml|rss)$/ if !$opt{rss}; my $nomatch_re = qr/''''/; # scan all links on home page my @l = $dat =~ m/\bhref\s*=\s*["']?([^'" ]+)/sig; push @l, '/rss.xml'; push @l, '/atom.xml'; push @l, '/atom.rss'; push @l, @{$opt{other}} if $opt{other}; my %in_img; my @s = $dat =~ m/\bsrc\s*=\s*["']?([^'" ]+)/sig; for (@s) {$in_img{$_} = 1;} push @l, @s; if ($opt{rss}) { # already have rss url, don't scan for it $rss_re = $nomatch_re; push @l, $opt{rss}; } die "Can't determine hotname from $opt{src}" unless $host; my %map; my %arch; my %alr; for (@l) { next if $alr{$_}; $alr{$_} = 1; next if $_ =~ /^mailto:/; $_ =~ s/#.*//; $_ =~ s|http://[^\s"'?&]+http://|http://|g; my $u = URI->new($_)->abs($opt{src}); my $get; my $uh; if (m|bp\.blogspot.com.*\/([^/]+)|) { my $name = $1; if ($in_img{$_}) { $name=~s/\./\.inl\./; $map{$_} = "$opt{base}/uploaded_images/$name"; cget($_, "/uploaded_images/$name"); } else { my $frame = cget($u->as_string(), "/uploaded_images/$name.link"); if ($frame =~ m/\bsrc\s*=\s*["']?([^'" ]+)/si) { $map{$_} = "$opt{base}/uploaded_images/$name"; cget($1, "/uploaded_images/$name"); } else { die "Google changed framing scheme... look at $frame / $_ / $u"; } } } eval {$uh=$u->host}; if (!$uh) { next; } next unless $uh eq $host; # doesn't work with weekly/daily... someone figure out how to get the list.....? if (/\/(\d{4})\/(\d{2})\//) { my ($y, $m) = ($1, $2); my $i = "${y}_${m}_01_index.html"; # try index push @l, $i unless $arch{$i}; my $i2 = "${y}_${m}_01_archive.html"; # try archive push @l, $i2 unless $arch{$i}; $arch{$i} = 1; $arch{$i2} = 1; $get = 1; } if (m^/search/label/(.*)^) { my $name = $1; $map{$_} = "$opt{base}/labels/$name.html"; cget($_, "/labels/$name.html"); next; } $get = 1 if /$rss_re/ || $arch{$_}; if ($get) { my $d = cget($u->as_string()); #print "$u->$d\n"; if ($opt{all}) { my @t = $d =~ m/\bhref\s*=\s*["']?([^'" ]+)/sig; for (@t) { next if $alr{$_}; print "$_\n"; push @l, $_; } } } } cleanup("$opt{home}/$host"); syncup(); sub syncup { my $f = "$opt{home}/".cpath($opt{src}); my $t = ((stat($f))[9]); my $p = ((stat("$f.sent"))[9]); if ($opt{dest} =~ m|^ftp:([^:]+):(.+)\@([^/]+)/(.*)|) { my ($u, $p, $h, $d) = ($1, $2, $3, $4); # ftp user:pass@host/path here my @l = ftplist("$opt{home}/$host", $h); $d =~ s/\/$//; if (@l) { my $ftp = Net::FTP->new($h, Debug => 0); $ftp->login($u, $p) || error("Can't login:" . $ftp->message); $ftp->binary(); if (!$ftp->cwd($d)) { $ftp->mkdir($d,1); $ftp->cwd($d) || error("Can't cd to $d:" . $ftp->message); } for my $curf (@l) { mark "sending $curf to $host\n" if $opt{verbose}; my $dest = $curf; $dest =~ s|^$opt{home}/$host/||; if (!$ftp->put($curf, $dest)) { if ($ftp->code eq '553') { my $path = $dest; if ($path =~ s|/[^/]+$||) { print "makding $dest dir\n" if $opt{verbose}; $ftp->mkdir($path, 1); } } if (!$ftp->put($curf, $dest)) { mark("Can't put $dest: (" . $ftp->code . ") " . $ftp->message); next; } } mark $ftp->message; touch("$curf.sent"); } } } elsif ($opt{dest} =~ m|^ftp:|) { mark("unknown ftp url format"); exit 1; } else { if ($t > $p || $opt{force}) { # rsync works...! my $ex = "--dry-run" if $opt{list}; my $cmd = "rsync $ex -t -r --size-only --exclude=*.sent --exclude=*.etag --exclude=*.clean --exclude=*.sig --exclude=*.link '$opt{home}/$host/' '$opt{dest}/'"; mark($cmd) if $opt{verbose}; system($cmd); if (!$opt{list}) { touch("$f.sent"); } } } } sub touch { my $f = shift; my $t = shift; open OUT, ">$f"; close OUT; if ($t) { utime($t, $t, $f); } } sub ftplist { my @l; my $d = shift; my $dest = shift; opendir(D, $d) || die "can't read $d: $!"; my @d = readdir(D); closedir D; for my $f (@d) { next if $f =~ /^\./; next if $f =~ /\.(clean|link|sig|etag|sent)$/; $f = "$d/$f"; if (-d $f) { push @l, ftplist($f, $dest); next; } my $t = (stat($f))[9]; if (! -e "$f.sent") { my $u = $f; $u =~ s/.*\/$host\///; my $ur = $ua->head("http://$dest/$u"); if ($ur->is_success()) { my $len = $ur->header("Content-Length"); if ($len == (-s "$f")) { mark("skipping $f\n") if $opt{verbose}; touch("$f.sent"); } } touch("$f.sent", $t-1); } if ($opt{force} || ($t > (stat("$f.sent"))[9])) { push @l, $f; } } return @l; } sub cleanup { my $d = shift; opendir(D, $d) || die "can't read $d: $!"; my @d = readdir(D); closedir D; for my $f (@d) { next if $f =~ /^\./; next if $f =~ /\.(clean|link|sig|etag|sent)$/; $f = "$d/$f"; if (-d $f) { cleanup($f); next; } next unless $f =~ /(html|htm|xml|rss)$/i; my $t = (stat($f))[9]; if ($opt{force} || ($t > (stat("$f.clean"))[9])) { print "clean $f\n" if $opt{verbose}; my $d = slurp($f); if (!$opt{nav}) { $d =~ s|]*>\@import *url\(http://www.blogger[^;]*?navbar/[^;]*?.css\);.*?||gsi; $d =~ s|||gsi; } $d =~ s|.*%expand%|%expand%|s; $d =~ s|||sg; $d =~ s|([^"']+search/label/[^"']+)|$map{$1}?$map{$1}:$1|ge; $d =~ s|([^"']+bp\.blogspot\.com/[^"']+)|$map{$1}?$map{$1}:$1|ge; $d =~ s|http:\/\/$host/|$opt{base}/|g; $d =~ s|http://[^\s"'?&]+http://|http://|g; if (!burp($f, $d)) { unlink $f; } utime($t, $t, $f); open(T,">$f.clean"); close T; } } } sub parseconf { my ($opt, $file) = @_; open(IN, '<'.$file) || die "syncblogger error: can't open conf $file: $!"; while () { next if /^#/; next if /^\s+$/; die "syncblogger error: configuration line $. of $file: needs NAME=VALUE" if !m|^([^=]+?)\s*=\s*(.*)$|; my ($n, $v) = (lc($1), $2); if ($n eq 'conf') { parseconf($opt, $v); } else { $opt->{$n} = $v; } } close IN; } sub usage { return <<'EOF'; USAGE: syncblogger [options] [-src] source-url [-dest] dest-url Options (can be shortened, default in parens): -src domain/path of source blog's rss feed (http:// can be omitted) -dest [[ftp|rsync|file]:][user:pass@][host:]/path url of destination -conf path to config file (/conf) -home syncblogger's home directory (<$ENV{HOME}>/.syncblog) -rss rss file location (default match [rss|atom].[xml|rss]) -other other blogger files (specify multiple paths if needed) -nav default is off ... specify -nav to turn it on -duser default dest username -dpass default dest password (so you can keep it in one place in the config) -ping ping weblog indexes on update -base re-base urls to this url (default: dest path minus public_html) -list list the files that would be copied, but don't do it -verbose for debugging Config file consists of name=value pairs (use conf=x for an include file). Config options don't start with a dash (-). Config lines starting with # are ignored. Command line options override config options. Later config options override earlier ones. If dest protocol is omitted file: is assumed on a url that starts with / (slash), rsync: is assumed otherwise. If dest is rsync:user@host:/path, then everything works better, use it! Example: syncblogger altroot.blogspot.com /var/www/hosts/documentroot.com/public_html Or: syncblogger altroot.blogspot.com erik@documentroot.com:public_html Or Just: syncblogger -c /etc/syncblogger.conf EOF } sub ping { my $cmd = "wget -q -O - 'http://rpc.weblogs.com/pingSiteForm?name=Bracing+Against+the+Wind&url=http://www.documentroot.com&xmlUrl=http://www.documentroot.com/rss.xml&category=Technology' | grep Thanks > /dev/null"; my $cmd2 = "wget -q -O - 'http://blogsearch.google.com/ping?name=Bracing+Against+the+Wind&url=http://www.documentroot.com&changesURL=http://www.documentroot.com/rss.xml' | grep Thanks > /dev/null"; my $cmd3 = "wget -q -O - 'http://api.moreover.com/ping?u=http://www.documentroot.com/rss.xml' | grep Thank > /dev/null"; } sub cisnew { my ($u, $n) = @_; my $f = "$opt{home}/" . cpath($u, $n); my $t = ((stat($f))[9]); if ($t > (stat("$f.clean"))[9]) { return 1; } } sub cpath { my ($u, $n) = @_; if (!$n) { $n = $u; $n =~ s|^https?://(\w+@\w+:)?||i; $n = $n . "/" if $n !~ /\//; $n = $n . $opt{index} if $n eq '/'; $n = $n . $opt{index} if $n =~ /\/$/; } else { $n=~ s/^\///; $n = "$host/$n"; } return $n; } sub cget { my ($u, $n, $t) = @_; $t = $opt{time} if !defined($t); $n = cpath($u, $n); # make path my $d; my @p = split('/', $n); pop @p; for (@p) { $d .= "/$_"; mkdir("$opt{home}$d"); } my $f = "$opt{home}/$n"; my $r; if (((-s $f) == 0) || ((stat($f))[9] < (time()-$t))) { mark "getting $u to $f" if $opt{verbose}; my %head; my $et = slurp("$f.etag"); if ($et) { $head{"If-None-Match"}=$et; } my $ur = $ua->get($u, %head); if ($ur->is_success()) { $r = $ur->decoded_content(); my $sig = md5_hex(encode('c',$r)); my $p_sig = slurp("$f.sig"); if (!($sig eq $p_sig)) { if (burp($f, $r)) { burp("$f.sig", $sig); } } burp("$f.etag", $ur->header('ETag')) if $ur->header('ETag'); } elsif ( $ur->code() eq '304' ) { mark "etag not modified $f" if $opt{verbose}; } else { mark "can't get $u to $f: $!" unless $u =~ /((\d_(archive|index).html)|(atom.rss))$/; } } else { $r = slurp($f); } return $r; } sub slurp { local $/=undef; my $in = new IO::File; open($in, $_[0]) || return undef; binmode $in, ":utf8"; my $dat = <$in>; return $dat; } sub burp { my ($fil, $content) = @_; my $out = new IO::File; open($out, ">$fil") || (mark("$!") && return undef); binmode($out, ":utf8"); if ($fil =~ /\.(htm|html|xml)$/) { print $out $content; } else { print $out $content; } close $out; return 1; } sub error { mark(@_); exit 1; } sub mark { my @m = @_; for (@m) {s/\n/ /g} print scalar(localtime), "\t", @m, "\n"; }