#!/usr/bin/perl
use strict;
use LWP::UserAgent;
use Getopt::Long;
use URI;
use Encode qw(encode_utf8 encode);
use Digest::MD5 qw(md5_hex);
use Net::FTP;
# Copyright (c) 2010, Erik Aronesty (documentroot.com)
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
my $ua = new LWP::UserAgent;
my %opt;
# this doesn't really do anything because of the sig code... maybe get rid of it
$opt{time} = 60;
GetOptions(\%opt, "home=s", "nav", "verbose", "src=s", "conf=s", "dest=s", "force", "all", "list", "rss=s", "other=s@", "base=s")
|| die usage();
if (!$opt{home}) {
# get default home
$opt{home} = $ENV{HOMEDRIVE} ? "$ENV{HOMEDRIVE}$ENV{HOMEPATH}" : $ENV{HOME};
$opt{home} .= '/.syncblog' if $opt{home};
$opt{home} = $ENV{SYNCBLOGHOME} if $ENV{SYNCBLOGHOME};
$opt{home} = "$ENV{TMP}/syncblog" if !$opt{home};
}
if (!$opt{src} && !$opt{dest}) {
$opt{src} = shift @ARGV;
$opt{dest} = shift @ARGV;
}
if (!$opt{conf} && -s "$opt{home}/conf") {
$opt{conf} = "$opt{home}/conf";
}
if ($opt{conf}) {
parseconf(\%opt, $opt{conf});
}
if (!$opt{base}) {
if ($opt{dest} =~ /public_html(.*)/) {
$opt{base} = $1;
} else {
die "Base is required for this destination\n\n" . usage();
}
}
$opt{index} = 'index.html';
if ($opt{dest} =~ s|([^/]+\.html?)$||) {
$opt{index} = $1;
}
# begins with a slash... or nothing
$opt{base} =~ s/\/$//;
$opt{base} =~ "/$opt{base}" if $opt{base} && $opt{base} !~ /^\// && $opt{base} !~ /^http:/;
die usage() if !$opt{home} || @ARGV || !$opt{src} || !$opt{dest};
mkdir($opt{home});
$opt{src} = "http://$opt{src}" if $opt{src} !~ /^\w+:\//;
$opt{src} = URI->new($opt{src})->canonical()->as_string();
my $host = URI->new($opt{src})->host;
# get src (either rss or home)
my $dat = cget($opt{src}, 0);
sub mark;
if (!$opt{all} && !$opt{force} && !cisnew($opt{src})) {
mark("no change to root") if $opt{verbose};
syncup();
exit 0;
}
my $rss;
# regexp to find rss/archive
my $rss_re = qr/(rss|atom)\.(xml|rss)$/ if !$opt{rss};
my $nomatch_re = qr/''''/;
# scan all links on home page
my @l = $dat =~ m/\bhref\s*=\s*["']?([^'" ]+)/sig;
push @l, '/rss.xml';
push @l, '/atom.xml';
push @l, '/atom.rss';
push @l, @{$opt{other}} if $opt{other};
my %in_img;
my @s = $dat =~ m/\bsrc\s*=\s*["']?([^'" ]+)/sig;
for (@s) {$in_img{$_} = 1;}
push @l, @s;
if ($opt{rss}) {
# already have rss url, don't scan for it
$rss_re = $nomatch_re;
push @l, $opt{rss};
}
die "Can't determine hotname from $opt{src}" unless $host;
my %map;
my %arch;
my %alr;
for (@l) {
next if $alr{$_};
$alr{$_} = 1;
next if $_ =~ /^mailto:/;
$_ =~ s/#.*//;
$_ =~ s|http://[^\s"'?&]+http://|http://|g;
my $u = URI->new($_)->abs($opt{src});
my $get;
my $uh;
if (m|bp\.blogspot.com.*\/([^/]+)|) {
my $name = $1;
if ($in_img{$_}) {
$name=~s/\./\.inl\./;
$map{$_} = "$opt{base}/uploaded_images/$name";
cget($_, "/uploaded_images/$name");
} else {
my $frame = cget($u->as_string(), "/uploaded_images/$name.link");
if ($frame =~ m/\bsrc\s*=\s*["']?([^'" ]+)/si) {
$map{$_} = "$opt{base}/uploaded_images/$name";
cget($1, "/uploaded_images/$name");
} else {
die "Google changed framing scheme... look at $frame / $_ / $u";
}
}
}
eval {$uh=$u->host};
if (!$uh) {
next;
}
next unless $uh eq $host;
# doesn't work with weekly/daily... someone figure out how to get the list.....?
if (/\/(\d{4})\/(\d{2})\//) {
my ($y, $m) = ($1, $2);
my $i = "${y}_${m}_01_index.html"; # try index
push @l, $i unless $arch{$i};
my $i2 = "${y}_${m}_01_archive.html"; # try archive
push @l, $i2 unless $arch{$i};
$arch{$i} = 1;
$arch{$i2} = 1;
$get = 1;
}
if (m^/search/label/(.*)^) {
my $name = $1;
$map{$_} = "$opt{base}/labels/$name.html";
cget($_, "/labels/$name.html");
next;
}
$get = 1 if /$rss_re/ || $arch{$_};
if ($get) {
my $d = cget($u->as_string());
#print "$u->$d\n";
if ($opt{all}) {
my @t = $d =~ m/\bhref\s*=\s*["']?([^'" ]+)/sig;
for (@t) {
next if $alr{$_};
print "$_\n";
push @l, $_;
}
}
}
}
cleanup("$opt{home}/$host");
syncup();
sub syncup {
my $f = "$opt{home}/".cpath($opt{src});
my $t = ((stat($f))[9]);
my $p = ((stat("$f.sent"))[9]);
if ($opt{dest} =~ m|^ftp:([^:]+):(.+)\@([^/]+)/(.*)|) {
my ($u, $p, $h, $d) = ($1, $2, $3, $4);
# ftp user:pass@host/path here
my @l = ftplist("$opt{home}/$host", $h);
$d =~ s/\/$//;
if (@l) {
my $ftp = Net::FTP->new($h, Debug => 0);
$ftp->login($u, $p) || error("Can't login:" . $ftp->message);
$ftp->binary();
if (!$ftp->cwd($d)) {
$ftp->mkdir($d,1);
$ftp->cwd($d) || error("Can't cd to $d:" . $ftp->message);
}
for my $curf (@l) {
mark "sending $curf to $host\n" if $opt{verbose};
my $dest = $curf;
$dest =~ s|^$opt{home}/$host/||;
if (!$ftp->put($curf, $dest)) {
if ($ftp->code eq '553') {
my $path = $dest;
if ($path =~ s|/[^/]+$||) {
print "makding $dest dir\n" if $opt{verbose};
$ftp->mkdir($path, 1);
}
}
if (!$ftp->put($curf, $dest)) {
mark("Can't put $dest: (" . $ftp->code . ") " . $ftp->message);
next;
}
}
mark $ftp->message;
touch("$curf.sent");
}
}
} elsif ($opt{dest} =~ m|^ftp:|) {
mark("unknown ftp url format");
exit 1;
} else {
if ($t > $p || $opt{force}) {
# rsync works...!
my $ex = "--dry-run" if $opt{list};
my $cmd = "rsync $ex -t -r --size-only --exclude=*.sent --exclude=*.etag --exclude=*.clean --exclude=*.sig --exclude=*.link '$opt{home}/$host/' '$opt{dest}/'";
mark($cmd) if $opt{verbose};
system($cmd);
if (!$opt{list}) {
touch("$f.sent");
}
}
}
}
sub touch {
my $f = shift;
my $t = shift;
open OUT, ">$f"; close OUT;
if ($t) {
utime($t, $t, $f);
}
}
sub ftplist {
my @l;
my $d = shift;
my $dest = shift;
opendir(D, $d) || die "can't read $d: $!";
my @d = readdir(D);
closedir D;
for my $f (@d) {
next if $f =~ /^\./;
next if $f =~ /\.(clean|link|sig|etag|sent)$/;
$f = "$d/$f";
if (-d $f) {
push @l, ftplist($f, $dest);
next;
}
my $t = (stat($f))[9];
if (! -e "$f.sent") {
my $u = $f;
$u =~ s/.*\/$host\///;
my $ur = $ua->head("http://$dest/$u");
if ($ur->is_success()) {
my $len = $ur->header("Content-Length");
if ($len == (-s "$f")) {
mark("skipping $f\n") if $opt{verbose};
touch("$f.sent");
}
}
touch("$f.sent", $t-1);
}
if ($opt{force} || ($t > (stat("$f.sent"))[9])) {
push @l, $f;
}
}
return @l;
}
sub cleanup {
my $d = shift;
opendir(D, $d) || die "can't read $d: $!";
my @d = readdir(D);
closedir D;
for my $f (@d) {
next if $f =~ /^\./;
next if $f =~ /\.(clean|link|sig|etag|sent)$/;
$f = "$d/$f";
if (-d $f) {
cleanup($f);
next;
}
next unless $f =~ /(html|htm|xml|rss)$/i;
my $t = (stat($f))[9];
if ($opt{force} || ($t > (stat("$f.clean"))[9])) {
print "clean $f\n" if $opt{verbose};
my $d = slurp($f);
if (!$opt{nav}) {
$d =~ s|||gsi;
$d =~ s|||gsi;
}
$d =~ s|.*%expand%|%expand%|s;
$d =~ s|||sg;
$d =~ s|([^"']+search/label/[^"']+)|$map{$1}?$map{$1}:$1|ge;
$d =~ s|([^"']+bp\.blogspot\.com/[^"']+)|$map{$1}?$map{$1}:$1|ge;
$d =~ s|http:\/\/$host/|$opt{base}/|g;
$d =~ s|http://[^\s"'?&]+http://|http://|g;
if (!burp($f, $d)) {
unlink $f;
}
utime($t, $t, $f);
open(T,">$f.clean"); close T;
}
}
}
sub parseconf {
my ($opt, $file) = @_;
open(IN, '<'.$file) || die "syncblogger error: can't open conf $file: $!";
while () {
next if /^#/;
next if /^\s+$/;
die "syncblogger error: configuration line $. of $file: needs NAME=VALUE"
if !m|^([^=]+?)\s*=\s*(.*)$|;
my ($n, $v) = (lc($1), $2);
if ($n eq 'conf') {
parseconf($opt, $v);
} else {
$opt->{$n} = $v;
}
}
close IN;
}
sub usage {
return <<'EOF';
USAGE: syncblogger [options] [-src] source-url [-dest] dest-url
Options (can be shortened, default in parens):
-src domain/path of source blog's rss feed (http:// can be omitted)
-dest [[ftp|rsync|file]:][user:pass@][host:]/path url of destination
-conf path to config file (/conf)
-home syncblogger's home directory (<$ENV{HOME}>/.syncblog)
-rss rss file location (default match [rss|atom].[xml|rss])
-other other blogger files (specify multiple paths if needed)
-nav default is off ... specify -nav to turn it on
-duser default dest username
-dpass default dest password (so you can keep it in one place in the config)
-ping ping weblog indexes on update
-base re-base urls to this url (default: dest path minus public_html)
-list list the files that would be copied, but don't do it
-verbose for debugging
Config file consists of name=value pairs (use conf=x for an include file).
Config options don't start with a dash (-).
Config lines starting with # are ignored.
Command line options override config options.
Later config options override earlier ones.
If dest protocol is omitted file: is assumed on a url that starts with / (slash),
rsync: is assumed otherwise.
If dest is rsync:user@host:/path, then everything works better, use it!
Example: syncblogger altroot.blogspot.com /var/www/hosts/documentroot.com/public_html
Or: syncblogger altroot.blogspot.com erik@documentroot.com:public_html
Or Just: syncblogger -c /etc/syncblogger.conf
EOF
}
sub ping {
my $cmd = "wget -q -O - 'http://rpc.weblogs.com/pingSiteForm?name=Bracing+Against+the+Wind&url=http://www.documentroot.com&xmlUrl=http://www.documentroot.com/rss.xml&category=Technology' | grep Thanks > /dev/null";
my $cmd2 = "wget -q -O - 'http://blogsearch.google.com/ping?name=Bracing+Against+the+Wind&url=http://www.documentroot.com&changesURL=http://www.documentroot.com/rss.xml' | grep Thanks > /dev/null";
my $cmd3 = "wget -q -O - 'http://api.moreover.com/ping?u=http://www.documentroot.com/rss.xml' | grep Thank > /dev/null";
}
sub cisnew {
my ($u, $n) = @_;
my $f = "$opt{home}/" . cpath($u, $n);
my $t = ((stat($f))[9]);
if ($t > (stat("$f.clean"))[9]) {
return 1;
}
}
sub cpath {
my ($u, $n) = @_;
if (!$n) {
$n = $u;
$n =~ s|^https?://(\w+@\w+:)?||i;
$n = $n . "/" if $n !~ /\//;
$n = $n . $opt{index} if $n eq '/';
$n = $n . $opt{index} if $n =~ /\/$/;
} else {
$n=~ s/^\///;
$n = "$host/$n";
}
return $n;
}
sub cget {
my ($u, $n, $t) = @_;
$t = $opt{time} if !defined($t);
$n = cpath($u, $n);
# make path
my $d;
my @p = split('/', $n); pop @p;
for (@p) {
$d .= "/$_";
mkdir("$opt{home}$d");
}
my $f = "$opt{home}/$n";
my $r;
if (((-s $f) == 0) || ((stat($f))[9] < (time()-$t))) {
mark "getting $u to $f" if $opt{verbose};
my %head;
my $et = slurp("$f.etag");
if ($et) {
$head{"If-None-Match"}=$et;
}
my $ur = $ua->get($u, %head);
if ($ur->is_success()) {
$r = $ur->decoded_content();
my $sig = md5_hex(encode('c',$r));
my $p_sig = slurp("$f.sig");
if (!($sig eq $p_sig)) {
if (burp($f, $r)) {
burp("$f.sig", $sig);
}
}
burp("$f.etag", $ur->header('ETag')) if $ur->header('ETag');
} elsif ( $ur->code() eq '304' ) {
mark "etag not modified $f" if $opt{verbose};
} else {
mark "can't get $u to $f: $!" unless $u =~ /((\d_(archive|index).html)|(atom.rss))$/;
}
} else {
$r = slurp($f);
}
return $r;
}
sub slurp
{
local $/=undef;
my $in = new IO::File;
open($in, $_[0]) || return undef;
binmode $in, ":utf8";
my $dat = <$in>;
return $dat;
}
sub burp {
my ($fil, $content) = @_;
my $out = new IO::File;
open($out, ">$fil") || (mark("$!") && return undef);
binmode($out, ":utf8");
if ($fil =~ /\.(htm|html|xml)$/) {
print $out $content;
} else {
print $out $content;
}
close $out;
return 1;
}
sub error {
mark(@_);
exit 1;
}
sub mark {
my @m = @_;
for (@m) {s/\n/ /g}
print scalar(localtime), "\t", @m, "\n";
}