#!/usr/bin/perl -w # # VW - spider v0.1 # by f roque # #spiders through site, updates db with list of each page->link found. # #todo: #will this work on alternate sites? #different hits log different user-agents, shouldnt be that way. # #expects mysql tables that look like this: # #mysql> desc map_run; #+------------+-----------+------+-----+---------+----------------+ #| Field | Type | Null | Key | Default | Extra | #+------------+-----------+------+-----+---------+----------------+ #| map_run_id | int(11) | | PRI | NULL | auto_increment | #| date | datetime | YES | | NULL | | #| url | char(100) | YES | | NULL | | #+------------+-----------+------+-----+---------+----------------+ #3 rows in set (0.00 sec) # #mysql> desc map_pages; #+--------------+-----------+------+-----+---------+----------------+ #| Field | Type | Null | Key | Default | Extra | #+--------------+-----------+------+-----+---------+----------------+ #| map_pages_id | int(11) | | PRI | NULL | auto_increment | #| page | char(100) | YES | | NULL | | #| link | char(100) | YES | | NULL | | #| type | int(1) | YES | | NULL | | #| depth | int(2) | YES | | NULL | | #| map_run | int(11) | YES | | NULL | | #| x | int(11) | YES | | NULL | | #| y | int(11) | YES | | NULL | | #+--------------+-----------+------+-----+---------+----------------+ #8 rows in set (0.00 sec) use strict; use DBI; use WWW::Robot; use Getopt::Std; use LWP::Simple; use Data::Dumper; require("/home/frisco/bin/dbgeneric"); $|=1; my %opt = ( 'h' => 0, 'v' => 0, 'u' => 'http://www.example.com/', ); unless (getopts('u:vh', \%opt) && !$opt{'h'}) { print STDERR <<"EOF"; usage: $0 [ options ] -u url url to spider through -v verbose messages -h this message EOF exit; } print STDERR time(), $/ if $opt{'v'}; my $root = $opt{'u'}; my %data = (); my @only = ( 'code/', 'art/', 'personal/', 'other/', 'about/' ); my @links = (); my @bad = (); my @good = (); my $robot = new WWW::Robot( 'NAME' => 'vw spider', 'VERSION' => '0.1', 'EMAIL' => '', 'DELAY' => 0, 'CHECK_MIME_TYPES' => 0, ); $robot->addHook('invoke-on-link', \&link); $robot->addHook('follow-url-test', \&follow_test); $robot->addHook('add-url-test', \&add_test); $robot->addHook('invoke-on-followed-url', \&followed); $robot->run($root); print STDERR "Inserting Data..." if $opt{'v'}; print STDERR $main::webdb.' '.$main::user.' '.$main::password."\n" if 0; my $dbh = DBI->connect("dbi:mysql:".$main::webdb, $main::user, $main::password); my $map_run = $dbh->selectrow_array("SELECT MAX(map_run_id) FROM map_run") +1; $dbh->do("INSERT INTO map_run VALUES ($map_run, from_unixtime(".time()."), '$root')") or die $dbh->errstr; my $sql = "INSERT INTO map_pages (page, link, type, map_run) VALUES "; $sql .= " (?, ?, ?, $map_run)"; my $sth = $dbh->prepare($sql); foreach my $key (sort keys %data) { foreach my $key2 ( sort keys %{$data{$key}} ) { my $val = ${$data{$key}}{$key2}; my $page = $key; my $link = $key2; $page =~ s/^http:\/\/www.blackant.net//; $link =~ s/^http:\/\/www.blackant.net//; $page =~ s/\/+/\//g; $link =~ s/\/+/\//g; $sth->execute($page,$link, $val); } } print STDERR "...done\n\n" if $opt{'v'}; $dbh->disconnect(); print STDERR time(), $/ if $opt{'v'}; exit; sub follow_test { my ($robot, $hook, $url) = @_; return 0 unless $url->scheme eq 'http'; return 0 if $url =~ /\.(gif|jpg|png|xbm|au|wav|mpg|class|wrl)$/i; return $url =~ m/^$root/i; } sub add_test { my ($robot, $hook, $url) = @_; return 0 if $url =~ m/index\.(?:php|html|shtml)$/i; return 0 if $url =~ m/[^:]\/\//; return $url =~ m/^$root/i; } sub link { my ($robot, $hook, $from, $to) = @_; return 0 if in_array($to, \@bad); return 0 if $to =~ /\.(gif|jpg|png|xbm|au|wav|mpg|class|wrl)$/i; if (in_array($to, \@links)) { } elsif ($to =~ m/\/\w+$/ && (in_array($to, \@links) || is_ok($to.'/'))) { $to .= '/'; push @links, $to; } elsif (is_ok($to)) { push @links, $to; } else { return 1; } my $value = 1; my $tail = $to; $tail =~ s/^$root//i; $value = 0 if $from eq $root && !in_array($tail, \@only); $data{$from}{$to} = $value; return 1; } sub followed { my ($robot, $hook, $url) = @_; print STDERR $url, $/ if $opt{'v'}; return 1; } sub is_ok { my $url = shift; return 1 if in_array($url, \@good); return 0 if in_array($url, \@bad); if ($url =~ /^$root/i) { if (head($url)) { push @good, $url; return 1; } else { push @bad, $url; return 0; } } return 0; } sub in_array { my $link = shift; my $array = shift; foreach my $val (@$array) { return 1 if ($val eq $link); } return 0; }