#!/usr/bin/perl -w
#
#usage: ./google-query [term]
#
#does google search for [term], follows all non-google pages returned, follows
#through to all of google's returns. note that the script doesn't escape
#[term] so you gotta do that part.
#example: ./google-query sid+%26+nancy
#is a search for 'sid & nancy'
#
#written by francisco, http://www.blackant.net/
#
#From my understaning of google's Terms of Service, i don't think personal use
#of this script violates those terms, but im no lawyer so use at your own peril
use strict;
use HTML::Parser;
use LWP::UserAgent;
use LWP::Simple;
$|=1;
#when i wrote this, my website was the first return for this search.
my $query = shift || 'porn+junkie';
#increase to be more polite, decrease to be ruder
my $sleep = 1;
my $base = 'http://www.google.com';
my $subbase = '/search?num=100&q=';
my $url = $base.$subbase.$query;
my $res = '';
my @pstuff = ();
my @followed = ();
my $req = new HTTP::Request('GET', $url);
my $ua = new LWP::UserAgent;
$ua->agent('Mozilla/5.0');
#run through each page of google search returns, hopefully
do {
my $p = HTML::Parser->new( api_version => 3,
start_h => [\@pstuff, 'tagname, attr'],
marked_sections => 1,
);
$res = $ua->request($req);
$p->parse($res->content) if ($res->is_success);
push @followed, $url;
print "parsing: ", $url, $/;
LINKS: foreach my $v (@pstuff) {
next LINKS unless $$v[0] =~ /^a/i;
#we only deal with tags
if (defined (${$$v[1]}{'href'})) {
#or more specifically, with tags
my $found = ${$$v[1]}{'href'};
#a google page
if ($found !~ /^http:\/\//) {
if ($found =~ /^\/search/ &&
!in_array($found, \@followed)) {
$url = 'http://www.google.com'.$found;
$req = new HTTP::Request('GET', $url);
}
}
#a non google page
elsif ($found !~ /\.google\.com/) {
print "getting: ", $found, $/;
#if you want to do something with the data you
#get, do it here - e.g. my $data = get($found)
#and then parse $data for whatever.
get($found);
sleep $sleep;
}
}
}
} while (!in_array($url, \@followed));
sub in_array {
my $link = shift;
my $array = shift;
foreach my $val (@$array) { return 1 if ($val eq $link); }
return 0;
}