#!/usr/bin/perl
#
# extract-links.pl
#
# Copyright (c) 2005-2006
# Lee Feigenbaum (lee AT thefigtrees DOT net)

use HTML::Parser;
use HTTP::Request;
use LWP::UserAgent;

my $ua = LWP::UserAgent->new;
$ua->agent("link-text-extractor/0.4");


my $pattern = shift @ARGV;

my @urls = @ARGV;

die("Usage:\n\t$0 link-text-pattern url1 [url2] [url3] ...\n\n")
  unless $pattern && @urls;

my $re = qr/$pattern/i;

for (@urls) {
    my $req = HTTP::Request->new("GET", $_);
    my $resp = $ua->request($req);
    my $p = HTML::Parser->new(api_version => 3,
        start_h => [\&a_start, "self,tagname,attr"],
        report_tags => ['a']
    );
    $p->parse($resp->content) || die $!;
}

my $current_href = '';
sub a_start {
    my ($self, $tag, $attr) = @_;
    return unless lc($tag) eq 'a';
    return unless exists $attr->{href};

    $current_href = $attr->{href};
    $self->handler(text => [], '@{dtext}');
    $self->handler(end => \&a_end, "self,tagname");
}

sub a_end {
    my ($self, $tag) = @_;
    my $text = join("", @{$self->handler("text")});
    $text =~ s/^\s+//;
    $text =~ s/\s+$//;
    $text =~ s/\s+/ /g;
    print "$current_href\n"
        if $text =~ $re;

    $self->handler(end => undef);
    $self->handler(start => \&a_start);
    $current_href = '';
}