#!/usr/bin/perl # # extract-links.pl # # Copyright (c) 2005-2006 # Lee Feigenbaum (lee AT thefigtrees DOT net) use HTML::Parser; use HTTP::Request; use LWP::UserAgent; my $ua = LWP::UserAgent->new; $ua->agent("link-text-extractor/0.4"); my $pattern = shift @ARGV; my @urls = @ARGV; die("Usage:\n\t$0 link-text-pattern url1 [url2] [url3] ...\n\n") unless $pattern && @urls; my $re = qr/$pattern/i; for (@urls) { my $req = HTTP::Request->new("GET", $_); my $resp = $ua->request($req); my $p = HTML::Parser->new(api_version => 3, start_h => [\&a_start, "self,tagname,attr"], report_tags => ['a'] ); $p->parse($resp->content) || die $!; } my $current_href = ''; sub a_start { my ($self, $tag, $attr) = @_; return unless lc($tag) eq 'a'; return unless exists $attr->{href}; $current_href = $attr->{href}; $self->handler(text => [], '@{dtext}'); $self->handler(end => \&a_end, "self,tagname"); } sub a_end { my ($self, $tag) = @_; my $text = join("", @{$self->handler("text")}); $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; print "$current_href\n" if $text =~ $re; $self->handler(end => undef); $self->handler(start => \&a_start); $current_href = ''; }