#!/usr/bin/perl

use LWP::Simple;
$site = "http://www.wellho.net";

# Sample (simplified code!) to reduce all URLs to a full, standard format
sub canonicalise {
        my $source = $_[0];
        $canon = "";
        return ($canon) if ($source =~ /^(mailto:|telnet:|callto:)/) ;
        if ($source =~ /^\//) {
                $source = $site . $source;
                }
        $source =~ s/[\#?].*//;
        return $source;
        }

# Grab a web page and check the links from it!

foreach $page(@ARGV) {
        print "Checking $page\n";
        $html = get($page); # Read web page - from LWP::Simple
        @urls = $html =~ /\shref="?([^\s>"]+)/gi ;
        }

@externals = ();
@internals = ();

foreach $url(@urls) {
        $full = canonicalise($url);
        next unless($full);

        $counter{$full}++;

        if ($full !~ /^$site/) {
                push @externals,$full;
        } else {
                push @internals,$full;
        }

}

print "Internal links: \n";
foreach $in(@internals) {
        print "$in\n";
        }
print "External links: \n";
foreach $ex(@externals) {
        print "$ex\n";
        }

print "Duplicate Testing: \n";
foreach $url(keys %counter) {
        if ($counter{$url} > 1) {
                print "$counter{$url} links to $url\n";
        }
}

__END__

Run the Script
e.g
perl lwp_fetch_links.pl http://www.flipkart.com/mobiles
Previous Post Next Post