#!/usr/bin/perl
use LWP::Simple;
$site = "http://www.wellho.net";
# Sample (simplified code!) to reduce all URLs to a full, standard format
sub canonicalise {
my $source = $_[0];
$canon = "";
return ($canon) if ($source =~ /^(mailto:|telnet:|callto:)/) ;
if ($source =~ /^\//) {
$source = $site . $source;
}
$source =~ s/[\#?].*//;
return $source;
}
# Grab a web page and check the links from it!
foreach $page(@ARGV) {
print "Checking $page\n";
$html = get($page); # Read web page - from LWP::Simple
@urls = $html =~ /\shref="?([^\s>"]+)/gi ;
}
@externals = ();
@internals = ();
foreach $url(@urls) {
$full = canonicalise($url);
next unless($full);
$counter{$full}++;
if ($full !~ /^$site/) {
push @externals,$full;
} else {
push @internals,$full;
}
}
print "Internal links: \n";
foreach $in(@internals) {
print "$in\n";
}
print "External links: \n";
foreach $ex(@externals) {
print "$ex\n";
}
print "Duplicate Testing: \n";
foreach $url(keys %counter) {
if ($counter{$url} > 1) {
print "$counter{$url} links to $url\n";
}
}
__END__
Run the Script
e.g
perl lwp_fetch_links.pl http://www.flipkart.com/mobiles
use LWP::Simple;
$site = "http://www.wellho.net";
# Sample (simplified code!) to reduce all URLs to a full, standard format
sub canonicalise {
my $source = $_[0];
$canon = "";
return ($canon) if ($source =~ /^(mailto:|telnet:|callto:)/) ;
if ($source =~ /^\//) {
$source = $site . $source;
}
$source =~ s/[\#?].*//;
return $source;
}
# Grab a web page and check the links from it!
foreach $page(@ARGV) {
print "Checking $page\n";
$html = get($page); # Read web page - from LWP::Simple
@urls = $html =~ /\shref="?([^\s>"]+)/gi ;
}
@externals = ();
@internals = ();
foreach $url(@urls) {
$full = canonicalise($url);
next unless($full);
$counter{$full}++;
if ($full !~ /^$site/) {
push @externals,$full;
} else {
push @internals,$full;
}
}
print "Internal links: \n";
foreach $in(@internals) {
print "$in\n";
}
print "External links: \n";
foreach $ex(@externals) {
print "$ex\n";
}
print "Duplicate Testing: \n";
foreach $url(keys %counter) {
if ($counter{$url} > 1) {
print "$counter{$url} links to $url\n";
}
}
__END__
Run the Script
e.g
perl lwp_fetch_links.pl http://www.flipkart.com/mobiles