#!/usr/bin/perl my $URL = shift(@ARGV); my $Wildcard = shift(@ARGV); my $UrlPrefixFile = shift(@ARGV); my $FileNameFile = shift(@ARGV); my $DirNameFile = shift(@ARGV); unless ($URL) { print "$0 URL WILDCARD [URLPREFIX_FILE [FILENAME_FILE] [DIRNAME_FILE]]\n"; print " Will look for download on website $URL that matches given wildcard and has highest version number.\n"; print "\n"; print " URL website that lists file downloads on an index page\n"; print " WILDCARD regular expression identifying interesting files\n"; print " URLPREFIX_FILE file where to write url-prefix of found candidate to (optional)\n"; print " FILENAME_FILE file where to write filename of found candidate to (optional)\n"; print " DIRNAME_FILE file where to write name of directory of lowest level (optional)\n"; print "\n"; print " Examples\n"; print " find newest binutils*bz2 file:\n"; print " $0 http://ftp.gnu.org/gnu/binutils/ \"binutils-2\..*bz2\$\"\n"; print "\n"; print "\n"; exit 10; } my $TempFile="tmp.FindNewestDownload"; if (1) { system("rm -f $TempFile; wget -O \"$TempFile\" \"$URL\" >log.FindNewestDownload 2>&1"); } #? if (substr($URL, -1) eq '/') { substr($URL, -1) = undef; } my $URL_Base = $URL; my $URL_Base_EndedWithSlash = 0; if (substr($URL_Base, -1) eq '/') { substr($URL_Base, -1) = undef; $URL_Base_EndedWithSlash = 1; } else { my $LastSlashPos = rindex($URL_Base, '/'); substr($URL_Base, $LastSlashPos) = undef; $URL_Base_EndedWithSlash = 1; } my $FirstSlashPos = index($URL_Base, '/', index($URL_Base, '//') + 2); my $Domain; if ($FirstSlashPos > -1) { $Domain = substr($URL_Base, 0, $FirstSlashPos); } # remove path from URL_Base else { $Domain = $URL_Base; } # bad: http://sourceforge.net/projects/warzone2100/files/latest/download?source=files # good:http://sourceforge.net/projects/warzone2100/files/releases/ my $Page; open(IN, "<$TempFile") or die("$0: ERROR - Cannot read file '$TempFile' downloaded from '$URL' ($!)"); while () { $Page .= $_; } close(IN); $Page =~ s/ -1) { # find all candidates matching given wildcard my %Attributes = extractTagAttributes($Page, $LinkPos); my $EndLinkPos = index($PAGE, ' -1) { my $LinkText = strip( substr($Page, $Attributes{'.EndPos'}, $EndLinkPos - $Attributes{'.EndPos'}) ); #$LinkText =~s/(<[^>]>)//g; # remove all <..> from $LinkText my $IsCandidate = 1; if (0) { print "\$LinkText='$LinkText'\n"; } if ($Wildcard) { unless ($LinkText =~ m/$Wildcard/) { $IsCandidate = 0; } } if ($IsCandidate) { my %Link = ( href => $Attributes{href}, text => $LinkText ); # print "\%Link = ( href => $Attributes{href}, text => $LinkText );\n"; #D push @Candidates, \%Link; } } else { $EndLinkPos = $LinkPos + 1; } # avoid endless loop if (0) { #??? my $String = substr($Page, $LinkPos + 2); my $GtPos = index($String, '>'); my $FirstPart = substr($String, 0, $GtPos); # 'href="...." ...' my $HrefPos = index(lc($FirstPart), 'href'); if (1 && ($HrefPos > -1) ) { my $Quote1Pos = index($FirstPart, '"', $HrefPos); my $Quote2Pos = index($FirstPart, '"', $Quote1Pos + 1); my %Link; if ($Quote2Pos > $Quote1Pos) { $Link{href} = substr($FirstPart, $Quote1Pos + 1, $Quote2Pos - $Quote1Pos - 1); } if ($Link{href}) { my $EndOfTextPos = index($String, ']>)//g; # remove all <..> from $Text $Link{text} = $Text; my $IsCandidate = 1; if ($Wildcard) { unless ($Text =~ m/$Wildcard/) { $IsCandidate = 0; } } if ($IsCandidate) { push @Candidates, \%Link; } } } } $LinkPos = index($PAGE, '{text}); my $B = strip($b->{text}); my $Compare = 0; for (my $Index = 0; $Index < length($A); $Index++) { if (isNumber($A, $Index) && isNumber($B, $Index)) { (my $NumberA, my $LenNumberA) = getNumber($A, $Index); (my $NumberB, my $LenNumberB) = getNumber($B, $Index); $Compare = ($NumberB <=> $NumberA); $Index += $LenNumberA; } else { $Compare = (substr($B, $Index,1) cmp substr($A, $Index,1) ); } if ($Compare != 0) { last; } } if ($Compare == 0) { $Compare = length($A) <=> length($B); } #X if ($Compare < 0) { print "$A < $B\n"; } #X elsif ($Compare > 0) { print "$A > $B\n"; } #X else { print "$A = $B\n"; } $Compare; } @Candidates; if (1) { # print all candidates print "Found candidates for wildcard '$Wildcard':\n"; map { my $LinkRef = $_; print "Entry: ".join(" ", map { $_.'="'.$LinkRef->{$_}.'"'; } sort keys %{$LinkRef})."\n"; } @Candidates; } unless (@Candidates) { print "No candidates found for wildcard '$Wildcard'\n"; } if (@Candidates) { my $Candidate = $Candidates[0]; my $Link = $Candidate->{href}; my $CandidateURL; if ( index(lc($Link), 'http://') > -1 ) { $CandidateURL = $Link; } if ( index(lc($Link), 'https://') > -1 ) { $CandidateURL = $Link; } if ( index(lc($Link), 'ftp://') > -1 ) { $CandidateURL = $Link; } if (substr($Link, 0, 1) eq '/') { $CandidateURL = $Domain.$Link; } # link is relative to domain (e.g. '/bla/foo/...') unless ($CandidateURL) { # link is relative to current URL if ($URL_Base_EndedWithSlash) { $CandidateURL = $URL_Base.'/'.$Link; } else { $CandidateURL = $URL_Base.$Link; } } print "CandidateURL=".$CandidateURL."\n"; #{ create output variables my $LastSlashPos = rindex($CandidateURL, '/'); my $Out_Url = $CandidateURL; my $Out_UrlPrefix = substr($CandidateURL, 0, $LastSlashPos + 1); my $Out_FileName = substr($CandidateURL, $LastSlashPos + 1); while (substr($CandidateURL, -1) eq '/') { substr($CandidateURL, -1) = undef; } $LastSlashPos = rindex($CandidateURL, '/'); my $Out_LastDir = substr($CandidateURL, $LastSlashPos + 1); if (1) { # print Out_ variables print "Out_Url = '$Out_Url'\n"; print "Out_UrlPrefix = '$Out_UrlPrefix'\n"; print "Out_FileName = '$Out_FileName'\n"; print "Out_LastDir = '$Out_LastDir'\n"; } #} create output variables if ($UrlPrefixFile) { open(OUT, ">$UrlPrefixFile") or die("$0 - ERROR: Cannot write to file '$UrlPrefixFile' ($!)"); if ($LastSlashPos > -1) { print OUT $Out_UrlPrefix."\n"; } close(OUT); } if ($FileNameFile) { open(OUT, ">$FileNameFile") or die("$0 - ERROR: Cannot write to file '$FileNameFile' ($!)"); print OUT $Out_FileName."\n"; close(OUT); } if ($DirNameFile) { open(OUT, ">$DirNameFile") or die("$0 - ERROR: Cannot write to file '$DirNameFile' ($!)"); print OUT $Out_LastDir."\n"; close(OUT); } } sub extractTagAttributes { # extracts all attrributes of HTML-tag starting at given position in given string my $String = shift; my $StartPos = shift; # position inside $String of '<' character my %Attributes; if (substr($String, $StartPos, 1) ne '<') { die("WRONG ARGUMENT to extractTagAttributes()!"); } my $Pos = $StartPos + 1; my $C; my $TagName, $Name, $Value; # print "\nextractTagAttributes(".substr($String, $Pos, 150).")\n"; #D my $State = 0; my $EndBracket; do { $C = substr($String, $Pos++, 1); if ($State == 0) { # looking for next space (end of tag-name) if ( isWhiteChar($C) ) { $Attributes{'.name'} = lc($TagName); $State = 1; } else { $TagName .= $C; } } elsif ($State == 1) { # looking for next non-space (start of next attribute) unless (isWhiteChar($C)) { $Name = ''; $State = 2; } } if ($State == 2) { # reading $Name until next '=' character if ($C ne '=') { $Name .= $C; } else { $Name = strip($Name); $State = 3; } } elsif ($State == 3) { # looking for next " or ' character (begin of value) if ($C eq '"') { $State = 4; $Value = ''; $EndBracket = $C; # will search for this character to end string } if ($C eq "'") { $State = 4; $Value = ''; $EndBracket = $C; # will search for this character to end string } } elsif ($State == 4) { # reading $Value until next $EndBracket character if ($C ne $EndBracket) { $Value .= $C; } else { $Attributes{$Name} = $Value; $State = 1; } } } while ( ($C ne undef) && ($C ne '>')); $Attributes{'.EndPos'} = $Pos; return %Attributes; } sub findNextDifference { my $A = shift; my $B = shift; my $StartPos = shift() + 0; for (; $StartPos < length($A); $StartPos++) { if ( substr($A, $StartPos, 1) ne substr($B, $StartPos, 1) ) { return $StartPos; } $StartPos++; } return $StartPos; } sub strip { my $String = shift; while ( isWhiteChar(substr($String, 0, 1)) ) { substr($String, 0, 1) = undef; } while ( isWhiteChar(substr($String, -1)) ) { substr($String, -1) = undef; } return $String; } sub isWhiteChar { my $C = shift; if ( ($C eq ' ') || ($C eq "\n") || ($C eq "\r") || ($C eq "\t") ) { return 1; } return 0; } sub isNumber { my $String = shift; my $Pos = shift; my %IsNumber = ('0' => 1, '1' => 1, '2' => 1, '3' => 1, '4' => 1, '5' => 1, '6' => 1, '7' => 1, '8' => 1, '9' => 1); my $C = substr($String, $Pos, 1); return $IsNumber{$C}; } sub getNumber { # extracts number starting at given position in given string my $String = shift; my $Pos = shift; my $Number; while ( ($Pos < length($String)) && isNumber($String, $Pos) ) { $Number .= substr($String, $Pos, 1); $Pos++; } return ($Number + 0, length($Number)); }