#!/usr/bin/perl

my $URL           = shift(@ARGV);
my $Wildcard      = shift(@ARGV);
my $UrlPrefixFile = shift(@ARGV);
my $FileNameFile  = shift(@ARGV);
my $DirNameFile   = shift(@ARGV);

unless ($URL) {
  print "$0 URL WILDCARD [URLPREFIX_FILE [FILENAME_FILE] [DIRNAME_FILE]]\n";
  print "    Will look for download on website $URL that matches given wildcard and has highest version number.\n";
  print "\n";
  print "    URL             website that lists file downloads on an index page\n";
  print "    WILDCARD        regular expression identifying interesting files\n";
  print "    URLPREFIX_FILE  file where to write url-prefix of found candidate to (optional)\n";
  print "    FILENAME_FILE   file where to write filename of found candidate to (optional)\n";
  print "    DIRNAME_FILE    file where to write name of directory of lowest level (optional)\n";
  print "\n";
  print "    Examples\n";
  print "    find newest binutils*bz2 file:\n";
  print "    $0 http://ftp.gnu.org/gnu/binutils/ \"binutils-2\..*bz2\$\"\n";
  print "\n";
  print "\n";
  exit 10;
}

my $TempFile="tmp.FindNewestDownload";
if (1) { system("rm -f $TempFile; wget -O \"$TempFile\" \"$URL\" >log.FindNewestDownload 2>&1"); }

#? if (substr($URL, -1) eq '/') { substr($URL, -1) = undef; }
my $URL_Base = $URL;
my $URL_Base_EndedWithSlash = 0;
if (substr($URL_Base, -1) eq '/') { 
  substr($URL_Base, -1) = undef;
  $URL_Base_EndedWithSlash = 1;
}
else {
  my $LastSlashPos = rindex($URL_Base, '/');
  substr($URL_Base, $LastSlashPos) = undef;
  $URL_Base_EndedWithSlash = 1;
}
my $FirstSlashPos = index($URL_Base, '/', index($URL_Base, '//') + 2);
my $Domain;
if ($FirstSlashPos > -1) { $Domain = substr($URL_Base, 0, $FirstSlashPos); } # remove path from URL_Base
else                     { $Domain = $URL_Base; }

# bad: http://sourceforge.net/projects/warzone2100/files/latest/download?source=files
# good:http://sourceforge.net/projects/warzone2100/files/releases/
my $Page;
open(IN, "<$TempFile") or die("$0: ERROR - Cannot read file '$TempFile' downloaded from '$URL' ($!)");
while (<IN>) { $Page .= $_; }
close(IN);
$Page =~ s/<A /<a /g;
$Page =~ s/<\/A/<\/a/g;
$Page =~ s/< \/A/<\/a/g;
$Page =~ s/< \/a/<\/a/g;
my $PAGE = uc($Page);

my %Links;
my @Candidates;
my $LinkPos = index($PAGE, '<A ');
while ($LinkPos > -1) { # find all candidates matching given wildcard
  my %Attributes = extractTagAttributes($Page, $LinkPos);
  my $EndLinkPos = index($PAGE, '</A', $LinkPos + 1);
  if ($EndLinkPos > -1) {
    my $LinkText = strip( substr($Page, $Attributes{'.EndPos'}, $EndLinkPos - $Attributes{'.EndPos'}) );
    #$LinkText =~s/(<[^>]>)//g; # remove all <..> from $LinkText
    my $IsCandidate = 1;
    if (0) { print "\$LinkText='$LinkText'\n"; }
    if ($Wildcard) {
      unless ($LinkText =~ m/$Wildcard/) { $IsCandidate = 0; }
    }
    if ($IsCandidate) {
      my %Link = ( href => $Attributes{href}, text => $LinkText );
      # print "\%Link = ( href => $Attributes{href}, text => $LinkText );\n"; #D
      push @Candidates, \%Link;
    }
  }
  else { $EndLinkPos = $LinkPos + 1; } # avoid endless loop
  
  if (0) { #???
    my $String = substr($Page, $LinkPos + 2);
    
    my $GtPos = index($String, '>');
    my $FirstPart = substr($String, 0, $GtPos); # 'href="...." ...'
    my $HrefPos = index(lc($FirstPart), 'href');
    if (1 && ($HrefPos > -1) ) {
      my $Quote1Pos = index($FirstPart, '"', $HrefPos);
      my $Quote2Pos = index($FirstPart, '"', $Quote1Pos + 1);
      my %Link;
      if ($Quote2Pos > $Quote1Pos) { $Link{href} = substr($FirstPart, $Quote1Pos + 1, $Quote2Pos - $Quote1Pos - 1); }
      if ($Link{href}) {
        my $EndOfTextPos = index($String, '</a', $GtPos + 1);
        my $Text = substr($String, $GtPos + 1, $EndOfTextPos - $GtPos - 1);
        $Text =~s/(<[^>]>)//g; # remove all <..> from $Text
        $Link{text} = $Text;
  
        my $IsCandidate = 1;
        if ($Wildcard) {
          unless ($Text =~ m/$Wildcard/) { $IsCandidate = 0; }
        }
        if ($IsCandidate) {
          push @Candidates, \%Link;
        }
      }
    }
  }
  
  $LinkPos = index($PAGE, '<A ', $EndLinkPos + 1);
};

@Candidates = sort { # sort candidates by version number
  my $A = strip($a->{text});
  my $B = strip($b->{text});
  
  my $Compare = 0;
  for (my $Index = 0; $Index < length($A); $Index++) {
    if (isNumber($A, $Index) && isNumber($B, $Index)) {
      (my $NumberA, my $LenNumberA) = getNumber($A, $Index);
      (my $NumberB, my $LenNumberB) = getNumber($B, $Index);
      $Compare = ($NumberB <=> $NumberA);
      $Index += $LenNumberA;
    }
    else { 
      $Compare = (substr($B, $Index,1) cmp substr($A, $Index,1) );
    }
    if ($Compare != 0) { last; }
  }
  if ($Compare == 0) {
    $Compare = length($A) <=> length($B);
  }
  #X if ($Compare < 0)    { print "$A < $B\n"; }
  #X elsif ($Compare > 0) { print "$A > $B\n"; }
  #X else                 { print "$A = $B\n"; }
  
  $Compare;
} @Candidates;
if (1) { # print all candidates
  print "Found candidates for wildcard '$Wildcard':\n";
  map {
    my $LinkRef = $_;
    print "Entry: ".join(" ", map { $_.'="'.$LinkRef->{$_}.'"'; } sort keys %{$LinkRef})."\n";
  } @Candidates;
}
unless (@Candidates) {
  print "No candidates found for wildcard '$Wildcard'\n";
}

if (@Candidates) {
  my $Candidate = $Candidates[0];
  my $Link = $Candidate->{href};
  my $CandidateURL;
  if ( index(lc($Link), 'http://')  > -1 ) { $CandidateURL = $Link; }
  if ( index(lc($Link), 'https://') > -1 ) { $CandidateURL = $Link; }
  if ( index(lc($Link), 'ftp://')   > -1 ) { $CandidateURL = $Link; }
  if (substr($Link, 0, 1) eq '/')          
  { $CandidateURL = $Domain.$Link; } # link is relative to domain (e.g. '/bla/foo/...')
  unless ($CandidateURL) {           # link is relative to current URL
    if ($URL_Base_EndedWithSlash) { $CandidateURL = $URL_Base.'/'.$Link; }
    else                          { $CandidateURL = $URL_Base.$Link; }
  }
  print "CandidateURL=".$CandidateURL."\n";
  
  #{ create output variables
  my $LastSlashPos = rindex($CandidateURL, '/');
  my $Out_Url = $CandidateURL;
  my $Out_UrlPrefix = substr($CandidateURL, 0, $LastSlashPos + 1);
  my $Out_FileName = substr($CandidateURL, $LastSlashPos + 1);
  while (substr($CandidateURL, -1) eq '/') { substr($CandidateURL, -1) = undef; }
  $LastSlashPos = rindex($CandidateURL, '/');
  my $Out_LastDir = substr($CandidateURL, $LastSlashPos + 1);
  if (1) { # print Out_ variables
    print "Out_Url       = '$Out_Url'\n";  
    print "Out_UrlPrefix = '$Out_UrlPrefix'\n";  
    print "Out_FileName  = '$Out_FileName'\n";  
    print "Out_LastDir   = '$Out_LastDir'\n";  
  }
  #} create output variables
  
  if ($UrlPrefixFile) {
    open(OUT, ">$UrlPrefixFile") or die("$0 - ERROR: Cannot write to file '$UrlPrefixFile' ($!)");
    if ($LastSlashPos > -1) { print OUT $Out_UrlPrefix."\n"; }
    close(OUT);
  }
  if ($FileNameFile) {
    open(OUT, ">$FileNameFile") or die("$0 - ERROR: Cannot write to file '$FileNameFile' ($!)");
    print OUT $Out_FileName."\n";
    close(OUT);
  }
  if ($DirNameFile) {
    open(OUT, ">$DirNameFile") or die("$0 - ERROR: Cannot write to file '$DirNameFile' ($!)");
    print OUT $Out_LastDir."\n";
    close(OUT);
  }
}

sub extractTagAttributes {  # extracts all attrributes of HTML-tag starting at given position in given string
  my $String = shift;
  my $StartPos = shift; # position inside $String of '<' character 
  
  my %Attributes;
  if (substr($String, $StartPos, 1) ne '<') { die("WRONG ARGUMENT to extractTagAttributes()!"); }
  my $Pos = $StartPos + 1;
  my $C;
  
  my $TagName, $Name, $Value;
  
  # print "\nextractTagAttributes(".substr($String, $Pos, 150).")\n"; #D
  my $State = 0;
  my $EndBracket;
  do {
    $C = substr($String, $Pos++, 1);
    if ($State == 0) { # looking for next space (end of tag-name)
      if ( isWhiteChar($C) ) { 
        $Attributes{'.name'} = lc($TagName);
        $State = 1;
      }
      else {
        $TagName .= $C;
      }
    }
 elsif ($State == 1) { # looking for next non-space (start of next attribute)
      unless (isWhiteChar($C)) {
        $Name = '';
        $State = 2;
      }
    }
    if ($State == 2) { # reading $Name until next '=' character
      if ($C ne '=') {
        $Name .= $C;
      }
      else {
        $Name = strip($Name);
        $State = 3;
      }
    }
 elsif ($State == 3) { # looking for next " or ' character (begin of value)
      if ($C eq '"') {
        $State = 4;
        $Value = '';
        $EndBracket = $C; # will search for this character to end string 
      }
      if ($C eq "'") {
        $State = 4;
        $Value = '';
        $EndBracket = $C; # will search for this character to end string 
      }
    }
 elsif ($State == 4) { # reading $Value until next $EndBracket character
     if ($C ne $EndBracket) {
        $Value .= $C;
      }
      else {
        $Attributes{$Name} = $Value;
        $State = 1;
      }
    }
    
  } while ( ($C ne undef) && ($C ne '>'));
  
  $Attributes{'.EndPos'} = $Pos;
  
  return %Attributes;
}
sub findNextDifference {
  my $A = shift;
  my $B = shift;
  my $StartPos = shift() + 0;
  
  for (; $StartPos < length($A); $StartPos++) {
    if ( substr($A, $StartPos, 1) ne substr($B, $StartPos, 1) ) {
      return $StartPos;
    }
    $StartPos++;
  }
  return $StartPos;
}
sub strip {
  my $String = shift;
  
  while ( isWhiteChar(substr($String, 0, 1)) ) 
  { substr($String, 0, 1) = undef; }
  while ( isWhiteChar(substr($String, -1)) ) 
  { substr($String, -1) = undef; }
  
  return $String;
}
sub isWhiteChar {
  my $C = shift;
  
  if ( ($C eq ' ')  ||
       ($C eq "\n") ||
       ($C eq "\r") ||
       ($C eq "\t")
     )
  { return 1; }
  
  return 0;
}
sub isNumber {
  my $String = shift;
  my $Pos    = shift;

  my %IsNumber = ('0' => 1, '1' => 1, '2' => 1, '3' => 1, '4' => 1, '5' => 1, '6' => 1, '7' => 1, '8' => 1, '9' => 1);
  
  my $C = substr($String, $Pos, 1);
  
  return $IsNumber{$C};
}
sub getNumber { # extracts number starting at given position in given string
  my $String = shift;
  my $Pos    = shift;
  
  my $Number;
  
  while ( ($Pos < length($String)) && isNumber($String, $Pos) ) {
    $Number .= substr($String, $Pos, 1);
    $Pos++;
  }
  
  return ($Number + 0, length($Number));
}

