#!/usr/bin/env perl use Getopt::Std; # Options ############################################################# # -h (or no args) .. pipe man page to less (and exit). # -H ................ print man page without using less. # -v ................ print version number and exit. # -s startfile ...... assume starting file startfile instead of # default: index.html. # -b ................ keep backup files until all editing is # complete. # -o ................ find HTML orbit only and exit. # -z ................ compress startfile. By default startfile is # not compressed. This option is meaningless # with the -u option. # -u ................ uncompress all files in HTML orbit. # By default all files except startfile in # HTML orbit are compressed. # Pattern match strings ############################################### $nonnull_gz = "(?:(?:[.-]g?|_)z|[.]Z)"; $nongz_gz = "((?:-g|[.-_])z|[.]Z|)"; # not .gz extension $gz = "($nonnull_gz|)"; $doc_type = "[.](?:dvi|p(?:df|s))"; $html_type = "[.]htm(?:l|)"; $doc = "($doc_type)$gz"; $html = "($html_type)$gz"; $type = "($doc_type|$html_type)"; $head = "(?!(?:ht|f)tp:\/\/|\/)([^\"#]*)"; $validlink = "\"($head$type$gz)[\"#]"; # $1 = link, $2 = head, # $3 = type, $4 = gz # Pager ############################################################### @pagers = qw( less more ); foreach $pager (@pagers) { $pagercmd = `which $pager`; last unless $pagercmd =~ /which:|not found/; } chomp $pagercmd; $pagercmd = ($pagercmd =~ /which:|not found/) ? "" : "| ".$pagercmd; # Process Options ##################################################### (@ARGV) || do { system("docpack -H $pagercmd"); exit; }; # No args ... pipe man page to pager. getopts('bhHos:uvz'); $opt_v && do { print "Docpack: version 1.02 [1998/10/26]\n"; exit; }; $opt_h && do { system("docpack -H $pagercmd"); exit; }; # Help! Pipe man page to pager. $opt_H && do { write; exit; }; # Help! Print man page without using less. (@ARGV) || die "Expected directory argument!\n", "Usage: docpack [-hH] [[-s startfile] [-ouz] dir]\n"; ($zdir) = @ARGV; # After getopts there should be # exactly one argument left # ... ignore all but the first one. ($logfile = $zdir) =~ s/(?:.*\/|)(.*)/$1.dlg/; open(LOG, ">$logfile") ? print "Logging to $logfile\n" : warn "Cannot log to $logfile: permission denied?\n"; (chdir "$zdir" && plog("Changing to directory: $zdir.\n") ) || die "Directory: $zdir not found.\n"; $startfile = $opt_s ? $opt_s : "index.html"; ((-f $startfile) && plog("Startfile: $startfile found.\n") ) || die "Startfile: $startfile not found.\n"; $opt_o && do { plog("Calculating HTML orbit only.\n"); ($opt_b || $opt_u || $opt_z) && plog("Ignoring other option(s).\n"); }; $newcompression = $opt_u ? "" : ".gz"; $oldgz = $opt_u ? "($nonnull_gz)" : $nongz_gz; if (!$opt_o) { $opt_b && plog("Keeping backup files until after editing is complete.\n"); $opt_u && plog("Uncompressing files.\n"); $opt_z && (($opt_u && do { plog("Ignoring z option, u option in effect.\n"); $opt_z = 0; }) || plog("Compressing startfile.\n") ); } ####################################################################### # The concept of HTML orbit comes from permutation group theory. # The web browser `acts' on an underlying set, namely HTML files # and .dvi|.pdf|.ps files (and others ... but we ignore any others). # The web browser action is not a `group action' ... but the standard # group algorithm for finding an orbit of an action is still applicable. ####################################################################### plog("Finding HTML orbit.\n"); &getHTMLorbit( $startfile ); @HTMLorbit = sort keys %orbithash; plog("HTML orbit: @HTMLorbit\n"); $opt_o && exit; # Exit if only HTML orbit desired. plog("Modifying files in HTML orbit of $zdir/$startfile.\n"); # By default startfile is left uncompressed and hence should not # be included in orbithash (otherwise files with links to startfile # will have those links edited to have .gz extensions). Thus, we # must edit the links in startfile separately and then undo the # gzip effected by editlinks. When the final compression state of # startfile is the same as the rest of the HTML orbit, startfile # is included in orbithash and so modify does the appropriate job. do { &editlinks($startfile, $newcompression); system( "gunzip $startfile" ); } unless ($opt_u || $opt_z); # Now we edit and g(un)zip the rest of the HTML orbit. &modify(); # If we chose to keep the backup files until the end we ask the # user what to do with them and act accordingly. $opt_b && &rm_or_restore_backups(); plog("Done.\n"); close(LOG); # Subroutines ############################################################ sub plog { # Print both to a logfile and to standard output print LOG @_; print STDOUT @_; } sub getHTMLorbit { my($startfile) = @_; # Exclude startfile from orbithash except in cases where the final # compression status of startfile is the same as the rest of the # HTML orbit, undef $orbithash; if ($opt_u || $opt_z) { %orbithash = ( $startfile => 1 ); } undef %badlinks; @checklist = ( $startfile ); while (@checklist) { # find the links in the next file of checklist, # ensure the path of each link is in `canonical' form, # filter out startfile, links to super-directories and bad links, # append those links remaining that are new to orbithash, # and append any new orbithash HTML links found to checklist. push(@checklist, grep(/$html$/, grep(/^(?!\.\.\/|$startfile$)/ && &goodlink($_) && !$orbithash{$_}++, map(&shortenpath($_), &getlinks( pop(@checklist) ) ) ) ) ); } # tails of .dvi|.ps|.pdf links in HTML orbit: keys %tails undef %tails; grep(/^(?:.+\/|)(.*$doc$)/ && !$tails{$1}++, keys %orbithash); } sub goodlink { # Check that a link is valid # ... if it is return 1 # ... if not announce the fact, append to badlinks and return 0 my($link) = @_; ( -f $link ) && return 1; plog("Bad link: $link does not exist.\n"); $badlinks{$link}++; return 0; } sub getlinks { # Get links of HTML orbit (below $zdir) in $file my($file) = @_; my($dir, $compression) = $file =~ /^(.+\/|).*?$gz$/; my @links = (); plog("Reading $file\n"); $/ = ""; # paragaraph mode - pattern matches are allowed to # extend beyond end-of-line open(CHECK, $compression ? "zcat $file |" : "<$file"); while () { while (/(?:href\s*=\s*)$validlink/ig) { push(@links, $dir.$1); } } close(CHECK); return @links; } sub shortenpath { ($path) = @_; $path =~ s?/{2,}?/?g; # Remove any superfluous /s $path =~ s?^\./??; # Remove superfluous ./ # if present at beginning of path while ( $path =~ s:(?!^/?(\.\./){2})(^|/)[^/]*/\.\.(?=/):: ) {}; # Remove backtracks e.g. things like aaa/.. # being careful not to remove ../.. from # the beginning of a path. $path =~ s?^/??; # Remove an initial / that might # have resulted from previous step. return $path; } sub modify { # Call editlinks to edit each HTML file in the HTML orbit # and ensure each .dvi(.gz)|.. file in HTML orbit has the # appropriate compression (either "" (none) or ".gz") # Note: startfile is only modified here if it is included # in the HTML orbit (which is the case when it is to be # compressed). By default, startfile is not left compressed. foreach $file (keys %orbithash) { if ($file =~ /$html$/) { &editlinks($file, $newcompression); } elsif ($file =~ /(.*)$doc$/) { # current compression = $3 if ($3 ne $newcompression) { # if compression is not already correct $opt_u ? system("gunzip $file") : $3 eq "" ? system("gzip $file") : system("gunzip $file; gzip $1$2"); # file is already compressed # ... but not with gzip # ... so we remedy this. } } } } sub editlinks { # edit links and appropriate textual references in file # according to newcompression (either "" (none) or ".gz") my($file, $newcompression) = @_; ($file, my $dir, my $compression) = $file =~ /^((.+\/|).*?)$gz$/; $compression && system("gunzip $file"); ($^I, $/) = (".bak", "\n"); @ARGV = ($file); plog("Editing $file.\n"); while (<>) { # edit the href links while ( /"($head$type$oldgz)(?=["#])/g ) { my $shortpath = &shortenpath($dir.$1); $orbithash{$shortpath} ? s//"$2$3$newcompression/ # Edit link if valid : $badlinks{$shortpath} # Inform about bad links && plog("Reference to bad link: $1 ", "at line no: $. of $file.\n"); } # edit any textual references to .dvi|.. files inside anchor tags # so long as the file referred to is a tail of an HTML orbit file # ... OR is a bare .dvi(.gz) etc. Doesn't matter that much if we # miss any ... it's just to make the text as accurate as possible # ... but be very sure not to change any extras! while ( /(((?:\w|[.])*) # head = $2 ($doc_type) # type = $3 $oldgz # gz = $4 ) # whole link = $1 (?=((?i)(\s*<\/[a-z]+>)*?\s*<\/a>)) # look-ahead to find # any no. of close tags # and a close anchor /xgm ) { ($tails{$1} || $2 eq "") && s//$2$3$newcompression/; } print; } $^I = undef; $opt_b || system("rm $file.bak"); $opt_u || system("gzip $file"); } sub rm_or_restore_backups { my $response = ""; print "Do you wish to delete or restore backup files?\n"; while ( $response !~ /^[dr]/i ) { $response && print "Invalid response. "; print "Type d for delete or r for restore. (d/r):"; $response = ; } my ($dofirst, $dolast) = $opt_u ? ( "", "; gzip \$thehead" ) : ( "rm -f \$file.gz; ", ""); my $htmlcmd = ( $response =~ /^d/i ) ? "system(\"rm \$thehead.bak\")" : "system(\"".$dofirst. "mv \$thehead.bak \$thehead". $dolast."\")"; my $compressioncmd = $opt_u ? "gzip" : "gunzip"; my $doccmd = ( $response =~ /^d/i ) ? "" : "system(\"$compressioncmd \$thehead\")"; $orbithash{$startfile}++; # Ensure startfile is in orbithash foreach $file ( keys %orbithash ) { my ($thehead) = $file =~ /^(.*?)$gz$/; ($thehead =~ /$html$/) ? eval $htmlcmd : eval $doccmd; } opt_u && system("gunzip $startfile"); # Assume startfile was initially # uncompressed. } # Manpage ############################################################## format STDOUT = docpack - (un)compress files in an HTML orbit SYNOPSIS docpack [-hH] [[-s startfile] [-buoz] dir] DESCRIPTION If called with the -h option or with no arguments docpack pipes this manpage to an auto-detected pager (`less' or if that isn't found it will use `more'). The -H option prints this manpage without piping through a pager. Otherwise, docpack determines the HTML orbit reached by a web browser starting at startfile in directory dir. The argument dir is essential. By default, startfile is `index.html'; this is overridden with the -s option. An HTML orbit is defined to consist of any HTML file (i.e. file with .html or .htm extension) or .dvi, .ps or .pdf file, (or com- pressed file which may be uncompressed by gunzip to have the aforementioned extensions) lying within directory dir, that may be reached by a web browser starting at startfile in dir. By default, all files except startfile are com- pressed with gzip (and so end up with a `.gz' extension). Prior to compression, all links and textual references in- side anchor tags in the HTML files of the HTML orbit are modified to have `.gz' extensions. With the -z option, the startfile is also compressed. With the -u option, all files in the HTML orbit are uncompressed; under this opt- ion, the -z option is meaningless and is ignored. Prior to editing, each HTML file in the HTML orbit is backed up (uncompressed) with a `.bak' extension, to protect against a system crash. After editing, each backup is deleted un- less the -b option is selected. If a user chooses the -b option (presumably so that the action of docpack may be checked with a web browser) the user is interrogated as to whether to delete or restore the backup files at the com- pletion of all file modifications. Output is logged in the file formed by taking the tail of dir and appending the extension `.dlg'. One may apply docpack to an HTML orbit that is already partially compressed. This means that after having applied docpack, some additional files can be linked into the HTML orbit and docpack can be reapplied. Docpack also notifies about bad links i.e. links to HTML orbit files that don't exist. OPTIONS -h, (or no args) Print this man page, by piping to less. -H Print this man page, but do not pipe to less. -v Print version number and exit. -s startfile Assume starting file startfile. By default, startfile is: `index.html'. -b Keep backup files until after modification of HTML orbit files is complete. -o Find HTML orbit only and exit. -z Compress startfile as well. By default, startfile is the only file in the HTML orbit that is left uncompressed. It is meaningless to use this option with the -u option, below. -u Uncompress all files in HTML orbit. By default, all files, except startfile, in HTML orbit are compressed with gzip. CHANGES Version 1.03: 1998/11/2: Fixed bug that allowed the file- name of a link to contain a `#'. Version 1.02: 1998/10/26: Added -v option (to give version number). Fixed bug that erroneously detected paths start- ing with `ftp://' as bad links. Version 1.01: 1998/10/21: We now log each file as it is read for links. In particular, this helps to identify which file is responsible for bad links when they occur. Added auto-detection of the pager. No pager is used if neither `less' nor `more' is found. Version 1.0: 1998/10/20: First release. BUGS Docpack is only known to work properly with Perl 5.004_04. It may also work with Perl 5.003, but is known to be bro- ken for earlier versions of Perl. The only pagers auto- detected are `less' and `more'. The startfile is assumed to be initially uncompressed when restoring files. To be certain of exact restoration backup the directory before using docpack. Currently, links ending in `/' are not res- olved. Almost certainly there are more bugs ... let's hope none is catastrophic. AUTHOR Greg Gamble @<<<<<<<<<<<<<<<<<<<<<<<<< "" VERSION 1.03 2 November 1998 1 .