#!/usr/bin/perl # Copyright 2000, onShore Development, Inc. # This script will push (S)HTML content through a CTT shtml template, # and rewrite the (S)HTML file. # # You should run this script from the directory it lives in, for # example: # # prompt$ pwd # /home/kfogel/onshore/ctt-tools # prompt$ ./ctt-stylize.pl -d ../ctt ../ctt/path/to/something.shtml # prompt$ # # You can process files explicitly by name (the above example had only # one target, "something.shtml", but you could specify N targets). Or # you can process directory trees recursively with the -R flag. # # Be careful if you use -R, as this script cannot automatically # distinguish between pre-login and post-login files. It must be told # with the -U flag (for "unregistered"). Thus, if you processed a # directory that is pre-login, passing -U and -R, but it has child # directories that are post-login, you might accidentally end up with # post-login files processed using a pre-login style template. Ooops! # # However, this usually isn't an issue, as files are marked with the # "" tag once processed, and this script knows to # skip files that have that tag. # # Run # # prompt$ ./ctt-stylize.pl -h # # to see a usage message. # This script requires the Getopt::Std Module and the Strict Module. use strict; # Module to convert args to absolute paths use Cwd 'abs_path'; # Module used to split arguments into file name and path use File::Basename; # Variables for the available flags (used by getopts) use vars qw($opt_h $opt_d $opt_o $opt_U $opt_R $opt_v $opt_V $opt_p $opt_S $opt_s $opt_n $opt_I); use Getopt::Std; # This checks for command line flags (in addition to the argument). # If an invalid flag was called (invalid flags are any flags NOT # declared in the getopts() call, $successfulParse will be set to 0. # If flags are set, the corresponding flag variable ($opt_FLAGLETTER) # will be set to 1. If a flag has an argument (flags that take # arguements are followed by colons in the getopts() call), the flag # variable ($opt_FLAGLETTER) will be set to the argument value. # # See subroutine displayUsage() for a list of what each option does. my $successfulParse = getopts('hd:InpoRS:s:UvV'); if ($opt_h) { &displayUsage (); exit (); } # Webserver document root, converted to an absolute path (no sym-links) my $docRoot; my $defaultDocRoot = '/var/www/'; # TODO: it would be even nicer if this script auto-detected whether a # file was pre- or post-login, based on its path and perhaps the # contents of a sibling .htaccess file. my $template; if ($opt_U) { $template = 'ctt-prelogin-stylize.template'; } else { $template = 'ctt-postlogin-stylize.template'; } # The files must end with one of these extensions my @validExtensions = ('.shtml', '.html'); # If the script is running recursively, these directories will be skipped. my @invalidDirectories = ('includes', 'images', 'bin', 'CVS'); my @invalidFiles = ('template.shtml'); # Defines where to place the title and body content when inserting # parsed info into the template also determines the location of the # script's custom comment tag my $commentStartPattern = ''; my $commentEndPattern = ''; my $titleStartPattern = ''; my $titleEndPattern = ''; my $bodyStartPattern = '' . "\n"; my $bodyEndPattern = "\n" . ''; # End title and body patterns my $customComment = "\n"; # SSI variable patterns, the script will set these Include variables # when it fleshes out the template my $sectionStartPattern = ''; my $subsectionStartPattern = ''; # End SSI variable patterns # SSI path->variable hash REFERENCE. This will map directories to SSI # variable values # Each directory is listed from the web document root, and corresponds # to an array that holds the SSI's "section" and "subsection" variable # values if the path to the current file is not in this array, # "section" and "subsection" will be set to "". # PLEASE NOTE THAT BOTH THE HASH AND THE NESTED ARRAY RETURN # REFERENCES THAT NEED TO BE EXPANDED IN THE getNearestMatchByPath() # SUBROUTINE. my $sectionMapRef = { '/' => ['', ''], '/forms/' => ['forms', ''], '/forms/policies/' => ['forms', 'policies'], '/forms/endorsements/' => ['forms', 'endorsements'], '/forms/stateforms/' => ['forms', 'state_forms'], '/forms/document_execution/' => ['forms', 'document_execution'], '/forms/acknowledgments/' => ['forms', 'acknowledgments'], '/recording/' => ['recording', ''], '/recording/recording_data/' => ['recording', 'recording_data'], '/recording/state_summaries/' => ['recording', 'state_summaries'], '/recording/additional_data/' => ['recording', 'additional_data'], '/recording/newsletters/' => ['recording', 'newsletters'], '/ucc/' => ['ucc', ''], '/ucc/filing_data/' => ['ucc', 'filing_data'], '/ucc/collateral_charts/' => ['ucc', 'collateral_charts'], '/ucc/filing_facts/' => ['ucc', 'filing_facts'], '/ucc/statutes/' => ['ucc', 'statutes'], '/ucc/additional_data/' => ['ucc', 'additional_data'], '/ucc/newsletters/' => ['ucc', 'newsletters'], '/taxes/' => ['taxes', ''], '/taxes/mortgage_taxes/' => ['taxes', 'mortgage_taxes'], '/taxes/transfer_taxes/' => ['taxes', 'transfer_taxes'], '/real_estate/' => ['real_estate', ''], '/real_estate/laws_customs/' => ['real_estate', 'laws_customs'], '/real_estate/fund_disbursements/' => ['real_estate', 'fund_disbursements'], '/title_digests/' => ['title_digests', ''], '/home/whatsnew.shtml' => ['whats_new', ''], '/search/' => ['search', ''], '/portfolio/' => ['portfolio', ''], '/home/articles/' => ['articles', ''] }; # End SSI path->variable hash ############ # MAIN # ############ # If invalid flags were called or no file was referenced, print usage # instructions and exit unless (($successfulParse) && @ARGV) { &displayUsage(); exit; } # If the 'print to standard out' flag is set, turn off error reporting # if ($opt_p) { # $opt_V = 1; # } # If "doc root path" flag is defined, reset $docRoot if ($opt_d) { unless ($docRoot = abs_path($opt_d)) { &report("Could not set the base path"); } } else { #set default doc root unless ($docRoot = abs_path($defaultDocRoot)) { &report("Default document root does not exist. " . "Setting the document root to \"\""); $docRoot = ''; #make docroot an empty string } } while(@ARGV) { my $target = shift(@ARGV); # Clean up any trailing slashes $target = getAbsolutePath($target); my $success = &validateArgs($target); if ($success) { if(-T $target) { &parseHTML($target); } elsif(-d $target) { &parseDirectory($target); } } } ############ # END MAIN # ############ ############ # SUBS # ############ # Makes sure the command line arguments are valid. # If the args are invalid, the appropriate errors are printed to # standard out. # Returns 1 if argument is valid. sub validateArgs() { my $target = shift @_; my $cleanTarget = 0; # Will be set to true of the target is valid if (-e $target) # If the file exists { if (-T $target) { # Hold valid extension string my $extensions = ''; # Check to see if the file is on the invalid file list foreach my $fileName (@invalidFiles) { # &report($target); # &report($fileName); # If the target path ends with the invalid file name if ($target =~ m/$fileName$/) { &report("$target is not allowed to be processed by this script."); $cleanTarget = 0; return $cleanTarget; # the file is invalid, return 0. } } foreach my $extension (@validExtensions) { # If the file ends with the current extension, it is valid so return 1 if ($target =~ m/$extension$/i) { $cleanTarget = 1; return $cleanTarget; } else # File not have valid extension, so add this ext to master string { if ($extensions eq '') { $extensions = $extension; } else { $extensions .= ", $extension"; } } } # If we haven't returned a valid value by this point, the file # extension isn't valid. Give the user a list of valid extensions &report("$target has an invalid extension. " . "The valid extensions are $extensions"); } elsif (-d $target) # the target is a directory { $cleanTarget = 1; # so the target is valid, trigger the flag } else { &report("$target is a binary file."); } } else # The file doesn't exist or no file was named { unless ($target eq "") { # A file was named, but it doesn't exist &report("File \"$target\" not found!"); } } return $cleanTarget; } # Prints messages to STDOUT, unless 'non-verbose flag is specified' # (and the overrideFlag is left blank) sub report ($message, $overrideFlag) { my $message = shift @_; my $overrideFlag = shift @_; # If an overrideflag is passed in, messages will print despite the # opt_V flag unless (($opt_V) && !($overrideFlag)) { print "$message\n"; } } # Displays a usage message to standard out sub displayUsage () { print "Usage: ctt-stylize [-d docRoot] [-InopRUVv] " . "[-S section] [-s subsect] target(s)\n"; print "\n"; print " -h display this help message and exit.\n"; print " -d docRoot set the document root used in determining SSI " . "variable values\n"; print " -I leave any Server-Side Includes already in " . "the target\'s body.\n"; print " -n do not execute anything that will change the disk.\n"; print " -o overwrite previously processed files\n"; print " -p print results to standard out\n"; print " -R process directories recursively\n"; print " -S section manually set SSI variable \"section\" for target\n"; print " -s subsect manually set SSI variable \"subsection\" " . "for target\n"; print " -U templatize for unregistered (pre-login) users" . "\n\t " . "(otherwise assumes post-login templatization)\n"; print " -V non-verbose: suppress all error messages\n"; print " -v verbose: print every file being processed\n"; print " target HTML file(s) or directories\n"; } #extract and return the page title and content. sub parseHTML () { my $file = shift @_; my $completeText; my $pageTitle = ""; my $pageBody = ""; # Store the current record separator value my $tempRecordSeparator = $/; # Undefine the current record separator value undef($/); # Read the entire file into $completeText open(FILE, "<$file"); $completeText = ; close(FILE); # End file reading # Restore the record separator $/ = $tempRecordSeparator; if ($completeText =~ m/$customComment/s) { if ($opt_o){ #if overwrite permission is turned on # Since this template has already been processed, we will # strip the title and body from the template patterns defined # at the beginning of the script if ($completeText =~ m/$titleStartPattern(.*?)$titleEndPattern/is) { $pageTitle = $1; } if ($completeText =~ m/$bodyStartPattern(.*?)$bodyEndPattern/is) { $pageBody = $1; } # End body and title pattern grabbing, based on template regexps &createFileFromTemplate($file, $pageTitle, $pageBody); } else #don't overwrite the file, return to the main loop { &report("$file has already been processed."); return; } } else { # Grab title and body from standard HTML title and body tags if ($completeText =~ m/(.*?)<\/TITLE.*?>/is) { $pageTitle = $1; } if ($completeText =~ m/(.*?)<\/BODY.*?>/is) { $pageBody = $1; } # End title and body grabbing from standard tags &createFileFromTemplate($file, $pageTitle, $pageBody); } } # Opens a directory and runs validateArgs and parseHTML on each file # if -R has been specified, this recurses over subdirectories, too. sub parseDirectory (directoryName, path) { my $directoryName = shift @_; # Make sure the directory isn't on the invalid list foreach my $invalidDirectory (@invalidDirectories) { # If this is an invalid directory, don't open it, abandon routine! if ($directoryName =~ m/$invalidDirectory/) { &report("The $directoryName directory is not automatically " . "parsed by this program.\nYou can, however, " . "call this script from within the directory"); return; } } # Store the path to the file my $path = shift @_; # If this function hasn't been called recursively, assign the path unless ($path) { $path = $directoryName; } my $success = opendir(DIR, "$directoryName"); if ($success) # if directory was opened { my @directoryContents = readdir(DIR); # Process each item in the directory foreach my $target (@directoryContents) { # As long as the item isn't `.' or `..', process it unless (($target =~ m/^\.$/) || ($target =~ m/^\.\.$/)) { # Add path info to the target $target = $path . '/' . $target; # If this item is a directory if (-d $target) { # If the recursive flag is set if ($opt_R) { &parseDirectory($target); } else { &report("$target is a directory"); } } else # this is a file, validate and parse it { my $success = &validateArgs($target); if ($success) { &parseHTML($target); } } } } closedir(DIR); } else { &report("Can\'t open directory $directoryName"); } } # Takes in a file name, a page title, and body content. # Inserts the title and body content into an HTML template # and saves the info into $file sub createFileFromTemplate (file, pageTitle, pageBody) { my $file = shift @_; my $pageTitle = shift @_; my $pageBody = shift @_; # This will hold the final HTML my $output; # Grab the best fitting SSI variables for the file, based on the # output file's directory/name my $section; my $subsection; my @templateValues; # If 'one of the manually set SSI variables' flags is defined if (($opt_S) || ($opt_s)) { $section = $opt_S; $subsection = $opt_s; } else # get an array of the template values from predefined routine { @templateValues = &getNearestMatchByPath($sectionMapRef, $file, $docRoot); if(@templateValues) #if array has a count, set $section and $subsection { ($section, $subsection) = @templateValues; } else # no match found, set $section and $subsection to empty strings { $section = ''; $subsection = ''; } } # End SSI variable grabbing # Store the current record separator value my $tempRecordSeparator = $/; # Undefine the current record separator value undef($/); # Read the entire template into $output open(FILE, "<$template"); $output = ; close(FILE); # End file reading # Restore the record separator $/ = $tempRecordSeparator; # Begin substituting info into the template # If the verbose flag is checked, report the file being processed if($opt_v) { &report("Processing: $file", 1); } # Unless the "don't remove SSIs" flag is active, remove all server-side # Includes and template substitution patterns from the content unless ($opt_I) { # Strip comments beginning with `#' if ($pageBody =~ s///g) { &report("Stripping old Server-Side Includes from $file"); } # Strip special comments that may confuse this script and report my $stripperFlag = 0; if ($pageBody =~ s/$bodyStartPattern//g) { $stripperFlag++; } if ($pageBody =~ s/$bodyEndPattern//g) { $stripperFlag++; } if ($stripperFlag) { &report("Stripping comments that may confuse this script from $file"); } # End special comment stripping } # Substitute SSI variable values $output =~ s/($sectionStartPattern).*?($sectionEndPattern)/$1$section$2/is; $output =~ s/($subsectionStartPattern).*?($subsectionEndPattern)/$1$subsection$2/is; # End SSI variable value substitution # Insert a custom comment that marks this file as being processed. $output =~ s/($commentStartPattern.*?)($commentEndPattern)/$1$customComment$2/is; $output =~ s/($titleStartPattern).*?($titleEndPattern)/$1$pageTitle$2/is; $output =~ s/($bodyStartPattern).*?($bodyEndPattern)/$1$pageBody$2/is; # If the 'print to standard out' flag is triggered, # Print the file to the screen if ($opt_p) { print $output; } else # the standard out flag hasn't been set, save over the old file { # Unless the "Don't do anything that will change the disk" flag is # on, save the changes unless ($opt_n) { my $success = open(FILE, ">$file"); if ($success) { # Lock the file flock(FILE,2); print FILE $output; # Unlock the file flock(FILE,8); close(FILE); } else { &report("Can\'t write to file $file, $!"); } } } # End substituting info into the template } # Returns the absolute path of the file or directory passed in # directories will be returned WITHOUT a trailing slash # uses the Cwd and File::Basename modules sub getAbsolutePath (target) { my $target = shift(@_); if (-f $target) # if this a regular file { (my $file, my $path) = fileparse($target); # split file from path $path = abs_path($path); # convert path from relative to absolute $target = $path . '/' . $file; } elsif (-d $target) { $target = abs_path($target); } return $target; } # Takes in a hash reference (which maps paths/files to an array of values) # and a complete path and returns the best fitting array of values for # that path. # Optionally, you can pass in a base path that is prepended to every path # in %pathToValues. This function works backwards from the complete path, # looking for matches. (i.e. - If the complete path matches, the array is # returned. # Otherwise, the filename (or directory) is lobbed off and the parent # path is checked for a match). If no match is made, this returns undef. sub getNearestMatchByPath ($hashReference, $targetPath, $basePath) { my $hashReference = shift @_; # Convert the hash reference into a full-fledged hash my %pathToValues = %$hashReference; my $targetPath = shift(@_); my $basePath = shift(@_); while ($targetPath ne "") { $targetPath =~ s/\/$//; #remove trailing slash from the targetPath foreach my $pathKey (keys(%pathToValues)){ # temporary variable for modifed path key my $cleanPathKey = $basePath . $pathKey; $cleanPathKey =~ s/\/$//; # remove trailing slash # If targetPath matches the current absolute path of the pathKey if ($targetPath eq $cleanPathKey) { # Get the corresponding array reference based on original pathKey my $returnValueRef = $pathToValues{$pathKey}; # Convert the reference to an array my @returnValues = @$returnValueRef; # Return the values associated with the pathKey return @returnValues; } } # This pattern doesn't match anything in the hash, remove the last # portion of the path and try again. $targetPath =~ s/(.*)\/.*/$1/; # remove everything after the last '/' } return undef; # no match found } ############ # END SUBS # ############