#!/usr/bin/perl
# Copyright 2000, onShore Development, Inc.
# This script will push (S)HTML content through a CTT shtml template,
# and rewrite the (S)HTML file.
#
# You should run this script from the directory it lives in, for
# example:
#
# prompt$ pwd
# /home/kfogel/onshore/ctt-tools
# prompt$ ./ctt-stylize.pl -d ../ctt ../ctt/path/to/something.shtml
# prompt$
#
# You can process files explicitly by name (the above example had only
# one target, "something.shtml", but you could specify N targets). Or
# you can process directory trees recursively with the -R flag.
#
# Be careful if you use -R, as this script cannot automatically
# distinguish between pre-login and post-login files. It must be told
# with the -U flag (for "unregistered"). Thus, if you processed a
# directory that is pre-login, passing -U and -R, but it has child
# directories that are post-login, you might accidentally end up with
# post-login files processed using a pre-login style template. Ooops!
#
# However, this usually isn't an issue, as files are marked with the
# "" tag once processed, and this script knows to
# skip files that have that tag.
#
# Run
#
# prompt$ ./ctt-stylize.pl -h
#
# to see a usage message.
# This script requires the Getopt::Std Module and the Strict Module.
use strict;
# Module to convert args to absolute paths
use Cwd 'abs_path';
# Module used to split arguments into file name and path
use File::Basename;
# Variables for the available flags (used by getopts)
use vars qw($opt_h $opt_d $opt_o $opt_U $opt_R $opt_v $opt_V
$opt_p $opt_S $opt_s $opt_n $opt_I);
use Getopt::Std;
# This checks for command line flags (in addition to the argument).
# If an invalid flag was called (invalid flags are any flags NOT
# declared in the getopts() call, $successfulParse will be set to 0.
# If flags are set, the corresponding flag variable ($opt_FLAGLETTER)
# will be set to 1. If a flag has an argument (flags that take
# arguements are followed by colons in the getopts() call), the flag
# variable ($opt_FLAGLETTER) will be set to the argument value.
#
# See subroutine displayUsage() for a list of what each option does.
my $successfulParse = getopts('hd:InpoRS:s:UvV');
if ($opt_h)
{
&displayUsage ();
exit ();
}
# Webserver document root, converted to an absolute path (no sym-links)
my $docRoot;
my $defaultDocRoot = '/var/www/';
# TODO: it would be even nicer if this script auto-detected whether a
# file was pre- or post-login, based on its path and perhaps the
# contents of a sibling .htaccess file.
my $template;
if ($opt_U) {
$template = 'ctt-prelogin-stylize.template';
}
else {
$template = 'ctt-postlogin-stylize.template';
}
# The files must end with one of these extensions
my @validExtensions = ('.shtml', '.html');
# If the script is running recursively, these directories will be skipped.
my @invalidDirectories = ('includes', 'images', 'bin', 'CVS');
my @invalidFiles = ('template.shtml');
# Defines where to place the title and body content when inserting
# parsed info into the template also determines the location of the
# script's custom comment tag
my $commentStartPattern = '
';
my $commentEndPattern = '';
my $titleStartPattern = '';
my $titleEndPattern = '';
my $bodyStartPattern = '' . "\n";
my $bodyEndPattern = "\n" . '';
# End title and body patterns
my $customComment = "\n";
# SSI variable patterns, the script will set these Include variables
# when it fleshes out the template
my $sectionStartPattern = '';
my $subsectionStartPattern = '';
# End SSI variable patterns
# SSI path->variable hash REFERENCE. This will map directories to SSI
# variable values
# Each directory is listed from the web document root, and corresponds
# to an array that holds the SSI's "section" and "subsection" variable
# values if the path to the current file is not in this array,
# "section" and "subsection" will be set to "".
# PLEASE NOTE THAT BOTH THE HASH AND THE NESTED ARRAY RETURN
# REFERENCES THAT NEED TO BE EXPANDED IN THE getNearestMatchByPath()
# SUBROUTINE.
my $sectionMapRef =
{
'/' => ['', ''],
'/forms/' => ['forms', ''],
'/forms/policies/' => ['forms', 'policies'],
'/forms/endorsements/' => ['forms', 'endorsements'],
'/forms/stateforms/' => ['forms', 'state_forms'],
'/forms/document_execution/' => ['forms', 'document_execution'],
'/forms/acknowledgments/' => ['forms', 'acknowledgments'],
'/recording/' => ['recording', ''],
'/recording/recording_data/' => ['recording', 'recording_data'],
'/recording/state_summaries/' => ['recording', 'state_summaries'],
'/recording/additional_data/' => ['recording', 'additional_data'],
'/recording/newsletters/' => ['recording', 'newsletters'],
'/ucc/' => ['ucc', ''],
'/ucc/filing_data/' => ['ucc', 'filing_data'],
'/ucc/collateral_charts/' => ['ucc', 'collateral_charts'],
'/ucc/filing_facts/' => ['ucc', 'filing_facts'],
'/ucc/statutes/' => ['ucc', 'statutes'],
'/ucc/additional_data/' => ['ucc', 'additional_data'],
'/ucc/newsletters/' => ['ucc', 'newsletters'],
'/taxes/' => ['taxes', ''],
'/taxes/mortgage_taxes/' => ['taxes', 'mortgage_taxes'],
'/taxes/transfer_taxes/' => ['taxes', 'transfer_taxes'],
'/real_estate/' => ['real_estate', ''],
'/real_estate/laws_customs/' => ['real_estate', 'laws_customs'],
'/real_estate/fund_disbursements/' => ['real_estate', 'fund_disbursements'],
'/title_digests/' => ['title_digests', ''],
'/home/whatsnew.shtml' => ['whats_new', ''],
'/search/' => ['search', ''],
'/portfolio/' => ['portfolio', ''],
'/home/articles/' => ['articles', '']
};
# End SSI path->variable hash
############
# MAIN #
############
# If invalid flags were called or no file was referenced, print usage
# instructions and exit
unless (($successfulParse) && @ARGV)
{
&displayUsage();
exit;
}
# If the 'print to standard out' flag is set, turn off error reporting
# if ($opt_p) {
# $opt_V = 1;
# }
# If "doc root path" flag is defined, reset $docRoot
if ($opt_d)
{
unless ($docRoot = abs_path($opt_d)) {
&report("Could not set the base path");
}
}
else { #set default doc root
unless ($docRoot = abs_path($defaultDocRoot)) {
&report("Default document root does not exist. "
. "Setting the document root to \"\"");
$docRoot = ''; #make docroot an empty string
}
}
while(@ARGV)
{
my $target = shift(@ARGV);
# Clean up any trailing slashes
$target = getAbsolutePath($target);
my $success = &validateArgs($target);
if ($success) {
if(-T $target) {
&parseHTML($target);
}
elsif(-d $target) {
&parseDirectory($target);
}
}
}
############
# END MAIN #
############
############
# SUBS #
############
# Makes sure the command line arguments are valid.
# If the args are invalid, the appropriate errors are printed to
# standard out.
# Returns 1 if argument is valid.
sub validateArgs()
{
my $target = shift @_;
my $cleanTarget = 0; # Will be set to true of the target is valid
if (-e $target) # If the file exists
{
if (-T $target)
{
# Hold valid extension string
my $extensions = '';
# Check to see if the file is on the invalid file list
foreach my $fileName (@invalidFiles)
{
# &report($target);
# &report($fileName);
# If the target path ends with the invalid file name
if ($target =~ m/$fileName$/)
{
&report("$target is not allowed to be processed by this script.");
$cleanTarget = 0;
return $cleanTarget; # the file is invalid, return 0.
}
}
foreach my $extension (@validExtensions)
{
# If the file ends with the current extension, it is valid so return 1
if ($target =~ m/$extension$/i)
{
$cleanTarget = 1;
return $cleanTarget;
}
else # File not have valid extension, so add this ext to master string
{
if ($extensions eq '') {
$extensions = $extension;
}
else {
$extensions .= ", $extension";
}
}
}
# If we haven't returned a valid value by this point, the file
# extension isn't valid. Give the user a list of valid extensions
&report("$target has an invalid extension. "
. "The valid extensions are $extensions");
}
elsif (-d $target) # the target is a directory
{
$cleanTarget = 1; # so the target is valid, trigger the flag
}
else
{
&report("$target is a binary file.");
}
}
else # The file doesn't exist or no file was named
{
unless ($target eq "")
{
# A file was named, but it doesn't exist
&report("File \"$target\" not found!");
}
}
return $cleanTarget;
}
# Prints messages to STDOUT, unless 'non-verbose flag is specified'
# (and the overrideFlag is left blank)
sub report ($message, $overrideFlag)
{
my $message = shift @_;
my $overrideFlag = shift @_;
# If an overrideflag is passed in, messages will print despite the
# opt_V flag
unless (($opt_V) && !($overrideFlag)) {
print "$message\n";
}
}
# Displays a usage message to standard out
sub displayUsage ()
{
print "Usage: ctt-stylize [-d docRoot] [-InopRUVv] "
. "[-S section] [-s subsect] target(s)\n";
print "\n";
print " -h display this help message and exit.\n";
print " -d docRoot set the document root used in determining SSI "
. "variable values\n";
print " -I leave any Server-Side Includes already in "
. "the target\'s body.\n";
print " -n do not execute anything that will change the disk.\n";
print " -o overwrite previously processed files\n";
print " -p print results to standard out\n";
print " -R process directories recursively\n";
print " -S section manually set SSI variable \"section\" for target\n";
print " -s subsect manually set SSI variable \"subsection\" "
. "for target\n";
print " -U templatize for unregistered (pre-login) users"
. "\n\t "
. "(otherwise assumes post-login templatization)\n";
print " -V non-verbose: suppress all error messages\n";
print " -v verbose: print every file being processed\n";
print " target HTML file(s) or directories\n";
}
#extract and return the page title and content.
sub parseHTML ()
{
my $file = shift @_;
my $completeText;
my $pageTitle = "";
my $pageBody = "";
# Store the current record separator value
my $tempRecordSeparator = $/;
# Undefine the current record separator value
undef($/);
# Read the entire file into $completeText
open(FILE, "<$file");
$completeText = ;
close(FILE);
# End file reading
# Restore the record separator
$/ = $tempRecordSeparator;
if ($completeText =~ m/$customComment/s)
{
if ($opt_o){ #if overwrite permission is turned on
# Since this template has already been processed, we will
# strip the title and body from the template patterns defined
# at the beginning of the script
if ($completeText =~ m/$titleStartPattern(.*?)$titleEndPattern/is)
{
$pageTitle = $1;
}
if ($completeText =~ m/$bodyStartPattern(.*?)$bodyEndPattern/is)
{
$pageBody = $1;
}
# End body and title pattern grabbing, based on template regexps
&createFileFromTemplate($file, $pageTitle, $pageBody);
}
else #don't overwrite the file, return to the main loop
{
&report("$file has already been processed.");
return;
}
}
else
{
# Grab title and body from standard HTML title and body tags
if ($completeText =~ m/(.*?)<\/TITLE.*?>/is) {
$pageTitle = $1;
}
if ($completeText =~ m/(.*?)<\/BODY.*?>/is) {
$pageBody = $1;
}
# End title and body grabbing from standard tags
&createFileFromTemplate($file, $pageTitle, $pageBody);
}
}
# Opens a directory and runs validateArgs and parseHTML on each file
# if -R has been specified, this recurses over subdirectories, too.
sub parseDirectory (directoryName, path)
{
my $directoryName = shift @_;
# Make sure the directory isn't on the invalid list
foreach my $invalidDirectory (@invalidDirectories)
{
# If this is an invalid directory, don't open it, abandon routine!
if ($directoryName =~ m/$invalidDirectory/)
{
&report("The $directoryName directory is not automatically "
. "parsed by this program.\nYou can, however, "
. "call this script from within the directory");
return;
}
}
# Store the path to the file
my $path = shift @_;
# If this function hasn't been called recursively, assign the path
unless ($path) {
$path = $directoryName;
}
my $success = opendir(DIR, "$directoryName");
if ($success) # if directory was opened
{
my @directoryContents = readdir(DIR);
# Process each item in the directory
foreach my $target (@directoryContents)
{
# As long as the item isn't `.' or `..', process it
unless (($target =~ m/^\.$/) || ($target =~ m/^\.\.$/))
{
# Add path info to the target
$target = $path . '/' . $target;
# If this item is a directory
if (-d $target)
{
# If the recursive flag is set
if ($opt_R) {
&parseDirectory($target);
}
else {
&report("$target is a directory");
}
}
else # this is a file, validate and parse it
{
my $success = &validateArgs($target);
if ($success)
{
&parseHTML($target);
}
}
}
}
closedir(DIR);
}
else {
&report("Can\'t open directory $directoryName");
}
}
# Takes in a file name, a page title, and body content.
# Inserts the title and body content into an HTML template
# and saves the info into $file
sub createFileFromTemplate (file, pageTitle, pageBody)
{
my $file = shift @_;
my $pageTitle = shift @_;
my $pageBody = shift @_;
# This will hold the final HTML
my $output;
# Grab the best fitting SSI variables for the file, based on the
# output file's directory/name
my $section;
my $subsection;
my @templateValues;
# If 'one of the manually set SSI variables' flags is defined
if (($opt_S) || ($opt_s))
{
$section = $opt_S;
$subsection = $opt_s;
}
else # get an array of the template values from predefined routine
{
@templateValues = &getNearestMatchByPath($sectionMapRef, $file, $docRoot);
if(@templateValues) #if array has a count, set $section and $subsection
{
($section, $subsection) = @templateValues;
}
else # no match found, set $section and $subsection to empty strings
{
$section = '';
$subsection = '';
}
}
# End SSI variable grabbing
# Store the current record separator value
my $tempRecordSeparator = $/;
# Undefine the current record separator value
undef($/);
# Read the entire template into $output
open(FILE, "<$template");
$output = ;
close(FILE);
# End file reading
# Restore the record separator
$/ = $tempRecordSeparator;
# Begin substituting info into the template
# If the verbose flag is checked, report the file being processed
if($opt_v) {
&report("Processing: $file", 1);
}
# Unless the "don't remove SSIs" flag is active, remove all server-side
# Includes and template substitution patterns from the content
unless ($opt_I) {
# Strip comments beginning with `#'
if ($pageBody =~ s///g) {
&report("Stripping old Server-Side Includes from $file");
}
# Strip special comments that may confuse this script and report
my $stripperFlag = 0;
if ($pageBody =~ s/$bodyStartPattern//g) {
$stripperFlag++;
}
if ($pageBody =~ s/$bodyEndPattern//g) {
$stripperFlag++;
}
if ($stripperFlag) {
&report("Stripping comments that may confuse this script from $file");
}
# End special comment stripping
}
# Substitute SSI variable values
$output =~ s/($sectionStartPattern).*?($sectionEndPattern)/$1$section$2/is;
$output =~ s/($subsectionStartPattern).*?($subsectionEndPattern)/$1$subsection$2/is;
# End SSI variable value substitution
# Insert a custom comment that marks this file as being processed.
$output =~ s/($commentStartPattern.*?)($commentEndPattern)/$1$customComment$2/is;
$output =~ s/($titleStartPattern).*?($titleEndPattern)/$1$pageTitle$2/is;
$output =~ s/($bodyStartPattern).*?($bodyEndPattern)/$1$pageBody$2/is;
# If the 'print to standard out' flag is triggered,
# Print the file to the screen
if ($opt_p) {
print $output;
}
else # the standard out flag hasn't been set, save over the old file
{
# Unless the "Don't do anything that will change the disk" flag is
# on, save the changes
unless ($opt_n)
{
my $success = open(FILE, ">$file");
if ($success)
{
# Lock the file
flock(FILE,2);
print FILE $output;
# Unlock the file
flock(FILE,8);
close(FILE);
}
else {
&report("Can\'t write to file $file, $!");
}
}
}
# End substituting info into the template
}
# Returns the absolute path of the file or directory passed in
# directories will be returned WITHOUT a trailing slash
# uses the Cwd and File::Basename modules
sub getAbsolutePath (target)
{
my $target = shift(@_);
if (-f $target) # if this a regular file
{
(my $file, my $path) = fileparse($target); # split file from path
$path = abs_path($path); # convert path from relative to absolute
$target = $path . '/' . $file;
}
elsif (-d $target)
{
$target = abs_path($target);
}
return $target;
}
# Takes in a hash reference (which maps paths/files to an array of values)
# and a complete path and returns the best fitting array of values for
# that path.
# Optionally, you can pass in a base path that is prepended to every path
# in %pathToValues. This function works backwards from the complete path,
# looking for matches. (i.e. - If the complete path matches, the array is
# returned.
# Otherwise, the filename (or directory) is lobbed off and the parent
# path is checked for a match). If no match is made, this returns undef.
sub getNearestMatchByPath ($hashReference, $targetPath, $basePath)
{
my $hashReference = shift @_;
# Convert the hash reference into a full-fledged hash
my %pathToValues = %$hashReference;
my $targetPath = shift(@_);
my $basePath = shift(@_);
while ($targetPath ne "")
{
$targetPath =~ s/\/$//; #remove trailing slash from the targetPath
foreach my $pathKey (keys(%pathToValues)){
# temporary variable for modifed path key
my $cleanPathKey = $basePath . $pathKey;
$cleanPathKey =~ s/\/$//; # remove trailing slash
# If targetPath matches the current absolute path of the pathKey
if ($targetPath eq $cleanPathKey)
{
# Get the corresponding array reference based on original pathKey
my $returnValueRef = $pathToValues{$pathKey};
# Convert the reference to an array
my @returnValues = @$returnValueRef;
# Return the values associated with the pathKey
return @returnValues;
}
}
# This pattern doesn't match anything in the hash, remove the last
# portion of the path and try again.
$targetPath =~ s/(.*)\/.*/$1/; # remove everything after the last '/'
}
return undef; # no match found
}
############
# END SUBS #
############