#!/usr/local/bin/perl # guatex2html -- Translate the -gua!spi papers into HTML. # Usage: guatex2html file.tex > file.html # BEWARE: This program will handle *most* constructions, but further hand work # is needed on the hard parts. So don't blindly rebuild an existing HTML file # as you'll ruin the hand labor. Items to look for: # * Macros of the "halign" type are not handled at all. You need to # turn them into proper HTML tables. Also, \obeylines can't be obeyed. # * \xitem is defined differently in different files and tables within each # file. Have fun. # * In some cases, 2-column tables looked a lot better with 3 columns, # like this: #
Abstract: ", endabstract => 0, "", chapter => 1, "
| %", cr => 0, " |
| ", # Spurious row at end of table, too bad caption => 1, " |
", # Lose [title of table] endtable => 0, "
', vhalign => 1, '
', endphalign => 0, '
', noindent => 0, "", # Can't turn indentation on or off penalty => 1, "", # Ignore penalties. def => 1, "", # Ignore TeX macro definitions. newcommand => 3, "", # Ignore LaTeX macro definitions renewcommand => 3, "", # Ignore LaTeX macro definitions documentstyle => 2, "", # Ignore various LaTeX admin stuff oddsidemargin => 2, "", evensidemargin => 2, "", document => 0, "", enddocument => 0, "", maketitle => 0, "", protect => 0, "", vskip => 1, "", # Definitions from guaspi.sty qh => 0, "`-'", # Quoted hyphen '!' => 0, "\\", # Backslash '|' => 0, "|", # Vertical bar caret => 0, "^", # Caret without kerns for unslanted type dotSE => 0, " . . .", # Ellipsis dots... dots => 0, " . . .", # Ellipsis dots... # Environment for running -gua\spi text. It's necessary to suppress line # breaks after a hyphen (tone symbol). guaemg => 1, "
%
$guafont%$guafonte
", ex => 2, "
', endexii => 0, "
is not wanted before them. %blocklevel = qw(
); # Read entire document at once.
$bfr =~ s/(?= length($bfr);
&output(undef, "\n\n
==== Unbalanced right squiggle here ====
\n\n");
redo;
}
$z = join("\n", sort keys %missing);
print "\n
These macros have no definition:\n$z\n" if $z ne ''; # The active characters BEGIN { %active = qw(\ 1 { 1 } 1 [ 1 ] 1 ~ 1 & 1 $ 1); } # Convert the buffer to tokens. Args: # \$bfr Ref. to linear input buffer # \$output Ref. to linear output buffer, or undef for direct printing. # $end The control sequence (with backslash) or active character at # which the unit ends. It is included with the unit. Specify # '' for exactly one token (or a subunit in { }). Specify # \bye or \enddocument for the entire document. # $j Index in buffer to start at # Returns: Index in buffer just after $end # It's assumed that no token can be over 100 bytes long. sub tokenize { my($bfr, $output, $end, $j) = @_; my($h, $j0); my $len = length($$bfr); $indent .= '*'; # Needed to know when to insert
# print STDERR "$indent Starting group `$end'\n"; #DEBUG # When hunting for macro arguments, whitespace before or # between arguments is skipped. $j += length($1) if substr($$bfr, $j, 100) =~ /(^\s+)/s; # Split off tokens one by one. TOKENS: { last if $j >= $len; # If end of input was reached $j0 = $j; # Location of token start # Tokens consist of: # % to end of line (comment, ignored) # \alphabetic, a macro name, eating one space after # digits followed by letters, a dimen # word characters, a word # a contiguous stretch of spaces including \n # any single character. $h = substr($$bfr, $j, 1); #The next byte # print STDERR "`$active{$h}' ", &nonl(substr($$bfr, $j, 10)), "\n"; #DEBUG if ($active{$h}) { $j++; my($sep); if ($h eq "\\") { #A macro name substr($$bfr, $j, 20) =~ /^([A-Za-z]+|.)(\s?)/s; $j += length($1); # $1 = macro name $h = $1; # Eat optional space after macro name, if there are # arguments, except leave a newline that doesn't # prevent recognition of args. $j += length($2) unless $nargs{$h} == 0 || exists($endmarks{$h}); } # Transform \begin{name} to \name, \end{name} to \endname if (($h eq "begin" || $h eq "end") && substr($$bfr, $j, 20) =~ /^\{(\w+)\}/) { $j += 2 + length($1); $h = (($h eq "end") ? $h : "") . $1; } # Extraction of arguments. There are 3 styles: my(@args); my $na = $nargs{$h}; # A special separator may delimit the argument(s). if ($sepr{$h} ne '') { @args = split($sepr{$h}, substr($$bfr, $j, 100), $na+1); pop(@args); #Lose text following special arg $j += length(join($sepr{$h}, @args, '')); } else { # Normally, a given number of ordinary tokens are used, # but a specific control sequence may be specified to # delimit the argument (generally only one). while ($na-- > 0) { push(@args, ''); $j = &tokenize($bfr, \$args[-1], $endmarks{$h}, $j); } } # Do the macro substitution. # print STDERR "Macro sub `$h' args @args\n"; #DEBUG if (exists($subst{$h})) { &output($output, &{$subst{$h}}(@args)); } else { &output($output, "\\$h" . join('', @args)); $missing{"\\$h"}++; } # Numbers are special in TeX. A dimension may follow. } elsif (substr($$bfr, $j, 200) =~ /^(-?[0-9]+[a-z]*)/) { &output($output, $1); #Ordinary text (to end of line) $j += length($1); # Ordinary text includes letters, whitespace, # and certain punctuation not significant to TeX. } elsif (substr($$bfr, $j, 200) =~ /^([A-Za-z.,;()`' \t]+\n?)/) { &output($output, $1); #Ordinary text (to end of line) $j += length($1); } else { &output($output, $h); #A single character $j++; } $needpara = length($indent) if (substr($$bfr, $j-2, 3) =~ /^\n\n[^%\n]/); # printf STDERR "%-8s %s\n", $indent, &nonl(substr($$bfr, $j0, $j-$j0)); #DEBUG } continue { # printf STDERR "end `%s' (%d) tail `%s'\n", $end, defined($end), &nonl(substr($$bfr, $j0, length($end))); #DEBUG redo unless substr($$bfr, $j0, length($end)) eq $end; } # print STDERR "$indent Exiting group `$end'\n"; #DEBUG substr($indent,-1) = ''; $j; } # Appends a fragment to the output stream. # \$output Ref. to linear buffer for output, or undef for direct printing # $data String to append sub output { my($output, $data) = @_; # After an empty line in the input stream, insert a
, # except don't if the next HTML tag is at block level. if (length($indent) <= $needpara) { $needpara = 0; $data =~ /^(\<\w+)/; #Capture first HTML tag excluding arguments substr($data, 0, 0) = "
" unless $blocklevel{lc($1)}; } if (defined($output)) { $$output .= $data; } else { print $data; } } # Where a string has newlines, changes to "\\n". sub nonl { my($data) = @_; $data =~ s/\n/\\n/sg; $data; }