#!/usr/local/bin/perl # guatex2html -- Translate the -gua!spi papers into HTML. # Usage: guatex2html file.tex > file.html # BEWARE: This program will handle *most* constructions, but further hand work # is needed on the hard parts. So don't blindly rebuild an existing HTML file # as you'll ruin the hand labor. Items to look for: # * Macros of the "halign" type are not handled at all. You need to # turn them into proper HTML tables. Also, \obeylines can't be obeyed. # * \xitem is defined differently in different files and tables within each # file. Have fun. # * In some cases, 2-column tables looked a lot better with 3 columns, # like this: # Word What it means # -gua\spi example Translation # It would have saved a lot of work to put this definition into a local # file where it's relevant: # xitem 3 % % % % # * \def is programmed to eat everything up to and including the {}, but # about three \defs have highly deceptive nesting behavior, so that # sections of wanted text got eaten -- half the file, in the worst case. # * In a few cases the TeX files use \emg to italicize Roman text. This is # bogus, and this program will underline it. # If the command line file is file.tex, after the standard macro expansions # are loaded, the program will read file.shy if it exists, to supplement the # macros. See \xitem above for the format; the args go directly to &addhash. die "file.tex (exactly one) is required\n" unless @ARGV == 1; die "No file $ARGV[0]\n" unless -r $ARGV[0]; $guafont=""; # The font used in the \gua environment $guafonte=""; # Adds key-value pairs to %subst without replacing what's there. Args: # $key Name of macro without leading backslash # $narg Number of arguments of this macro followed by the # separator (if any). All separators must be the same. # $value Either a string or a code ref. For a string, each # successive % is replaced by one argument. If a code # ref, the arguments are the macro args and the return # value is what to replace them with. # Arbitrarily many triplets are given. sub addhash { my($k, $n, $v); while (($k, $n, $v) = splice(@_, 0, 3)) { $n =~ /^(\d*)(\D*)$/; # Number of args + separator, e.g. "2," $nargs{$k} = $1; $sepr{$k} = $2; # Usually this is '' unless (defined($v)) { die "Error, args to addhash are out of sync. Keys:\n`", join("', `", keys %subst), "'\n"; } # Now convert $v to a subroutine as described above. # (It could already be a subroutine.) if (ref($v) eq '') { my(@parts) = split('%', $v, -1); #Don't lose trailing null fields my $w = shift @parts; #Eventual return value my $i = 0; foreach $_ (@parts) { if (substr($w,-1) eq "\\") { #Don't replace \% substr($w,-1) = "%$_"; } else { $w .= '$_[' . $i++ . ']' . $_; # % -> $_[$i] } } $w =~ s/(?=[\\"])/\\/g; #Put \ before metachars my $cmd = "sub { \"$w\" }"; #Make the subroutine. $v = eval $cmd or warn "Key $k $cmd --- $@\n"; } $subst{$k} = $v; } } our %subst; # Macros, 0 or 1 argument. % represents the arg. # The key is the macro name without leading backslash. &addhash( # Escaped characters ' ' => 0, " ", # Escaped space '\ ' "\n" => 0, "\n", # Accidentally escaped newline "\\\n" '/' => 0, "", # Italic correction (ignore it) '\/' '-' => 0, "", # Hyphenation hint (ignore) "=" => 0, "", # Alternative to \- when hypen is active '%' => 0, "\\%", # Escaped percent '#' => 0, "#", # Escaped pound sign '*' => 0, "*", # Escaped asterisk (guaspi special) # Active characters '~' => 0, " ", # Nonbreak space '&' => 0, "", # Separator in halign '$' => 0, "", # Math mode (ignore it) '{' => 1, "%", # Grouping characters '}' => 0, "", # Grouping characters '[' => 1, "[%", # Visible grouping characters ']' => 0, "]", # Visible grouping characters # General TeX and LaTeX definitions title => 1, "

%

", author => 1, "

%

", date => 1, "
%
", abstract => 0, "
Abstract: ", endabstract => 0, "
", chapter => 1, "

%

", section => 1, "

%

", subsection => 1, "

%

", appendix => 1, "

Appendix: %

", label => 1, " ", it => 1, "%", langle => 0, "<", rangle => 0, ">", P => 0, "¶", quad => 0, "    ", #Disgusting qquad => 0, "        ", itemize => 0, sub { $subst{item} = sub { "
  • " }; $nargs{item} = 0; "", list => 0, sub { $subst{item} = sub { "
    $_[0]
    " }; $nargs{item} = 1; "
    "; }, endlist => 0, "
    ", cite => 1, "[%]", sp => 1, "%", # Superscript sc => 1, '%', ref => 1, "[%]", # Not really functional # It's assumed that all tables have the form # \begin{table} \halign{stuff} \caption stuff \end{table}. table => 1, "
    %", halign => 1, "
    %", cr => 0, "
    ", # Spurious row at end of table, too bad caption => 1, "
    \n
    ", # Lose [title of table] endtable => 0, "

    ", figure => 0, "
    ", # Can't do much with a figure endfigure => 0, "
    ", phalign => 0, '
    ', endphalign => 0, '
    ', vhalign => 1, '
    %
    ', noindent => 0, "", # Can't turn indentation on or off penalty => 1, "", # Ignore penalties. def => 1, "", # Ignore TeX macro definitions. newcommand => 3, "", # Ignore LaTeX macro definitions renewcommand => 3, "", # Ignore LaTeX macro definitions documentstyle => 2, "", # Ignore various LaTeX admin stuff oddsidemargin => 2, "", evensidemargin => 2, "", document => 0, "", enddocument => 0, "", maketitle => 0, "", protect => 0, "", vskip => 1, "", # Definitions from guaspi.sty qh => 0, "`-'", # Quoted hyphen '!' => 0, "\\", # Backslash '|' => 0, "|", # Vertical bar caret => 0, "^", # Caret without kerns for unslanted type dotSE => 0, " . . .", # Ellipsis dots... dots => 0, " . . .", # Ellipsis dots... # Environment for running -gua\spi text. It's necessary to suppress line # breaks after a hyphen (tone symbol). guaemg => 1, "
    $guafont%$guafonte
    ", #The rule is in the original but it seems bogus. guahyph => 0, "$guafont-$guafonte", englhyph => 0, "-", emdash => 0, "---", #Need an em-dash by cowboy programming # An inline word or short phrase in -gua!spi gua => 1, "$guafont%$guafonte", qgua => 1, "$guafont``%''$guafonte", # Quoted gua\spi word guaspi => 0, "${guafont}gua\\spi$guafonte", #The name of gua\spi Guaspi => 0, "${guafont}Gua\\spi$guafonte", #Same, capitalized # A word with its translation. Format: \trw-gua,english, trw => "2,", sub { "``$guafont" . substr($_[0],1) . "$guafonte-" . $_[1] . "''" }, emg => 1, "%", # An emphasized gua\spi word, underlined betw => 1, "<%>", # A phrase in angle brackets < > hfilbreak => 0, "", # Ignore various line break adjustments afilbreak => 0, "", vabreak => 0, "", # These names pertain to \halign: phalign endphalign shalign vhalign # Paragraph in table cell. It's used with 2 arguments: a width and the # content. Toss the width, leaving the content. Return value: empty string. littlepar => 1, "", # 2-column examples. exii is seen as \begin{exii} and endexii is \end{exii}. exii => 0, '
    ', endexii => 0, "
    ", ex => 2, "$guafont%$guafonte%", # A single 2-column example pli => 2, sub { &{$subst{exii}}() . &{$subst{ex}}(@_) . &{$subst{endexii}}() }, # A lot of 2-column examples exbox => 1, sub { &{$subst{exii}}() . $_[0] . &{$subst{endexii}}() }, # Word lists have two 2-column lines. Args are: # \xitem{word}{description}{-gua!spi example}{translation} # xitem => 4, sub { &{$subst{ex}}("** $_[0]", $_[1]) . &{$subst{ex}}(@_[2..3]) }, ); # End of loading %subst # Load an auxiliary definition file if present. Its lines have the format: # macroname nargs content # with fields separated by whitespace. The content is the rest of the line. # Blank lines and lines beginning with % are ignored. if (($AUX = $ARGV[0]) =~ s/\.tex/.shy/ && open(AUX, $AUX)) { while () { next if /^\s*(%|$)/; chomp; my(@row) = split(' ', $_, 3); if ($row[2] =~ /^sub /) { my $cmd = eval $row[2]; if (defined($cmd)) { $row[2] = $cmd; } else { warn "In $AUX $row[2] --- $@\n"; $row[2] = " OOPS "; } } &addhash(@row); } close AUX; } # If an active character or macro name (without backslash) is a key in this # table, its one argument extends to (and including) the macro (with backslash) # or active character which is the value. %endmarks = ( '{', '}', '[', ']', "def", '{', # \def#1{whatever} this eats the {} too \} ); # These HTML objects are at block level and a

    is not wanted before them. %blocklevel = qw(

    ); # Read entire document at once. $bfr =~ s/(?= length($bfr); &output(undef, "\n\n
    ==== Unbalanced right squiggle here ====
    \n\n"); redo; } $z = join("\n", sort keys %missing); print "\n

    These macros have no definition:\n$z\n" if $z ne ''; # The active characters BEGIN { %active = qw(\ 1 { 1 } 1 [ 1 ] 1 ~ 1 & 1 $ 1); } # Convert the buffer to tokens. Args: # \$bfr Ref. to linear input buffer # \$output Ref. to linear output buffer, or undef for direct printing. # $end The control sequence (with backslash) or active character at # which the unit ends. It is included with the unit. Specify # '' for exactly one token (or a subunit in { }). Specify # \bye or \enddocument for the entire document. # $j Index in buffer to start at # Returns: Index in buffer just after $end # It's assumed that no token can be over 100 bytes long. sub tokenize { my($bfr, $output, $end, $j) = @_; my($h, $j0); my $len = length($$bfr); $indent .= '*'; # Needed to know when to insert

    # print STDERR "$indent Starting group `$end'\n"; #DEBUG # When hunting for macro arguments, whitespace before or # between arguments is skipped. $j += length($1) if substr($$bfr, $j, 100) =~ /(^\s+)/s; # Split off tokens one by one. TOKENS: { last if $j >= $len; # If end of input was reached $j0 = $j; # Location of token start # Tokens consist of: # % to end of line (comment, ignored) # \alphabetic, a macro name, eating one space after # digits followed by letters, a dimen # word characters, a word # a contiguous stretch of spaces including \n # any single character. $h = substr($$bfr, $j, 1); #The next byte # print STDERR "`$active{$h}' ", &nonl(substr($$bfr, $j, 10)), "\n"; #DEBUG if ($active{$h}) { $j++; my($sep); if ($h eq "\\") { #A macro name substr($$bfr, $j, 20) =~ /^([A-Za-z]+|.)(\s?)/s; $j += length($1); # $1 = macro name $h = $1; # Eat optional space after macro name, if there are # arguments, except leave a newline that doesn't # prevent recognition of args. $j += length($2) unless $nargs{$h} == 0 || exists($endmarks{$h}); } # Transform \begin{name} to \name, \end{name} to \endname if (($h eq "begin" || $h eq "end") && substr($$bfr, $j, 20) =~ /^\{(\w+)\}/) { $j += 2 + length($1); $h = (($h eq "end") ? $h : "") . $1; } # Extraction of arguments. There are 3 styles: my(@args); my $na = $nargs{$h}; # A special separator may delimit the argument(s). if ($sepr{$h} ne '') { @args = split($sepr{$h}, substr($$bfr, $j, 100), $na+1); pop(@args); #Lose text following special arg $j += length(join($sepr{$h}, @args, '')); } else { # Normally, a given number of ordinary tokens are used, # but a specific control sequence may be specified to # delimit the argument (generally only one). while ($na-- > 0) { push(@args, ''); $j = &tokenize($bfr, \$args[-1], $endmarks{$h}, $j); } } # Do the macro substitution. # print STDERR "Macro sub `$h' args @args\n"; #DEBUG if (exists($subst{$h})) { &output($output, &{$subst{$h}}(@args)); } else { &output($output, "\\$h" . join('', @args)); $missing{"\\$h"}++; } # Numbers are special in TeX. A dimension may follow. } elsif (substr($$bfr, $j, 200) =~ /^(-?[0-9]+[a-z]*)/) { &output($output, $1); #Ordinary text (to end of line) $j += length($1); # Ordinary text includes letters, whitespace, # and certain punctuation not significant to TeX. } elsif (substr($$bfr, $j, 200) =~ /^([A-Za-z.,;()`' \t]+\n?)/) { &output($output, $1); #Ordinary text (to end of line) $j += length($1); } else { &output($output, $h); #A single character $j++; } $needpara = length($indent) if (substr($$bfr, $j-2, 3) =~ /^\n\n[^%\n]/); # printf STDERR "%-8s %s\n", $indent, &nonl(substr($$bfr, $j0, $j-$j0)); #DEBUG } continue { # printf STDERR "end `%s' (%d) tail `%s'\n", $end, defined($end), &nonl(substr($$bfr, $j0, length($end))); #DEBUG redo unless substr($$bfr, $j0, length($end)) eq $end; } # print STDERR "$indent Exiting group `$end'\n"; #DEBUG substr($indent,-1) = ''; $j; } # Appends a fragment to the output stream. # \$output Ref. to linear buffer for output, or undef for direct printing # $data String to append sub output { my($output, $data) = @_; # After an empty line in the input stream, insert a

    , # except don't if the next HTML tag is at block level. if (length($indent) <= $needpara) { $needpara = 0; $data =~ /^(\<\w+)/; #Capture first HTML tag excluding arguments substr($data, 0, 0) = "

    " unless $blocklevel{lc($1)}; } if (defined($output)) { $$output .= $data; } else { print $data; } } # Where a string has newlines, changes to "\\n". sub nonl { my($data) = @_; $data =~ s/\n/\\n/sg; $data; }