#!/usr/local/bin/perl
# guatex2html -- Translate the -gua!spi papers into HTML.
# Usage: guatex2html file.tex > file.html
# BEWARE: This program will handle *most* constructions, but further hand work 
# is needed on the hard parts.  So don't blindly rebuild an existing HTML file
# as you'll ruin the hand labor.  Items to look for:
#   * Macros of the "halign" type are not handled at all.  You need to
#	turn them into proper HTML tables.  Also, \obeylines can't be obeyed.
#   * \xitem is defined differently in different files and tables within each
#	file.  Have fun.
#   * In some cases, 2-column tables looked a lot better with 3 columns,
#	like this:
#	    <tr><td>Word <td colspan=2>What it means
#	    <tr><td><td>-gua\spi example <td>Translation
#	It would have saved a lot of work to put this definition into a local
#	file where it's relevant:
#	    xitem 3 <tr><td><i>%</i> <td colspan=2>% <tr><td><td><i>%</i> <td>%
#   * \def is programmed to eat everything up to and including the {}, but
#	about three \defs have highly deceptive nesting behavior, so that
#	sections of wanted text got eaten -- half the file, in the worst case.
#   * In a few cases the TeX files use \emg to italicize Roman text.  This is
#	bogus, and this program will underline it.

# If the command line file is file.tex, after the standard macro expansions 
# are loaded, the program will read file.shy if it exists, to supplement the
# macros.  See \xitem above for the format; the args go directly to &addhash.

die "file.tex (exactly one) is required\n" unless @ARGV == 1;
die "No file $ARGV[0]\n" unless -r $ARGV[0];

$guafont="<I>";		# The font used in the \gua environment
$guafonte="</I>";

# Adds key-value pairs to %subst without replacing what's there.  Args:
#   $key		Name of macro without leading backslash
#   $narg		Number of arguments of this macro followed by the
#			separator (if any).  All separators must be the same.
#   $value		Either a string or a code ref.  For a string, each
#			successive % is replaced by one argument.  If a code
#			ref, the arguments are the macro args and the return
#			value is what to replace them with.
# Arbitrarily many triplets are given.
sub addhash {
    my($k, $n, $v);
    while (($k, $n, $v) = splice(@_, 0, 3)) {
	$n =~ /^(\d*)(\D*)$/;	# Number of args + separator, e.g. "2,"
	$nargs{$k} = $1;
	$sepr{$k} = $2;		# Usually this is ''
	unless (defined($v)) {
	    die "Error, args to addhash are out of sync.  Keys:\n`",
		join("', `", keys %subst), "'\n";
	}
		# Now convert $v to a subroutine as described above.
		# (It could already be a subroutine.)
	if (ref($v) eq '') {
	    my(@parts) = split('%', $v, -1);	#Don't lose trailing null fields
	    my $w = shift @parts;		#Eventual return value
	    my $i = 0;
	    foreach $_ (@parts) {
		if (substr($w,-1) eq "\\") {	#Don't replace \%
		    substr($w,-1) = "%$_";
		} else {
		    $w .= '$_[' . $i++ . ']' . $_;	# % -> $_[$i]
		}
	    }
	    $w =~ s/(?=[\\"])/\\/g;		#Put \ before metachars
	    my $cmd = "sub { \"$w\" }";		#Make the subroutine.
	    $v = eval $cmd or warn "Key $k $cmd --- $@\n";
	}
	$subst{$k} = $v;
    }
}

our %subst;		# Macros, 0 or 1 argument.  % represents the arg.
			# The key is the macro name without leading backslash.

&addhash(
# Escaped characters
	' '	=> 0, " ",		# Escaped space '\ '
	"\n"	=> 0, "\n",		# Accidentally escaped newline "\\\n"
	'/'	=> 0, "",		# Italic correction (ignore it) '\/'
	'-'	=> 0, "",		# Hyphenation hint (ignore)
	"="	=> 0, "",		# Alternative to \- when hypen is active
	'%'	=> 0, "\\%",		# Escaped percent
	'#'	=> 0, "#",		# Escaped pound sign
	'*'	=> 0, "*",		# Escaped asterisk (guaspi special)
# Active characters
	'~'	=> 0, "&nbsp;",		# Nonbreak space
	'&'	=> 0, "<TD>",		# Separator in halign
	'$'	=> 0, "",		# Math mode (ignore it)
	'{'	=> 1, "%",		# Grouping characters
	'}'	=> 0, "",		# Grouping characters
	'['	=> 1, "[%",		# Visible grouping characters
	']'	=> 0, "]",		# Visible grouping characters
	

# General TeX and LaTeX definitions
	title	=> 1, "<H1 align=center>%</H1>",
	author	=> 1, "<H3 align=center>%</H3>",
	date	=> 1, "<div align=center>%</div>",
	abstract	=> 0, "<blockquote>Abstract: ",
	endabstract	=> 0, "</blockquote>",
	chapter		=> 1, "<H1 align=center>%</H1>",
	section		=> 1, "<H2 align=center>%</H2>",
	subsection	=> 1, "<H3 align=center>%</H3>",
	appendix	=> 1, "<H2 align=center>Appendix: %</H2>",
	label	=> 1, "<A name=\"%\"> </A>",
	it	=> 1, "<EM>%</EM>",
	langle	=> 0, "&lt;",
	rangle	=> 0, "&gt;",
	P	=> 0, "&para;",
	quad	=> 0, "&nbsp;&nbsp;&nbsp;&nbsp;",	#Disgusting
	qquad	=> 0, "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;",
	itemize	=> 0, sub { $subst{item} = sub { "<LI>" }; $nargs{item} = 0; "<UL>"; },
	enditemize	=> 0, "</UL>",
	list	=> 0, sub { $subst{item} = sub { "<DT>$_[0]<DD>" }; $nargs{item} = 1; "<DL>"; },
	endlist	=> 0, "</DL>",
	cite	=> 1, "[%]",
	sp	=> 1, "<SUP>%</SUP>",	# Superscript
	sc	=> 1, '<SPAN style="font-variant:small-caps">%</SPAN>',
	ref	=> 1, "[%]",		# Not really functional

# It's assumed that all tables have the form 
# \begin{table} \halign{stuff} \caption stuff \end{table}.
	table	=> 1, "<HR><TABLE>%",
	halign	=> 1, "<TR><TD>%",
	cr	=> 0, "<TR><TD>",	# Spurious row at end of table, too bad
	caption	=> 1, "</TABLE>\n<BLOCKQUOTE>",	# Lose [title of table]
	endtable => 0, "</BLOCKQUOTE><HR>",
	figure	=> 0, "<HR>",		# Can't do much with a figure
	endfigure => 0, "</BLOCKQUOTE><HR>",
	phalign		=> 0, '<BLOCKQUOTE><TABLE WIDTH="100\\%"><TR><TD>',
	endphalign	=> 0, '</TABLE></BLOCKQUOTE>',
	vhalign		=> 1, '<BLOCKQUOTE><TABLE WIDTH="100\\%"><TR><TD>%</TABLE></BLOCKQUOTE>',

	noindent => 0, "",		# Can't turn indentation on or off
	penalty	=> 1, "",		# Ignore penalties.
	def	=> 1, "",		# Ignore TeX macro definitions.
	newcommand	=> 3, "",	# Ignore LaTeX macro definitions
	renewcommand	=> 3, "",	# Ignore LaTeX macro definitions
	documentstyle	=> 2, "",	# Ignore various LaTeX admin stuff
	oddsidemargin	=> 2, "",
	evensidemargin	=> 2, "",
	document	=> 0, "",
	enddocument	=> 0, "",
	maketitle	=> 0, "",
	protect		=> 0, "",
	vskip		=> 1, "",

# Definitions from guaspi.sty
	qh	=> 0, "`-'",		# Quoted hyphen
	'!'	=> 0, "\\",		# Backslash
	'|'	=> 0, "|",		# Vertical bar
	caret	=> 0, "^",		# Caret without kerns for unslanted type
	dotSE	=> 0, "&nbsp;.&nbsp;.&nbsp;.",	# Ellipsis dots...
	dots	=> 0, "&nbsp;.&nbsp;.&nbsp;.",	# Ellipsis dots...

# Environment for running -gua\spi text.  It's necessary to suppress line
# breaks after a hyphen (tone symbol). 
	guaemg	=> 1, "<blockquote>$guafont%$guafonte</blockquote><rule>", #The rule is in the original but it seems bogus.

	guahyph	=> 0, "$guafont-$guafonte",
	englhyph => 0, "-",
	emdash	=> 0, "---",	#Need an em-dash by cowboy programming
# An inline word or short phrase in -gua!spi
	gua	=> 1, "$guafont%$guafonte",
	qgua	=> 1, "$guafont``%''$guafonte",	# Quoted gua\spi word
	guaspi	=> 0, "${guafont}gua\\spi$guafonte", #The name of gua\spi
	Guaspi	=> 0, "${guafont}Gua\\spi$guafonte", #Same, capitalized

# A word with its translation.  Format: \trw-gua,english,  
	trw	=> "2,", sub { "``$guafont" . substr($_[0],1) . "$guafonte-" . $_[1] . "''" },

	emg	=> 1, "<U>%</U>",	# An emphasized gua\spi word, underlined
	betw	=> 1, "&lt;%&gt;",	# A phrase in angle brackets < >
	hfilbreak	=> 0, "",	# Ignore various line break adjustments
	afilbreak	=> 0, "",
	vabreak		=> 0, "",

# These names pertain to \halign: phalign endphalign shalign vhalign

# Paragraph in table cell.  It's used with 2 arguments: a width and the
# content.  Toss the width, leaving the content.  Return value: empty string.
	littlepar	=> 1, "",

# 2-column examples.  exii is seen as \begin{exii} and endexii is \end{exii}.
	exii	=> 0, '<blockquote><table width="100\\%"><col width="50\\%"><col width="50\\%">',
	endexii	=> 0, "</table></blockquote>",
	ex	=> 2, "<tr><td>$guafont%$guafonte<td>%",
# A single 2-column example
	pli	=> 2, sub { &{$subst{exii}}() . &{$subst{ex}}(@_) . &{$subst{endexii}}() },
# A lot of 2-column examples
	exbox	=> 1, sub { &{$subst{exii}}() . $_[0] . &{$subst{endexii}}() },
# Word lists have two 2-column lines.  Args are: 
# \xitem{word}{description}<NL>{-gua!spi example}{translation}
#	xitem	=> 4, sub { &{$subst{ex}}("** $_[0]", $_[1]) . &{$subst{ex}}(@_[2..3]) },


);	# End of loading %subst

# Load an auxiliary definition file if present.  Its lines have the format:
#	macroname nargs content
# with fields separated by whitespace.  The content is the rest of the line.
# Blank lines and lines beginning with % are ignored.
if (($AUX = $ARGV[0]) =~ s/\.tex/.shy/ && open(AUX, $AUX)) {
    while (<AUX>) {
	next if /^\s*(%|$)/;
	chomp;
	my(@row) = split(' ', $_, 3);
	if ($row[2] =~ /^sub /) {
	    my $cmd = eval $row[2];
	    if (defined($cmd)) {
		$row[2] = $cmd;
	    } else {
		warn "In $AUX $row[2] --- $@\n";
		$row[2] = " OOPS ";
	    }
	} 
	&addhash(@row);
    }
    close AUX;
}


# If an active character or macro name (without backslash) is a key in this
# table, its one argument extends to (and including) the macro (with backslash)
# or active character which is the value.
%endmarks = ( '{', '}', '[', ']', 
	"def", '{',		# \def#1{whatever} this eats the {} too  \}
);

# These HTML objects are at block level and a <p> is not wanted before them.
%blocklevel = qw(<p 1 <ul 1 <ol 1 <dl 1 <li 1 <dt 1 <dd 1
	<table 1 <tr 1 <td 1 <th 1 <blockquote 1
	<h1 1 <h2 1 <h3 1 <h4 1 <h5 1 <h6 1 <pre 1 <div 1 );

# \hyphenation{whe-ther re-fri-ger-a-tor me-ta-phor me-ta-phors
#	au-stra-lo-pi-the-cus au-stra-lo-pi-the-cine
#	ne-gate ne-gated ne-gat-ive sur-prise mod-al }



# A TeX document is a list of tokens, which can be strings or sub-lists.
$bfr = join('', <>);		# Read entire document at once.
$bfr =~ s/(?<!\\)%[^\n]*\n//sg;	# Percents mark comments, to \n, which vanishes.
$j = 0;
TOKEN: {
    $j = &tokenize(\$bfr, undef, "\\enddocument", $j);
    last if $j >= length($bfr);
    &output(undef, "\n\n<br>==== Unbalanced right squiggle here ====<br>\n\n");
    redo;
}

$z = join("\n", sort keys %missing);
print  "\n<p>These macros have no definition:\n$z\n" if $z ne ''; 

# The active characters
BEGIN {
    %active = qw(\ 1 { 1 } 1 [ 1 ] 1 ~ 1 & 1 $ 1);
}

# Convert the buffer to tokens.  Args:
#   \$bfr	Ref. to linear input buffer
#   \$output	Ref. to linear output buffer, or undef for direct printing.
#   $end        The control sequence (with backslash) or active character at
#		which the unit ends.  It is included with the unit.  Specify
#		'' for exactly one token (or a subunit in { }).  Specify
#		\bye or \enddocument for the entire document.
#   $j		Index in buffer to start at
#   Returns:	Index in buffer just after $end
# It's assumed that no token can be over 100 bytes long.
sub tokenize {
    my($bfr, $output, $end, $j) = @_;
    my($h, $j0);
    my $len = length($$bfr);
    $indent .= '*';			# Needed to know when to insert <p>
#   print STDERR "$indent Starting group `$end'\n"; #DEBUG
		# When hunting for macro arguments, whitespace before or
		# between arguments is skipped.
    $j += length($1) if substr($$bfr, $j, 100) =~ /(^\s+)/s;
		# Split off tokens one by one.
    TOKENS: {
	last if $j >= $len;			# If end of input was reached
	$j0 = $j;				# Location of token start
		# Tokens consist of:
		#   % to end of line (comment, ignored)
		#   \alphabetic, a macro name, eating one space after
		#   digits followed by letters, a dimen
		#   word characters, a word
		#   a contiguous stretch of spaces including \n
		#   any single character.
	$h = substr($$bfr, $j, 1);		#The next byte
#	print STDERR "`$active{$h}' ", &nonl(substr($$bfr, $j, 10)), "\n"; #DEBUG
	if ($active{$h}) {
	    $j++;
	    my($sep);
	    if ($h eq "\\") {			#A macro name
		substr($$bfr, $j, 20) =~ /^([A-Za-z]+|.)(\s?)/s;
		$j += length($1);	# $1 = macro name
		$h = $1;
                        # Eat optional space after macro name, if there are
			# arguments, except leave a newline that doesn't
			# prevent recognition of args.
		$j += length($2) unless $nargs{$h} == 0 
		    || exists($endmarks{$h});
	    }
		# Transform \begin{name} to \name, \end{name} to \endname
	    if (($h eq "begin" || $h eq "end") &&
				substr($$bfr, $j, 20) =~ /^\{(\w+)\}/) {
		$j += 2 + length($1);
		$h = (($h eq "end") ? $h : "") . $1;
	    }
			# Extraction of arguments.  There are 3 styles:
	    my(@args);
	    my $na = $nargs{$h};
			# A special separator may delimit the argument(s).
	    if ($sepr{$h} ne '') {
		@args = split($sepr{$h}, substr($$bfr, $j, 100), $na+1);
		pop(@args);			#Lose text following special arg
		$j += length(join($sepr{$h}, @args, ''));
	    } else {
			# Normally, a given number of ordinary tokens are used,
			# but a specific control sequence may be specified to
			# delimit the argument (generally only one).
		while ($na-- > 0) {
		    push(@args, '');
		    $j = &tokenize($bfr, \$args[-1], $endmarks{$h}, $j);
		}
	    }
			# Do the macro substitution.
#	    print STDERR "Macro sub `$h' args @args\n"; #DEBUG
	    if (exists($subst{$h})) {
		&output($output, &{$subst{$h}}(@args));
	    } else {
		&output($output, "\\$h" . join('', @args));
		$missing{"\\$h"}++;
	    }

			# Numbers are special in TeX.  A dimension may follow.
	} elsif (substr($$bfr, $j, 200) =~ /^(-?[0-9]+[a-z]*)/) {
	    &output($output, $1);		#Ordinary text (to end of line)
	    $j += length($1);
	
                        # Ordinary text includes letters, whitespace,
			# and certain punctuation not significant to TeX.
	} elsif (substr($$bfr, $j, 200) =~ /^([A-Za-z.,;()`' \t]+\n?)/) {
	    &output($output, $1);		#Ordinary text (to end of line)
	    $j += length($1);
	} else {
	    &output($output, $h);		#A single character
	    $j++;
	}
	$needpara = length($indent) 
	    if (substr($$bfr, $j-2, 3) =~ /^\n\n[^%\n]/);
#	printf STDERR "%-8s %s\n", $indent, &nonl(substr($$bfr, $j0, $j-$j0)); #DEBUG
    } continue {
#	printf STDERR "end `%s' (%d) tail `%s'\n", $end, defined($end), &nonl(substr($$bfr, $j0, length($end))); #DEBUG
	redo unless substr($$bfr, $j0, length($end)) eq $end;
    }
#   print STDERR "$indent Exiting group `$end'\n"; #DEBUG
    substr($indent,-1) = '';
    $j;
}

# Appends a fragment to the output stream.
#   \$output	Ref. to linear buffer for output, or undef for direct printing
#   $data	String to append
sub output {
    my($output, $data) = @_;
		# After an empty line in the input stream, insert a <p>,
		# except don't if the next HTML tag is at block level.
    if (length($indent) <= $needpara) {
	$needpara = 0;
	$data =~ /^(\<\w+)/;	#Capture first HTML tag excluding arguments
	substr($data, 0, 0) = "<p>" unless $blocklevel{lc($1)};
    }
    if (defined($output)) {
	$$output .= $data;
    } else {
	print $data;
    }
}

# Where a string has newlines, changes to "\\n".
sub nonl {
    my($data) = @_;
    $data =~ s/\n/\\n/sg;
    $data;
}
