From:	RUSVX2::Jnet%"A4422DAB@AWIUNI11"      "Erich Neuwirth" 19-NOV-1990 01:18:52.20
To:	Eberhard Mattes <DAAR1034@DS0RUS54>, 'Stefan Momma' <NBAB1424@DS0RUS54>, Harald Krieger <VCU11671@DS0RUS54>, 'Barbara Burr' <ZRFN0370@DS0RUS54>
CC:	
Subj:	

Received: From DEARN(MAILER) by DS0RUS54 with Jnet id 1075
          for ZRFN0370@DS0RUS54; Mon, 19 Nov 90 01:17 A
Received: by DEARN (Mailer R2.07) id 0854; Mon, 19 Nov 90 01:16:15 CET
Date:         Mon, 19 Nov 90 01:05:22 MEZ
Reply-To:     German TeX Users Communication List <TEX-D-L@DEARN>
Sender:       German TeX Users Communication List <TEX-D-L@DEARN>
From:         Erich Neuwirth <A4422DAB@AWIUNI11>
To:           Eberhard Mattes <DAAR1034@DS0RUS54>,
              'Stefan Momma' <NBAB1424@DS0RUS54>,
              Harald Krieger <VCU11671@DS0RUS54>,
              'Barbara Burr' <ZRFN0370@DS0RUS54>
 
 
======================================================================== 491
Date:         Sun, 18 Nov 90 23:27:16 WUT
From:         Gustaf Neumann <NEUMANN@AWIWUW11>
Subject:      Programm zur Konvertierung von Umlauten
To:           TEX-D-L@DEARN
 
 
Nachstehend folgt ein Lex-Programm zur Konvertierung von Texten mit
'Ascii-German'-Umlauten (Umlaute werden als Ae, Oe, ... geschrieben) in
Texte mit (TeX)-Umlauten. Das Programm ist bei weitem noch nicht
perfekt (und wird es auch nie sein), doch ist vielleicht fuer einige
doch nuetzlich. Ich konnte mit dem Programm den Text des Buches
 
\bibitem[{Neu88}]{neumann88}
        G.~Neumann: \T{Metaprogrammierung und Prolog},
        Addison--Wesley, Bonn 1988.
 
zur Gaenze fehlerfrei umsetzen (Ende der Werbeeinschaltung).
 
Bekannte Problemkinder sind "Masse" ("im hohen Masse" versus
"Gesteinsmasse" ) und "Busse" ("Autobusse" vs. "tuet Busse"). In beiden
Faellen wird die jeweils erste Variante als richtig angenommen,
die anderen Alternativen erreicht man durch "Gesteinsmas{}se" bzw. durch
"tuet Bu{}sse". Ich nehme gerne noch weitere Ausnahmeregeln entgegen.
 
-Gustaf neumann
-------------------------------------------------------------------
Gustaf Neumann       neumann@dec4.wu-wien.ac.at, neumann@awiwuw11.bitnet
Vienna University of Economics and Business Administration
Augasse 2-6,  A-1090 Vienna, Austria
Tel: +43 (222) 31-336 x4533     Fax 347-555
 
------------------------------------- cut here -----diac.shar-----------
# This is a shell archive.  Remove anything before this line,
# then unpack it by saving it in a file and typing "sh file".
#
# Wrapped by neumann on Sun Nov 18 23:20:06 1990
#
# This archive contains:
#	diac.l		Makefile	diacaux.c	diacaux.h
#
 
LANG=""; export LANG
 
echo x - diac.l
cat >diac.l <<'@EOF'
%{
 
/* diac.l
 * lex file for converting Ascii German into diacritical German
 * Version 1.0 written by
 *  Dorai Sitaram, Rice University, 1990   dorai@titan.rice.edu
 *
 * Version 1.1:
 * General rewrite, using some Material from
 *    H.Kaeslin, Behandlung der Umlaute bei der Verarbeitung deutscher
 *    Texte unter Unix, in: it, Vol 1, 1988
 * and Duden - die Rechtschreibung.
 *
 * Gustaf Neumann, Wirtschaftsuniversitaet Wien, October 1990
 * neumann@dec4.wu-wien.ac.at         neumann@awiwuw11.bitnet
 *
 * The resulting LaTeX file uses german.sty!
 * Representation of umlaut characters:    \"a \"A \"o \"O \"u \"U {\ss}
 * The style file german.sty would allow    "a  "A  "o  "O  "u  "U "s
 * as well, but the latter representation makes it impossible to
 * to distinguish between umlaut characters and quoted text. This distinction
 * is necessesay in cases where quotes should be changed into opening and
 * closing german quotes (\glqq and \qrqq) in an automated way (another
 * lex program).
 *
 * If you do NOT want to use GERMAN.STY, replace underneath the ruleset
 * for \documentstyle with the following rule:
 
\\documentstyle[^\}]*\}	{ printf("%s\n", yytext);
			printf("\\newskip\\zeeskip\n");
			printf("\\zeeskip=0pt plus0pt minus0pt\n");
			printf("\\def\\1{\\nobreak\\hskip\\zeeskip}\n");
			printf("\\let\\umlaut\\\"\n");
			printf("\\def\\\"#1{\\1\\umlaut#1\\1}\n");
			printf("\\let\\oldss\\ss\n");
			printf("\\def\\ss{\\1\\oldss\\1}\n"); }
 *
 *
 * To prevent the conversion from Ascii German into diacritical German,
 * it is necessary to insert empty groups into the words (e.g. Ka{}eslin).
 */
 
#include "diacaux.h"
int i;
%}
 
%p 6500
%n 1000
%e 2500
%a 4000
%k 2500
%o 3500
 
 
V	[AEIOUaeiou]
C	[B-DF-HJ-NP-TV-Zb-df-hj-np-tv-z]
W	[ "'\t\n,;!?().]
b       [ \t\n]
 
%%
 
\\documentstyle{b}*\{   printf("\\documentstyle[german]{");
\\documentstyle{b}*\[.*german.*\]{b}*\{   ECHO;
\\documentstyle{b}*\[.*\]{b}*\{ {
                          for(i=13;yytext[i]=='[';i++);
                          printf("\\documentstyle[german,%s",&yytext[i+2]);}
 
 
 
\\input{b}*\{[^\}]+\}	{ texfile = getfilenamebrack(&yytext[6]);
			tempfile = maketempfilename(texfile);
			printf("\\input{%s}", tempfile);
			dosubdiac(texfile, tempfile); }
 
\\input{b}*[^ \t\n]+	{ texfile = getfilename(&yytext[6]);
			tempfile = maketempfilename(texfile);
			printf("\\input %s", tempfile);
			dosubdiac(texfile, tempfile); }
 
\\begin\{.+\}	ECHO;
\\end\{.+\}	ECHO;
\\[A-Za-z]+	ECHO;
 
 
%{ /* ue */
%}
 
[Rr]euessier 	printf("%ce\\\"ussier", yytext[0]);
[^igGbB][Ee]ue 	ECHO;
[QqAa]ue 	ECHO;
[Uu]e[iu] 	ECHO;
[Gg]etue{W}	ECHO;
[a-rt-z]tuend	ECHO;
{W}tuet{W}	ECHO;
[Nn]ichtstuend	ECHO;
[Nn]ichtstuer	ECHO;
Tuerei{W}	ECHO;
[a-z]tuerei	ECHO;
[a-z]tuerisch	ECHO;
[Aa]bzue[b-z][a-z]*[elr]n	ECHO;
[Aa]nzue[b-z][a-z]*[elr]n	ECHO;
[Aa]u[fs]zue[b-z][a-z]*[elr]n	ECHO;
[Ee]inzue[b-z][a-z]*[elr]n	ECHO;
[Hh]inzue[b-z][a-z]*[elr]n	ECHO;
[Mm]itzue[b-z][a-z]*[elr]n	ECHO;
[Nn]achzue[b-z][a-z]*[elr]n	ECHO;
[Vv]orzue[b-z][a-z]*[elr]n	ECHO;
[Ww]iederzue[b-z][a-z]*[elr]n	ECHO;
[Zz]ue[b-z][a-z]*[elr]n		ECHO;
[Zz]urueckzue[b-z][a-z]*[elr]n	printf("%cur\\\"uckzu%s",yytext[0],&yytext[9]);
tuendere	ECHO;
[Aa]biguen	ECHO;
[Aa]ffluen	ECHO;
[Bb]u[ea]nos	ECHO;
[Dd]uett	ECHO;
[Dd]uell	ECHO;
entuell 	ECHO;
[Gg]raduell	ECHO;
[Gg]uerill	ECHO;
[Ii]ndividuen	ECHO;
[Ii]nfluen	ECHO;
Lueger	        ECHO;
[krx]tuell	ECHO;
[Kk]ongruen	ECHO;
[Kk]onstituen	ECHO;
[Mm]enuett	ECHO;
[Mm]anuell	ECHO;
[Mm]igue[tl]	ECHO;
[Pp]irouett	ECHO;
[Pp]uerto	ECHO;
[Rr]esiduen	ECHO;
[Ss]tatue	ECHO;
[Ss]exuell	ECHO;
[Ss]uez		ECHO;
[Vv]enezuel	ECHO;
[Vv]isuell	ECHO;
[Zz]uerkannt	ECHO;
[Zz]uerteil	ECHO;
[Zz]uerst	ECHO;
 
 
%{ /* ae */
%}
 
[Aa]ero		ECHO;
[Dd]odekae	ECHO;
[Hh]exae	ECHO;
[Ii]kosae	ECHO;
[Ii]srael	ECHO;
[Kk]afkaesk	ECHO;
aeuel           printf("\\\"auel");
[Mm]ichael	ECHO;
[Mm]etae	ECHO;
[Oo]ctae	ECHO;
[Pp]entae	ECHO;
[Pp]harmae	ECHO;
[Rr]affael	ECHO;
[Rr]afael	ECHO;
[Rr]aphael	ECHO;
[Tt]etrae	ECHO;
[Tt]hemae	ECHO;
[Ss]chemae	ECHO;
[Ss]amuel	ECHO;
[Vv]alue{W}	ECHO;
[Tt]rue{W}	ECHO;
 
 
%{ /* oe */
%}
 
[Aa]utoe	ECHO;
[Bb]enzoe	ECHO;
[Cc]hemoe	ECHO;
[Dd]iarrhoea	ECHO;
[Ee]lektroe	ECHO;
[Gg]oethe	ECHO;
[Hh]eroen 	ECHO;
[Hh]o[ml]oe 	ECHO;
[Hh]ydroe 	ECHO;
[Ii]ndoeuro	ECHO;
Joel	        ECHO;
[Kk]inoe 	ECHO;
[Kk]oedukat 	ECHO;
[Kk]oeffizi 	ECHO;
[Kk]oerzi 	ECHO;
[Kk]oexist 	ECHO;
[Cc]oexist 	ECHO;
[Kk]oenzym 	ECHO;
[Kk]ontoe 	ECHO;
[Ss]oeben 	ECHO;
Soest   	ECHO;
[Mm]etazoe 	ECHO;
[Mm][ai][ck]roe ECHO;
[Mm]onoe 	ECHO;
[Nn]euroe 	ECHO;
[Oo]boe 	ECHO;
[Oo]erlikon 	ECHO;
[Oo]ldesloe	ECHO;
[Oo]kto 	ECHO;
[Oo]pto 	ECHO;
[Pp]oesie 	ECHO;
[Pp]oebene 	ECHO;
[Pp]iezo 	ECHO;
[Pp]hoto 	ECHO;
[Pp]hysioe 	ECHO;
[Pp]oe[mt]i 	ECHO;
[Pp]oe[mt][^a-z]	ECHO;
[Pp]orto 	ECHO;
[Pp]roenzy	ECHO;
[Pp]roto	ECHO;
[Pp]rotozoe 	ECHO;
[Pp]seudo 	ECHO;
[Pp]sycho 	ECHO;
[Pp]yro 	ECHO;
[Rr]adio 	ECHO;
[Tt]otoer	ECHO;
[Tt]urbo	ECHO;
[Vv]ideo	ECHO;
 
 
%{ /* ss */
%}
 
{V}sss		printf("%c{\\ss}s",yytext[0]);
[EeAu][iu]ss	printf("%c%c{\\ss}", yytext[0],yytext[1]);
{C}{V}sser{W}	ECHO;
{C}{V}sser{V}	ECHO;
{C}{V}ssen	ECHO;
[^r]uesse[ln] 	printf("%c\\\"usse%c",yytext[0],yytext[6]);
luesse 		printf("l\\\"usse");
iess		printf("ie{\\ss}");
ssung 		ECHO;
ssel 		ECHO;
ssoren 		ECHO;
ssiez 		ECHO;
ccess 		ECHO;
ssidy 		ECHO;
chss 		ECHO;
ssch 		ECHO;
sspr 		ECHO;
ssier 		ECHO;
nisse		ECHO;
lss 		ECHO;
ss' 		ECHO;
tionss		ECHO;
tss		ECHO;
ussisch		ECHO;
ungss		ECHO;
usserl{W}	ECHO;
[Aa]ssoz	ECHO;
[Aa]ssist	ECHO;
[Aa]ssemb	ECHO;
[Aa]uss[^e]	ECHO;
[Aa]usse[^rn]	ECHO;
[Aa]ussende	ECHO;
[Ee]sse		ECHO;
[Bb]isschen	printf("%ci{\\ss}chen", yytext[0]);
[Bb]usiness	ECHO;
[Bb]usse	ECHO;
[Bb]ussard	ECHO;
triebss		ECHO;
beitss		ECHO;
[Dd]iskussion	ECHO;
[Dd]issert	ECHO;
[Dd]asselb	ECHO;
[Ee]ssi		ECHO;
[Ff]lusse	ECHO;
[Ff]luess[ie]	printf("%cl\\\"uss%c", yytext[0],yytext[6]);
Grass		ECHO;
[Gg]enosse	ECHO;
[Gg]rosse       printf("%cro{\\ss}e",yytext[0]);
[Ii]nteress	ECHO;
[Kk]lass[ie]	ECHO;
[Kk]assette	ECHO;
[Ll]asse	ECHO;
[Ll]aessig	printf("%c\\\"assig", yytext[0]);
[Mm]assa[^nr]	ECHO;
[Mm]asseu	ECHO;
[Mm]isser{C}	printf("%ci{\\ss}er%c", yytext[0],yytext[6]);
[Mm]iss[ei]	ECHO;
[Ee]rmassen	printf("%crma{\\ss}en", yytext[0]);
[Mm]assi	ECHO;
[Pp]rivatissi	ECHO;
[Pp]assiv	ECHO;
[Pp]rozessor	ECHO;
[Ss]tossen	printf("%cto{\\ss}en", yytext[0]);
[Rr]essource	ECHO;
[Ww][ia]sse	ECHO;
 
{C}ss{C}	ECHO;
 
[AaOoUu]e	printf("\\\"%c", yytext[0]);
ss		printf("{\\ss}");
 
@EOF
 
chmod 644 diac.l
 
echo x - Makefile
cat >Makefile <<'@EOF'
#
# if you do not have flex available, deactivate the definitions of
# LEX and LEXLIB; The program compiled with flex works also with the
# standard lex library (-ll).
#
LEX=flex
LEXLIB=-lfl
PROGS= diac
 
all: ${PROGS}
 
diac: diac.l diacaux.h diacaux.c
	${LEX} ${LFLAGS} diac.l
	cc -O ${DEFINES} -o $@ diacaux.c lex.yy.c ${LEXLIB}
	strip $@
	rm lex.yy.c lex.yy.o diacaux.o
 
clean:
	rm -f ${PROGS} *.o *~ #* core
 
 
shar:
	shar diac.l Makefile diacaux.c diacaux.h > diac.shar
@EOF
 
chmod 644 Makefile
 
echo x - diacaux.c
cat >diacaux.c <<'@EOF'
/* diacaux.c
 * to be linked with lex.yy.c from diac.l
 * written by Dorai Sitaram, Rice University, 1990
 */
 
#include "diacaux.h"
 
int slen(s)
char *s;
{
  int i;
 
  for (i = 1; s[i] != '\0'; i++)
	;
 
  return i;
}
 
char *strap(s,t)
char *s,*t;
{
  char *r = (char *) malloc(slen(s) + slen(t));
  int i,j;
 
  for (i = 0; s[i] != '\0'; i++)
	r[i] = s[i];
 
  for (j = 0; t[j] != '\0'; i++, j++)
	r[i] = t[j];
 
  r[i] = '\0';
 
  return r;
}
 
char *getfilename(s)
char *s;
{
  char *r = (char *) malloc(slen(s));
  int i,j;
 
  for (i = 0; s[i] == ' ' || s[i] == '\t' || s[i] == '\n'; i++)
	;
 
  for (j = 0; s[i] != '\0'; i++, j++)
	r[j] = s[i];
 
  r[j] = '\0';
 
  return r;
}
 
char *getfilenamebrack(s)
char *s;
{
  char *r = (char *) malloc(slen(s));
  int i,j;
 
  for (i = 0; s[i] == ' ' || s[i] == '\t' || s[i] == '\n' ||
		s[i] == '{'; i++)
	;
 
  for (j = 0; s[i] != '}'; i++, j++)
	r[j] = s[i];
 
  r[j] = '\0';
 
  return r;
}
 
char *maketempfilename(s)
char *s;
{
  char *r = (char *)malloc(slen(s));
  int i,j;
 
  for (i = 0, j = 0; s[j] != '\0'; i++, j++) {
	r[i] = s[j];
	if (r[i] == '/') r[i] = '_';
  }
 
  r[i] = '\0';
 
  return strap("/tmp/",r);
}
 
void dosubdiac(s,t)
char *s,*t;
{
  system(strap("diac <",
	strap(texfile,
	strap(" > ", tempfile))));
}
@EOF
 
chmod 644 diacaux.c
 
echo x - diacaux.h
cat >diacaux.h <<'@EOF'
/* diac.h
 * to be included in diac.l and diac.c
 * written by Dorai Sitaram, Rice University, 1990
 */
 
char *texfile;
char *tempfile;
int slen();
char *strap();
char *getfilename();
char *getfilenamebrack();
char *maketempfilename();
void dosubdiac();
@EOF
 
chmod 644 diacaux.h
 
exit 0
 
ERICH NEUWIRTH
BITNET (EARN): A4422DAB@AWIUNI11
INTERNET:      a4422dab@Helios.EDVZ.UniVie.AC.AT
Intitute for Statistics and Computer Science
UNIVERSITY OF VIENNA, UNIVERSITAETSSTR. 5/9, A-1010 VIENNA, AUSTRIA