#include #include #include #include"heap.h" #include"library.h" #define PARSERIMPLEMENTATION #include"parse.h" #define RINGSIZE 15 #define MAXSTRING 256 #define MAXWORT 1022 #define MODESTACK 100 #define MINDESTLAENGE 1 #ifdef DEBUG #define storemode( mode ) { ringindex++; ringindex %= RINGSIZE; modering[ringindex] = mode; printf("%d ",modering[ringindex]);} #else #define storemode( mode ) { ringindex++; ringindex %= RINGSIZE; modering[ringindex] = mode;} #endif #define min(a,b) ((a)<(b) ? (a) : (b)) #define is_a_letter(c) ((('A'<=(c))&&((c)<='Z'))||(('a'<=(c))&&((c)<='z'))) #define islower( c ) ( ('a'<= ( c ))&&(( c ) <= 'z') ) #define isdigit( c ) ( ('0'<= ( c ))&&(( c ) <= '9') ) #define isupper( c ) ( ('A'<= ( c ))&&(( c ) <= 'Z') ) #define preprevious_mode ( modering[ ringindex > 0 ? ringindex-1 : RINGSIZE-1 ] ) #define previous_mode ( modering[ ringindex ] ) #define old_mode( age ) ( modering[ ringindex - age => 0 ? ringindex-age : RINGSIZE + ringindex - age ] ) enum catcode { catcode_escape, catcode_begingroup, catcode_endgroup, catcode_mathshift, catcode_alignmenttab, catcode_endofline, catcode_space, catcode_letter, catcode_other, catcode_active, catcode_comment, catcode_ignored, catcode_eof }; enum parsemode { no_mode, /* kein mode im mode-ring */ common_mode, /* normale buchstaben */ escape_mode, /* befehle */ parameter_mode, /* untersuchung von environment-parametern */ space_mode, /* skippen von leerzeichen */ digit_mode, /* behandeln von zahlen (mit dezimalpunkt) */ other_mode, /* skippen von others ( z.b. '(' ) */ skip_mode, /* $$ werden geskipt und aehnliches */ comment_mode, /* behandeln von kommentaren */ active_mode, /* untersuchen von active-characters */ end_deletion_mode /* skippen der parameter von \end */ }; struct token { enum catcode catcode; unsigned char token; }; struct active_expansion{ unsigned char of_what; unsigned char *what; }; struct active_definition{ struct active_definition *next; unsigned char character; int anzahl; struct active_expansion expansion[1]; }; /* Modulglobale variablen */ static enum parsemode modering[RINGSIZE]; static int ringindex; static unsigned char punctuation_chars[50] = ".,:;!?"; /* characters, die hinter sich, aber nicht vor sich ein space wollen */ static unsigned char textbegingroup_chars[20] = "("; /* characters, die vor sich, aber nicht hinter sich ein space wollen */ static unsigned char textendgroup_chars[20] = ")"; /* characters, die vor sich, aber nicht hinter sich ein space wollen */ static unsigned char sentenceend_chars[20]=".!?"; static int zeilenoffset; static struct active_expansion default_expansion; static struct active_definition *active_defs; static struct active_definition *escape_def; static struct active_definition escape_default; static heap *active_heap; static enum parsemode mode_stack[MODESTACK]; static enum parsemode *mode; /* pointer auf den mode_stack */ static struct token actualtoken; static FILE *source; static FILE *outputfile; static const unsigned char *texname; static struct token catcodes[256]; static library *skipped_env, *skipped_com, *delimiter_com; static library *weird_capital; /* prototypen */ static struct token next_token(void); static void exit_parser( void ); /* code */ void init_parser(FILE *infofile, FILE *sourcefile, FILE *outfile, const unsigned char *sourcename ) { unsigned char zeile[MAXSTRING]; unsigned char keyword[MAXSTRING]; outputfile = outfile; source = sourcefile; texname = sourcename; zeilennummer = 1; zeilenoffset = 0; /* initialisierung des mode-rings auf den jeweils die alten modes kommen */ for ( ringindex = 0; ringindex < RINGSIZE; ringindex++ ) modering[ringindex] = no_mode; ringindex = 0; escape_default.anzahl = 1; escape_default.next = NULL; default_expansion.of_what = '-'; default_expansion.what = "\\-@"; escape_default.expansion[0] = default_expansion; skipped_env = library_create( 500 ); skipped_com = library_create( 500 ); delimiter_com = library_create( 500 ); weird_capital = library_create( 500 ); /* 1.11.91 */ active_heap = heap_create( 500 ); atexit( exit_parser ); escape_def = &escape_default; active_defs = NULL; { /* init catcodes */ unsigned char c; for ( c = 255; c >= ' '; c-- ) { catcodes[c].catcode = catcode_other; catcodes[c].token = c; } for ( c = 1; c < ' '; c++ ) catcodes[c].catcode = catcode_ignored; catcodes['\\'].catcode = catcode_escape; catcodes['{'].catcode = catcode_begingroup; catcodes['}'].catcode = catcode_endgroup; catcodes['$'].catcode = catcode_mathshift; catcodes['&'].catcode = catcode_alignmenttab; catcodes['\n'].catcode = catcode_endofline; catcodes['\n'].token = '\n'; catcodes[' '].catcode = catcode_space; catcodes['\t'].catcode = catcode_space; catcodes['%'].catcode = catcode_comment; for ( c = 'A'; c <= 'Z'; c++ ) catcodes[c].catcode = catcode_letter; for ( c = 'a'; c <= 'z'; c++ ) catcodes[c].catcode = catcode_letter; catcodes[0].catcode = catcode_eof; } /* init catcodes */ /* infofile-syntax = = catcodes / skipped_environment / skipped_commands */ while( fscanf( infofile, "%s\t%s\n", keyword, zeile )) { unsigned char *zeilepointer; zeilepointer = zeile; if (!strcmp(keyword,"escape")) while ( *zeilepointer ) catcodes[*zeilepointer++].catcode = catcode_escape; else if (!strcmp(keyword,"begingroup")) while ( *zeilepointer ) catcodes[*zeilepointer++].catcode = catcode_begingroup; else if (!strcmp(keyword,"endgroup")) while ( *zeilepointer ) catcodes[*zeilepointer++].catcode = catcode_endgroup; else if (!strcmp(keyword,"mathshift")) while ( *zeilepointer ) catcodes[*zeilepointer++].catcode = catcode_mathshift; else if (!strcmp(keyword,"alignmenttab")) while ( *zeilepointer ) catcodes[*zeilepointer++].catcode = catcode_alignmenttab; else if (!strcmp(keyword,"endofline")) while ( *zeilepointer ) catcodes[*zeilepointer++].catcode = catcode_endofline; else if (!strcmp(keyword,"space")) while ( *zeilepointer ) catcodes[*zeilepointer++].catcode = catcode_space; else if (!strcmp(keyword,"letter")) while ( *zeilepointer ) catcodes[*zeilepointer++].catcode = catcode_letter; else if (!strcmp(keyword,"active")) { int i,anzahl; struct active_definition *ad; unsigned char expansion[80]; catcodes[*zeilepointer++].catcode = catcode_active; anzahl = atoi( zeilepointer ); ad = heap_allocate( active_heap, sizeof( struct active_definition ) + sizeof( struct active_expansion ) * ( anzahl - 1 )); ad->character = *zeile; ad->anzahl = anzahl; ad->next = active_defs; active_defs = ad; for ( i = 0; i < anzahl; i++ ) { fscanf( infofile, "%c\t%s\n", &( ((ad->expansion)[i]).of_what), expansion ); ((ad->expansion)[i]).what = heap_allocate( active_heap, strlen( expansion ) + 1 ); strcpy( ((ad->expansion)[i]).what, expansion ); } /* for i < anzahl */ } else if (!strcmp(keyword,"escape_def")) { int i,anzahl; unsigned char expansion[80]; anzahl = atoi( zeilepointer ); escape_def = heap_allocate( active_heap, sizeof( struct active_definition ) + sizeof( struct active_expansion ) * ( anzahl - 1 )); escape_def->character = *zeile; escape_def->anzahl = anzahl; escape_def->next = NULL; for ( i = 0; i < anzahl; i++ ) { fscanf( infofile, "%c\t%s\n", &( ((escape_def->expansion)[i]).of_what), expansion ); ((escape_def->expansion)[i]).what = heap_allocate( active_heap, strlen( expansion ) + 1 ); strcpy( ((escape_def->expansion)[i]).what, expansion ); } /* for i < anzahl */ } else if (!strcmp(keyword,"other")) while ( *zeilepointer ) catcodes[*zeilepointer++].catcode = catcode_other; else if (!strcmp(keyword,"ignored")) while ( *zeilepointer ) catcodes[*zeilepointer++].catcode = catcode_ignored; else if (!strcmp(keyword,"comment")) while ( *zeilepointer ) catcodes[*zeilepointer++].catcode = catcode_comment; else if (!strcmp(keyword,"escape")) while ( *zeilepointer ) catcodes[*zeilepointer++].catcode = catcode_escape; else if (!strcmp(keyword,"skipped_command")) library_enter_entry( skipped_com, &zeile[1], 1 ); else if (!strcmp(keyword,"punctuation")) strcpy( punctuation_chars, zeile ); else if (!strcmp(keyword,"sentenceend")) strcpy( sentenceend_chars, zeile ); else if (!strcmp(keyword,"text_begingroup")) strcpy( textbegingroup_chars, zeile ); else if (!strcmp(keyword,"text_endgroup")) strcpy( textendgroup_chars, zeile ); else if (!strcmp(keyword,"capital")) /* 1.11.91 */ library_enter_entry( weird_capital, zeile, 1 ); else if (!strcmp(keyword,"skipped_environment")) library_enter_entry( skipped_env, zeile, 1 ); else if (!strcmp(keyword,"delimiter_command")) library_enter_entry( delimiter_com, &zeile[1], 1 ); else if (!strcmp(keyword,"end_of_file"))break; else printf( "Invalid command %s ignored; check infofile\n\b", keyword); } /* while fgets */ actualtoken = next_token(); mode = mode_stack+1; *mode = common_mode; } /* init_parser */ static struct token next_token(void) { unsigned char zeichen; for (;;) { zeichen = getc( source ); if (feof(source)) break; if ( zeichen == '\n' ) zeilenoffset++; if (catcodes[zeichen].catcode != catcode_ignored ) return ( catcodes[zeichen] ); } /* while zeichen != EOF */ return( catcodes[0] ); } /* next_token */ unsigned char *next_word( int *end_of_sentence ) { static unsigned char wortbuffer[MAXWORT + 2]; unsigned char *wort; unsigned char *wortzeiger; unsigned char escapebuffer[MAXSTRING]; unsigned char *escapezeiger; unsigned char parameterbuffer[MAXSTRING]; unsigned char *parameterzeiger; unsigned char endbuffer[MAXSTRING]; unsigned char *endzeiger; int wortlaenge = 0; escapezeiger = escapebuffer; end_of_sentence = &new_sentence; wort = &wortbuffer[2]; wortzeiger = wort; wortlaenge = 0; zeilennummer += zeilenoffset; zeilenoffset = 0; while( 1 ) { if ( wortzeiger < wort) { wortzeiger = wort; printf("%s:%d:wortunderflow\n",texname, zeilennummer ); } if ( wortzeiger >= &wort[MAXWORT-1]) { printf("%s:%d:wortoverflow\n",texname, zeilennummer ); return(wort); } if ( mode <= mode_stack ) /* stack underflow verhindern */ { zeilennummer += zeilenoffset; zeilenoffset = 0; fprintf( outputfile, "%s:%d: stack underflow\n", texname, zeilennummer ); mode = mode_stack + 1; *mode = common_mode; }; if ( mode >= &mode_stack[ MODESTACK - 1] ) /* stack overflow */ { zeilennummer += zeilenoffset; zeilenoffset = 0; fprintf( outputfile, "%s:%d: stack overflow\n", texname, zeilennummer ); mode = mode_stack + 1; *mode = common_mode; }; switch( *mode ) { case common_mode: /* normale buchstaben */ while( actualtoken.catcode == catcode_letter ) { if (wortzeiger - wort >= MAXWORT) return(wort); *wortzeiger++ = actualtoken.token; wortlaenge++; actualtoken = next_token(); } switch( actualtoken.catcode ) { case catcode_escape: storemode( *mode ); *++mode = escape_mode; /* mode auf den stack */ if (wortzeiger - wort >= MAXWORT) return(wort); *wortzeiger++ = actualtoken.token; actualtoken = next_token(); escapezeiger = escapebuffer; /* initialisierung des escapebuffers */ break; case catcode_begingroup: storemode( *mode ); *++mode = common_mode; actualtoken = next_token(); break; case catcode_endgroup: storemode( *mode ); --mode; actualtoken = next_token(); break; case catcode_mathshift: /* *wortzeiger++ = actualtoken.token; */ /* mshift nicht an wort dran */ actualtoken = next_token(); break; case catcode_endofline: case catcode_space: storemode( *mode ); *++mode = space_mode; *wortzeiger = 0; if ( wortlaenge > MINDESTLAENGE ) return( wort ); if ( actualtoken.token == '\n' ) { zeilennummer += zeilenoffset; zeilenoffset = 0; } wortzeiger = wort; wortlaenge = 0; break; case catcode_alignmenttab: case catcode_other: storemode( *mode ); *++mode = other_mode; *wortzeiger = 0; if ( wortlaenge > MINDESTLAENGE ) return( wort ); wortzeiger = wort; wortlaenge = 0; break; case catcode_active: storemode( *mode ); *++mode = active_mode; break; case catcode_comment: storemode( *mode ); *++mode = comment_mode; break; case catcode_ignored: ; break; case catcode_eof: return( NULL ); } /* local switch */ break; case escape_mode: /* befehle */ switch( actualtoken.catcode ) { case catcode_space: *--wortzeiger = 0; /* \\ = \n wortzeiger eins runter, um alten backslash zu tilgen */ storemode( *mode ); *mode = space_mode; if ( wortlaenge > MINDESTLAENGE ) return( wort ); wortzeiger = wort; wortlaenge = 0; /* neues wort anfangen */ break; case catcode_escape: *--wortzeiger = 0; /* \\ = \n wortzeiger eins runter, um alten backslash zu tilgen */ actualtoken = next_token(); /* naechstes token holen */ storemode( *mode ); mode--; /* alten mode vom stack holen */ if ( wortlaenge > MINDESTLAENGE ) return( wort ); wortzeiger = wort; wortlaenge = 0; /* neues wort anfangen */ break; case catcode_other: { int i = 0; while ( (((escape_def->expansion)[i]).of_what != actualtoken.token ) && ( i < escape_def->anzahl )) i++; /* jetzt haben wir die makroexpansion und schauen ob sie geskippt werden muss */ if ( i == escape_def->anzahl ) ; else { unsigned char string[80]; storemode( *mode ); mode--; actualtoken = next_token(); strcpy( string,&(((escape_def->expansion)[i]).what[1])); /* kopieren ohne backslash */ string[strlen( string )-1] = 0; if( library_find_entry( delimiter_com, string )) { *--wortzeiger = 0; /* eingefuegt 14.11.91 */ if ( wortlaenge > MINDESTLAENGE ) return( wort ); wortzeiger = wort; wortlaenge = 0; break; } if ( library_find_entry( skipped_com, string )) { *--wortzeiger = 0; /* backslash wegwerfen */ } else { wortzeiger--; strcpy (wortzeiger, (( escape_def->expansion)[i] ).what); wortzeiger += strlen((( escape_def->expansion)[i] ).what); wortlaenge++; } break; } } case catcode_begingroup: /* zeichen fuer geschweifte klammer auf */ case catcode_endgroup: /* zeichen fuer geschweifte klammer zu */ case catcode_mathshift: /* dollarzeichen */ case catcode_alignmenttab: /* ampersand */ case catcode_active: case catcode_comment: *wortzeiger++ = actualtoken.token; actualtoken = next_token(); storemode( *mode ); mode--; break; case catcode_endofline: if ( wortlaenge > MINDESTLAENGE ) return( wort ); *wortzeiger = 0; actualtoken = next_token(); break; case catcode_letter: escapezeiger = escapebuffer; do{ *escapezeiger++ = actualtoken.token; actualtoken = next_token(); }while( actualtoken.catcode == catcode_letter ); *escapezeiger = 0; if(!strcmp(escapebuffer,"begin")) { storemode( *mode ); *mode = parameter_mode; if( actualtoken.catcode == catcode_comment ) { storemode(*mode); *++mode = comment_mode; } break; } if(!strcmp(escapebuffer,"end")) { storemode( *mode ); *mode = end_deletion_mode; if( actualtoken.catcode == catcode_comment ) { storemode( *mode ); *++mode = comment_mode; } break; } if(library_find_entry( delimiter_com, escapebuffer )) /* escapesequenz wortdelimiter (wie \quad)*/ { storemode( *mode ); mode--; *--wortzeiger = 0; if ( wortlaenge > MINDESTLAENGE ) return( wort ); wortzeiger = wort; wortlaenge = 0; } else if ( library_find_entry( skipped_com, escapebuffer )) { storemode( *mode ); *mode = space_mode; /* damit der delimiting space ueberlesen wird, eingeb. 20.11.91 */ *--wortzeiger = 0; } else { strcpy( wortzeiger,escapebuffer ); wortzeiger += ( escapezeiger - escapebuffer ); *wortzeiger++ = '@'; /* statt eines leerzeichens in der library fuer z.B. stra\ss e */ storemode( *mode ); mode--; while ( actualtoken.catcode == catcode_space ) actualtoken = next_token(); } break; case catcode_ignored: ; break; case catcode_eof: return( NULL ); } /* local switch */ break; case parameter_mode: /* untersuchung von environment-parametern */ parameterzeiger = parameterbuffer; while ( actualtoken.token == '[' ) while ( actualtoken.token != ']' ) { actualtoken = next_token(); if ( actualtoken.catcode == catcode_eof ) return( NULL ); } while ( actualtoken.catcode == catcode_begingroup ) actualtoken = next_token(); while ( actualtoken.catcode == catcode_letter ) { *parameterzeiger++ = actualtoken.token; actualtoken = next_token(); } *parameterzeiger = 0; if( library_find_entry( skipped_env, parameterbuffer )) { storemode( *mode ); *mode = skip_mode; break; } else if ( actualtoken.token == '*' ) actualtoken = next_token(); while ( actualtoken.catcode == catcode_endgroup ) actualtoken = next_token(); storemode( *mode ); mode--; break; case other_mode: /* behandlung von others, insbes. interpunktion */ if ( punctuation_check ) { if ( strchr( punctuation_chars, actualtoken.token )) { unsigned char mishandled_token = actualtoken.token; actualtoken = next_token(); if ( actualtoken.catcode == catcode_eof ) return( NULL ); if ( strchr(sentenceend_chars, mishandled_token) && !isdigit( actualtoken.token )) *end_of_sentence = 1; if ( ( actualtoken.catcode != catcode_space )&& ( actualtoken.catcode != catcode_endofline )&& !isdigit( actualtoken.token )&& ( actualtoken.catcode != catcode_alignmenttab)&& ( actualtoken.catcode != catcode_endgroup )) { fprintf(outputfile , "%s:%d: missing space after %c\n", texname, zeilennummer, mishandled_token ); } } else /* if actualtoken.token is punctuation_char */ { actualtoken = next_token(); } } else actualtoken = next_token(); /* if punctuation_check */ storemode( *mode ); mode--; break; case space_mode: /* skippen von leerzeichen */ while (( actualtoken.catcode == catcode_space )||( actualtoken.catcode == catcode_endofline)) actualtoken = next_token(); if ( actualtoken.catcode == catcode_eof ) return( NULL ); if ( punctuation_check ) if ( strchr( punctuation_chars, actualtoken.token )) fprintf( outputfile , "%s:%d: extra space before %c\n", texname, zeilennummer, actualtoken.token ); storemode( *mode ); mode--; break; case skip_mode: /* environments{parameterbuffer} werden geskipt */ do { do { do { do { do { if( actualtoken.catcode == catcode_eof ) return( NULL ); while ( actualtoken.catcode != catcode_escape )actualtoken = next_token(); if( actualtoken.catcode == catcode_eof ) return( NULL ); actualtoken = next_token(); }while( actualtoken.token != 'e' ); actualtoken = next_token(); }while( actualtoken.token != 'n' ); actualtoken = next_token(); }while( actualtoken.token != 'd' ); actualtoken = next_token(); }while( actualtoken.catcode != catcode_begingroup ); actualtoken = next_token(); endzeiger = endbuffer; while( actualtoken.catcode == catcode_letter ) { *endzeiger++ = actualtoken.token; actualtoken = next_token(); } *endzeiger = 0; }while( 0 != strcmp( endbuffer,parameterbuffer )); actualtoken = next_token(); if( actualtoken.catcode == catcode_eof ) return( NULL ); storemode( *mode ); mode--; break; case comment_mode: /* behandeln von kommentaren */ while ( actualtoken.catcode != catcode_endofline ) { actualtoken = next_token(); if ( actualtoken.catcode == catcode_eof ) return( NULL ); } if ( actualtoken.token == '\n' ) { zeilennummer += zeilenoffset; zeilenoffset = 0; } storemode( *mode ); /* 1.11.91 */ mode--; /* 1.11.91 */ actualtoken = next_token(); break; case active_mode: /* untersuchen von active-characters */ { unsigned char wrong_token; struct active_definition *def; int i=0; storemode( *mode ); mode--; def = active_defs; while ( def->character != actualtoken.token ) def = def->next; actualtoken = next_token(); while ( (((def->expansion)[i]).of_what != actualtoken.token ) && ( i < def->anzahl )) i++; /* jetzt haben wir die makroexpansion und schauen ob sie geskippt werden muss */ wrong_token = actualtoken.token; actualtoken = next_token(); if ( i == def->anzahl ) { fprintf( outputfile, "%s:%d: active %c with unknown parameter %c\n", texname, zeilennummer, def->character,wrong_token ); } else { unsigned char string[80]; strcpy( string,&(((def->expansion)[i]).what[1])); /* kopieren ohne backslash */ string[strlen( string )-1] = 0; if( library_find_entry( delimiter_com, string )) { if ( wortlaenge > MINDESTLAENGE ) return( wort ); wortzeiger = wort; wortlaenge = 0; } if ( library_find_entry( skipped_com, string )) { ; } else { strcpy (wortzeiger, ((def->expansion)[i]).what); wortzeiger += strlen(((def->expansion)[i]).what); wortlaenge++; } } } break; case end_deletion_mode: storemode( *mode ); mode--; if ( actualtoken.catcode == catcode_begingroup ) while ( actualtoken.catcode != catcode_endgroup ) { actualtoken = next_token(); if ( actualtoken.catcode == catcode_eof ) return ( NULL ); } actualtoken = next_token(); break; default: ; break; } /* switch */ } /* while 1 */ } /* next_word */ static void exit_parser( void ) /* wirft die nebemlibraries weg */ { library_delete( skipped_com ); library_delete( skipped_env ); library_delete( delimiter_com ); library_delete( weird_capital ); heap_delete( active_heap ); } int check_first_char( unsigned char *word ) /* prueft den ersten buchstaben */ { /* 1 = gross, 0 = klein */ unsigned char *hilfspointer; while(word) { unsigned char string[MAXSTRING]; while( !is_a_letter( *word )) { if( *word == '\\' ) break; word++; } if ( islower( *word ) ) return( 0 ); if ( isupper( *word ) ) return( 1 ); /* wir brauchen noch weird_capital fuer \L@ \AE@ etc. */ hilfspointer = word; while( ( word )&&( *word != '@' )) /* bis zum @ kopieren */ word++; /* von hilfspointer bis word */ /* ist der bereich als capital definiert? 1 : continue; */ word++; /* affenschwanz mitnehmen */ strncpy( string, hilfspointer, min(word - hilfspointer,MAXSTRING) ); string[ min(word-hilfspointer,MAXSTRING-1) ] = 0; if ( library_find_entry( weird_capital, string )) return( 1 ); } return(0); }