/* SGML_stream.c * $Id$ */ /* implements... */ #include "SGMLstream.h" /* uses ... */ #include #include int SGML_read(stream, getc, buf, nbytes, entities, expand_entity, max_entity_length, declared_content, inout_lookahead) SGML_Object stream; SGML_Method getc; char* buf; int nbytes; int max_entity_length; SGML_Object entities; SGML_Method_charptrs expand_entity; int declared_content; int* inout_lookahead; { int c; /* state machine input character */ enum { /* state machine states */ start, data, and, and_hash, entity, lt, lt_slash, tag, pi, lt_bang, lt_bang_dash, comment, comment_dash, ps } state = start; /* auxiliary state: */ int cref; /* saw '#' after '&' */ int end_tag; /* saw '/' after '<' */ int ret = 0; /* number of characters read */ char name[SGML_NAMELEN + 1]; /* entity name */ int name_chars; #define LOOKAHEAD(n) (ret + n < nbytes) #define REDUCE(s) { state = (s); break; } #define SHIFT(s) { state = (s); continue; } #define DONE(c) { *inout_lookahead = (c); return ret; } #define WRITE(c) { *buf++ = (c); ret++; } /* check arguments */ if(max_entity_length < 2) max_entity_length = 2; /* prime the pump */ if((c = *inout_lookahead) == EOF) c = (getc)(stream); /* state machine...*/ while(ret < nbytes){ switch(state){ case start: if(c == EOF) return EOF; else if(c == '<'){ if(LOOKAHEAD(3)) { REDUCE(lt); } else { DONE(c); } /* no room for lookahead */ }else { SHIFT(data); } case data: if(c == EOF || c == '<') { DONE(c); } else if(c == '&' && declared_content != SGML_CDATA){ if(LOOKAHEAD(max_entity_length)) { REDUCE(and); } else { DONE(c); } /* no room to parse entity reference */ }else{ WRITE(c); break; } case and: if(c == '#') { REDUCE(and_hash); } else if(isalpha(c)) { cref = 0; name_chars = 0; SHIFT(entity); } else{ WRITE('&'); SHIFT(data); } case and_hash: if(isalnum(c)){ name_chars = 0; cref = 1; SHIFT(entity); } else{ WRITE('&'); WRITE('#'); SHIFT(data); } case entity: if(isdigit(c) || (cref == 0 && (isalpha(c) || strchr(SGML_UCNMCHAR SGML_LCNMCHAR, c)))){ if(name_chars < SGML_NAMELEN) name[name_chars++] = c; /* else markup error: name too long */ break; } else{ int entlen; name[name_chars] = '\0'; entlen = (expand_entity)(entities, name, buf); ret += entlen; buf += entlen; if(c == ';') { REDUCE(data); } else /* terminate entity reference w/space or something */ { SHIFT(data); } } case lt: if(c == '/') { REDUCE(lt_slash); } if(declared_content == SGML_PCDATA){ if(c == '?') { REDUCE(pi); } else if(c == '!') { REDUCE(lt_bang); } else if(isalpha(c)) { end_tag = 0; SHIFT(tag); } } WRITE('<'); SHIFT(data); case lt_slash: if(isalpha(c)) { end_tag = 1; SHIFT(tag); } else { WRITE('<'); WRITE('/'); SHIFT(data); } case tag: ret = end_tag ? SGML_end_tag : SGML_start_tag; DONE(c); case pi: /* processing instruction (or markup declaraion) */ if(c == '>') { REDUCE(start); } else if(c == EOF) { SHIFT(start); } /* error: EOF in pic */ else break; case lt_bang: if(c == '-') { REDUCE(lt_bang_dash); } /* * *** NON CONFORMING IMPLEMENTATION *** * a letter here starts a markup declaration, which isn't supported * a [ starts a marked section, which isn't supported. * treat them like processing instructions. */ else if(c == '[' || isalpha(c)) { REDUCE(pi); } else{ WRITE('<'); WRITE('!'); SHIFT(data); } case lt_bang_dash: if(c == '-') { REDUCE(comment); } else{ WRITE('<'); WRITE('!'); WRITE('-'); SHIFT(data); } case comment: if(c == '-') { REDUCE(comment_dash); } else if(c == EOF) { DONE(c); } /* error: eof in comment */ else break; case comment_dash: if(c == '-') { REDUCE(ps); } else if(c == EOF) { DONE(c); }/* error: eof in comment */ else break; case ps: /* parameter separator between -- and > */ if(isspace(c)) break; else { REDUCE(start); }/* error if c !='>' */ } c = (getc)(stream); } DONE(c); /* set up lookahead for next call */ #undef S #undef LOOKAHEAD #undef REDUCE #undef SHIFT #undef DONE #undef WRITE } int SGML_read_name(stream, getc, buf, inout_lookahead) SGML_Object stream; SGML_Method getc; char* buf; int* inout_lookahead; { int name_chars = 0; int c = *inout_lookahead; if(!isalpha(c)) return 0; do{ if(name_chars <= SGML_NAMELEN) buf[name_chars++] = toupper(c); /* else error: name too long */ c = (getc)(stream); }while(isalnum(c) || strchr(SGML_UCNMCHAR SGML_LCNMCHAR, c)); while(isspace(c)) c = (getc)(stream); *inout_lookahead = c; return name_chars; } int SGML_read_value (stream, getc, buf, entities, expand_entity, max_entity_length, inout_lookahead) SGML_Object stream; SGML_Method getc; char* buf; SGML_Object entities; SGML_Method_charptrs expand_entity; int max_entity_length; int* inout_lookahead; { int c; /* state machine input character */ enum { /* state machine states */ start, literal, and, and_hash, entity, #ifdef SGML_SHORTTAG value, #endif ps } state = start; /* auxiliary state: */ int cref; /* saw '#' after '&' */ char quote; /* which kind of quote */ int ret = 0; /* number of characters read */ char name[SGML_NAMELEN + 1]; /* entity name */ int name_chars; #define LOOKAHEAD(n) (ret + n < SGML_LITLEN) #define REDUCE(s) { state = (s); break; } #define SHIFT(s) { state = (s); continue; } #define DONE(c) { *inout_lookahead = (c); return ret; } #define WRITE(c) { *buf++ = (c); ret++; } /* check arguments */ if(max_entity_length < 2) max_entity_length = 2; /* enough for lookahead for &# processing */ /* prime the pump */ if((c = *inout_lookahead) == EOF) c = (getc)(stream); /* state machine...*/ while(ret < SGML_LITLEN){ switch(state){ case start: if(c == EOF) return EOF; else if(c == '"') { quote = c; REDUCE(literal); } else if(c == '\'') { quote = c; REDUCE(literal); } else if(isspace(c)) break; #ifdef SGML_SHORTTAG else if(isalnum(c)) { SHIFT(value); } #endif else { DONE(c); } /* error: illegal char in markup */ #ifdef SGML_SHORTTAG case value: if(c == EOF) { DONE(c); } #ifdef GROK_UNQUOTED_LITERALS else if(!(isspace(c) || c == '>')){ #else else if(isalnum(c) || strchr(SGML_UCNMCHAR SGML_LCNMCHAR, c)){ #endif WRITE(c); break; }else{ SHIFT(ps); } #endif case literal: if(c == EOF) { DONE(c); } else if(c == quote) { REDUCE(ps); } else if(c == '&'){ if(LOOKAHEAD(max_entity_length)) { REDUCE(and); } /* * *** NON CONFORMING IMPLEMENTATION *** * attribute value _might_ be too long (which would be an error). * we can't tell here, so we punt. */ else { DONE(c); } }else{ WRITE(c); break; } case and: if(c == '#') { REDUCE(and_hash); } else if(isalpha(c)) { cref = 0; name_chars = 0; SHIFT(entity); } else{ WRITE('&'); SHIFT(literal); } case and_hash: if(isalnum(c)){ cref = 1; name_chars = 0; SHIFT(entity); } else{ WRITE('&'); WRITE('#'); SHIFT(literal); } case entity: if(isdigit(c) || (cref == 0 && isalpha(c))){ if(name_chars < SGML_NAMELEN) name[name_chars++] = c; /* else markup error: name too long */ } else{ int entlen; name[name_chars] = '\0'; entlen = (expand_entity)(entities, name, buf); ret += entlen; buf += entlen; if(c == ';') { REDUCE(literal); } else /* terminate entity reference w/space or something */ { SHIFT(literal); } } case ps: /* parameter separator between attributes */ if(isspace(c)) break; else { DONE(c); } } c = (getc)(stream); } /* error: attribute value too long */ DONE(EOF); /* set lookahead to EOF for next call */ #undef S #undef LOOKAHEAD #undef REDUCE #undef SHIFT #undef DONE #undef WRITE }