/* SGML_stream.c
 * $Id$
 */

/* implements... */
#include "SGMLstream.h"

/* uses ... */
#include <ctype.h>
#include <assert.h>

int
  SGML_read(stream, getc,
	    buf, nbytes,
	    entities, expand_entity,  max_entity_length,
	    declared_content,
	    inout_lookahead)
SGML_Object stream;
SGML_Method getc;
char* buf;
int nbytes;
int max_entity_length;
SGML_Object entities;
SGML_Method_charptrs expand_entity;
int declared_content;
int* inout_lookahead;
{
  int c; /* state machine input character */
  enum { /* state machine states */
    start, data,
    and, and_hash, entity,
    lt, lt_slash, tag,
    pi,
    lt_bang, lt_bang_dash,
    comment, comment_dash, ps
  } state = start;
  /* auxiliary state: */
  int cref; /* saw '#' after '&' */
  int end_tag; /* saw '/' after '<' */

  int ret = 0; /* number of characters read */
  char name[SGML_NAMELEN + 1]; /* entity name */
  int name_chars;

#define LOOKAHEAD(n) (ret + n < nbytes)
#define REDUCE(s) { state = (s); break; }
#define SHIFT(s) { state = (s); continue; }
#define DONE(c) { *inout_lookahead = (c); return ret; }
#define WRITE(c) { *buf++ = (c); ret++; }

  /* check arguments */
  if(max_entity_length < 2)
    max_entity_length = 2;

  /* prime the pump */
  if((c  = *inout_lookahead) == EOF)
    c = (getc)(stream);
  
  /* state machine...*/
  while(ret < nbytes){

    switch(state){

    case start:
      if(c == EOF) return EOF;
      else if(c == '<'){
	if(LOOKAHEAD(3)) { REDUCE(lt); }
	else { DONE(c); } /* no room for lookahead */
      }else { SHIFT(data); }

    case data:
      if(c == EOF || c == '<') { DONE(c); }
      else if(c == '&' && declared_content != SGML_CDATA){
	if(LOOKAHEAD(max_entity_length)) { REDUCE(and); }
	else { DONE(c); } /* no room to parse entity reference */
      }else{
	WRITE(c);
	break;
      }

    case and:
      if(c == '#') { REDUCE(and_hash); }
      else if(isalpha(c)) { cref = 0; name_chars = 0; SHIFT(entity); }
      else{ WRITE('&');	SHIFT(data); }

    case and_hash:
      if(isalnum(c)){ name_chars = 0; cref = 1; SHIFT(entity); }
      else{ WRITE('&'); WRITE('#'); SHIFT(data); }

    case entity:
      if(isdigit(c) ||
	 (cref == 0 &&
	  (isalpha(c) || strchr(SGML_UCNMCHAR SGML_LCNMCHAR, c)))){
	if(name_chars < SGML_NAMELEN)
	  name[name_chars++] = c;
	/* else markup error: name too long */
	break;
      }
      else{
	int entlen;

	name[name_chars] = '\0';
	entlen = (expand_entity)(entities, name, buf);
	ret += entlen; buf += entlen;
	if(c == ';') { REDUCE(data); }
	else
	  /* terminate entity reference w/space or something */
	  { SHIFT(data); }
      }

    case lt:
      if(c == '/') { REDUCE(lt_slash); }
      if(declared_content == SGML_PCDATA){
	if(c == '?') { REDUCE(pi); }
	else if(c == '!') { REDUCE(lt_bang); }
	else if(isalpha(c)) { end_tag = 0; SHIFT(tag); }
      }
      WRITE('<'); SHIFT(data);

    case lt_slash:
      if(isalpha(c)) { end_tag = 1; SHIFT(tag); }
      else { WRITE('<'); WRITE('/'); SHIFT(data); }

    case tag:
      ret = end_tag ?  SGML_end_tag : SGML_start_tag;
      DONE(c);

    case pi: /* processing instruction (or markup declaraion) */
      if(c == '>') { REDUCE(start); }
      else if(c == EOF) { SHIFT(start); } /* error: EOF in pic */
      else break;

    case lt_bang:
      if(c == '-') { REDUCE(lt_bang_dash); }
      /*
       * *** NON CONFORMING IMPLEMENTATION ***
       * a letter here starts a markup declaration, which isn't supported
       * a [ starts a marked section, which isn't supported.
       * treat them like processing instructions.
       */
      else if(c == '[' || isalpha(c)) { REDUCE(pi); }
      else{ WRITE('<'); WRITE('!'); SHIFT(data); }

    case lt_bang_dash:
      if(c == '-') { REDUCE(comment); }
      else{ WRITE('<'); WRITE('!'); WRITE('-'); SHIFT(data); }

    case comment:
      if(c == '-') { REDUCE(comment_dash); }
      else if(c == EOF) { DONE(c); } /* error: eof in comment */
      else break;

    case comment_dash:
      if(c == '-') { REDUCE(ps); }
      else if(c == EOF) { DONE(c); }/* error: eof in comment */
      else break;

    case ps: /* parameter separator between -- and > */
      if(isspace(c)) break;
      else { REDUCE(start); }/* error if c !='>' */

    }
    c = (getc)(stream);
  }

  DONE(c); /* set up lookahead for next call */
#undef S
#undef LOOKAHEAD
#undef REDUCE
#undef SHIFT
#undef DONE
#undef WRITE
}


int
  SGML_read_name(stream, getc, buf, inout_lookahead)
SGML_Object stream;
SGML_Method getc;
char* buf;
int* inout_lookahead;
{
  int name_chars = 0;
  int c = *inout_lookahead;

  if(!isalpha(c)) return 0;

  do{
    if(name_chars <= SGML_NAMELEN)
      buf[name_chars++] = toupper(c);
    /* else error: name too long */
    c = (getc)(stream);
  }while(isalnum(c) || strchr(SGML_UCNMCHAR SGML_LCNMCHAR, c));

  while(isspace(c))
    c = (getc)(stream);

  *inout_lookahead = c;
  return name_chars;
}


int
  SGML_read_value (stream,
		   getc,
		   buf,
		   entities,
		   expand_entity,
		   max_entity_length,
		   inout_lookahead)
SGML_Object stream;
SGML_Method getc;
char* buf;
SGML_Object entities;
SGML_Method_charptrs expand_entity;
int max_entity_length;
int* inout_lookahead;
{

  int c; /* state machine input character */
  enum { /* state machine states */
    start,
    literal,
    and, and_hash, entity,
#ifdef SGML_SHORTTAG
    value,
#endif
    ps
  } state = start;
  /* auxiliary state: */
  int cref; /* saw '#' after '&' */
  char quote; /* which kind of quote */

  int ret = 0; /* number of characters read */
  char name[SGML_NAMELEN + 1]; /* entity name */
  int name_chars;

#define LOOKAHEAD(n) (ret + n < SGML_LITLEN)
#define REDUCE(s) { state = (s); break; }
#define SHIFT(s) { state = (s); continue; }
#define DONE(c) { *inout_lookahead = (c); return ret; }
#define WRITE(c) { *buf++ = (c); ret++; }

  /* check arguments */
  if(max_entity_length < 2)
    max_entity_length = 2; /* enough for lookahead for &# processing */

  /* prime the pump */
  if((c  = *inout_lookahead) == EOF)
    c = (getc)(stream);
  
  /* state machine...*/
  while(ret < SGML_LITLEN){

    switch(state){

    case start:
      if(c == EOF) return EOF;
      else if(c == '"') { quote = c; REDUCE(literal); }
      else if(c == '\'') { quote = c; REDUCE(literal); }
      else if(isspace(c)) break;
#ifdef SGML_SHORTTAG
      else if(isalnum(c)) { SHIFT(value); }
#endif
      else { DONE(c); } /* error: illegal char in markup */

#ifdef SGML_SHORTTAG
    case value:
      if(c == EOF) { DONE(c); }
#ifdef GROK_UNQUOTED_LITERALS
      else if(!(isspace(c) || c == '>')){
#else
      else if(isalnum(c) || strchr(SGML_UCNMCHAR SGML_LCNMCHAR, c)){
#endif
	WRITE(c);
	break;
      }else{ SHIFT(ps); }
#endif

    case literal:
      if(c == EOF) { DONE(c); }
      else if(c == quote) { REDUCE(ps); }
      else if(c == '&'){
	if(LOOKAHEAD(max_entity_length)) { REDUCE(and); }
	/*
	 * *** NON CONFORMING IMPLEMENTATION ***
	 * attribute value _might_ be too long (which would be an error).
	 * we can't tell here, so we punt.
	 */
	else { DONE(c); }
      }else{
	WRITE(c);
	break;
      }

    case and:
      if(c == '#') { REDUCE(and_hash); }
      else if(isalpha(c)) { cref = 0; name_chars = 0; SHIFT(entity); }
      else{ WRITE('&');	SHIFT(literal); }

    case and_hash:
      if(isalnum(c)){ cref = 1; name_chars = 0; SHIFT(entity); }
      else{ WRITE('&'); WRITE('#'); SHIFT(literal); }

    case entity:
      if(isdigit(c) || (cref == 0 && isalpha(c))){
	if(name_chars < SGML_NAMELEN)
	  name[name_chars++] = c;
	/* else markup error: name too long */
      }
      else{
	int entlen;

	name[name_chars] = '\0';
	entlen = (expand_entity)(entities, name, buf);
	ret += entlen; buf += entlen;
	if(c == ';') { REDUCE(literal); }
	else
	  /* terminate entity reference w/space or something */
	  { SHIFT(literal); }
      }

    case ps: /* parameter separator between attributes */
      if(isspace(c)) break;
      else { DONE(c); }

    }
    c = (getc)(stream);
  }

  /* error: attribute value too long */

  DONE(EOF); /* set lookahead to EOF for next call */
#undef S
#undef LOOKAHEAD
#undef REDUCE
#undef SHIFT
#undef DONE
#undef WRITE
}