/* tidy.c - reformats HTML files to match DTD On an HP-UX system this compiles with: cc -Aa -g -D_HPUX_SOURCE -o tidy tidy.c The program reformats html files to remove faults and make them consistent with current practice. Please email your comments and suggestions for improvements to dsr@hplb.hpl.hp.com Useage: tidy file file file ... (c) April 1994 Dave Raggett */ #include #include #include #include #include #include #include #include #define VERSION "1.0a" #define IsWhite(c) (c == ' ' || c == '\n' || c == '\t' || c == '\f' || c == '\r') #define max(a, b) ((a) > (b) ? (a) : (b)) #define min(a, b) ((a) < (b) ? (a) : (b)) /* alignment codes */ #define ALIGN_TOP 0 #define ALIGN_MIDDLE 1 #define ALIGN_BOTTOM 2 #define ALIGN_LEFT 3 #define ALIGN_CENTER 4 #define ALIGN_RIGHT 5 #define ALIGN_JUSTIFY 6 /* the html token codes */ #define COMMENT -5 /* <-- SGML Comments --> */ #define ENTITY -4 #define WHITESPACE -3 /* the specific char */ #define PCDATA -2 /* the specific char */ #define ENDDATA -1 #define ENDTAG (1<<7) /* ORed with TAG code */ #define IsTag(tag) (tag >= 0) #define UNKNOWN 0 #define TAG_ANCHOR 1 /* EN_TEXT */ #define TAG_BOLD 2 /* EN_TEXT */ #define TAG_DL 3 /* EN_LIST */ #define TAG_DT 4 /* EN_DEFLIST */ #define TAG_DD 5 /* EN_DEFLIST */ #define TAG_H1 6 /* EN_HEADER */ #define TAG_H2 7 /* EN_HEADER */ #define TAG_H3 8 /* EN_HEADER */ #define TAG_H4 9 /* EN_HEADER */ #define TAG_H5 10 /* EN_HEADER */ #define TAG_H6 11 /* EN_HEADER */ #define TAG_ITALIC 12 /* EN_TEXT */ #define TAG_IMG 13 /* EN_TEXT */ #define TAG_LI 14 /* EN_LIST */ #define TAG_OL 15 /* EN_LIST */ #define TAG_P 16 /* EN_BLOCK */ #define TAG_TITLE 17 /* EN_SETUP */ #define TAG_UNDERLINE 18 /* EN_TEXT */ #define TAG_UL 19 /* EN_LIST */ #define TAG_HEAD 20 /* EN_SETUP */ #define TAG_BODY 21 /* EN_MAIN */ #define TAG_HR 22 /* EN_BLOCK */ #define TAG_ADDRESS 23 /* EN_BLOCK */ #define TAG_BR 24 /* EN_TEXT */ #define TAG_STRIKE 25 /* EN_TEXT */ #define TAG_PRE 26 /* EN_BLOCK */ #define TAG_CITE 27 /* EN_TEXT */ #define TAG_CODE 28 /* EN_TEXT */ #define TAG_TT 29 /* EN_TEXT */ #define TAG_EM 30 /* EN_TEXT */ #define TAG_STRONG 31 /* EN_TEXT */ #define TAG_KBD 32 /* EN_TEXT */ #define TAG_SAMP 33 /* EN_TEXT */ #define TAG_DFN 34 /* EN_TEXT */ #define TAG_Q 35 /* EN_TEXT */ #define TAG_QUOTE 36 /* EN_BLOCK */ #define TAG_ISINDEX 37 /* EN_SETUP */ #define TAG_FIG 38 /* EN_TEXT */ #define TAG_INPUT 39 /* EN_TEXT */ #define TAG_SELECT 40 /* EN_TEXT */ #define TAG_OPTION 41 /* EN_TEXT */ #define TAG_TEXTAREA 42 /* EN_TEXT */ #define TAG_TABLE 43 /* EN_BLOCK */ #define TAG_TR 44 /* EN_TABLE */ #define TAG_TH 45 /* EN_TABLE */ #define TAG_TD 46 /* EN_TABLE */ #define TAG_CAPTION 47 /* EN_BLOCK */ #define TAG_FORM 48 /* EN_BLOCK */ /* entity classes */ #define EN_UNKNOWN 0 #define EN_TEXT (1 << 0) #define EN_BLOCK (1 << 1) #define EN_LIST (1 << 2) #define EL_DEFLIST (1 << 3) #define EN_HEADER (1 << 4) #define EN_SETUP (1 << 5) #define EN_MAIN (1 << 6) #define EN_TABLE (1 << 7) #define EN_COMMENT (1 << 8) /* input field types */ #define TEXTFIELD 0x00 #define CHECKBOX 0x10 #define RADIOBUTTON 0x20 #define OPTIONLIST 0x40 #define CHECKED 0x80 /* flags field as 'active' */ #define IN_ERROR 0x40 /* field flagged as in error */ #define DISABLED 0x20 /* field is greyed out */ #define SUBMIT 0x10 /* submit button */ /* window margin and indents */ #define MAXMARGIN 78 #define MININDENT 4 #define NESTINDENT 5 #define GLINDENT 30 #define OLINDENT 3 #define LINDENT 8 #define GLGAP 4 #define LBUFSIZE 1024 int comment = 0; int debug = 0; /* The current top line is displayed at the top of the window,the pixel offset is the number of pixels from the start of the document. */ FILE *fp; /* global file pointer for writing cleaned version */ char *bufptr; /* parse position in the HTML buffer */ char *lastbufptr; /* keep track of last position to store delta's */ int overwrite; int preformatted = 0; static int EndTag, TagLen; static int TokenClass, TokenValue, Token; static char *EntityValue; int baseline; /* from top of line */ long TermTop, TermBottom; long LastOffset; long PixOffset; /* current offset from start of document */ long PrevOffset; /* keep track for saving delta's */ long LastLIoffset; /* kludge for line spacing */ long ViewOffset; /* for toggling between HTML/TEXT views */ int Here; int HTMLInit = 0; long figEnd; char *LastBufPtr, *StartOfLine, *StartOfWord; /* in HTML document */ static int LineLen, LineWidth, WordStart, WordWidth; static char LineBuf[LBUFSIZE]; /* line buffer */ int namelen, hreflen; char *name, *href; #define NOBREAK 0 #define BREAK 1 char *TokenName[] = { "", "A", "B", "DL", "DT", "DD", "H1", "H2", "H3", "H4", "H5", "H6", "I", "IMG", "LI", "OL", "P", "TITLE", "U", "UL", "HEAD", "BODY", "HR", "ADDRESS", "BR", "S", "PRE", "CITE", "CODE", "TT", "EM", "STRONG", "KBD", "SAMP", "DFN", "Q", "BLOCKQUOTE", "ISINDEX", "INPUT", "SELECT", "OPTION", "TEXTAREA", "FORM" }; #if 0 void Warn(char *args, ...) { va_list ap; char buf[256]; if (debug) { va_start(ap, args); vsprintf(buf, args, ap); va_end(ap); fprintf(stderr, "%s\n", buf); } } #endif void PrintEntity(void) { int c; for (;;) { c = *bufptr++; if (c == '\0' || c == ';') break; LineBuf[LineLen++] = c; } LineBuf[LineLen++] = ';'; } void PrintString(int len) { fwrite((char *)LineBuf, 1, len, fp); } void EndOfLine(void) { putc('\n', fp); } void PrintStartTag(int token) { fprintf(fp, "<%s>", TokenName[token]); } void PrintEndTag(int token) { fprintf(fp, "%s>", TokenName[token]); } /* check if current word forces word wrap and flush line as needed */ void WrapIfNeeded(int WrapLeftMargin, int WrapRightMargin) { int WordLen, space, rightMargin; long line; rightMargin = WrapRightMargin; LineBuf[LineLen] = '\0'; /* debug*/ WordLen = LineLen - WordStart; WordWidth = WordLen; space = 1; line = 1; if (WordStart == 0 && Here + WordWidth > rightMargin) { /* word wider than window */ if (WordWidth > rightMargin - WrapLeftMargin) { PrintString(WordLen); LineWidth = LineLen = WordStart = 0; StartOfLine = bufptr; } else /* wrap to next line */ { LineWidth = WordWidth; LineLen = WordLen; WordStart = LineLen; StartOfLine = StartOfWord; } Here = WrapLeftMargin; EndOfLine(); } else if (WordStart > 0 && Here + LineWidth + space + WordWidth > rightMargin) { PrintString(WordStart-1); Here = WrapLeftMargin; EndOfLine(); memcpy(LineBuf, LineBuf+WordStart, WordLen); LineWidth = WordWidth; LineLen = WordLen; WordStart = LineLen; StartOfLine = StartOfWord; } else /* word will fit on end of current line */ { if (WordStart > 0) LineWidth += space; if (WordWidth > 0) LineWidth += WordWidth; WordStart = LineLen; } } /* flush text in line buffer, wrapping line as needed */ void FlushLine(int linebreak, int WrapLeftMargin, int WrapRightMargin) { int WordLen; if (preformatted) { WordLen = LineLen - WordStart; LineWidth = WordLen; } else if (LineLen > 0) WrapIfNeeded(WrapLeftMargin, WrapRightMargin); if (LineLen > 0) { /* watch out for single space as leading spaces are stripped by CopyLine */ if (LineLen > 1 || LineBuf[0] != ' ') PrintString(LineLen); if (linebreak) { Here = WrapLeftMargin; LineWidth = LineLen = WordStart = 0; EndOfLine(); } else { Here += LineWidth; LineWidth = LineLen = WordStart = 0; } } else if (linebreak && Here > WrapLeftMargin) { Here = WrapLeftMargin; EndOfLine(); } StartOfLine = StartOfWord = bufptr; } /* needs to cope with > in quoted text for ' and " */ void SwallowAttributes(void) { int c, dash; dash = 0; while ((c = *bufptr) && c != '>') { if (comment) { if (c == '-') { if (dash) comment = dash = 0; else dash = 1; } else dash = 0; } else if (c == '-') { if (dash) { comment = 1; dash = 0; } else dash = 1; } else if (c == '>') break; ++bufptr; } if (c == '>') ++bufptr; } /* char *tag points to start of tag string which is terminated by whitespace (including EOF) or a '>' character. return tag code or 0 if unknown. */ int RecogniseTag(void) { int c, len; char *s; s = bufptr; if (*++s == '/') { EndTag = 1; ++s; } else EndTag = 0; /* find end of tag to allow use of strncasecmp */ while (isalpha(*s) || isdigit(*s)) ++s; TagLen = s - bufptr; /* how far to next char after tag name */ len = TagLen - EndTag - 1; /* number of chars in tag name itself */ s -= len; c = tolower(*s); if (isalpha(c)) { if (c == 'a') { if (len == 1 && strncasecmp(s, "a", len) == 0) { TokenClass = EN_TEXT; return TAG_ANCHOR; } if (len == 7 && strncasecmp(s, "address", len) == 0) { TokenClass = EN_BLOCK; return TAG_ADDRESS; } } else if (c == 'b') { if (len == 1) { TokenClass = EN_TEXT; return TAG_BOLD; } if (len == 2 && strncasecmp(s, "br", len) == 0) { TokenClass = EN_TEXT; return TAG_BR; } if (len == 4 && strncasecmp(s, "body", len) == 0) { TokenClass = EN_MAIN; return TAG_BODY; } } else if (c == 'c') { if (len == 4) { if (strncasecmp(s, "code", len) == 0) { TokenClass = EN_TEXT; return TAG_CODE; } if (strncasecmp(s, "cite", len) == 0) { TokenClass = EN_TEXT; return TAG_CITE; } } } else if (c == 'd') { if (len != 2) return 0; if (strncasecmp(s, "dl", len) == 0) { TokenClass = EN_LIST; return TAG_DL; } if (strncasecmp(s, "dt", len) == 0) { TokenClass = EL_DEFLIST; return TAG_DT; } if (strncasecmp(s, "dd", len) == 0) { TokenClass = EL_DEFLIST; return TAG_DD; } } else if (c == 'e') { if (len == 2 && strncasecmp(s, "code", len) == 0) { TokenClass = EN_TEXT; return TAG_EM; } } else if (c == 'f') { if (len == 3 && strncasecmp(s, "fig", len) == 0) { TokenClass = EN_TEXT; return TAG_FIG; } if (len == 4 && strncasecmp(s, "form", len) == 0) { TokenClass = EN_BLOCK; return TAG_FORM; } } else if (c == 'h') { if (len == 4 && strncasecmp(s, "head", len) == 0) { TokenClass = EN_SETUP; return TAG_HEAD; } if (len != 2) return 0; TokenClass = EN_HEADER; c = tolower(s[1]); switch (c) { case '1': return TAG_H1; case '2': return TAG_H2; case '3': return TAG_H3; case '4': return TAG_H4; case '5': return TAG_H5; case '6': return TAG_H6; case 'r': TokenClass = EN_BLOCK; return TAG_HR; } } else if (c == 'i') { if (len == 1) { TokenClass = EN_TEXT; return TAG_ITALIC; } if (len == 3 && strncasecmp(s, "img", len) == 0) { TokenClass = EN_TEXT; return TAG_IMG; } if (len == 5 && strncasecmp(s, "input", len) == 0) { TokenClass = EN_TEXT; return TAG_INPUT; } if (len == 7 && strncasecmp(s, "isindex", len) == 0) { TokenClass = EN_SETUP; return TAG_ISINDEX; } } else if (c == 'k') { if (len == 3 && strncasecmp(s, "kbd", len) == 0) { TokenClass = EN_LIST; return TAG_KBD; } } else if (c == 'l') { if (len == 2 && strncasecmp(s, "li", len) == 0) { TokenClass = EN_LIST; return TAG_LI; } } else if (c == 'o') { if (len == 2 && strncasecmp(s, "ol", len) == 0) { TokenClass = EN_LIST; return TAG_OL; } if (len == 6 && strncasecmp(s, "option", len) == 0) { TokenClass = EN_TEXT; /* kludge for error recovery */ return TAG_OPTION; } } else if (c == 'p') { if (len == 1) { TokenClass = EN_BLOCK; return TAG_P; } if (len == 3 && strncasecmp(s, "pre", len) == 0) { TokenClass = EN_BLOCK; return TAG_PRE; } } else if (c == 'q') { if (len == 1) { TokenClass = EN_TEXT; return TAG_Q; } } else if (c == 's') { if (len == 1) { TokenClass = EN_TEXT; return TAG_STRIKE; } if (len == 4 && strncasecmp(s, "samp", len) == 0) { TokenClass = EN_TEXT; return TAG_SAMP; } if (len == 5 && strncasecmp(s, "strong", len) == 0) { TokenClass = EN_TEXT; return TAG_STRONG; } if (len == 6 && strncasecmp(s, "select", len) == 0) { TokenClass = EN_TEXT; return TAG_SELECT; } } else if (c == 't') { if (len == 5 && strncasecmp(s, "title", len) == 0) { TokenClass = EN_SETUP; return TAG_TITLE; } if (len == 2 && strncasecmp(s, "tt", len) == 0) { TokenClass = EN_TEXT; return TAG_TT; } if (len == 8 && strncasecmp(s, "textarea", len) == 0) { TokenClass = EN_TEXT; return TAG_TEXTAREA; } } else if (c == 'u') { if (len == 1) { TokenClass = EN_TEXT; return TAG_UNDERLINE; } if (len == 2 && strncasecmp(s, "ul", len) == 0) { TokenClass = EN_LIST; return TAG_UL; } } } TokenClass = EN_UNKNOWN; return UNKNOWN; /* unknown tag */ } void UnGetToken(void) { bufptr = LastBufPtr; } /* The token type is returned in the global token. Characters are returned in TokenValue while TokenClass is used to return a class value e.g. EN_SETUP or EN_BLOCK. Entity definitions are pointed to by EntityValue. The bufptr is moved past the token, except at the end of the buffer - as a safety precaution. */ int GetToken(void) { int c, k, n; static char *NextBufPtr; LastBufPtr = bufptr; next_char: /* hack for entity names */ c = *bufptr; TokenValue = c; if (c == '<') { if (bufptr[1] == '-' && bufptr[2] == '-') { Token = UNKNOWN; TokenClass = EN_COMMENT; comment = 1; bufptr += 3; return Token; } else if (isalpha(bufptr[1])) { Token = RecogniseTag(); bufptr += TagLen; /* to first char after tag name */ return Token; } else if ((bufptr[1] == '/' || bufptr[1] == '!') && isalpha(bufptr[2])) { Token = RecogniseTag(); bufptr += TagLen; /* to first char after tag name */ return Token; } } TokenClass = EN_TEXT; EndTag = 0; /* app needs to advance bufptr past entity: &fred; */ if (c == '&' && isalpha(bufptr[1])) { Token = ENTITY; return Token; } if (c <= ' ') { if (c == '\0') { Token = ENDDATA; TokenClass = EN_UNKNOWN; return Token; } ++bufptr; Token = WHITESPACE; return Token; } ++bufptr; Token = PCDATA; return Token; } /* assumes bufptr points to start of attribute */ char *ParseAttribute(int *len) { int c; char *attr; *len = 0; attr = bufptr; for (;;) { c = *bufptr; if (c == '>' || c == '\0') return attr; if (c == '=' || IsWhite(c)) break; ++(*len); ++bufptr; } fprintf(fp, " "); fwrite(attr, 1, *len, fp); return attr; } /* values start with "=" or " = " etc. */ char *ParseValue(int *len) { int c, delim; char *value; *len = 0; while (c = *bufptr, IsWhite(c)) ++bufptr; if (c != '=') return 0; ++bufptr; /* past the = sign */ while (c = *bufptr, IsWhite(c)) ++bufptr; if (c == '"' || c == '\'') { delim = c; ++bufptr; } else delim = 0; value = bufptr; for (;;) { c = *bufptr; if (c == '\0') return 0; if (delim) { if (c == delim) { ++bufptr; break; } } else if (c == '>' || IsWhite(c)) break; ++(*len); ++bufptr; } if (!delim) delim = '"'; if (*len > 0) { fprintf(fp, "=%c", delim); fwrite(value, 1, *len, fp); fprintf(fp, "%c", delim); } return value; } void ParseAttributes(int tag) { int c, n, m; char *attr, *value; fprintf(fp, "<%s", TokenName[tag]); for (;;) { c = *bufptr++; if (c == '\0') break; if (c == '>') break; if (IsWhite(c)) continue; --bufptr; attr = ParseAttribute(&n); value = ParseValue(&m); } fprintf(fp, ">"); } void ParseTitle(int implied) { if (EndTag) { SwallowAttributes(); return; } if (!implied) SwallowAttributes(); fprintf(fp, ""); /* skip leading white space - subsequently contigous white space is compressed to a single space */ while (GetToken() == WHITESPACE); UnGetToken(); LineLen = 0; for (;;) { GetToken(); if (Token == TAG_TITLE && EndTag) { SwallowAttributes(); break; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == ENTITY) { PrintEntity(); continue; } if (Token == WHITESPACE) { while (GetToken() == WHITESPACE); UnGetToken(); if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = ' '; continue; } if (Token != PCDATA) { UnGetToken(); break; } if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = TokenValue; } LineBuf[LineLen] = '\0'; fprintf(fp, "%s\n", LineBuf); } void ParseSetUp(int implied) { if (EndTag) { SwallowAttributes(); return; } fprintf(fp, "\n"); if (!implied) SwallowAttributes(); for (;;) { while (GetToken() == WHITESPACE); UnGetToken(); if (Token == TAG_HEAD && EndTag) { SwallowAttributes(); break; } if (Token == TAG_TITLE) { ParseTitle(0); continue; } if (Token == TAG_ISINDEX) { SwallowAttributes(); continue; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == PCDATA || Token == ENTITY) { UnGetToken(); break; } if (Token == ENDDATA || TokenClass != EN_SETUP) { UnGetToken(); break; } } fprintf(fp, "\n\n"); } void ParseOption(int implied) { int width; if (EndTag) { SwallowAttributes(); return; } if (!implied) SwallowAttributes(); fprintf(fp, ""); LineLen = 0; for (;;) { GetToken(); if (Token == TAG_OPTION && EndTag) { SwallowAttributes(); break; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } /* condense whitespace */ if (Token == WHITESPACE) { while (GetToken() == WHITESPACE); UnGetToken(); if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = ' '; continue; } if (Token == PCDATA) { if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = TokenValue; continue; } /* unexpected tag so terminate element */ UnGetToken(); break; } LineBuf[LineLen] = '\0'; } void ParseSelect(int implied) { int type, nlen, vlen, size, flags; char *name, *value; if (EndTag) { SwallowAttributes(); return; } if (!implied) ParseAttributes(TAG_SELECT); else { name = value = ""; nlen = vlen = flags = 0; nlen = vlen = flags = 0; } for (;;) { while (GetToken() == WHITESPACE); if (Token == TAG_SELECT && EndTag) { SwallowAttributes(); break; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == TAG_OPTION) { ParseOption(0); continue; } if (Token == PCDATA) { ParseOption(1); continue; } /* unexpected tag so terminate element */ UnGetToken(); break; } } /* check for anchors which enclose headers as the parser will treat these as an empty anchor followed by a header */ int AnchorEnclosesHeader() { int n; char *lastptr, *p, *q, *r, buf[1024]; lastptr = bufptr; SwallowAttributes(); while (GetToken() == WHITESPACE); UnGetToken(); q = bufptr; /* if is followed by something that isn't allowed inside an .. then swap them over */ if (TokenClass != EN_TEXT && !EndTag && Token != TAG_ANCHOR) { /* bufptr -> ... etc */ for (r = lastptr; *--r != '<';); for (p = buf; (*p++ = *bufptr++) != '>';); *p = '\0'; /* for debugging */ n = p - buf; memmove(r + (bufptr-q), r, q - r); p = buf; bufptr = r; while(n-- > 0) *r++ = *p++; return 1; } bufptr = lastptr; return 0; } void ParseEmph(int left, int right) { int ThisToken, WordLen, hreflen, namelen, nlen, vlen, size, type, align, rows, cols, delta, ismap, width, height, flags; char *href, *name, *value, *p; if (EndTag) { SwallowAttributes(); PrintEndTag(Token); return; } ThisToken = Token; switch (ThisToken) { case TAG_ANCHOR: if (AnchorEnclosesHeader()) return; ParseAttributes(ThisToken); break; case TAG_TT: case TAG_CODE: case TAG_SAMP: case TAG_KBD: PrintStartTag(ThisToken); SwallowAttributes(); break; case TAG_ITALIC: case TAG_EM: case TAG_DFN: case TAG_CITE: case TAG_Q: PrintStartTag(ThisToken); SwallowAttributes(); break; case TAG_BOLD: case TAG_STRONG: PrintStartTag(ThisToken); SwallowAttributes(); break; case TAG_UNDERLINE: PrintStartTag(ThisToken); SwallowAttributes(); break; case TAG_STRIKE: PrintStartTag(ThisToken); SwallowAttributes(); break; case TAG_TEXTAREA: /* a short term bodge */ ParseAttributes(ThisToken); StartOfLine = StartOfWord = bufptr; LineLen = LineWidth = WordStart = 0; return; case TAG_INPUT: ParseAttributes(ThisToken); StartOfLine = StartOfWord = bufptr; LineLen = LineWidth = WordStart = 0; return; case TAG_SELECT: ParseSelect(0); StartOfLine = StartOfWord = bufptr; LineLen = LineWidth = WordStart = 0; return; case TAG_OPTION: ParseSelect(1); StartOfLine = StartOfWord = bufptr; LineLen = LineWidth = WordStart = 0; return; case TAG_BR: PrintStartTag(ThisToken); SwallowAttributes(); Here = left; EndOfLine(); StartOfLine = StartOfWord = bufptr; LineLen = LineWidth = WordStart = 0; return; case TAG_IMG: ParseAttributes(ThisToken); StartOfLine = StartOfWord = bufptr; LineLen = LineWidth = WordStart = 0; return; default: PrintStartTag(ThisToken); SwallowAttributes(); break; } StartOfLine = StartOfWord = bufptr; LineLen = LineWidth = WordStart = 0; for (;;) { GetToken(); if (Token == ThisToken && EndTag) { SwallowAttributes(); break; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == ENTITY) { PrintEntity(); continue; } if (TokenClass != EN_TEXT) { UnGetToken(); break; } if (Token == WHITESPACE) { if (preformatted) { if (TokenValue == '\n') FlushLine(BREAK, left, right); else { if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = ' '; } continue; } while (GetToken() == WHITESPACE); UnGetToken(); /* check that we have a word */ if ((WordLen = LineLen - WordStart) > 0) WrapIfNeeded(left, right); if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = ' '; WordStart = LineLen; StartOfWord = bufptr; continue; } if (IsTag(Token)) { FlushLine(NOBREAK, left, right); ParseEmph(left, right); continue; } /* must be PCDATA */ if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = TokenValue; } LineBuf[LineLen] = '\0'; FlushLine(NOBREAK, left, right); PrintEndTag(ThisToken); } void ParseHeader(int left, int right) { int HeaderTag, WordLen; SwallowAttributes(); if (EndTag) return; fprintf(fp, "\n", Token - TAG_H1 + 1); HeaderTag = Token; Here = left; StartOfLine = StartOfWord = bufptr; LineLen = LineWidth = WordStart = 0; for (;;) { GetToken(); if (Token == HeaderTag && EndTag) { SwallowAttributes(); break; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == ENTITY) { PrintEntity(); continue; } if (TokenClass != EN_TEXT) { UnGetToken(); break; } if (Token == WHITESPACE) { while (GetToken() == WHITESPACE); UnGetToken(); /* check that we have a word */ if ((WordLen = LineLen - WordStart) > 0) WrapIfNeeded(left, right); if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = ' '; WordStart = LineLen; StartOfWord = bufptr; continue; } if (IsTag(Token)) { if (EndTag) { SwallowAttributes(); continue; } FlushLine(NOBREAK, left, right); ParseEmph(left, right); continue; } /* must be PCDATA */ if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = TokenValue; } LineBuf[LineLen] = '\0'; FlushLine(NOBREAK, left, right); fprintf(fp, "\n\n", HeaderTag - TAG_H1 + 1); PixOffset += 1; } void ParsePara(int implied, int left, int right) { int WordLen, LastToken; if (EndTag) { SwallowAttributes(); return; } if (!implied) SwallowAttributes(); if (Here < left) Here = left; /* skip leading white space - subsequently contigous white space is compressed to a single space */ while (GetToken() == WHITESPACE); UnGetToken(); StartOfLine = StartOfWord = bufptr; LineLen = LineWidth = WordStart = 0; for (;;) { LastToken = Token; GetToken(); if (Token == TAG_P && EndTag) { SwallowAttributes(); break; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == TAG_P) { SwallowAttributes(); FlushLine(NOBREAK, left, right); Here = left; LineLen = LineWidth = WordStart = 0; fprintf(fp, "\n"); continue;; } if (Token == ENTITY) { PrintEntity(); continue; } if (TokenClass != EN_TEXT) { UnGetToken(); break; } if (Token == WHITESPACE) { while (GetToken() == WHITESPACE); UnGetToken(); /* check that we have a word */ if ((WordLen = LineLen - WordStart) > 0) WrapIfNeeded(left, right); if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = ' '; WordStart = LineLen; StartOfWord = bufptr; continue; } if (IsTag(Token)) { if (EndTag) { SwallowAttributes(); continue; } FlushLine(NOBREAK, left, right); ParseEmph(left, right); continue; } /* must be PCDATA */ if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = TokenValue; } LineBuf[LineLen] = '\0'; FlushLine((LastToken != TAG_P ? BREAK : NOBREAK), left, right); PixOffset += 1; } /* advance declaration due for nested lists */ void ParseUL(int implied, int depth, int left, int right); void ParseOL(int implied, int depth, int left, int right); void ParseDL(int implied, int left, int right); void ParseLI(int implied, int depth, int seq, int left, int right) { int indent, w; long y; char buf[16]; if (EndTag) { SwallowAttributes(); return; } else fprintf(fp, ""); indent = 3; y = PixOffset; if (!implied) SwallowAttributes(); Here = left + indent/3; for (;;) { while (GetToken() == WHITESPACE); if (Token == TAG_LI && EndTag) { SwallowAttributes(); break; } if (Token == TAG_UL) { UnGetToken(); if (EndTag) break; ParseUL(0, depth, left + indent, right); continue; } if (Token == TAG_OL) { UnGetToken(); if (EndTag) break; ParseOL(0, depth, left + indent, right); continue; } if (Token == TAG_DL) { UnGetToken(); if (EndTag) break; ParseDL(0, left + indent, right); continue; } if (Token == TAG_DT || Token == TAG_DD) { UnGetToken(); ParseDL(1, left + indent, right); continue; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == TAG_P) { ParsePara(0, left + indent, right); continue; } if (Token == TAG_HR) { SwallowAttributes(); if (EndTag) continue; EndOfLine(); continue; } if (TokenClass == EN_TEXT) { UnGetToken(); ParsePara(0, left + indent, right); continue; } /* unexpected tag so terminate element */ UnGetToken(); break; } /* kludge to cope with an element with no content */ if (y == PixOffset) PixOffset += 1; EndOfLine(); } void ParseUL(int implied, int depth, int left, int right) { if (EndTag) { SwallowAttributes(); return; } else fprintf(fp, "\n\n\n"); if (!implied) SwallowAttributes(); for (;;) { while (GetToken() == WHITESPACE); if (Token == TAG_UL && EndTag) { SwallowAttributes(); break; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == TAG_LI) { ParseLI(0, depth, 0, left, right); continue; } if (Token == TAG_P || Token == TAG_HR || TokenClass == EN_TEXT) { if (EndTag) { SwallowAttributes(); continue; } UnGetToken(); ParseLI(1, depth, 0, left, right); continue; } if (Token == TAG_UL || Token == TAG_OL || Token == TAG_DL || Token == TAG_DT || Token == TAG_DD) { if (EndTag) { SwallowAttributes(); continue; } UnGetToken(); ParseLI(1, depth, 0, left, right); continue; } /* unexpected tag so terminate element */ UnGetToken(); break; } fprintf(fp, "\n\n"); Here = left; } void ParseOL(int implied, int depth, int left, int right) { int seq; if (EndTag) { SwallowAttributes(); return; } else fprintf(fp, "\n\n\n"); if (!implied) SwallowAttributes(); seq = 0; for (;;) { ++seq; while (GetToken() == WHITESPACE); if (Token == TAG_OL && EndTag) { SwallowAttributes(); break; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == TAG_LI) { ParseLI(0, depth, seq, left, right); continue; } if (Token == TAG_P || Token == TAG_HR || TokenClass == EN_TEXT) { if (EndTag) { SwallowAttributes(); continue; } UnGetToken(); ParseLI(1, depth, seq, left, right); continue; } if (Token == TAG_UL || Token == TAG_OL || Token == TAG_DL || Token == TAG_DT || Token == TAG_DD) { if (EndTag) { SwallowAttributes(); continue; } UnGetToken(); ParseLI(1, depth, seq, left, right); continue; } /* unexpected tag so terminate element */ UnGetToken(); break; } fprintf(fp, "\n\n"); Here = left; } void ParseDT(int implied, int left, int right) { int WordLen; if (EndTag) { SwallowAttributes(); return; } else fprintf(fp, ""); if (!implied) SwallowAttributes(); Here = left; /* skip leading white space - subsequently contigous white space is compressed to a single space */ while (GetToken() == WHITESPACE); UnGetToken(); StartOfLine = StartOfWord = bufptr; LineLen = LineWidth = WordStart = 0; for (;;) { GetToken(); if (Token == TAG_DT && EndTag) { SwallowAttributes(); break; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (TokenClass != EN_TEXT) { UnGetToken(); break; } if (Token == WHITESPACE) { while (GetToken() == WHITESPACE); UnGetToken(); /* check that we have a word */ if ((WordLen = LineLen - WordStart) > 0) WrapIfNeeded(left, right); if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = ' '; WordStart = LineLen; StartOfWord = bufptr; continue; } if (IsTag(Token)) { if (EndTag) { SwallowAttributes(); continue; } FlushLine(NOBREAK, left, right); ParseEmph(left, right); continue; } if (Token == ENTITY) { PrintEntity(); continue; } /* must be PCDATA */ if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = TokenValue; } LineBuf[LineLen] = '\0'; FlushLine(NOBREAK, left, right); EndOfLine(); Here += 5; } void ParseDD(int implied, int left, int right) { long y; if (EndTag) { SwallowAttributes(); return; } else fprintf(fp, ""); if (!implied) SwallowAttributes(); y = PixOffset; for (;;) { while (GetToken() == WHITESPACE); if (Token == TAG_DD && EndTag) { SwallowAttributes(); break; } if (Token == TAG_DL && EndTag) { UnGetToken(); break; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == TAG_UL) { ParseUL(0, 0, left, right); continue; } if (Token == TAG_OL) { ParseOL(0, 0, left, right); continue; } if (Token == TAG_LI) { if (EndTag) { SwallowAttributes(); continue; } UnGetToken(); ParseUL(1, 0, left, right); continue; } if (Token == TAG_DL) { if (PixOffset == y) /* force a line break */ { Here = left; EndOfLine(); } ParseDL(0, left, right); Here = left; continue; } if (Token == TAG_HR) { SwallowAttributes(); if (EndTag) continue; EndOfLine(); continue; } if (Token == TAG_P) { ParsePara(0, left, right); continue; } if (TokenClass == EN_TEXT || Token == ENTITY) { UnGetToken(); ParsePara(1, left, right); continue; } /* unexpected tag so terminate element */ UnGetToken(); break; } /* kludge to cope with an element with no content */ if (y == PixOffset) PixOffset += 1; EndOfLine(); } void ParseDL(int implied, int left, int right) { int indent, LastToken; if (EndTag) { SwallowAttributes(); return; } else fprintf(fp, "\n\n\n"); if (!implied) SwallowAttributes(); LastToken = TAG_DL; indent = 2; for (;;) { while (GetToken() == WHITESPACE); if (Token == TAG_DL && EndTag) { SwallowAttributes(); if (LastToken == TAG_DT) EndOfLine(); break; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == TAG_DT ) { if (EndTag) { SwallowAttributes(); continue; } if (LastToken == TAG_DT) EndOfLine(); ParseDT(0, left, right); LastToken = TAG_DT; continue; } if (Token == TAG_DD ) { if (EndTag) { SwallowAttributes(); continue; } ParseDD(0, left + indent, right); LastToken = TAG_DD; continue; } if (Token == TAG_P || Token == TAG_HR || TokenClass == EN_TEXT || Token == TAG_UL || Token == TAG_LI || Token == TAG_OL || Token == TAG_DL) { if (EndTag) { SwallowAttributes(); continue; } UnGetToken(); ParseDD(1, left + indent, right); LastToken = TAG_DD; continue; } /* unexpected tag so terminate element */ UnGetToken(); break; } fprintf(fp, "\n\n"); Here = left; } void ParseAddress(int implied, int left, int right) { if (EndTag) { SwallowAttributes(); return; } else fprintf(fp, "\n"); if (!implied) SwallowAttributes(); Here = left; for (;;) { while (GetToken() == WHITESPACE); if (Token == TAG_ADDRESS && EndTag) { SwallowAttributes(); break; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == TAG_P) { ParsePara(0, left, right); continue; } if (TokenClass == EN_TEXT) { if (EndTag) { SwallowAttributes(); continue; } UnGetToken(); ParsePara(1, left, right); continue; } /* unexpected tag so terminate element */ UnGetToken(); break; } fprintf(fp, "\n"); } void ParsePRE(int implied, int left, int right) { int WordLen; if (EndTag) { SwallowAttributes(); return; } else fprintf(fp, "\n"); if (!implied) SwallowAttributes(); Here = left; preformatted = 1; StartOfLine = StartOfWord = bufptr; LineLen = LineWidth = WordStart = 0; for (;;) { GetToken(); if (Token == TAG_PRE && EndTag) { SwallowAttributes(); break; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == ENTITY) { PrintEntity(); continue; } if (TokenClass != EN_TEXT) { UnGetToken(); break; } if (Token == WHITESPACE && TokenValue == '\n') { FlushLine(BREAK, left, right); continue; } if (IsTag(Token)) { if (EndTag) { SwallowAttributes(); continue; } FlushLine(NOBREAK, left, right); ParseEmph(left, right); continue; } /* must be PCDATA */ if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = TokenValue; } LineBuf[LineLen] = '\0'; FlushLine(BREAK, left, right); fprintf(fp, "\n"); preformatted = 0; } void ParseBody(int implied, int left, int right) { if (EndTag) { SwallowAttributes(); return; } else fprintf(fp, "\n"); if (!implied) SwallowAttributes(); for (;;) { while (GetToken() == WHITESPACE); if (Token == TAG_BODY && EndTag) { SwallowAttributes(); break; } if (IsTag(Token) && TokenClass == EN_HEADER) { if (EndTag) { SwallowAttributes(); continue; } ParseHeader(left, right); continue; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == TAG_FORM) { SwallowAttributes(); continue; } if (Token == TAG_UL) { ParseUL(0, 0, left, right); continue; } if (Token == TAG_OL) { ParseOL(0, 0, left, right); continue; } if (Token == TAG_LI) { if (EndTag) { SwallowAttributes(); continue; } UnGetToken(); ParseUL(1, 0, left, right); continue; } if (Token == TAG_DL) { ParseDL(0, left, right); continue; } if (Token == TAG_DT || Token == TAG_DD) { if (EndTag) { SwallowAttributes(); continue; } UnGetToken(); ParseDL(1, left, right); continue; } if (Token == TAG_HR) { if (EndTag) { SwallowAttributes(); continue; } SwallowAttributes(); EndOfLine(); continue; } if (Token == TAG_PRE) { ParsePRE(0, left, right); continue; } if (Token == TAG_P) { ParsePara(0, left, right); continue; } if (TokenClass == EN_TEXT || Token == ENTITY) { if (EndTag) { SwallowAttributes(); continue; } UnGetToken(); ParsePara(1, left, right); continue; } if (Token == TAG_ADDRESS) { ParseAddress(0, left, right); continue; } if (Token == ENDDATA) { UnGetToken(); break; } } fprintf(fp, "\n"); } long ParseHTML(char *buffer) { int c, WordLen; PixOffset = LastOffset = 0; LastBufPtr = bufptr = buffer; comment = preformatted = 0; fprintf(fp, "\n"); for (;;) { while (GetToken() == WHITESPACE); if (Token == UNKNOWN) { SwallowAttributes(); continue; } break; } if (Token == TAG_HEAD) ParseSetUp(0); else { UnGetToken(); ParseSetUp(1); } while (GetToken() == WHITESPACE); if (Token == TAG_BODY) ParseBody(0, MININDENT, MAXMARGIN); else { UnGetToken(); ParseBody(1, MININDENT, MAXMARGIN); } return PixOffset; } char *ReadFile(char *file) { int fd, err; size_t size, len; char *buf; if ((fd = open(file, O_RDONLY)) == -1) { fprintf(stderr, "Can't open: %s\n", file); return NULL; } size = lseek(fd, 0L, SEEK_END); buf = malloc(1 + size); lseek(fd, 0L, SEEK_SET); len = read(fd, (void *)buf, size); err = errno; close(fd); if (len < 0) { fprintf(stderr, "Can't read data for %s - errno %d\n", file, err); free(buf); return NULL; } buf[len] = '\0'; return buf; } void Clean(char *file) { char *buf; if (!overwrite) { if ((buf = ReadFile(file)) != NULL) { fp = stdout; ParseHTML(buf); free(buf); } } else if ((buf = ReadFile(file)) != NULL && (fp = fopen(file, "w")) != NULL) { ParseHTML(buf); fclose(fp); free(buf); } else fprintf(stderr, "\n\n*** Can't open file: %s\n\n", file); } Help() { printf("\nTidy version %s, (c) Dave Raggett April 1994\n\n", VERSION); printf("This is a program for tidying up unruly HTML documents. Tidy\n"); printf("puts in all the bits people tend to leave out such as the DOCTYPE\n"); printf("declaration and the HEAD and BODY elements. Tidy works hard to\n"); printf("spot missing start and end tags, for example missing list tags\n"); printf("like or . Unknown tags are stripped out. Tidy lays out\n"); printf("the document to fit within an 80 column display for convenience\n"); printf("in further editing. If you have any suggestions for improving\n"); printf("\"tidy\" please email your comments to me at dsr@hplb.hpl.com\n\n"); } main(int argc, char **argv) { overwrite = 0; if (argc == 1) { fprintf(stderr, "\nTidies HTML files to match the DTD and current practice.\n"); fprintf(stderr, "You are recommended to back up your files before tidying them.\n"); fprintf(stderr, "Version %s, (c) Dave Raggett April 1994\n", VERSION); fprintf(stderr, "\nUseage:\n\t%s [-help] [-w] file file file ...\n\n", argv[0]); fprintf(stderr, "where -help gives futher information on what tidy does, and -w\n"); fprintf(stderr, "overwrites existing files rather than sending corrections to stdout\n\n"); } debug = 0; while (argc > 1) { if (strcasecmp(argv[1], "-help") == 0) { Help(); break; } else if (strcasecmp(argv[1], "-w") == 0) overwrite = 1; else if (argv[1][0] == '-') fprintf(stderr, "Unknown flag: %s\n", argv[1]); else Clean(argv[1]); --argc; ++argv; } }
"); continue;; } if (Token == ENTITY) { PrintEntity(); continue; } if (TokenClass != EN_TEXT) { UnGetToken(); break; } if (Token == WHITESPACE) { while (GetToken() == WHITESPACE); UnGetToken(); /* check that we have a word */ if ((WordLen = LineLen - WordStart) > 0) WrapIfNeeded(left, right); if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = ' '; WordStart = LineLen; StartOfWord = bufptr; continue; } if (IsTag(Token)) { if (EndTag) { SwallowAttributes(); continue; } FlushLine(NOBREAK, left, right); ParseEmph(left, right); continue; } /* must be PCDATA */ if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = TokenValue; } LineBuf[LineLen] = '\0'; FlushLine((LastToken != TAG_P ? BREAK : NOBREAK), left, right); PixOffset += 1; } /* advance declaration due for nested lists */ void ParseUL(int implied, int depth, int left, int right); void ParseOL(int implied, int depth, int left, int right); void ParseDL(int implied, int left, int right); void ParseLI(int implied, int depth, int seq, int left, int right) { int indent, w; long y; char buf[16]; if (EndTag) { SwallowAttributes(); return; } else fprintf(fp, "
\n"); if (!implied) SwallowAttributes(); Here = left; preformatted = 1; StartOfLine = StartOfWord = bufptr; LineLen = LineWidth = WordStart = 0; for (;;) { GetToken(); if (Token == TAG_PRE && EndTag) { SwallowAttributes(); break; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == ENTITY) { PrintEntity(); continue; } if (TokenClass != EN_TEXT) { UnGetToken(); break; } if (Token == WHITESPACE && TokenValue == '\n') { FlushLine(BREAK, left, right); continue; } if (IsTag(Token)) { if (EndTag) { SwallowAttributes(); continue; } FlushLine(NOBREAK, left, right); ParseEmph(left, right); continue; } /* must be PCDATA */ if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = TokenValue; } LineBuf[LineLen] = '\0'; FlushLine(BREAK, left, right); fprintf(fp, "