/* tidy.c - reformats HTML files to match DTD On an HP-UX system this compiles with: cc -Aa -g -D_HPUX_SOURCE -o tidy tidy.c The program reformats html files to remove faults and make them consistent with current practice. Please email your comments and suggestions for improvements to dsr@hplb.hpl.hp.com Useage: tidy file file file ... (c) April 1994 Dave Raggett */ #include #include #include #include #include #include #include #include #define VERSION "1.0a" #define IsWhite(c) (c == ' ' || c == '\n' || c == '\t' || c == '\f' || c == '\r') #define max(a, b) ((a) > (b) ? (a) : (b)) #define min(a, b) ((a) < (b) ? (a) : (b)) /* alignment codes */ #define ALIGN_TOP 0 #define ALIGN_MIDDLE 1 #define ALIGN_BOTTOM 2 #define ALIGN_LEFT 3 #define ALIGN_CENTER 4 #define ALIGN_RIGHT 5 #define ALIGN_JUSTIFY 6 /* the html token codes */ #define COMMENT -5 /* <-- SGML Comments --> */ #define ENTITY -4 #define WHITESPACE -3 /* the specific char */ #define PCDATA -2 /* the specific char */ #define ENDDATA -1 #define ENDTAG (1<<7) /* ORed with TAG code */ #define IsTag(tag) (tag >= 0) #define UNKNOWN 0 #define TAG_ANCHOR 1 /* EN_TEXT */ #define TAG_BOLD 2 /* EN_TEXT */ #define TAG_DL 3 /* EN_LIST */ #define TAG_DT 4 /* EN_DEFLIST */ #define TAG_DD 5 /* EN_DEFLIST */ #define TAG_H1 6 /* EN_HEADER */ #define TAG_H2 7 /* EN_HEADER */ #define TAG_H3 8 /* EN_HEADER */ #define TAG_H4 9 /* EN_HEADER */ #define TAG_H5 10 /* EN_HEADER */ #define TAG_H6 11 /* EN_HEADER */ #define TAG_ITALIC 12 /* EN_TEXT */ #define TAG_IMG 13 /* EN_TEXT */ #define TAG_LI 14 /* EN_LIST */ #define TAG_OL 15 /* EN_LIST */ #define TAG_P 16 /* EN_BLOCK */ #define TAG_TITLE 17 /* EN_SETUP */ #define TAG_UNDERLINE 18 /* EN_TEXT */ #define TAG_UL 19 /* EN_LIST */ #define TAG_HEAD 20 /* EN_SETUP */ #define TAG_BODY 21 /* EN_MAIN */ #define TAG_HR 22 /* EN_BLOCK */ #define TAG_ADDRESS 23 /* EN_BLOCK */ #define TAG_BR 24 /* EN_TEXT */ #define TAG_STRIKE 25 /* EN_TEXT */ #define TAG_PRE 26 /* EN_BLOCK */ #define TAG_CITE 27 /* EN_TEXT */ #define TAG_CODE 28 /* EN_TEXT */ #define TAG_TT 29 /* EN_TEXT */ #define TAG_EM 30 /* EN_TEXT */ #define TAG_STRONG 31 /* EN_TEXT */ #define TAG_KBD 32 /* EN_TEXT */ #define TAG_SAMP 33 /* EN_TEXT */ #define TAG_DFN 34 /* EN_TEXT */ #define TAG_Q 35 /* EN_TEXT */ #define TAG_QUOTE 36 /* EN_BLOCK */ #define TAG_ISINDEX 37 /* EN_SETUP */ #define TAG_FIG 38 /* EN_TEXT */ #define TAG_INPUT 39 /* EN_TEXT */ #define TAG_SELECT 40 /* EN_TEXT */ #define TAG_OPTION 41 /* EN_TEXT */ #define TAG_TEXTAREA 42 /* EN_TEXT */ #define TAG_TABLE 43 /* EN_BLOCK */ #define TAG_TR 44 /* EN_TABLE */ #define TAG_TH 45 /* EN_TABLE */ #define TAG_TD 46 /* EN_TABLE */ #define TAG_CAPTION 47 /* EN_BLOCK */ #define TAG_FORM 48 /* EN_BLOCK */ /* entity classes */ #define EN_UNKNOWN 0 #define EN_TEXT (1 << 0) #define EN_BLOCK (1 << 1) #define EN_LIST (1 << 2) #define EL_DEFLIST (1 << 3) #define EN_HEADER (1 << 4) #define EN_SETUP (1 << 5) #define EN_MAIN (1 << 6) #define EN_TABLE (1 << 7) #define EN_COMMENT (1 << 8) /* input field types */ #define TEXTFIELD 0x00 #define CHECKBOX 0x10 #define RADIOBUTTON 0x20 #define OPTIONLIST 0x40 #define CHECKED 0x80 /* flags field as 'active' */ #define IN_ERROR 0x40 /* field flagged as in error */ #define DISABLED 0x20 /* field is greyed out */ #define SUBMIT 0x10 /* submit button */ /* window margin and indents */ #define MAXMARGIN 78 #define MININDENT 4 #define NESTINDENT 5 #define GLINDENT 30 #define OLINDENT 3 #define LINDENT 8 #define GLGAP 4 #define LBUFSIZE 1024 int comment = 0; int debug = 0; /* The current top line is displayed at the top of the window,the pixel offset is the number of pixels from the start of the document. */ FILE *fp; /* global file pointer for writing cleaned version */ char *bufptr; /* parse position in the HTML buffer */ char *lastbufptr; /* keep track of last position to store delta's */ int overwrite; int preformatted = 0; static int EndTag, TagLen; static int TokenClass, TokenValue, Token; static char *EntityValue; int baseline; /* from top of line */ long TermTop, TermBottom; long LastOffset; long PixOffset; /* current offset from start of document */ long PrevOffset; /* keep track for saving delta's */ long LastLIoffset; /* kludge for
  • line spacing */ long ViewOffset; /* for toggling between HTML/TEXT views */ int Here; int HTMLInit = 0; long figEnd; char *LastBufPtr, *StartOfLine, *StartOfWord; /* in HTML document */ static int LineLen, LineWidth, WordStart, WordWidth; static char LineBuf[LBUFSIZE]; /* line buffer */ int namelen, hreflen; char *name, *href; #define NOBREAK 0 #define BREAK 1 char *TokenName[] = { "", "A", "B", "DL", "DT", "DD", "H1", "H2", "H3", "H4", "H5", "H6", "I", "IMG", "LI", "OL", "P", "TITLE", "U", "UL", "HEAD", "BODY", "HR", "ADDRESS", "BR", "S", "PRE", "CITE", "CODE", "TT", "EM", "STRONG", "KBD", "SAMP", "DFN", "Q", "BLOCKQUOTE", "ISINDEX", "INPUT", "SELECT", "OPTION", "TEXTAREA", "FORM" }; #if 0 void Warn(char *args, ...) { va_list ap; char buf[256]; if (debug) { va_start(ap, args); vsprintf(buf, args, ap); va_end(ap); fprintf(stderr, "%s\n", buf); } } #endif void PrintEntity(void) { int c; for (;;) { c = *bufptr++; if (c == '\0' || c == ';') break; LineBuf[LineLen++] = c; } LineBuf[LineLen++] = ';'; } void PrintString(int len) { fwrite((char *)LineBuf, 1, len, fp); } void EndOfLine(void) { putc('\n', fp); } void PrintStartTag(int token) { fprintf(fp, "<%s>", TokenName[token]); } void PrintEndTag(int token) { fprintf(fp, "", TokenName[token]); } /* check if current word forces word wrap and flush line as needed */ void WrapIfNeeded(int WrapLeftMargin, int WrapRightMargin) { int WordLen, space, rightMargin; long line; rightMargin = WrapRightMargin; LineBuf[LineLen] = '\0'; /* debug*/ WordLen = LineLen - WordStart; WordWidth = WordLen; space = 1; line = 1; if (WordStart == 0 && Here + WordWidth > rightMargin) { /* word wider than window */ if (WordWidth > rightMargin - WrapLeftMargin) { PrintString(WordLen); LineWidth = LineLen = WordStart = 0; StartOfLine = bufptr; } else /* wrap to next line */ { LineWidth = WordWidth; LineLen = WordLen; WordStart = LineLen; StartOfLine = StartOfWord; } Here = WrapLeftMargin; EndOfLine(); } else if (WordStart > 0 && Here + LineWidth + space + WordWidth > rightMargin) { PrintString(WordStart-1); Here = WrapLeftMargin; EndOfLine(); memcpy(LineBuf, LineBuf+WordStart, WordLen); LineWidth = WordWidth; LineLen = WordLen; WordStart = LineLen; StartOfLine = StartOfWord; } else /* word will fit on end of current line */ { if (WordStart > 0) LineWidth += space; if (WordWidth > 0) LineWidth += WordWidth; WordStart = LineLen; } } /* flush text in line buffer, wrapping line as needed */ void FlushLine(int linebreak, int WrapLeftMargin, int WrapRightMargin) { int WordLen; if (preformatted) { WordLen = LineLen - WordStart; LineWidth = WordLen; } else if (LineLen > 0) WrapIfNeeded(WrapLeftMargin, WrapRightMargin); if (LineLen > 0) { /* watch out for single space as leading spaces are stripped by CopyLine */ if (LineLen > 1 || LineBuf[0] != ' ') PrintString(LineLen); if (linebreak) { Here = WrapLeftMargin; LineWidth = LineLen = WordStart = 0; EndOfLine(); } else { Here += LineWidth; LineWidth = LineLen = WordStart = 0; } } else if (linebreak && Here > WrapLeftMargin) { Here = WrapLeftMargin; EndOfLine(); } StartOfLine = StartOfWord = bufptr; } /* needs to cope with > in quoted text for ' and " */ void SwallowAttributes(void) { int c, dash; dash = 0; while ((c = *bufptr) && c != '>') { if (comment) { if (c == '-') { if (dash) comment = dash = 0; else dash = 1; } else dash = 0; } else if (c == '-') { if (dash) { comment = 1; dash = 0; } else dash = 1; } else if (c == '>') break; ++bufptr; } if (c == '>') ++bufptr; } /* char *tag points to start of tag string which is terminated by whitespace (including EOF) or a '>' character. return tag code or 0 if unknown. */ int RecogniseTag(void) { int c, len; char *s; s = bufptr; if (*++s == '/') { EndTag = 1; ++s; } else EndTag = 0; /* find end of tag to allow use of strncasecmp */ while (isalpha(*s) || isdigit(*s)) ++s; TagLen = s - bufptr; /* how far to next char after tag name */ len = TagLen - EndTag - 1; /* number of chars in tag name itself */ s -= len; c = tolower(*s); if (isalpha(c)) { if (c == 'a') { if (len == 1 && strncasecmp(s, "a", len) == 0) { TokenClass = EN_TEXT; return TAG_ANCHOR; } if (len == 7 && strncasecmp(s, "address", len) == 0) { TokenClass = EN_BLOCK; return TAG_ADDRESS; } } else if (c == 'b') { if (len == 1) { TokenClass = EN_TEXT; return TAG_BOLD; } if (len == 2 && strncasecmp(s, "br", len) == 0) { TokenClass = EN_TEXT; return TAG_BR; } if (len == 4 && strncasecmp(s, "body", len) == 0) { TokenClass = EN_MAIN; return TAG_BODY; } } else if (c == 'c') { if (len == 4) { if (strncasecmp(s, "code", len) == 0) { TokenClass = EN_TEXT; return TAG_CODE; } if (strncasecmp(s, "cite", len) == 0) { TokenClass = EN_TEXT; return TAG_CITE; } } } else if (c == 'd') { if (len != 2) return 0; if (strncasecmp(s, "dl", len) == 0) { TokenClass = EN_LIST; return TAG_DL; } if (strncasecmp(s, "dt", len) == 0) { TokenClass = EL_DEFLIST; return TAG_DT; } if (strncasecmp(s, "dd", len) == 0) { TokenClass = EL_DEFLIST; return TAG_DD; } } else if (c == 'e') { if (len == 2 && strncasecmp(s, "code", len) == 0) { TokenClass = EN_TEXT; return TAG_EM; } } else if (c == 'f') { if (len == 3 && strncasecmp(s, "fig", len) == 0) { TokenClass = EN_TEXT; return TAG_FIG; } if (len == 4 && strncasecmp(s, "form", len) == 0) { TokenClass = EN_BLOCK; return TAG_FORM; } } else if (c == 'h') { if (len == 4 && strncasecmp(s, "head", len) == 0) { TokenClass = EN_SETUP; return TAG_HEAD; } if (len != 2) return 0; TokenClass = EN_HEADER; c = tolower(s[1]); switch (c) { case '1': return TAG_H1; case '2': return TAG_H2; case '3': return TAG_H3; case '4': return TAG_H4; case '5': return TAG_H5; case '6': return TAG_H6; case 'r': TokenClass = EN_BLOCK; return TAG_HR; } } else if (c == 'i') { if (len == 1) { TokenClass = EN_TEXT; return TAG_ITALIC; } if (len == 3 && strncasecmp(s, "img", len) == 0) { TokenClass = EN_TEXT; return TAG_IMG; } if (len == 5 && strncasecmp(s, "input", len) == 0) { TokenClass = EN_TEXT; return TAG_INPUT; } if (len == 7 && strncasecmp(s, "isindex", len) == 0) { TokenClass = EN_SETUP; return TAG_ISINDEX; } } else if (c == 'k') { if (len == 3 && strncasecmp(s, "kbd", len) == 0) { TokenClass = EN_LIST; return TAG_KBD; } } else if (c == 'l') { if (len == 2 && strncasecmp(s, "li", len) == 0) { TokenClass = EN_LIST; return TAG_LI; } } else if (c == 'o') { if (len == 2 && strncasecmp(s, "ol", len) == 0) { TokenClass = EN_LIST; return TAG_OL; } if (len == 6 && strncasecmp(s, "option", len) == 0) { TokenClass = EN_TEXT; /* kludge for error recovery */ return TAG_OPTION; } } else if (c == 'p') { if (len == 1) { TokenClass = EN_BLOCK; return TAG_P; } if (len == 3 && strncasecmp(s, "pre", len) == 0) { TokenClass = EN_BLOCK; return TAG_PRE; } } else if (c == 'q') { if (len == 1) { TokenClass = EN_TEXT; return TAG_Q; } } else if (c == 's') { if (len == 1) { TokenClass = EN_TEXT; return TAG_STRIKE; } if (len == 4 && strncasecmp(s, "samp", len) == 0) { TokenClass = EN_TEXT; return TAG_SAMP; } if (len == 5 && strncasecmp(s, "strong", len) == 0) { TokenClass = EN_TEXT; return TAG_STRONG; } if (len == 6 && strncasecmp(s, "select", len) == 0) { TokenClass = EN_TEXT; return TAG_SELECT; } } else if (c == 't') { if (len == 5 && strncasecmp(s, "title", len) == 0) { TokenClass = EN_SETUP; return TAG_TITLE; } if (len == 2 && strncasecmp(s, "tt", len) == 0) { TokenClass = EN_TEXT; return TAG_TT; } if (len == 8 && strncasecmp(s, "textarea", len) == 0) { TokenClass = EN_TEXT; return TAG_TEXTAREA; } } else if (c == 'u') { if (len == 1) { TokenClass = EN_TEXT; return TAG_UNDERLINE; } if (len == 2 && strncasecmp(s, "ul", len) == 0) { TokenClass = EN_LIST; return TAG_UL; } } } TokenClass = EN_UNKNOWN; return UNKNOWN; /* unknown tag */ } void UnGetToken(void) { bufptr = LastBufPtr; } /* The token type is returned in the global token. Characters are returned in TokenValue while TokenClass is used to return a class value e.g. EN_SETUP or EN_BLOCK. Entity definitions are pointed to by EntityValue. The bufptr is moved past the token, except at the end of the buffer - as a safety precaution. */ int GetToken(void) { int c, k, n; static char *NextBufPtr; LastBufPtr = bufptr; next_char: /* hack for entity names */ c = *bufptr; TokenValue = c; if (c == '<') { if (bufptr[1] == '-' && bufptr[2] == '-') { Token = UNKNOWN; TokenClass = EN_COMMENT; comment = 1; bufptr += 3; return Token; } else if (isalpha(bufptr[1])) { Token = RecogniseTag(); bufptr += TagLen; /* to first char after tag name */ return Token; } else if ((bufptr[1] == '/' || bufptr[1] == '!') && isalpha(bufptr[2])) { Token = RecogniseTag(); bufptr += TagLen; /* to first char after tag name */ return Token; } } TokenClass = EN_TEXT; EndTag = 0; /* app needs to advance bufptr past entity: &fred; */ if (c == '&' && isalpha(bufptr[1])) { Token = ENTITY; return Token; } if (c <= ' ') { if (c == '\0') { Token = ENDDATA; TokenClass = EN_UNKNOWN; return Token; } ++bufptr; Token = WHITESPACE; return Token; } ++bufptr; Token = PCDATA; return Token; } /* assumes bufptr points to start of attribute */ char *ParseAttribute(int *len) { int c; char *attr; *len = 0; attr = bufptr; for (;;) { c = *bufptr; if (c == '>' || c == '\0') return attr; if (c == '=' || IsWhite(c)) break; ++(*len); ++bufptr; } fprintf(fp, " "); fwrite(attr, 1, *len, fp); return attr; } /* values start with "=" or " = " etc. */ char *ParseValue(int *len) { int c, delim; char *value; *len = 0; while (c = *bufptr, IsWhite(c)) ++bufptr; if (c != '=') return 0; ++bufptr; /* past the = sign */ while (c = *bufptr, IsWhite(c)) ++bufptr; if (c == '"' || c == '\'') { delim = c; ++bufptr; } else delim = 0; value = bufptr; for (;;) { c = *bufptr; if (c == '\0') return 0; if (delim) { if (c == delim) { ++bufptr; break; } } else if (c == '>' || IsWhite(c)) break; ++(*len); ++bufptr; } if (!delim) delim = '"'; if (*len > 0) { fprintf(fp, "=%c", delim); fwrite(value, 1, *len, fp); fprintf(fp, "%c", delim); } return value; } void ParseAttributes(int tag) { int c, n, m; char *attr, *value; fprintf(fp, "<%s", TokenName[tag]); for (;;) { c = *bufptr++; if (c == '\0') break; if (c == '>') break; if (IsWhite(c)) continue; --bufptr; attr = ParseAttribute(&n); value = ParseValue(&m); } fprintf(fp, ">"); } void ParseTitle(int implied) { if (EndTag) { SwallowAttributes(); return; } if (!implied) SwallowAttributes(); fprintf(fp, ""); /* skip leading white space - subsequently contigous white space is compressed to a single space */ while (GetToken() == WHITESPACE); UnGetToken(); LineLen = 0; for (;;) { GetToken(); if (Token == TAG_TITLE && EndTag) { SwallowAttributes(); break; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == ENTITY) { PrintEntity(); continue; } if (Token == WHITESPACE) { while (GetToken() == WHITESPACE); UnGetToken(); if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = ' '; continue; } if (Token != PCDATA) { UnGetToken(); break; } if (LineLen < LBUFSIZE - 1) LineBuf[LineLen++] = TokenValue; } LineBuf[LineLen] = '\0'; fprintf(fp, "%s\n", LineBuf); } void ParseSetUp(int implied) { if (EndTag) { SwallowAttributes(); return; } fprintf(fp, "\n"); if (!implied) SwallowAttributes(); for (;;) { while (GetToken() == WHITESPACE); UnGetToken(); if (Token == TAG_HEAD && EndTag) { SwallowAttributes(); break; } if (Token == TAG_TITLE) { ParseTitle(0); continue; } if (Token == TAG_ISINDEX) { SwallowAttributes(); continue; } if (Token == UNKNOWN) { SwallowAttributes(); continue; } if (Token == PCDATA || Token == ENTITY) { UnGetToken(); break; } if (Token == ENDDATA || TokenClass != EN_SETUP) { UnGetToken(); break; } } fprintf(fp, "\n\n"); } void ParseOption(int implied) { int width; if (EndTag) { SwallowAttributes(); return; } if (!implied) SwallowAttributes(); fprintf(fp, "