xy z
+ for all tags "p" allowing PCDATA */ + for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { + if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) { + return(0); + } + } + } + return(1); +} + +/** + * Creates a new HTML document without a DTD node if `URI` and `publicId` + * are NULL + * + * @param URI system ID (URI) of the DTD (optional) + * @param publicId public ID of the DTD (optional) + * @returns a new document, do not initialize the DTD if not provided + */ +xmlDoc * +htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *publicId) { + xmlDocPtr cur; + + /* + * Allocate a new document and fill the fields. + */ + cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); + if (cur == NULL) + return(NULL); + memset(cur, 0, sizeof(xmlDoc)); + + cur->type = XML_HTML_DOCUMENT_NODE; + cur->version = NULL; + cur->intSubset = NULL; + cur->doc = cur; + cur->name = NULL; + cur->children = NULL; + cur->extSubset = NULL; + cur->oldNs = NULL; + cur->encoding = NULL; + cur->standalone = 1; + cur->compression = 0; + cur->ids = NULL; + cur->refs = NULL; + cur->_private = NULL; + cur->charset = XML_CHAR_ENCODING_UTF8; + cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT; + if ((publicId != NULL) || + (URI != NULL)) { + xmlDtdPtr intSubset; + + intSubset = xmlCreateIntSubset(cur, BAD_CAST "html", publicId, URI); + if (intSubset == NULL) { + xmlFree(cur); + return(NULL); + } + } + if ((xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue)) + xmlRegisterNodeDefaultValue((xmlNodePtr)cur); + return(cur); +} + +/** + * Creates a new HTML document + * + * @param URI system ID (URI) of the DTD (optional) + * @param publicId public ID of the DTD (optional) + * @returns a new document + */ +xmlDoc * +htmlNewDoc(const xmlChar *URI, const xmlChar *publicId) { + if ((URI == NULL) && (publicId == NULL)) + return(htmlNewDocNoDtD( + BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd", + BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN")); + + return(htmlNewDocNoDtD(URI, publicId)); +} + + +/************************************************************************ + * * + * The parser itself * + * Relates to http://www.w3.org/TR/html40 * + * * + ************************************************************************/ + +/************************************************************************ + * * + * The parser itself * + * * + ************************************************************************/ + +/** + * parse an HTML tag or attribute name, note that we convert it to lowercase + * since HTML names are not case-sensitive. + * + * @param ctxt an HTML parser context + * @param attr whether this is an attribute name + * @returns the Tag Name parsed or NULL + */ + +static xmlHashedString +htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) { + xmlHashedString ret; + xmlChar buf[HTML_PARSER_BUFFER_SIZE]; + const xmlChar *in; + size_t avail; + int eof = PARSER_PROGRESSIVE(ctxt); + int nbchar = 0; + int stop = attr ? '=' : ' '; + + in = ctxt->input->cur; + avail = ctxt->input->end - in; + + while (1) { + int c, size; + + if ((!eof) && (avail < 32)) { + size_t oldAvail = avail; + + ctxt->input->cur = in; + + SHRINK; + xmlParserGrow(ctxt); + + in = ctxt->input->cur; + avail = ctxt->input->end - in; + + if (oldAvail == avail) + eof = 1; + } + + if (avail == 0) + break; + + c = *in; + size = 1; + + if ((nbchar != 0) && + ((c == '/') || (c == '>') || (c == stop) || + (IS_WS_HTML(c)))) + break; + + if (c == 0) { + if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) { + buf[nbchar++] = 0xEF; + buf[nbchar++] = 0xBF; + buf[nbchar++] = 0xBD; + } + } else if (c < 0x80) { + if (nbchar < HTML_PARSER_BUFFER_SIZE) { + if (IS_UPPER(c)) + c += 0x20; + buf[nbchar++] = c; + } + } else { + size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0); + + if (size > 0) { + if (nbchar + size <= HTML_PARSER_BUFFER_SIZE) { + memcpy(buf + nbchar, in, size); + nbchar += size; + } + } else { + size = 1; + + if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) { + buf[nbchar++] = 0xEF; + buf[nbchar++] = 0xBF; + buf[nbchar++] = 0xBD; + } + } + } + + in += size; + avail -= size; + } + + ctxt->input->cur = in; + + SHRINK; + + ret = xmlDictLookupHashed(ctxt->dict, buf, nbchar); + if (ret.name == NULL) + htmlErrMemory(ctxt); + + return(ret); +} + +static const short htmlC1Remap[32] = { + 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, + 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 +}; + +static const xmlChar * +htmlCodePointToUtf8(int c, xmlChar *out, int *osize) { + int i = 0; + int bits, hi; + + if ((c >= 0x80) && (c < 0xA0)) { + c = htmlC1Remap[c - 0x80]; + } else if ((c <= 0) || + ((c >= 0xD800) && (c < 0xE000)) || + (c > 0x10FFFF)) { + c = 0xFFFD; + } + + if (c < 0x80) { bits = 0; hi = 0x00; } + else if (c < 0x800) { bits = 6; hi = 0xC0; } + else if (c < 0x10000) { bits = 12; hi = 0xE0; } + else { bits = 18; hi = 0xF0; } + + out[i++] = (c >> bits) | hi; + + while (bits > 0) { + bits -= 6; + out[i++] = ((c >> bits) & 0x3F) | 0x80; + } + + *osize = i; + return(out); +} + +#include "codegen/html5ent.inc" + +#define ENT_F_SEMICOLON 0x80u +#define ENT_F_SUBTABLE 0x40u +#define ENT_F_ALL 0xC0u + +static const xmlChar * +htmlFindEntityPrefix(const xmlChar *string, size_t slen, int isAttr, + int *nlen, int *rlen) { + const xmlChar *match = NULL; + unsigned left, right; + int first = string[0]; + size_t matchLen = 0; + size_t soff = 1; + + if (slen < 2) + return(NULL); + if (!IS_ASCII_LETTER(first)) + return(NULL); + + /* + * Look up range by first character + */ + first &= 63; + left = htmlEntAlpha[first*3] | htmlEntAlpha[first*3+1] << 8; + right = left + htmlEntAlpha[first*3+2]; + + /* + * Binary search + */ + while (left < right) { + const xmlChar *bytes; + unsigned mid; + size_t len; + int cmp; + + mid = left + (right - left) / 2; + bytes = htmlEntStrings + htmlEntValues[mid]; + len = bytes[0] & ~ENT_F_ALL; + + cmp = string[soff] - bytes[1]; + + if (cmp == 0) { + if (slen < len) { + cmp = strncmp((const char *) string + soff + 1, + (const char *) bytes + 2, + slen - 1); + /* Prefix can never match */ + if (cmp == 0) + break; + } else { + cmp = strncmp((const char *) string + soff + 1, + (const char *) bytes + 2, + len - 1); + } + } + + if (cmp < 0) { + right = mid; + } else if (cmp > 0) { + left = mid + 1; + } else { + int term = soff + len < slen ? string[soff + len] : 0; + int isAlnum, isTerm; + + isAlnum = IS_ALNUM(term); + isTerm = ((term == ';') || + ((bytes[0] & ENT_F_SEMICOLON) && + ((!isAttr) || + ((!isAlnum) && (term != '='))))); + + if (isTerm) { + match = bytes + len + 1; + matchLen = soff + len; + if (term == ';') + matchLen += 1; + } + + if (bytes[0] & ENT_F_SUBTABLE) { + if (isTerm) + match += 2; + + if ((isAlnum) && (soff + len < slen)) { + left = mid + bytes[len + 1]; + right = left + bytes[len + 2]; + soff += len; + continue; + } + } + + break; + } + } + + if (match == NULL) + return(NULL); + + *nlen = matchLen; + *rlen = match[0]; + return(match + 1); +} + +/** + * Parse data until terminator is reached. + * + * @param ctxt an HTML parser context + * @param mask mask of terminating characters + * @param comment true if parsing a comment + * @param refs true if references are allowed + * @param maxLength maximum output length + * @returns the parsed string or NULL in case of errors. + */ + +static xmlChar * +htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask, + int comment, int refs, int maxLength) { + xmlParserInputPtr input = ctxt->input; + xmlChar *ret = NULL; + xmlChar *buffer; + xmlChar utf8Char[4]; + size_t buffer_size; + size_t used; + int eof = PARSER_PROGRESSIVE(ctxt); + int line, col; + int termSkip = -1; + + used = 0; + buffer_size = ctxt->spaceMax; + buffer = (xmlChar *) ctxt->spaceTab; + if (buffer == NULL) { + buffer_size = 500; + buffer = xmlMalloc(buffer_size + 1); + if (buffer == NULL) { + htmlErrMemory(ctxt); + return(NULL); + } + } + + line = input->line; + col = input->col; + + while (!PARSER_STOPPED(ctxt)) { + const xmlChar *chunk, *in, *repl; + size_t avail, chunkSize, extraSize; + int replSize; + int skip = 0; + int ncr = 0; + int ncrSize = 0; + int cp = 0; + + chunk = input->cur; + avail = input->end - chunk; + in = chunk; + + repl = BAD_CAST ""; + replSize = 0; + + while (!PARSER_STOPPED(ctxt)) { + size_t j; + int cur, size; + + if ((!eof) && (avail <= 64)) { + size_t oldAvail = avail; + size_t off = in - chunk; + + input->cur = in; + + xmlParserGrow(ctxt); + + in = input->cur; + chunk = in - off; + input->cur = chunk; + avail = input->end - in; + + if (oldAvail == avail) + eof = 1; + } + + if (avail == 0) { + termSkip = 0; + break; + } + + cur = *in; + size = 1; + col += 1; + + if (htmlMaskMatch(mask, cur)) { + if (comment) { + if (avail < 2) { + termSkip = 1; + } else if (in[1] == '-') { + if (avail < 3) { + termSkip = 2; + } else if (in[2] == '>') { + termSkip = 3; + } else if (in[2] == '!') { + if (avail < 4) + termSkip = 3; + else if (in[3] == '>') + termSkip = 4; + } + } + + if (termSkip >= 0) + break; + } else { + termSkip = 0; + break; + } + } + + if (ncr) { + int lc = cur | 0x20; + int digit; + + if ((cur >= '0') && (cur <= '9')) { + digit = cur - '0'; + } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) { + digit = (lc - 'a') + 10; + } else { + if (cur == ';') { + in += 1; + size += 1; + ncrSize += 1; + } + goto next_chunk; + } + + cp = cp * ncr + digit; + if (cp >= 0x110000) + cp = 0x110000; + + ncrSize += 1; + + goto next_char; + } + + switch (cur) { + case '&': + if (!refs) + break; + + j = 1; + + if ((j < avail) && (in[j] == '#')) { + j += 1; + if (j < avail) { + if ((in[j] | 0x20) == 'x') { + j += 1; + if ((j < avail) && (IS_HEX_DIGIT(in[j]))) { + ncr = 16; + size = 3; + ncrSize = 3; + cp = 0; + } + } else if (IS_ASCII_DIGIT(in[j])) { + ncr = 10; + size = 2; + ncrSize = 2; + cp = 0; + } + } + } else { + repl = htmlFindEntityPrefix(in + j, + avail - j, + /* isAttr */ 1, + &skip, &replSize); + if (repl != NULL) { + skip += 1; + goto next_chunk; + } + + skip = 0; + } + + break; + + case '\0': + skip = 1; + repl = BAD_CAST "\xEF\xBF\xBD"; + replSize = 3; + goto next_chunk; + + case '\n': + line += 1; + col = 1; + break; + + case '\r': + skip = 1; + if (in[1] != 0x0A) { + repl = BAD_CAST "\x0A"; + replSize = 1; + } + goto next_chunk; + + default: + if (cur < 0x80) + break; + + if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) { + xmlChar * guess; + + if (in > chunk) + goto next_chunk; + +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + guess = NULL; +#else + guess = htmlFindEncoding(ctxt); +#endif + if (guess == NULL) { + xmlSwitchEncoding(ctxt, + XML_CHAR_ENCODING_WINDOWS_1252); + } else { + xmlSwitchEncodingName(ctxt, (const char *) guess); + xmlFree(guess); + } + input->flags |= XML_INPUT_HAS_ENCODING; + + eof = PARSER_PROGRESSIVE(ctxt); + goto restart; + } + + size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0); + + if (size <= 0) { + skip = 1; + repl = BAD_CAST "\xEF\xBF\xBD"; + replSize = 3; + goto next_chunk; + } + + break; + } + +next_char: + in += size; + avail -= size; + } + +next_chunk: + if (ncrSize > 0) { + skip = ncrSize; + in -= ncrSize; + + repl = htmlCodePointToUtf8(cp, utf8Char, &replSize); + } + + chunkSize = in - chunk; + extraSize = chunkSize + replSize; + + if (extraSize > maxLength - used) { + htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT, + "value too long\n", NULL, NULL); + goto error; + } + + if (extraSize > buffer_size - used) { + size_t newSize = (used + extraSize) * 2; + xmlChar *tmp = xmlRealloc(buffer, newSize + 1); + + if (tmp == NULL) { + htmlErrMemory(ctxt); + goto error; + } + buffer = tmp; + buffer_size = newSize; + } + + if (chunkSize > 0) { + input->cur += chunkSize; + memcpy(buffer + used, chunk, chunkSize); + used += chunkSize; + } + + input->cur += skip; + if (replSize > 0) { + memcpy(buffer + used, repl, replSize); + used += replSize; + } + + SHRINK; + + if (termSkip >= 0) + break; + +restart: + ; + } + + if (termSkip > 0) { + input->cur += termSkip; + col += termSkip; + } + + input->line = line; + input->col = col; + + ret = xmlMalloc(used + 1); + if (ret == NULL) { + htmlErrMemory(ctxt); + } else { + memcpy(ret, buffer, used); + ret[used] = 0; + } + +error: + ctxt->spaceTab = (void *) buffer; + ctxt->spaceMax = buffer_size; + + return(ret); +} + +/** + * @deprecated Internal function, don't use. + * + * @param ctxt an HTML parser context + * @param str location to store the entity name + * @returns NULL. + */ +const htmlEntityDesc * +htmlParseEntityRef(htmlParserCtxt *ctxt ATTRIBUTE_UNUSED, + const xmlChar **str ATTRIBUTE_UNUSED) { + return(NULL); +} + +/** + * parse a value for an attribute + * Note: the parser won't do substitution of entities here, this + * will be handled later in #xmlStringGetNodeList, unless it was + * asked for ctxt->replaceEntities != 0 + * + * @param ctxt an HTML parser context + * @returns the AttValue parsed or NULL. + */ + +static xmlChar * +htmlParseAttValue(htmlParserCtxtPtr ctxt) { + xmlChar *ret = NULL; + int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? + XML_MAX_HUGE_LENGTH : + XML_MAX_TEXT_LENGTH; + + if (CUR == '"') { + SKIP(1); + ret = htmlParseData(ctxt, MASK_DQ, 0, 1, maxLength); + if (CUR == '"') + SKIP(1); + } else if (CUR == '\'') { + SKIP(1); + ret = htmlParseData(ctxt, MASK_SQ, 0, 1, maxLength); + if (CUR == '\'') + SKIP(1); + } else { + ret = htmlParseData(ctxt, MASK_WS_GT, 0, 1, maxLength); + } + return(ret); +} + +static void +htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf, + int size, int mode) { + if ((ctxt->sax == NULL) || (ctxt->disableSAX)) + return; + + if ((mode == 0) || (mode == DATA_RCDATA) || + (ctxt->sax->cdataBlock == NULL)) { + if ((ctxt->name == NULL) || + (xmlStrEqual(ctxt->name, BAD_CAST "html")) || + (xmlStrEqual(ctxt->name, BAD_CAST "head"))) { + int i; + + /* + * Add leading whitespace to html or head elements before + * calling htmlStartCharData. + */ + for (i = 0; i < size; i++) + if (!IS_WS_HTML(buf[i])) + break; + + if (i > 0) { + if (!ctxt->keepBlanks) { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace(ctxt->userData, buf, i); + } else { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters(ctxt->userData, buf, i); + } + + buf += i; + size -= i; + } + + if (size <= 0) + return; + + htmlStartCharData(ctxt); + + if (PARSER_STOPPED(ctxt)) + return; + } + + if ((mode == 0) && + (!ctxt->keepBlanks) && + (areBlanks(ctxt, buf, size) > 0)) { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace(ctxt->userData, buf, size); + } else { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters(ctxt->userData, buf, size); + } + } else { + /* + * Insert as CDATA, which is the same as HTML_PRESERVE_NODE + */ + ctxt->sax->cdataBlock(ctxt->userData, buf, size); + } +} + +/** + * Parse character data and references. + * + * @param ctxt an HTML parser context + * @param partial true if the input buffer is incomplete + * @returns 1 if all data was parsed, 0 otherwise. + */ + +static int +htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) { + xmlParserInputPtr input = ctxt->input; + xmlChar utf8Char[4]; + int complete = 0; + int done = 0; + int mode; + int eof = PARSER_PROGRESSIVE(ctxt); + int line, col; + + mode = ctxt->endCheckState; + + line = input->line; + col = input->col; + + while (!PARSER_STOPPED(ctxt)) { + const xmlChar *chunk, *in, *repl; + size_t avail; + int replSize; + int skip = 0; + int ncr = 0; + int ncrSize = 0; + int cp = 0; + + chunk = input->cur; + avail = input->end - chunk; + in = chunk; + + repl = BAD_CAST ""; + replSize = 0; + + while (!PARSER_STOPPED(ctxt)) { + size_t j; + int cur, size; + + if (avail <= 64) { + if (!eof) { + size_t oldAvail = avail; + size_t off = in - chunk; + + input->cur = in; + + xmlParserGrow(ctxt); + + in = input->cur; + chunk = in - off; + input->cur = chunk; + avail = input->end - in; + + if (oldAvail == avail) + eof = 1; + } + + if (avail == 0) { + if ((partial) && (ncr)) { + in -= ncrSize; + ncrSize = 0; + } + + done = 1; + break; + } + } + + /* Accelerator */ + if (!ncr) { + while (avail > 0) { + static const unsigned mask[8] = { + 0x00002401, 0x10002040, + 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, + 0xFFFFFFFF, 0xFFFFFFFF + }; + cur = *in; + if ((1u << (cur & 0x1F)) & mask[cur >> 5]) + break; + col += 1; + in += 1; + avail -= 1; + } + + if ((!eof) && (avail <= 64)) + continue; + if (avail == 0) + continue; + } + + cur = *in; + size = 1; + col += 1; + + if (ncr) { + int lc = cur | 0x20; + int digit; + + if ((cur >= '0') && (cur <= '9')) { + digit = cur - '0'; + } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) { + digit = (lc - 'a') + 10; + } else { + if (cur == ';') { + in += 1; + size += 1; + ncrSize += 1; + } + goto next_chunk; + } + + cp = cp * ncr + digit; + if (cp >= 0x110000) + cp = 0x110000; + + ncrSize += 1; + + goto next_char; + } + + switch (cur) { + case '<': + if (mode == 0) { + done = 1; + complete = 1; + goto next_chunk; + } + if (mode == DATA_PLAINTEXT) + break; + + j = 1; + if (j < avail) { + if ((mode == DATA_SCRIPT) && (in[j] == '!')) { + /* Check for comment start */ + + j += 1; + if ((j < avail) && (in[j] == '-')) { + j += 1; + if ((j < avail) && (in[j] == '-')) + mode = DATA_SCRIPT_ESC1; + } + } else { + int i = 0; + int solidus = 0; + + /* Check for tag */ + + if (in[j] == '/') { + j += 1; + solidus = 1; + } + + if ((solidus) || (mode == DATA_SCRIPT_ESC1)) { + while ((j < avail) && + (ctxt->name[i] != 0) && + (ctxt->name[i] == (in[j] | 0x20))) { + i += 1; + j += 1; + } + + if ((ctxt->name[i] == 0) && (j < avail)) { + int c = in[j]; + + if ((c == '>') || (c == '/') || + (IS_WS_HTML(c))) { + if ((mode == DATA_SCRIPT_ESC1) && + (!solidus)) { + mode = DATA_SCRIPT_ESC2; + } else if (mode == DATA_SCRIPT_ESC2) { + mode = DATA_SCRIPT_ESC1; + } else { + complete = 1; + done = 1; + goto next_chunk; + } + } + } + } + } + } + + if ((partial) && (j >= avail)) { + done = 1; + goto next_chunk; + } + + break; + + case '-': + if ((mode != DATA_SCRIPT_ESC1) && (mode != DATA_SCRIPT_ESC2)) + break; + + /* Check for comment end */ + + j = 1; + if ((j < avail) && (in[j] == '-')) { + j += 1; + if ((j < avail) && (in[j] == '>')) + mode = DATA_SCRIPT; + } + + if ((partial) && (j >= avail)) { + done = 1; + goto next_chunk; + } + + break; + + case '&': + if ((mode != 0) && (mode != DATA_RCDATA)) + break; + + j = 1; + + if ((j < avail) && (in[j] == '#')) { + j += 1; + if (j < avail) { + if ((in[j] | 0x20) == 'x') { + j += 1; + if ((j < avail) && (IS_HEX_DIGIT(in[j]))) { + ncr = 16; + size = 3; + ncrSize = 3; + cp = 0; + } + } else if (IS_ASCII_DIGIT(in[j])) { + ncr = 10; + size = 2; + ncrSize = 2; + cp = 0; + } + } + } else { + if (partial) { + int terminated = 0; + size_t i; + + /* + * ∳ has 33 bytes. + */ + for (i = 1; i < avail; i++) { + if ((i >= 32) || + (!IS_ASCII_LETTER(in[i]) && + ((i < 2) || !IS_ASCII_DIGIT(in[i])))) { + terminated = 1; + break; + } + } + + if (!terminated) { + done = 1; + goto next_chunk; + } + } + + repl = htmlFindEntityPrefix(in + j, + avail - j, + /* isAttr */ 0, + &skip, &replSize); + if (repl != NULL) { + skip += 1; + goto next_chunk; + } + + skip = 0; + } + + if ((partial) && (j >= avail)) { + done = 1; + goto next_chunk; + } + + break; + + case '\0': + skip = 1; + + if (mode == 0) { + /* + * The HTML5 spec says that the tokenizer should + * pass on U+0000 unmodified in normal data mode. + * These characters should then be ignored in body + * and other text, but should be replaced with + * U+FFFD in foreign content. + * + * At least for now, we always strip U+0000 when + * tokenizing. + */ + repl = BAD_CAST ""; + replSize = 0; + } else { + repl = BAD_CAST "\xEF\xBF\xBD"; + replSize = 3; + } + + goto next_chunk; + + case '\n': + line += 1; + col = 1; + break; + + case '\r': + if (partial && avail < 2) { + done = 1; + goto next_chunk; + } + + skip = 1; + if (in[1] != 0x0A) { + repl = BAD_CAST "\x0A"; + replSize = 1; + } + goto next_chunk; + + default: + if (cur < 0x80) + break; + + if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) { + xmlChar * guess; + + if (in > chunk) + goto next_chunk; + +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + guess = NULL; +#else + guess = htmlFindEncoding(ctxt); +#endif + if (guess == NULL) { + xmlSwitchEncoding(ctxt, + XML_CHAR_ENCODING_WINDOWS_1252); + } else { + xmlSwitchEncodingName(ctxt, (const char *) guess); + xmlFree(guess); + } + input->flags |= XML_INPUT_HAS_ENCODING; + + eof = PARSER_PROGRESSIVE(ctxt); + goto restart; + } + + size = htmlValidateUtf8(ctxt, in, avail, partial); + + if ((partial) && (size == 0)) { + done = 1; + goto next_chunk; + } + + if (size <= 0) { + skip = 1; + repl = BAD_CAST "\xEF\xBF\xBD"; + replSize = 3; + goto next_chunk; + } + + break; + } + +next_char: + in += size; + avail -= size; + } + +next_chunk: + if (ncrSize > 0) { + skip = ncrSize; + in -= ncrSize; + + repl = htmlCodePointToUtf8(cp, utf8Char, &replSize); + } + + if (in > chunk) { + input->cur += in - chunk; + htmlCharDataSAXCallback(ctxt, chunk, in - chunk, mode); + } + + input->cur += skip; + if (replSize > 0) + htmlCharDataSAXCallback(ctxt, repl, replSize, mode); + + SHRINK; + + if (done) + break; + +restart: + ; + } + + input->line = line; + input->col = col; + + if (complete) + ctxt->endCheckState = 0; + else + ctxt->endCheckState = mode; + + return(complete); +} + +/** + * Parse an HTML comment + * + * @param ctxt an HTML parser context + * @param bogus true if this is a bogus comment + */ +static void +htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) { + const xmlChar *comment = BAD_CAST ""; + xmlChar *buf = NULL; + int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? + XML_MAX_HUGE_LENGTH : + XML_MAX_TEXT_LENGTH; + + if (bogus) { + buf = htmlParseData(ctxt, MASK_GT, 0, 0, maxLength); + if (CUR == '>') + SKIP(1); + comment = buf; + } else { + if ((!PARSER_PROGRESSIVE(ctxt)) && + (ctxt->input->end - ctxt->input->cur < 2)) + xmlParserGrow(ctxt); + + if (CUR == '>') { + SKIP(1); + } else if ((CUR == '-') && (NXT(1) == '>')) { + SKIP(2); + } else { + buf = htmlParseData(ctxt, MASK_DASH, 1, 0, maxLength); + comment = buf; + } + } + + if (comment == NULL) + return; + + if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && + (!ctxt->disableSAX)) + ctxt->sax->comment(ctxt->userData, comment); + + xmlFree(buf); +} + +/** + * @deprecated Internal function, don't use. + * + * @param ctxt an HTML parser context + * @returns 0 + */ +int +htmlParseCharRef(htmlParserCtxt *ctxt ATTRIBUTE_UNUSED) { + return(0); +} + + +/** + * Parse a DOCTYPE SYTSTEM or PUBLIC literal. + * + * @param ctxt an HTML parser context + * @returns the literal or NULL in case of error. + */ + +static xmlChar * +htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) { + xmlChar *ret; + int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? + XML_MAX_TEXT_LENGTH : + XML_MAX_NAME_LENGTH; + + if (CUR == '"') { + SKIP(1); + ret = htmlParseData(ctxt, MASK_DQ_GT, 0, 0, maxLength); + if (CUR == '"') + SKIP(1); + } else if (CUR == '\'') { + SKIP(1); + ret = htmlParseData(ctxt, MASK_SQ_GT, 0, 0, maxLength); + if (CUR == '\'') + SKIP(1); + } else { + return(NULL); + } + + return(ret); +} + +static void +htmlSkipBogusDoctype(htmlParserCtxtPtr ctxt) { + const xmlChar *in; + size_t avail; + int eof = PARSER_PROGRESSIVE(ctxt); + int line, col; + + line = ctxt->input->line; + col = ctxt->input->col; + + in = ctxt->input->cur; + avail = ctxt->input->end - in; + + while (!PARSER_STOPPED(ctxt)) { + int cur; + + if ((!eof) && (avail <= 64)) { + size_t oldAvail = avail; + + ctxt->input->cur = in; + + xmlParserGrow(ctxt); + + in = ctxt->input->cur; + avail = ctxt->input->end - in; + + if (oldAvail == avail) + eof = 1; + } + + if (avail == 0) + break; + + col += 1; + + cur = *in; + if (cur == '>') { + in += 1; + break; + } else if (cur == 0x0A) { + line += 1; + col = 1; + } + + in += 1; + avail -= 1; + + SHRINK; + } + + ctxt->input->cur = in; + ctxt->input->line = line; + ctxt->input->col = col; +} + +/** + * Parse a DOCTYPE declaration. + * + * @param ctxt an HTML parser context + */ + +static void +htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { + xmlChar *name = NULL; + xmlChar *publicId = NULL; + xmlChar *URI = NULL; + int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? + XML_MAX_TEXT_LENGTH : + XML_MAX_NAME_LENGTH; + + /* + * We know that 'input->cur < ctxt->input->end) && (CUR != '>')) { + name = htmlParseData(ctxt, MASK_WS_GT, 0, 0, maxLength); + + if ((ctxt->options & HTML_PARSE_HTML5) && (name != NULL)) { + xmlChar *cur; + + for (cur = name; *cur; cur++) { + if (IS_UPPER(*cur)) + *cur += 0x20; + } + } + + SKIP_BLANKS; + } + + /* + * Check for SystemID and publicId + */ + if ((UPPER == 'P') && (UPP(1) == 'U') && + (UPP(2) == 'B') && (UPP(3) == 'L') && + (UPP(4) == 'I') && (UPP(5) == 'C')) { + SKIP(6); + SKIP_BLANKS; + publicId = htmlParseDoctypeLiteral(ctxt); + if (publicId == NULL) + goto bogus; + SKIP_BLANKS; + URI = htmlParseDoctypeLiteral(ctxt); + } else if ((UPPER == 'S') && (UPP(1) == 'Y') && + (UPP(2) == 'S') && (UPP(3) == 'T') && + (UPP(4) == 'E') && (UPP(5) == 'M')) { + SKIP(6); + SKIP_BLANKS; + URI = htmlParseDoctypeLiteral(ctxt); + } + +bogus: + htmlSkipBogusDoctype(ctxt); + + /* + * Create or update the document accordingly to the DOCTYPE + */ + if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) && + (!ctxt->disableSAX)) + ctxt->sax->internalSubset(ctxt->userData, name, publicId, URI); + + xmlFree(name); + xmlFree(URI); + xmlFree(publicId); +} + +/** + * parse an attribute + * + * [41] Attribute ::= Name Eq AttValue + * + * [25] Eq ::= S? '=' S? + * + * With namespace: + * + * [NS 11] Attribute ::= QName Eq AttValue + * + * Also the case QName == xmlns:??? is handled independently as a namespace + * definition. + * + * @param ctxt an HTML parser context + * @param value a xmlChar ** used to store the value of the attribute + * @returns the attribute name, and the value in *value. + */ + +static xmlHashedString +htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { + xmlHashedString hname; + xmlChar *val = NULL; + + *value = NULL; + hname = htmlParseHTMLName(ctxt, 1); + if (hname.name == NULL) + return(hname); + + /* + * read the value + */ + SKIP_BLANKS; + if (CUR == '=') { + SKIP(1); + SKIP_BLANKS; + val = htmlParseAttValue(ctxt); + } + + *value = val; + return(hname); +} + +static int +htmlCharEncCheckAsciiCompatible(htmlParserCtxt *ctxt, + const xmlChar *encoding) { + xmlCharEncodingHandler *handler; + xmlChar in[9] = ""; + xmlChar out[9]; + int inlen, outlen; + int res; + + res = xmlCreateCharEncodingHandler( + (const char *) encoding, + XML_ENC_INPUT | XML_ENC_HTML, + ctxt->convImpl, ctxt->convCtxt, + &handler); + if (res != XML_ERR_OK) { + xmlFatalErr(ctxt, res, (const char *) encoding); + return(-1); + } + + /* UTF-8 */ + if (handler == NULL) + return(0); + + inlen = 8; + outlen = 8; + res = xmlEncInputChunk(handler, out, &outlen, in, &inlen, /* flush */ 1); + + xmlCharEncCloseFunc(handler); + + if ((res != XML_ENC_ERR_SUCCESS) || + (inlen != 8) || (outlen != 8) || + (memcmp(in, out, 8) != 0)) { + htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, + "Encoding %s isn't ASCII-compatible", encoding, NULL); + return(-1); + } + + return(0); +} + +/** + * Handle charset encoding in meta tag. + * + * @param ctxt an HTML parser context + * @param atts the attributes values + */ +static void +htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { + int i; + const xmlChar *att, *value; + int isContentType = 0; + const xmlChar *content = NULL; + xmlChar *encoding = NULL; + + if ((ctxt == NULL) || (atts == NULL)) + return; + + i = 0; + att = atts[i++]; + while (att != NULL) { + value = atts[i++]; + if (value != NULL) { + if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) && + (!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) { + isContentType = 1; + } else if (!xmlStrcasecmp(att, BAD_CAST "charset")) { + encoding = xmlStrdup(value); + if (encoding == NULL) + htmlErrMemory(ctxt); + break; + } else if (!xmlStrcasecmp(att, BAD_CAST "content")) { + content = value; + } + } + att = atts[i++]; + } + + if ((encoding == NULL) && (isContentType) && (content != NULL)) { + htmlMetaEncodingOffsets off; + + if (htmlParseContentType(content, &off)) { + encoding = xmlStrndup(content + off.start, off.end - off.start); + if (encoding == NULL) + htmlErrMemory(ctxt); + } + } + + if (encoding != NULL) { + if (htmlCharEncCheckAsciiCompatible(ctxt, encoding) < 0) { + xmlFree(encoding); + return; + } + + xmlSetDeclaredEncoding(ctxt, encoding); + } +} + +/** + * Inserts a new attribute into the hash table. + * + * @param ctxt parser context + * @param size size of the hash table + * @param name attribute name + * @param hashValue hash value of name + * @param aindex attribute index (this is a multiple of 5) + * @returns INT_MAX if no existing attribute was found, the attribute + * index if an attribute was found, -1 if a memory allocation failed. + */ +static int +htmlAttrHashInsert(xmlParserCtxtPtr ctxt, unsigned size, const xmlChar *name, + unsigned hashValue, int aindex) { + xmlAttrHashBucket *table = ctxt->attrHash; + xmlAttrHashBucket *bucket; + unsigned hindex; + + hindex = hashValue & (size - 1); + bucket = &table[hindex]; + + while (bucket->index >= 0) { + const xmlChar **atts = &ctxt->atts[bucket->index]; + + if (name == atts[0]) + return(bucket->index); + + hindex++; + bucket++; + if (hindex >= size) { + hindex = 0; + bucket = table; + } + } + + bucket->index = aindex; + + return(INT_MAX); +} + +/** + * parse a start of tag either for rule element or + * EmptyElement. In both case we don't parse the tag closing chars. + * + * [40] STag ::= '<' Name (S Attribute)* S? '>' + * + * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' + * + * With namespace: + * + * [NS 8] STag ::= '<' QName (S Attribute)* S? '>' + * + * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>' + * + * @param ctxt an HTML parser context + * @returns 0 in case of success, -1 in case of error and 1 if discarded + */ + +static void +htmlParseStartTag(htmlParserCtxtPtr ctxt) { + const xmlChar *name; + const xmlChar *attname; + xmlChar *attvalue; + const xmlChar **atts; + int nbatts = 0; + int maxatts; + int i; + int discardtag = 0; + + ctxt->endCheckState = 0; + + SKIP(1); + + atts = ctxt->atts; + maxatts = ctxt->maxatts; + + GROW; + name = htmlParseHTMLName(ctxt, 0).name; + if (name == NULL) + return; + + if ((ctxt->options & HTML_PARSE_HTML5) == 0) { + /* + * Check for auto-closure of HTML elements. + */ + htmlAutoClose(ctxt, name); + + /* + * Check for implied HTML elements. + */ + htmlCheckImplied(ctxt, name); + + /* + * Avoid html at any level > 0, head at any level != 1 + * or any attempt to recurse body + */ + if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) { + htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, + "htmlParseStartTag: misplaced tag\n", + name, NULL); + discardtag = 1; + ctxt->depth++; + } + if ((ctxt->nameNr != 1) && + (xmlStrEqual(name, BAD_CAST"head"))) { + htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, + "htmlParseStartTag: misplaced tag\n", + name, NULL); + discardtag = 1; + ctxt->depth++; + } + if (xmlStrEqual(name, BAD_CAST"body")) { + int indx; + for (indx = 0;indx < ctxt->nameNr;indx++) { + if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) { + htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, + "htmlParseStartTag: misplaced tag\n", + name, NULL); + discardtag = 1; + ctxt->depth++; + } + } + } + } + + /* + * Now parse the attributes, it ends up with the ending + * + * (S Attribute)* S? + */ + SKIP_BLANKS; + while ((ctxt->input->cur < ctxt->input->end) && + (CUR != '>') && + ((CUR != '/') || (NXT(1) != '>')) && + (PARSER_STOPPED(ctxt) == 0)) { + xmlHashedString hattname; + + /* unexpected-solidus-in-tag */ + if (CUR == '/') { + SKIP(1); + SKIP_BLANKS; + continue; + } + GROW; + hattname = htmlParseAttribute(ctxt, &attvalue); + attname = hattname.name; + + if (attname != NULL) { + /* + * Add the pair to atts + */ + if (nbatts + 4 > maxatts) { + const xmlChar **tmp; + unsigned *utmp; + int newSize; + + newSize = xmlGrowCapacity(maxatts, + sizeof(tmp[0]) * 2 + sizeof(utmp[0]), + 11, HTML_MAX_ATTRS); + if (newSize < 0) { + htmlErrMemory(ctxt); + goto failed; + } +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (newSize < 2) + newSize = 2; +#endif + tmp = xmlRealloc(atts, newSize * sizeof(tmp[0]) * 2); + if (tmp == NULL) { + htmlErrMemory(ctxt); + goto failed; + } + atts = tmp; + ctxt->atts = tmp; + + utmp = xmlRealloc(ctxt->attallocs, newSize * sizeof(utmp[0])); + if (utmp == NULL) { + htmlErrMemory(ctxt); + goto failed; + } + ctxt->attallocs = utmp; + + maxatts = newSize * 2; + ctxt->maxatts = maxatts; + } + + ctxt->attallocs[nbatts/2] = hattname.hashValue; + atts[nbatts++] = attname; + atts[nbatts++] = attvalue; + + attvalue = NULL; + } + +failed: + if (attvalue != NULL) + xmlFree(attvalue); + + SKIP_BLANKS; + } + + if (ctxt->input->cur >= ctxt->input->end) { + discardtag = 1; + goto done; + } + + /* + * Verify that attribute names are unique. + */ + if (nbatts > 2) { + unsigned attrHashSize; + int j, k; + + attrHashSize = 4; + while (attrHashSize / 2 < (unsigned) nbatts / 2) + attrHashSize *= 2; + + if (attrHashSize > ctxt->attrHashMax) { + xmlAttrHashBucket *tmp; + + tmp = xmlRealloc(ctxt->attrHash, attrHashSize * sizeof(tmp[0])); + if (tmp == NULL) { + htmlErrMemory(ctxt); + goto done; + } + + ctxt->attrHash = tmp; + ctxt->attrHashMax = attrHashSize; + } + + memset(ctxt->attrHash, -1, attrHashSize * sizeof(ctxt->attrHash[0])); + + for (i = 0, j = 0, k = 0; i < nbatts; i += 2, k++) { + unsigned hashValue; + int res; + + attname = atts[i]; + hashValue = ctxt->attallocs[k] | 0x80000000; + + res = htmlAttrHashInsert(ctxt, attrHashSize, attname, + hashValue, j); + if (res < 0) + continue; + + if (res == INT_MAX) { + atts[j] = atts[i]; + atts[j+1] = atts[i+1]; + j += 2; + } else { + xmlFree((xmlChar *) atts[i+1]); + } + } + + nbatts = j; + } + + if (nbatts > 0) { + atts[nbatts] = NULL; + atts[nbatts + 1] = NULL; + + /* + * Apple's new libiconv is so broken that you routinely run into + * issues when fuzz testing (by accident with an uninstrumented + * libiconv). Here's a harmless (?) example: + * + * printf '>' | iconv -f shift_jis -t utf-8 | hexdump -C + * printf '\xfc\x00\x00' | iconv -f shift_jis -t utf-8 | hexdump -C + * printf '>\xfc\x00\x00' | iconv -f shift_jis -t utf-8 | hexdump -C + * + * The last command fails to detect the illegal sequence. + */ +#if !defined(__APPLE__) || \ + !defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) + /* + * Handle specific association to the META tag + */ + if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) && + (strcmp((char *) name, "meta") == 0)) { + htmlCheckMeta(ctxt, atts); + } +#endif + } + + /* + * SAX: Start of Element ! + */ + if (!discardtag) { + if (ctxt->options & HTML_PARSE_HTML5) { + if (ctxt->nameNr > 0) + htmlnamePop(ctxt); + } + + htmlnamePush(ctxt, name); + if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) { + if (nbatts != 0) + ctxt->sax->startElement(ctxt->userData, name, atts); + else + ctxt->sax->startElement(ctxt->userData, name, NULL); + } + } + +done: + if (atts != NULL) { + for (i = 1;i < nbatts;i += 2) { + if (atts[i] != NULL) + xmlFree((xmlChar *) atts[i]); + } + } +} + +/** + * parse an end of tag + * + * [42] ETag ::= '' Name S? '>' + * + * With namespace + * + * [NS 9] ETag ::= '' QName S? '>' + * + * @param ctxt an HTML parser context + * @returns 1 if the current level should be closed. + */ + +static void +htmlParseEndTag(htmlParserCtxtPtr ctxt) +{ + const xmlChar *name; + const xmlChar *oldname; + int i; + + ctxt->endCheckState = 0; + + SKIP(2); + + if (ctxt->input->cur >= ctxt->input->end) { + htmlStartCharData(ctxt); + if ((ctxt->sax != NULL) && (!ctxt->disableSAX) && + (ctxt->sax->characters != NULL)) + ctxt->sax->characters(ctxt->userData, + BAD_CAST "", 2); + return; + } + + if (CUR == '>') { + SKIP(1); + return; + } + + if (!IS_ASCII_LETTER(CUR)) { + htmlParseComment(ctxt, /* bogus */ 1); + return; + } + + name = htmlParseHTMLName(ctxt, 0).name; + if (name == NULL) + return; + + /* + * Parse and ignore attributes. + */ + SKIP_BLANKS; + while ((ctxt->input->cur < ctxt->input->end) && + (CUR != '>') && + ((CUR != '/') || (NXT(1) != '>')) && + (ctxt->instate != XML_PARSER_EOF)) { + xmlChar *attvalue = NULL; + + /* unexpected-solidus-in-tag */ + if (CUR == '/') { + SKIP(1); + SKIP_BLANKS; + continue; + } + GROW; + htmlParseAttribute(ctxt, &attvalue); + if (attvalue != NULL) + xmlFree(attvalue); + + SKIP_BLANKS; + } + + if (CUR == '>') { + SKIP(1); + } else if ((CUR == '/') && (NXT(1) == '>')) { + SKIP(2); + } else { + return; + } + + if (ctxt->options & HTML_PARSE_HTML5) { + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, name); + return; + } + + /* + * if we ignored misplaced tags in htmlParseStartTag don't pop them + * out now. + */ + if ((ctxt->depth > 0) && + (xmlStrEqual(name, BAD_CAST "html") || + xmlStrEqual(name, BAD_CAST "body") || + xmlStrEqual(name, BAD_CAST "head"))) { + ctxt->depth--; + return; + } + + /* + * If the name read is not one of the element in the parsing stack + * then return, it's just an error. + */ + for (i = (ctxt->nameNr - 1); i >= 0; i--) { + if (xmlStrEqual(name, ctxt->nameTab[i])) + break; + } + if (i < 0) { + htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, + "Unexpected end tag : %s\n", name, NULL); + return; + } + + + /* + * Check for auto-closure of HTML elements. + */ + + htmlAutoCloseOnClose(ctxt, name); + + /* + * Well formedness constraints, opening and closing must match. + * With the exception that the autoclose may have popped stuff out + * of the stack. + */ + if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) { + htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, + "Opening and ending tag mismatch: %s and %s\n", + name, ctxt->name); + } + + /* + * SAX: End of Tag + */ + oldname = ctxt->name; + if ((oldname != NULL) && (xmlStrEqual(oldname, name))) { + htmlParserFinishElementParsing(ctxt); + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, name); + htmlnamePop(ctxt); + } +} + +/** + * Parse a content: comment, sub-element, reference or text. + * New version for non recursive htmlParseElementInternal + * + * @param ctxt an HTML parser context + */ + +static void +htmlParseContent(htmlParserCtxtPtr ctxt) { + GROW; + + while ((PARSER_STOPPED(ctxt) == 0) && + (ctxt->input->cur < ctxt->input->end)) { + int mode; + + mode = ctxt->endCheckState; + + if ((mode == 0) && (CUR == '<')) { + if (NXT(1) == '/') { + htmlParseEndTag(ctxt); + } else if (NXT(1) == '!') { + /* + * Sometimes DOCTYPE arrives in the middle of the document + */ + if ((UPP(2) == 'D') && (UPP(3) == 'O') && + (UPP(4) == 'C') && (UPP(5) == 'T') && + (UPP(6) == 'Y') && (UPP(7) == 'P') && + (UPP(8) == 'E')) { + htmlParseDocTypeDecl(ctxt); + } else if ((NXT(2) == '-') && (NXT(3) == '-')) { + SKIP(4); + htmlParseComment(ctxt, /* bogus */ 0); + } else { + SKIP(2); + htmlParseComment(ctxt, /* bogus */ 1); + } + } else if (NXT(1) == '?') { + SKIP(1); + htmlParseComment(ctxt, /* bogus */ 1); + } else if (IS_ASCII_LETTER(NXT(1))) { + htmlParseElementInternal(ctxt); + } else { + htmlStartCharData(ctxt); + if ((ctxt->sax != NULL) && (!ctxt->disableSAX) && + (ctxt->sax->characters != NULL)) + ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1); + SKIP(1); + } + } else { + htmlParseCharData(ctxt, /* partial */ 0); + } + + SHRINK; + GROW; + } + + if (ctxt->input->cur >= ctxt->input->end) + htmlAutoCloseOnEnd(ctxt); +} + +/** + * Parse an HTML element, new version, non recursive + * + * @param ctxt an HTML parser context + */ +static int +htmlParseElementInternal(htmlParserCtxtPtr ctxt) { + const xmlChar *name; + const htmlElemDesc * info; + htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 }; + + if ((ctxt == NULL) || (ctxt->input == NULL)) + return(0); + + /* Capture start position */ + if (ctxt->record_info) { + node_info.begin_pos = ctxt->input->consumed + + (CUR_PTR - ctxt->input->base); + node_info.begin_line = ctxt->input->line; + } + + htmlParseStartTag(ctxt); + name = ctxt->name; + if (name == NULL) + return(0); + + if (ctxt->record_info) + htmlNodeInfoPush(ctxt, &node_info); + + /* + * Check for an Empty Element labeled the XML/SGML way + */ + if ((CUR == '/') && (NXT(1) == '>')) { + SKIP(2); + htmlParserFinishElementParsing(ctxt); + if ((ctxt->options & HTML_PARSE_HTML5) == 0) { + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, name); + } + htmlnamePop(ctxt); + return(0); + } + + if (CUR != '>') + return(0); + SKIP(1); + + /* + * Lookup the info for that element. + */ + info = htmlTagLookup(name); + + /* + * Check for an Empty Element from DTD definition + */ + if ((info != NULL) && (info->empty)) { + htmlParserFinishElementParsing(ctxt); + if ((ctxt->options & HTML_PARSE_HTML5) == 0) { + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, name); + } + htmlnamePop(ctxt); + return(0); + } + + if (info != NULL) + ctxt->endCheckState = info->dataMode; + + return(1); +} + +/** + * This is kept for compatibility with previous code versions + * + * @deprecated Internal function, don't use. + * + * @param ctxt an HTML parser context + */ +void +htmlParseElement(htmlParserCtxt *ctxt) { + const xmlChar *oldptr; + int depth; + + if ((ctxt == NULL) || (ctxt->input == NULL)) + return; + + if (htmlParseElementInternal(ctxt) == 0) + return; + + /* + * Parse the content of the element: + */ + depth = ctxt->nameNr; + while (CUR != 0) { + oldptr = ctxt->input->cur; + htmlParseContent(ctxt); + if (oldptr==ctxt->input->cur) break; + if (ctxt->nameNr < depth) break; + } + + if (CUR == 0) { + htmlAutoCloseOnEnd(ctxt); + } +} + +/** + * @param ctxt parser context + * @param input parser input + * @returns a node list. + */ +xmlNode * +htmlCtxtParseContentInternal(htmlParserCtxt *ctxt, xmlParserInput *input) { + xmlNodePtr root; + xmlNodePtr list = NULL; + xmlChar *rootName = BAD_CAST "#root"; + + root = xmlNewDocNode(ctxt->myDoc, NULL, rootName, NULL); + if (root == NULL) { + htmlErrMemory(ctxt); + return(NULL); + } + + if (xmlCtxtPushInput(ctxt, input) < 0) { + xmlFreeNode(root); + return(NULL); + } + + htmlnamePush(ctxt, rootName); + nodePush(ctxt, root); + + htmlParseContent(ctxt); + + /* + * Only check for truncated multi-byte sequences + */ + xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR); + + /* TODO: Use xmlCtxtIsCatastrophicError */ + if (ctxt->errNo != XML_ERR_NO_MEMORY) { + xmlNodePtr cur; + + /* + * Unlink newly created node list. + */ + list = root->children; + root->children = NULL; + root->last = NULL; + for (cur = list; cur != NULL; cur = cur->next) + cur->parent = NULL; + } + + nodePop(ctxt); + htmlnamePop(ctxt); + + xmlCtxtPopInput(ctxt); + + xmlFreeNode(root); + return(list); +} + +/** + * Parse an HTML document and invoke the SAX handlers. This is useful + * if you're only interested in custom SAX callbacks. If you want a + * document tree, use #htmlCtxtParseDocument. + * + * @param ctxt an HTML parser context + * @returns 0, -1 in case of error. + */ +int +htmlParseDocument(htmlParserCtxt *ctxt) { + if ((ctxt == NULL) || (ctxt->input == NULL)) + return(-1); + + if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) { + ctxt->sax->setDocumentLocator(ctxt->userData, + (xmlSAXLocator *) &xmlDefaultSAXLocator); + } + + xmlDetectEncoding(ctxt); + + /* + * TODO: Implement HTML5 prescan algorithm + */ + + /* + * This is wrong but matches long-standing behavior. In most + * cases, a document starting with an XML declaration will + * specify UTF-8. The HTML5 prescan algorithm handles + * XML declarations in a better way. + */ + if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) && + (xmlStrncmp(ctxt->input->cur, BAD_CAST "sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) + ctxt->sax->startDocument(ctxt->userData); + + /* + * Parse possible comments, PIs or doctype declarations + * before any content. + */ + ctxt->instate = XML_PARSER_MISC; + while (CUR == '<') { + if (NXT(1) == '!') { + if ((NXT(2) == '-') && (NXT(3) == '-')) { + SKIP(4); + htmlParseComment(ctxt, /* bogus */ 0); + } else if ((UPP(2) == 'D') && (UPP(3) == 'O') && + (UPP(4) == 'C') && (UPP(5) == 'T') && + (UPP(6) == 'Y') && (UPP(7) == 'P') && + (UPP(8) == 'E')) { + htmlParseDocTypeDecl(ctxt); + ctxt->instate = XML_PARSER_PROLOG; + } else { + SKIP(2); + htmlParseComment(ctxt, /* bogus */ 1); + } + } else if (NXT(1) == '?') { + SKIP(1); + htmlParseComment(ctxt, /* bogus */ 1); + } else { + break; + } + SKIP_BLANKS; + GROW; + } + + /* + * Time to start parsing the tree itself + */ + ctxt->instate = XML_PARSER_CONTENT; + htmlParseContent(ctxt); + + /* + * Only check for truncated multi-byte sequences + */ + xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR); + + /* + * SAX: end of the document processing. + */ + if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) + ctxt->sax->endDocument(ctxt->userData); + + if (! ctxt->wellFormed) return(-1); + return(0); +} + + +/************************************************************************ + * * + * Parser contexts handling * + * * + ************************************************************************/ + +/** + * Initialize a parser context + * + * @param ctxt an HTML parser context + * @param sax SAX handler + * @param userData user data + * @returns 0 in case of success and -1 in case of error + */ +static int +htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax, + void *userData) +{ +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + size_t initialNodeTabSize = 1; +#else + size_t initialNodeTabSize = 10; +#endif + + if (ctxt == NULL) return(-1); + memset(ctxt, 0, sizeof(htmlParserCtxt)); + + ctxt->dict = xmlDictCreate(); + if (ctxt->dict == NULL) + return(-1); + + if (ctxt->sax == NULL) + ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler)); + if (ctxt->sax == NULL) + return(-1); + if (sax == NULL) { + memset(ctxt->sax, 0, sizeof(htmlSAXHandler)); + xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax); + ctxt->userData = ctxt; + } else { + memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); + ctxt->userData = userData ? userData : ctxt; + } + + /* Allocate the Input stack */ + ctxt->inputTab = (htmlParserInputPtr *) + xmlMalloc(sizeof(htmlParserInputPtr)); + if (ctxt->inputTab == NULL) + return(-1); + ctxt->inputNr = 0; + ctxt->inputMax = 1; + ctxt->input = NULL; + ctxt->version = NULL; + ctxt->encoding = NULL; + ctxt->standalone = -1; + ctxt->instate = XML_PARSER_START; + + /* Allocate the Node stack */ + ctxt->nodeTab = xmlMalloc(initialNodeTabSize * sizeof(htmlNodePtr)); + if (ctxt->nodeTab == NULL) + return(-1); + ctxt->nodeNr = 0; + ctxt->nodeMax = initialNodeTabSize; + ctxt->node = NULL; + + /* Allocate the Name stack */ + ctxt->nameTab = xmlMalloc(initialNodeTabSize * sizeof(xmlChar *)); + if (ctxt->nameTab == NULL) + return(-1); + ctxt->nameNr = 0; + ctxt->nameMax = initialNodeTabSize; + ctxt->name = NULL; + + ctxt->nodeInfoTab = NULL; + ctxt->nodeInfoNr = 0; + ctxt->nodeInfoMax = 0; + + ctxt->myDoc = NULL; + ctxt->wellFormed = 1; + ctxt->replaceEntities = 0; + ctxt->keepBlanks = xmlKeepBlanksDefaultValue; + ctxt->html = INSERT_INITIAL; + ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT; + ctxt->vctxt.userData = ctxt; + ctxt->vctxt.error = xmlParserValidityError; + ctxt->vctxt.warning = xmlParserValidityWarning; + ctxt->record_info = 0; + ctxt->validate = 0; + ctxt->checkIndex = 0; + ctxt->catalogs = NULL; + xmlInitNodeInfoSeq(&ctxt->node_seq); + return(0); +} + +/** + * Free all the memory used by a parser context. However the parsed + * document in `ctxt->myDoc` is not freed. + * + * @param ctxt an HTML parser context + */ +void +htmlFreeParserCtxt(htmlParserCtxt *ctxt) +{ + xmlFreeParserCtxt(ctxt); +} + +/** + * Allocate and initialize a new HTML parser context. + * + * This can be used to parse HTML documents into DOM trees with + * functions like #xmlCtxtReadFile or #xmlCtxtReadMemory. + * + * See #htmlCtxtUseOptions for parser options. + * + * See #xmlCtxtSetErrorHandler for advanced error handling. + * + * See #htmlNewSAXParserCtxt for custom SAX parsers. + * + * @returns the htmlParserCtxt or NULL in case of allocation error + */ +htmlParserCtxt * +htmlNewParserCtxt(void) +{ + return(htmlNewSAXParserCtxt(NULL, NULL)); +} + +/** + * Allocate and initialize a new HTML SAX parser context. If `userData` + * is NULL, the parser context will be passed as user data. + * + * @since 2.11.0 + * + * If you want support older versions, it's best to invoke + * #htmlNewParserCtxt and set `ctxt->sax` with struct assignment. + * + * Also see #htmlNewParserCtxt. + * + * @param sax SAX handler + * @param userData user data + * @returns the htmlParserCtxt or NULL in case of allocation error + */ +htmlParserCtxt * +htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData) +{ + xmlParserCtxtPtr ctxt; + + xmlInitParser(); + + ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt)); + if (ctxt == NULL) + return(NULL); + memset(ctxt, 0, sizeof(xmlParserCtxt)); + if (htmlInitParserCtxt(ctxt, sax, userData) < 0) { + htmlFreeParserCtxt(ctxt); + return(NULL); + } + return(ctxt); +} + +static htmlParserCtxtPtr +htmlCreateMemoryParserCtxtInternal(const char *url, + const char *buffer, size_t size, + const char *encoding) { + xmlParserCtxtPtr ctxt; + xmlParserInputPtr input; + + if (buffer == NULL) + return(NULL); + + ctxt = htmlNewParserCtxt(); + if (ctxt == NULL) + return(NULL); + + input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding, 0); + if (input == NULL) { + xmlFreeParserCtxt(ctxt); + return(NULL); + } + + if (xmlCtxtPushInput(ctxt, input) < 0) { + xmlFreeInputStream(input); + xmlFreeParserCtxt(ctxt); + return(NULL); + } + + return(ctxt); +} + +/** + * Create a parser context for an HTML in-memory document. The input + * buffer must not contain any terminating null bytes. + * + * @deprecated Use #htmlNewParserCtxt and #htmlCtxtReadMemory. + * + * @param buffer a pointer to a char array + * @param size the size of the array + * @returns the new parser context or NULL + */ +htmlParserCtxt * +htmlCreateMemoryParserCtxt(const char *buffer, int size) { + if (size <= 0) + return(NULL); + + return(htmlCreateMemoryParserCtxtInternal(NULL, buffer, size, NULL)); +} + +/** + * Create a parser context for a null-terminated string. + * + * @param str a pointer to an array of xmlChar + * @param url URL of the document (optional) + * @param encoding encoding (optional) + * @returns the new parser context or NULL if a memory allocation failed. + */ +static htmlParserCtxtPtr +htmlCreateDocParserCtxt(const xmlChar *str, const char *url, + const char *encoding) { + xmlParserCtxtPtr ctxt; + xmlParserInputPtr input; + + if (str == NULL) + return(NULL); + + ctxt = htmlNewParserCtxt(); + if (ctxt == NULL) + return(NULL); + + input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str, + encoding, 0); + if (input == NULL) { + xmlFreeParserCtxt(ctxt); + return(NULL); + } + + if (xmlCtxtPushInput(ctxt, input) < 0) { + xmlFreeInputStream(input); + xmlFreeParserCtxt(ctxt); + return(NULL); + } + + return(ctxt); +} + +#ifdef LIBXML_PUSH_ENABLED +/************************************************************************ + * * + * Progressive parsing interfaces * + * * + ************************************************************************/ + +typedef enum { + LSTATE_TAG_NAME = 0, + LSTATE_BEFORE_ATTR_NAME, + LSTATE_ATTR_NAME, + LSTATE_AFTER_ATTR_NAME, + LSTATE_BEFORE_ATTR_VALUE, + LSTATE_ATTR_VALUE_DQUOTED, + LSTATE_ATTR_VALUE_SQUOTED, + LSTATE_ATTR_VALUE_UNQUOTED +} xmlLookupStates; + +/** + * Check whether there's enough data in the input buffer to finish parsing + * a tag. This has to take quotes into account. + * + * @param ctxt an HTML parser context + */ +static int +htmlParseLookupGt(xmlParserCtxtPtr ctxt) { + const xmlChar *cur; + const xmlChar *end = ctxt->input->end; + int state = ctxt->endCheckState; + size_t index; + + if (ctxt->checkIndex == 0) + cur = ctxt->input->cur + 2; /* Skip 'input->cur + ctxt->checkIndex; + + while (cur < end) { + int c = *cur++; + + if (state != LSTATE_ATTR_VALUE_SQUOTED && + state != LSTATE_ATTR_VALUE_DQUOTED) { + if (c == '/' && + state != LSTATE_BEFORE_ATTR_VALUE && + state != LSTATE_ATTR_VALUE_UNQUOTED) { + state = LSTATE_BEFORE_ATTR_NAME; + continue; + } else if (c == '>') { + ctxt->checkIndex = 0; + ctxt->endCheckState = 0; + return(0); + } + } + + switch (state) { + case LSTATE_TAG_NAME: + if (IS_WS_HTML(c)) + state = LSTATE_BEFORE_ATTR_NAME; + break; + + case LSTATE_BEFORE_ATTR_NAME: + if (!IS_WS_HTML(c)) + state = LSTATE_ATTR_NAME; + break; + + case LSTATE_ATTR_NAME: + if (c == '=') + state = LSTATE_BEFORE_ATTR_VALUE; + else if (IS_WS_HTML(c)) + state = LSTATE_AFTER_ATTR_NAME; + break; + + case LSTATE_AFTER_ATTR_NAME: + if (c == '=') + state = LSTATE_BEFORE_ATTR_VALUE; + else if (!IS_WS_HTML(c)) + state = LSTATE_ATTR_NAME; + break; + + case LSTATE_BEFORE_ATTR_VALUE: + if (c == '"') + state = LSTATE_ATTR_VALUE_DQUOTED; + else if (c == '\'') + state = LSTATE_ATTR_VALUE_SQUOTED; + else if (!IS_WS_HTML(c)) + state = LSTATE_ATTR_VALUE_UNQUOTED; + break; + + case LSTATE_ATTR_VALUE_DQUOTED: + if (c == '"') + state = LSTATE_BEFORE_ATTR_NAME; + break; + + case LSTATE_ATTR_VALUE_SQUOTED: + if (c == '\'') + state = LSTATE_BEFORE_ATTR_NAME; + break; + + case LSTATE_ATTR_VALUE_UNQUOTED: + if (IS_WS_HTML(c)) + state = LSTATE_BEFORE_ATTR_NAME; + break; + } + } + + index = cur - ctxt->input->cur; + if (index > LONG_MAX) { + ctxt->checkIndex = 0; + ctxt->endCheckState = 0; + return(0); + } + ctxt->checkIndex = index; + ctxt->endCheckState = state; + return(-1); +} + +/** + * Check whether the input buffer contains a string. + * + * @param ctxt an XML parser context + * @param startDelta delta to apply at the start + * @param str string + * @param strLen length of string + * @param extraLen extra length + */ +static int +htmlParseLookupString(xmlParserCtxtPtr ctxt, size_t startDelta, + const char *str, size_t strLen, size_t extraLen) { + const xmlChar *end = ctxt->input->end; + const xmlChar *cur, *term; + size_t index, rescan; + int ret; + + if (ctxt->checkIndex == 0) { + cur = ctxt->input->cur + startDelta; + } else { + cur = ctxt->input->cur + ctxt->checkIndex; + } + + term = BAD_CAST strstr((const char *) cur, str); + if ((term != NULL) && + ((size_t) (ctxt->input->end - term) >= extraLen + 1)) { + ctxt->checkIndex = 0; + + if (term - ctxt->input->cur > INT_MAX / 2) + ret = INT_MAX / 2; + else + ret = term - ctxt->input->cur; + + return(ret); + } + + /* Rescan (strLen + extraLen - 1) characters. */ + rescan = strLen + extraLen - 1; + if ((size_t) (end - cur) <= rescan) + end = cur; + else + end -= rescan; + index = end - ctxt->input->cur; + if (index > INT_MAX / 2) { + ctxt->checkIndex = 0; + ret = INT_MAX / 2; + } else { + ctxt->checkIndex = index; + ret = -1; + } + + return(ret); +} + +/** + * Try to find a comment end tag in the input stream + * The search includes "-->" as well as WHATWG-recommended + * incorrectly-closed tags. + * + * @param ctxt an HTML parser context + * @returns the index to the current parsing point if the full + * sequence is available, -1 otherwise. + */ +static int +htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt) +{ + int mark = 0; + int offset; + + while (1) { + mark = htmlParseLookupString(ctxt, 2, "--", 2, 0); + if (mark < 0) + break; + /* + * is a complete comment, but + * is not + * is + */ + if ((NXT(mark+2) == '>') || + ((mark >= 4) && (NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) { + ctxt->checkIndex = 0; + break; + } + offset = (NXT(mark+2) == '!') ? 3 : 2; + if (mark + offset >= ctxt->input->end - ctxt->input->cur) { + ctxt->checkIndex = mark; + return(-1); + } + ctxt->checkIndex = mark + 1; + } + return mark; +} + + +/** + * Try to progress on parsing + * + * @param ctxt an HTML parser context + * @param terminate last chunk indicator + * @returns zero if no parsing was possible + */ +static void +htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { + while (PARSER_STOPPED(ctxt) == 0) { + htmlParserInputPtr in; + size_t avail; + + in = ctxt->input; + if (in == NULL) break; + avail = in->end - in->cur; + + switch (ctxt->instate) { + case XML_PARSER_EOF: + /* + * Document parsing is done ! + */ + return; + + case XML_PARSER_START: + /* + * Very first chars read from the document flow. + */ + if ((!terminate) && (avail < 4)) + return; + + xmlDetectEncoding(ctxt); + + /* + * TODO: Implement HTML5 prescan algorithm + */ + + /* + * This is wrong but matches long-standing behavior. In most + * cases, a document starting with an XML declaration will + * specify UTF-8. The HTML5 prescan algorithm handles + * XML declarations in a better way. + */ + if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) && + (xmlStrncmp(ctxt->input->cur, BAD_CAST "sax) && (ctxt->sax->setDocumentLocator)) { + ctxt->sax->setDocumentLocator(ctxt->userData, + (xmlSAXLocator *) &xmlDefaultSAXLocator); + } + if ((ctxt->sax) && (ctxt->sax->startDocument) && + (!ctxt->disableSAX)) + ctxt->sax->startDocument(ctxt->userData); + + /* Allow callback to modify state for tests */ + if ((ctxt->instate == XML_PARSER_START) || + (ctxt->instate == XML_PARSER_XML_DECL)) + ctxt->instate = XML_PARSER_MISC; + break; + + case XML_PARSER_START_TAG: + if ((!terminate) && + (htmlParseLookupGt(ctxt) < 0)) + return; + + htmlParseElementInternal(ctxt); + + ctxt->instate = XML_PARSER_CONTENT; + break; + + case XML_PARSER_MISC: /* initial */ + case XML_PARSER_PROLOG: /* before html */ + case XML_PARSER_CONTENT: { + int mode; + + if ((ctxt->instate == XML_PARSER_MISC) || + (ctxt->instate == XML_PARSER_PROLOG)) { + SKIP_BLANKS; + avail = in->end - in->cur; + } + + if (avail < 1) + return; + /* + * Note that endCheckState is also used by + * xmlParseLookupGt. + */ + mode = ctxt->endCheckState; + + if (mode != 0) { + if (htmlParseCharData(ctxt, !terminate) == 0) + return; + } else if (in->cur[0] == '<') { + int next; + + if (avail < 2) { + if (!terminate) + return; + next = ' '; + } else { + next = in->cur[1]; + } + + if (next == '!') { + if ((!terminate) && (avail < 4)) + return; + if ((in->cur[2] == '-') && (in->cur[3] == '-')) { + if ((!terminate) && + (htmlParseLookupCommentEnd(ctxt) < 0)) + return; + SKIP(4); + htmlParseComment(ctxt, /* bogus */ 0); + /* don't change state */ + break; + } + + if ((!terminate) && (avail < 9)) + return; + if ((UPP(2) == 'D') && (UPP(3) == 'O') && + (UPP(4) == 'C') && (UPP(5) == 'T') && + (UPP(6) == 'Y') && (UPP(7) == 'P') && + (UPP(8) == 'E')) { + if ((!terminate) && + (htmlParseLookupString(ctxt, 9, ">", 1, + 0) < 0)) + return; + htmlParseDocTypeDecl(ctxt); + if (ctxt->instate == XML_PARSER_MISC) + ctxt->instate = XML_PARSER_PROLOG; + } else { + if ((!terminate) && + (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0)) + return; + SKIP(2); + htmlParseComment(ctxt, /* bogus */ 1); + } + } else if (next == '?') { + if ((!terminate) && + (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0)) + return; + SKIP(1); + htmlParseComment(ctxt, /* bogus */ 1); + /* don't change state */ + } else if (next == '/') { + ctxt->instate = XML_PARSER_END_TAG; + ctxt->checkIndex = 0; + } else if (IS_ASCII_LETTER(next)) { + ctxt->instate = XML_PARSER_START_TAG; + ctxt->checkIndex = 0; + } else { + ctxt->instate = XML_PARSER_CONTENT; + htmlStartCharData(ctxt); + if ((ctxt->sax != NULL) && (!ctxt->disableSAX) && + (ctxt->sax->characters != NULL)) + ctxt->sax->characters(ctxt->userData, + BAD_CAST "<", 1); + SKIP(1); + } + } else { + ctxt->instate = XML_PARSER_CONTENT; + /* + * We follow the logic of the XML push parser + */ + if (avail < HTML_PARSER_BIG_BUFFER_SIZE) { + if ((!terminate) && + (htmlParseLookupString(ctxt, 0, "<", 1, 0) < 0)) + return; + } + ctxt->checkIndex = 0; + if (htmlParseCharData(ctxt, !terminate) == 0) + return; + } + + break; + } + + case XML_PARSER_END_TAG: + if ((!terminate) && + (htmlParseLookupGt(ctxt) < 0)) + return; + htmlParseEndTag(ctxt); + ctxt->instate = XML_PARSER_CONTENT; + ctxt->checkIndex = 0; + break; + + default: + htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, + "HPP: internal error\n", NULL, NULL); + ctxt->instate = XML_PARSER_EOF; + break; + } + } +} + +/** + * Parse a chunk of memory in push parser mode. + * + * Assumes that the parser context was initialized with + * #htmlCreatePushParserCtxt. + * + * The last chunk, which will often be empty, must be marked with + * the `terminate` flag. With the default SAX callbacks, the resulting + * document will be available in `ctxt->myDoc`. This pointer will not + * be freed by the library. + * + * If the document isn't well-formed, `ctxt->myDoc` is set to NULL. + * + * Since 2.14.0, #xmlCtxtGetDocument can be used to retrieve the + * result document. + * + * @param ctxt an HTML parser context + * @param chunk chunk of memory + * @param size size of chunk in bytes + * @param terminate last chunk indicator + * @returns an xmlParserErrors code (0 on success). + */ +int +htmlParseChunk(htmlParserCtxt *ctxt, const char *chunk, int size, + int terminate) { + if ((ctxt == NULL) || + (ctxt->input == NULL) || (ctxt->input->buf == NULL) || + (size < 0) || + ((size > 0) && (chunk == NULL))) + return(XML_ERR_ARGUMENT); + if (PARSER_STOPPED(ctxt) != 0) + return(ctxt->errNo); + + if (size > 0) { + size_t pos = ctxt->input->cur - ctxt->input->base; + int res; + + res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); + xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos); + if (res < 0) { + htmlParseErr(ctxt, ctxt->input->buf->error, + "xmlParserInputBufferPush failed", NULL, NULL); + return (ctxt->errNo); + } + } + + htmlParseTryOrFinish(ctxt, terminate); + + if ((terminate) && (ctxt->instate != XML_PARSER_EOF)) { + htmlAutoCloseOnEnd(ctxt); + + /* + * Only check for truncated multi-byte sequences + */ + xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR); + + if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) + ctxt->sax->endDocument(ctxt->userData); + + ctxt->instate = XML_PARSER_EOF; + } + + return((xmlParserErrors) ctxt->errNo); +} + +/************************************************************************ + * * + * User entry points * + * * + ************************************************************************/ + +/** + * Create a parser context for using the HTML parser in push mode. + * + * @param sax a SAX handler (optional) + * @param user_data The user data returned on SAX callbacks (optional) + * @param chunk a pointer to an array of chars (optional) + * @param size number of chars in the array + * @param filename only used for error reporting (optional) + * @param enc encoding (deprecated, pass XML_CHAR_ENCODING_NONE) + * @returns the new parser context or NULL if a memory allocation + * failed. + */ +htmlParserCtxt * +htmlCreatePushParserCtxt(htmlSAXHandler *sax, void *user_data, + const char *chunk, int size, const char *filename, + xmlCharEncoding enc) { + htmlParserCtxtPtr ctxt; + htmlParserInputPtr input; + const char *encoding; + + ctxt = htmlNewSAXParserCtxt(sax, user_data); + if (ctxt == NULL) + return(NULL); + + encoding = xmlGetCharEncodingName(enc); + input = xmlNewPushInput(filename, chunk, size); + if (input == NULL) { + htmlFreeParserCtxt(ctxt); + return(NULL); + } + + if (xmlCtxtPushInput(ctxt, input) < 0) { + xmlFreeInputStream(input); + xmlFreeParserCtxt(ctxt); + return(NULL); + } + + if (encoding != NULL) + xmlSwitchEncodingName(ctxt, encoding); + + return(ctxt); +} +#endif /* LIBXML_PUSH_ENABLED */ + +/** + * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks + * to handle parse events. If sax is NULL, fallback to the default DOM + * behavior and return a tree. + * + * @deprecated Use #htmlNewSAXParserCtxt and #htmlCtxtReadDoc. + * + * @param cur a pointer to an array of xmlChar + * @param encoding a free form C string describing the HTML document encoding, or NULL + * @param sax the SAX handler block + * @param userData if using SAX, this pointer will be provided on callbacks. + * @returns the resulting document tree unless SAX is NULL or the document is + * not well formed. + */ + +xmlDoc * +htmlSAXParseDoc(const xmlChar *cur, const char *encoding, + htmlSAXHandler *sax, void *userData) { + htmlDocPtr ret; + htmlParserCtxtPtr ctxt; + + if (cur == NULL) + return(NULL); + + ctxt = htmlCreateDocParserCtxt(cur, NULL, encoding); + if (ctxt == NULL) + return(NULL); + + if (sax != NULL) { + *ctxt->sax = *sax; + ctxt->userData = userData; + } + + htmlParseDocument(ctxt); + ret = ctxt->myDoc; + htmlFreeParserCtxt(ctxt); + + return(ret); +} + +/** + * Parse an HTML in-memory document and build a tree. + * + * @deprecated Use #htmlReadDoc. + * + * This function uses deprecated global parser options. + * + * @param cur a pointer to an array of xmlChar + * @param encoding the encoding (optional) + * @returns the resulting document tree + */ + +xmlDoc * +htmlParseDoc(const xmlChar *cur, const char *encoding) { + return(htmlSAXParseDoc(cur, encoding, NULL, NULL)); +} + + +/** + * Create a parser context to read from a file. + * + * @deprecated Use #htmlNewParserCtxt and #htmlCtxtReadFile. + * + * A non-NULL encoding overrides encoding declarations in the document. + * + * Automatic support for ZLIB/Compress compressed document is provided + * by default if found at compile-time. + * + * @param filename the filename + * @param encoding optional encoding + * @returns the new parser context or NULL if a memory allocation failed. + */ +htmlParserCtxt * +htmlCreateFileParserCtxt(const char *filename, const char *encoding) +{ + htmlParserCtxtPtr ctxt; + htmlParserInputPtr input; + + if (filename == NULL) + return(NULL); + + ctxt = htmlNewParserCtxt(); + if (ctxt == NULL) { + return(NULL); + } + + input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0); + if (input == NULL) { + xmlFreeParserCtxt(ctxt); + return(NULL); + } + if (xmlCtxtPushInput(ctxt, input) < 0) { + xmlFreeInputStream(input); + xmlFreeParserCtxt(ctxt); + return(NULL); + } + + return(ctxt); +} + +/** + * parse an HTML file and build a tree. Automatic support for ZLIB/Compress + * compressed document is provided by default if found at compile-time. + * It use the given SAX function block to handle the parsing callback. + * If sax is NULL, fallback to the default DOM tree building routines. + * + * @deprecated Use #htmlNewSAXParserCtxt and #htmlCtxtReadFile. + * + * @param filename the filename + * @param encoding encoding (optional) + * @param sax the SAX handler block + * @param userData if using SAX, this pointer will be provided on callbacks. + * @returns the resulting document tree unless SAX is NULL or the document is + * not well formed. + */ + +xmlDoc * +htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandler *sax, + void *userData) { + htmlDocPtr ret; + htmlParserCtxtPtr ctxt; + htmlSAXHandlerPtr oldsax = NULL; + + ctxt = htmlCreateFileParserCtxt(filename, encoding); + if (ctxt == NULL) return(NULL); + if (sax != NULL) { + oldsax = ctxt->sax; + ctxt->sax = sax; + ctxt->userData = userData; + } + + htmlParseDocument(ctxt); + + ret = ctxt->myDoc; + if (sax != NULL) { + ctxt->sax = oldsax; + ctxt->userData = NULL; + } + htmlFreeParserCtxt(ctxt); + + return(ret); +} + +/** + * Parse an HTML file and build a tree. + * + * @param filename the filename + * @param encoding encoding (optional) + * @returns the resulting document tree + */ + +xmlDoc * +htmlParseFile(const char *filename, const char *encoding) { + return(htmlSAXParseFile(filename, encoding, NULL, NULL)); +} + +/** + * Set and return the previous value for handling HTML omitted tags. + * + * @deprecated Use HTML_PARSE_NOIMPLIED + * + * @param val int 0 or 1 + * @returns the last value for 0 for no handling, 1 for auto insertion. + */ + +int +htmlHandleOmittedElem(int val) { + int old = htmlOmittedDefaultValue; + + htmlOmittedDefaultValue = val; + return(old); +} + +/** + * @deprecated Don't use. + * + * @param parent HTML parent element + * @param elt HTML element + * @returns 1 + */ +int +htmlElementAllowedHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED, + const xmlChar* elt ATTRIBUTE_UNUSED) { + return(1); +} + +/** + * @deprecated Don't use. + * + * @param parent HTML parent element + * @param elt HTML element + * @returns HTML_VALID + */ +htmlStatus +htmlElementStatusHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED, + const htmlElemDesc* elt ATTRIBUTE_UNUSED) { + return(HTML_VALID); +} + +/** + * @deprecated Don't use. + * + * @param elt HTML element + * @param attr HTML attribute + * @param legacy whether to allow deprecated attributes + * @returns HTML_VALID + */ +htmlStatus +htmlAttrAllowed(const htmlElemDesc* elt ATTRIBUTE_UNUSED, + const xmlChar* attr ATTRIBUTE_UNUSED, + int legacy ATTRIBUTE_UNUSED) { + return(HTML_VALID); +} + +/** + * @deprecated Don't use. + * + * @param node an xmlNode in a tree + * @param legacy whether to allow deprecated elements (YES is faster here + * for Element nodes) + * @returns HTML_VALID + */ +htmlStatus +htmlNodeStatus(xmlNode *node ATTRIBUTE_UNUSED, + int legacy ATTRIBUTE_UNUSED) { + return(HTML_VALID); +} + +/************************************************************************ + * * + * New set (2.6.0) of simpler and more flexible APIs * + * * + ************************************************************************/ + +/** + * Reset a parser context + * + * Same as #xmlCtxtReset. + * + * @param ctxt an HTML parser context + */ +void +htmlCtxtReset(htmlParserCtxt *ctxt) +{ + xmlCtxtReset(ctxt); +} + +static int +htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask) +{ + int allMask; + + if (ctxt == NULL) + return(-1); + + allMask = HTML_PARSE_RECOVER | + HTML_PARSE_HTML5 | + HTML_PARSE_NODEFDTD | + HTML_PARSE_NOERROR | + HTML_PARSE_NOWARNING | + HTML_PARSE_PEDANTIC | + HTML_PARSE_NOBLANKS | + HTML_PARSE_NONET | + HTML_PARSE_NOIMPLIED | + HTML_PARSE_COMPACT | + HTML_PARSE_HUGE | + HTML_PARSE_IGNORE_ENC | + HTML_PARSE_BIG_LINES; + + ctxt->options = (ctxt->options & keepMask) | (options & allMask); + + /* + * For some options, struct members are historically the source + * of truth. See xmlCtxtSetOptionsInternal. + */ + ctxt->keepBlanks = (options & HTML_PARSE_NOBLANKS) ? 0 : 1; + + /* + * Recover from character encoding errors + */ + ctxt->recovery = 1; + + /* + * Changing SAX callbacks is a bad idea. This should be fixed. + */ + if (options & HTML_PARSE_NOBLANKS) { + ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; + } + if (options & HTML_PARSE_HUGE) { + if (ctxt->dict != NULL) + xmlDictSetLimit(ctxt->dict, 0); + } + + /* + * It would be useful to allow this feature. + */ + ctxt->dictNames = 0; + + /* + * Allow XML_PARSE_NOENT which many users set on the HTML parser. + */ + return(options & ~allMask & ~XML_PARSE_NOENT); +} + +/** + * Applies the options to the parser context. Unset options are + * cleared. + * + * @since 2.14.0 + * + * With older versions, you can use #htmlCtxtUseOptions. + * + * @param ctxt an HTML parser context + * @param options a bitmask of htmlParserOption values + * @returns 0 in case of success, the set of unknown or unimplemented options + * in case of error. + */ +int +htmlCtxtSetOptions(htmlParserCtxt *ctxt, int options) +{ + return(htmlCtxtSetOptionsInternal(ctxt, options, 0)); +} + +/** + * Applies the options to the parser context. The following options + * are never cleared and can only be enabled: + * + * @deprecated Use #htmlCtxtSetOptions. + * + * - HTML_PARSE_NODEFDTD + * - HTML_PARSE_NOERROR + * - HTML_PARSE_NOWARNING + * - HTML_PARSE_NOIMPLIED + * - HTML_PARSE_COMPACT + * - HTML_PARSE_HUGE + * - HTML_PARSE_IGNORE_ENC + * - HTML_PARSE_BIG_LINES + * + * @param ctxt an HTML parser context + * @param options a combination of htmlParserOption values + * @returns 0 in case of success, the set of unknown or unimplemented options + * in case of error. + */ +int +htmlCtxtUseOptions(htmlParserCtxt *ctxt, int options) +{ + int keepMask; + + /* + * For historic reasons, some options can only be enabled. + */ + keepMask = HTML_PARSE_NODEFDTD | + HTML_PARSE_NOERROR | + HTML_PARSE_NOWARNING | + HTML_PARSE_NOIMPLIED | + HTML_PARSE_COMPACT | + HTML_PARSE_HUGE | + HTML_PARSE_IGNORE_ENC | + HTML_PARSE_BIG_LINES; + + return(htmlCtxtSetOptionsInternal(ctxt, options, keepMask)); +} + +/** + * Parse an HTML document and return the resulting document tree. + * + * @since 2.13.0 + * + * @param ctxt an HTML parser context + * @param input parser input + * @returns the resulting document tree or NULL + */ +xmlDoc * +htmlCtxtParseDocument(htmlParserCtxt *ctxt, xmlParserInput *input) +{ + htmlDocPtr ret; + + if ((ctxt == NULL) || (input == NULL)) { + xmlFatalErr(ctxt, XML_ERR_ARGUMENT, NULL); + xmlFreeInputStream(input); + return(NULL); + } + + /* assert(ctxt->inputNr == 0); */ + while (ctxt->inputNr > 0) + xmlFreeInputStream(xmlCtxtPopInput(ctxt)); + + if (xmlCtxtPushInput(ctxt, input) < 0) { + xmlFreeInputStream(input); + return(NULL); + } + + ctxt->html = INSERT_INITIAL; + htmlParseDocument(ctxt); + + ret = xmlCtxtGetDocument(ctxt); + + /* assert(ctxt->inputNr == 1); */ + while (ctxt->inputNr > 0) + xmlFreeInputStream(xmlCtxtPopInput(ctxt)); + + return(ret); +} + +/** + * Convenience function to parse an HTML document from a zero-terminated + * string. + * + * See #htmlCtxtReadDoc for details. + * + * @param str a pointer to a zero terminated string + * @param url only used for error reporting (optoinal) + * @param encoding the document encoding (optional) + * @param options a combination of htmlParserOption values + * @returns the resulting document tree. + */ +xmlDoc * +htmlReadDoc(const xmlChar *str, const char *url, const char *encoding, + int options) +{ + htmlParserCtxtPtr ctxt; + xmlParserInputPtr input; + htmlDocPtr doc = NULL; + + ctxt = htmlNewParserCtxt(); + if (ctxt == NULL) + return(NULL); + + htmlCtxtUseOptions(ctxt, options); + + input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str, encoding, + XML_INPUT_BUF_STATIC); + + if (input != NULL) + doc = htmlCtxtParseDocument(ctxt, input); + + htmlFreeParserCtxt(ctxt); + return(doc); +} + +/** + * Convenience function to parse an HTML file from the filesystem, + * the network or a global user-defined resource loader. + * + * See #htmlCtxtReadFile for details. + * + * @param filename a file or URL + * @param encoding the document encoding (optional) + * @param options a combination of htmlParserOption values + * @returns the resulting document tree. + */ +xmlDoc * +htmlReadFile(const char *filename, const char *encoding, int options) +{ + htmlParserCtxtPtr ctxt; + xmlParserInputPtr input; + htmlDocPtr doc = NULL; + + ctxt = htmlNewParserCtxt(); + if (ctxt == NULL) + return(NULL); + + htmlCtxtUseOptions(ctxt, options); + + input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0); + + if (input != NULL) + doc = htmlCtxtParseDocument(ctxt, input); + + htmlFreeParserCtxt(ctxt); + return(doc); +} + +/** + * Convenience function to parse an HTML document from memory. + * The input buffer must not contain any terminating null bytes. + * + * See #htmlCtxtReadMemory for details. + * + * @param buffer a pointer to a char array + * @param size the size of the array + * @param url only used for error reporting (optional) + * @param encoding the document encoding, or NULL + * @param options a combination of htmlParserOption values + * @returns the resulting document tree + */ +xmlDoc * +htmlReadMemory(const char *buffer, int size, const char *url, + const char *encoding, int options) +{ + htmlParserCtxtPtr ctxt; + xmlParserInputPtr input; + htmlDocPtr doc = NULL; + + if (size < 0) + return(NULL); + + ctxt = htmlNewParserCtxt(); + if (ctxt == NULL) + return(NULL); + + htmlCtxtUseOptions(ctxt, options); + + input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding, + XML_INPUT_BUF_STATIC); + + if (input != NULL) + doc = htmlCtxtParseDocument(ctxt, input); + + htmlFreeParserCtxt(ctxt); + return(doc); +} + +/** + * Convenience function to parse an HTML document from a + * file descriptor. + * + * NOTE that the file descriptor will not be closed when the + * context is freed or reset. + * + * See #htmlCtxtReadFd for details. + * + * @param fd an open file descriptor + * @param url only used for error reporting (optional) + * @param encoding the document encoding, or NULL + * @param options a combination of htmlParserOption values + * @returns the resulting document tree + */ +xmlDoc * +htmlReadFd(int fd, const char *url, const char *encoding, int options) +{ + htmlParserCtxtPtr ctxt; + xmlParserInputPtr input; + htmlDocPtr doc = NULL; + + ctxt = htmlNewParserCtxt(); + if (ctxt == NULL) + return(NULL); + + htmlCtxtUseOptions(ctxt, options); + + input = xmlCtxtNewInputFromFd(ctxt, url, fd, encoding, 0); + + if (input != NULL) + doc = htmlCtxtParseDocument(ctxt, input); + + htmlFreeParserCtxt(ctxt); + return(doc); +} + +/** + * Convenience function to parse an HTML document from I/O functions + * and context. + * + * See #htmlCtxtReadIO for details. + * + * @param ioread an I/O read function + * @param ioclose an I/O close function (optional) + * @param ioctx an I/O handler + * @param url only used for error reporting (optional) + * @param encoding the document encoding (optional) + * @param options a combination of htmlParserOption values + * @returns the resulting document tree + */ +xmlDoc * +htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, + void *ioctx, const char *url, const char *encoding, int options) +{ + htmlParserCtxtPtr ctxt; + xmlParserInputPtr input; + htmlDocPtr doc = NULL; + + ctxt = htmlNewParserCtxt(); + if (ctxt == NULL) + return (NULL); + + htmlCtxtUseOptions(ctxt, options); + + input = xmlCtxtNewInputFromIO(ctxt, url, ioread, ioclose, ioctx, + encoding, 0); + + if (input != NULL) + doc = htmlCtxtParseDocument(ctxt, input); + + htmlFreeParserCtxt(ctxt); + return(doc); +} + +/** + * Parse an HTML in-memory document and build a tree. + * + * See #htmlCtxtUseOptions for details. + * + * @param ctxt an HTML parser context + * @param str a pointer to a zero terminated string + * @param URL only used for error reporting (optional) + * @param encoding the document encoding (optional) + * @param options a combination of htmlParserOption values + * @returns the resulting document tree + */ +xmlDoc * +htmlCtxtReadDoc(xmlParserCtxt *ctxt, const xmlChar *str, + const char *URL, const char *encoding, int options) +{ + xmlParserInputPtr input; + + if (ctxt == NULL) + return (NULL); + + htmlCtxtReset(ctxt); + htmlCtxtUseOptions(ctxt, options); + + input = xmlCtxtNewInputFromString(ctxt, URL, (const char *) str, + encoding, 0); + if (input == NULL) + return(NULL); + + return(htmlCtxtParseDocument(ctxt, input)); +} + +/** + * Parse an HTML file from the filesystem, the network or a + * user-defined resource loader. + * + * See #htmlCtxtUseOptions for details. + * + * @param ctxt an HTML parser context + * @param filename a file or URL + * @param encoding the document encoding (optional) + * @param options a combination of htmlParserOption values + * @returns the resulting document tree + */ +xmlDoc * +htmlCtxtReadFile(xmlParserCtxt *ctxt, const char *filename, + const char *encoding, int options) +{ + xmlParserInputPtr input; + + if (ctxt == NULL) + return (NULL); + + htmlCtxtReset(ctxt); + htmlCtxtUseOptions(ctxt, options); + + input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0); + if (input == NULL) + return(NULL); + + return(htmlCtxtParseDocument(ctxt, input)); +} + +/** + * Parse an HTML in-memory document and build a tree. The input buffer must + * not contain any terminating null bytes. + * + * See #htmlCtxtUseOptions for details. + * + * @param ctxt an HTML parser context + * @param buffer a pointer to a char array + * @param size the size of the array + * @param URL only used for error reporting (optional) + * @param encoding the document encoding (optinal) + * @param options a combination of htmlParserOption values + * @returns the resulting document tree + */ +xmlDoc * +htmlCtxtReadMemory(xmlParserCtxt *ctxt, const char *buffer, int size, + const char *URL, const char *encoding, int options) +{ + xmlParserInputPtr input; + + if ((ctxt == NULL) || (size < 0)) + return (NULL); + + htmlCtxtReset(ctxt); + htmlCtxtUseOptions(ctxt, options); + + input = xmlCtxtNewInputFromMemory(ctxt, URL, buffer, size, encoding, + XML_INPUT_BUF_STATIC); + if (input == NULL) + return(NULL); + + return(htmlCtxtParseDocument(ctxt, input)); +} + +/** + * Parse an HTML from a file descriptor and build a tree. + * + * See #htmlCtxtUseOptions for details. + * + * NOTE that the file descriptor will not be closed when the + * context is freed or reset. + * + * @param ctxt an HTML parser context + * @param fd an open file descriptor + * @param URL only used for error reporting (optional) + * @param encoding the document encoding (optinal) + * @param options a combination of htmlParserOption values + * @returns the resulting document tree + */ +xmlDoc * +htmlCtxtReadFd(xmlParserCtxt *ctxt, int fd, + const char *URL, const char *encoding, int options) +{ + xmlParserInputPtr input; + + if (ctxt == NULL) + return(NULL); + + htmlCtxtReset(ctxt); + htmlCtxtUseOptions(ctxt, options); + + input = xmlCtxtNewInputFromFd(ctxt, URL, fd, encoding, 0); + if (input == NULL) + return(NULL); + + return(htmlCtxtParseDocument(ctxt, input)); +} + +/** + * Parse an HTML document from I/O functions and source and build a tree. + * + * See #htmlCtxtUseOptions for details. + * + * @param ctxt an HTML parser context + * @param ioread an I/O read function + * @param ioclose an I/O close function + * @param ioctx an I/O handler + * @param URL the base URL to use for the document + * @param encoding the document encoding, or NULL + * @param options a combination of htmlParserOption values + * @returns the resulting document tree + */ +xmlDoc * +htmlCtxtReadIO(xmlParserCtxt *ctxt, xmlInputReadCallback ioread, + xmlInputCloseCallback ioclose, void *ioctx, + const char *URL, + const char *encoding, int options) +{ + xmlParserInputPtr input; + + if (ctxt == NULL) + return (NULL); + + htmlCtxtReset(ctxt); + htmlCtxtUseOptions(ctxt, options); + + input = xmlCtxtNewInputFromIO(ctxt, URL, ioread, ioclose, ioctx, + encoding, 0); + if (input == NULL) + return(NULL); + + return(htmlCtxtParseDocument(ctxt, input)); +} + +#endif /* LIBXML_HTML_ENABLED */