File size: 6,275 Bytes
31dd200 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | /**
* Rehype plugin to restore limited HTML elements inside Markdown table cells.
*
* ## Problem
* The remark/rehype pipeline neutralizes inline HTML as literal text
* (remarkLiteralHtml) so that XML/HTML snippets in LLM responses display
* as-is instead of being rendered. This causes <br> and <ul> markup in
* table cells to show as plain text.
*
* ## Solution
* This plugin traverses the HAST post-conversion, parses whitelisted HTML
* patterns from text nodes, and replaces them with actual HAST element nodes
* that will be rendered as real HTML.
*
* ## Supported HTML
* - `<br>` / `<br/>` / `<br />` - Line breaks (inline)
* - `<ul><li>...</li></ul>` - Unordered lists (block)
*
* ## Key Implementation Details
*
* ### 1. Sibling Combination (Critical)
* The Markdown pipeline may fragment content across multiple text nodes and `<br>`
* elements. For example, `<ul><li>a</li></ul>` might arrive as:
* - Text: `"<ul>"`
* - Element: `<br>`
* - Text: `"<li>a</li></ul>"`
*
* We must combine consecutive text nodes and `<br>` elements into a single string
* before attempting to parse list markup. Without this, list detection fails.
*
* ### 2. visitParents for Deep Traversal
* Table cell content may be wrapped in intermediate elements (e.g., `<p>` tags).
* Using `visitParents` instead of direct child iteration ensures we find text
* nodes at any depth within the cell.
*
* ### 3. Reference Comparison for No-Op Detection
* When checking if `<br>` expansion changed anything, we compare:
* `expanded.length !== 1 || expanded[0] !== textNode`
*
* This catches both cases:
* - Multiple nodes created (text was split)
* - Single NEW node created (original had only `<br>`, now it's an element)
*
* A simple `length > 1` check would miss the single `<br>` case.
*
* ### 4. Strict List Validation
* `parseList()` rejects malformed markup by checking for garbage text between
* `<li>` elements. This prevents creating broken DOM from partial matches like
* `<ul>garbage<li>a</li></ul>`.
*
* ### 5. Newline Substitution for `<br>` in Combined String
* When combining siblings, existing `<br>` elements become `\n` in the combined
* string. This allows list content to span visual lines while still being parsed
* as a single unit.
*
* @example
* // Input Markdown:
* // | Feature | Notes |
* // |---------|-------|
* // | Multi-line | First<br>Second |
* // | List | <ul><li>A</li><li>B</li></ul> |
* //
* // Without this plugin: <br> and <ul> render as literal text
* // With this plugin: <br> becomes line break, <ul> becomes actual list
*/
import type { Plugin } from 'unified';
import type { Element, ElementContent, Root, Text } from 'hast';
import { visit } from 'unist-util-visit';
import { visitParents } from 'unist-util-visit-parents';
import { BR_PATTERN, LIST_PATTERN, LI_PATTERN } from '$lib/constants/table-html-restorer';
/**
* Expands text containing `<br>` tags into an array of text nodes and br elements.
*/
function expandBrTags(value: string): ElementContent[] {
const matches = [...value.matchAll(BR_PATTERN)];
if (!matches.length) return [{ type: 'text', value } as Text];
const result: ElementContent[] = [];
let cursor = 0;
for (const m of matches) {
if (m.index! > cursor) {
result.push({ type: 'text', value: value.slice(cursor, m.index) } as Text);
}
result.push({ type: 'element', tagName: 'br', properties: {}, children: [] } as Element);
cursor = m.index! + m[0].length;
}
if (cursor < value.length) {
result.push({ type: 'text', value: value.slice(cursor) } as Text);
}
return result;
}
/**
* Parses a `<ul><li>...</li></ul>` string into a HAST element.
* Returns null if the markup is malformed or contains unexpected content.
*/
function parseList(value: string): Element | null {
const match = value.trim().match(LIST_PATTERN);
if (!match) return null;
const body = match[1];
const items: ElementContent[] = [];
let cursor = 0;
for (const liMatch of body.matchAll(LI_PATTERN)) {
// Reject if there's non-whitespace between list items
if (body.slice(cursor, liMatch.index!).trim()) return null;
items.push({
type: 'element',
tagName: 'li',
properties: {},
children: expandBrTags(liMatch[1] ?? '')
} as Element);
cursor = liMatch.index! + liMatch[0].length;
}
// Reject if no items found or trailing garbage exists
if (!items.length || body.slice(cursor).trim()) return null;
return { type: 'element', tagName: 'ul', properties: {}, children: items } as Element;
}
/**
* Processes a single table cell, restoring HTML elements from text content.
*/
function processCell(cell: Element) {
visitParents(cell, 'text', (textNode: Text, ancestors) => {
const parent = ancestors[ancestors.length - 1];
if (!parent || parent.type !== 'element') return;
const parentEl = parent as Element;
const siblings = parentEl.children as ElementContent[];
const startIndex = siblings.indexOf(textNode as ElementContent);
if (startIndex === -1) return;
// Combine consecutive text nodes and <br> elements into one string
let combined = '';
let endIndex = startIndex;
for (let i = startIndex; i < siblings.length; i++) {
const sib = siblings[i];
if (sib.type === 'text') {
combined += (sib as Text).value;
endIndex = i;
} else if (sib.type === 'element' && (sib as Element).tagName === 'br') {
combined += '\n';
endIndex = i;
} else {
break;
}
}
// Try parsing as list first (replaces entire combined range)
const list = parseList(combined);
if (list) {
siblings.splice(startIndex, endIndex - startIndex + 1, list);
return;
}
// Otherwise, just expand <br> tags in this text node
const expanded = expandBrTags(textNode.value);
if (expanded.length !== 1 || expanded[0] !== textNode) {
siblings.splice(startIndex, 1, ...expanded);
}
});
}
export const rehypeRestoreTableHtml: Plugin<[], Root> = () => (tree) => {
visit(tree, 'element', (node: Element) => {
if (node.tagName === 'td' || node.tagName === 'th') {
processCell(node);
}
});
};
|