/** * Server-side Markdown renderer for the publisher pipeline. * * Walks the same TipTap-JSON document used by `html-renderer.ts` and * produces a clean Markdown file conforming to the `llms.txt` convention * (https://llmstxt.org/). The output is meant to be consumed by LLM * agents and crawlers that struggle with the heavy published HTML * (theme bootstrap, KaTeX/Mermaid runtimes, inlined CSS, iframe-wrapped * D3 charts, ...). * * Conventions mirror the upstream `research-article-template` Astro * plugin (`app/plugins/astro/generate-llms-txt.mjs`): * * - `` -> `*[Interactive visualization: ]*` * - `...` -> blockquote * - `` -> blockquote + attribution * - ``/`` -> unwrap content * - `` -> main content + blockquote aside * - `` -> content + caption * - `` -> bold title + content * - `` -> fenced ```mermaid``` code block * - `` -> `[@u](https://huggingface.co/u)` * - `` -> `[key]` (or `[N]` for IEEE) * - `` -> Pandoc `[^N]` reference + footnotes section * - inline / block math -> `$...$` / `$$...$$` * * Output shape: * # * * > <subtitle / description> * * - **Authors**: ... * - **Published**: ... * - **DOI**: ... * * --- * * <body markdown> * * ## References * ... * * ## Footnotes * ... */ import type { PublishMeta, CitationData } from "./html-renderer.js"; type JSONNode = { type?: string; attrs?: Record<string, any>; marks?: Array<{ type: string; attrs?: Record<string, any> }>; text?: string; content?: JSONNode[]; }; interface RenderCtx { citationData?: CitationData; /** Pre-formatted bibliography (HTML from citation-js) - we strip tags. */ biblioHtml?: string; /** Footnote texts collected during the walk, emitted at the end. */ footnotes: string[]; } // --------------------------------------------------------------------------- // Inline rendering (text + marks + inline atoms) // --------------------------------------------------------------------------- function applyMarks(text: string, marks: JSONNode["marks"]): string { if (!marks?.length) return text; let out = text; for (const mark of marks) { switch (mark.type) { case "bold": out = `**${out}**`; break; case "italic": out = `*${out}*`; break; case "strike": out = `~~${out}~~`; break; case "code": out = `\`${out}\``; break; case "link": { const href = mark.attrs?.href || ""; out = href ? `[${out}](${href})` : out; break; } default: break; } } return out; } function getCitationLabel(key: string, ctx: RenderCtx, fallbackLabel?: string): string { if (!ctx.citationData) return fallbackLabel || `[${key}]`; const { style, orderedKeys } = ctx.citationData; if (style === "ieee" || style === "vancouver") { const idx = orderedKeys.indexOf(key); if (idx >= 0) return `[${idx + 1}]`; } return fallbackLabel || `[${key}]`; } function renderInline(nodes: JSONNode[] | undefined, ctx: RenderCtx): string { if (!nodes) return ""; let out = ""; for (const node of nodes) { out += renderInlineNode(node, ctx); } return out; } function renderInlineNode(node: JSONNode, ctx: RenderCtx): string { switch (node.type) { case "text": return applyMarks(node.text || "", node.marks); case "hardBreak": return " \n"; case "inlineMath": { const latex = node.attrs?.latex || ""; return latex ? `$${latex}$` : ""; } case "citation": { const key = String(node.attrs?.key || ""); if (!key) return ""; return getCitationLabel(key, ctx, node.attrs?.label); } case "glossary": { const term = String(node.attrs?.term || ""); return term; } case "footnote": { const content = String(node.attrs?.content || ""); ctx.footnotes.push(content); return `[^${ctx.footnotes.length}]`; } case "image": { const src = String(node.attrs?.src || ""); const alt = String(node.attrs?.alt || ""); const title = node.attrs?.title ? ` "${node.attrs.title}"` : ""; return src ? `![${alt}](${src}${title})` : alt; } default: // Unknown inline: fall back to its text content if any. return renderInline(node.content, ctx); } } // --------------------------------------------------------------------------- // Block rendering // --------------------------------------------------------------------------- function renderBlocks(nodes: JSONNode[] | undefined, ctx: RenderCtx): string { if (!nodes?.length) return ""; const parts: string[] = []; for (const node of nodes) { const rendered = renderBlock(node, ctx); if (rendered) parts.push(rendered); } return parts.join("\n\n"); } function renderBlock(node: JSONNode, ctx: RenderCtx): string { switch (node.type) { case "doc": return renderBlocks(node.content, ctx); case "paragraph": { const inner = renderInline(node.content, ctx).trim(); return inner; } case "heading": { const level = Math.min(Math.max(Number(node.attrs?.level) || 1, 1), 6); const inner = renderInline(node.content, ctx).trim(); return `${"#".repeat(level)} ${inner}`; } case "blockquote": { const inner = renderBlocks(node.content, ctx); return inner .split("\n") .map((l) => (l.length ? `> ${l}` : ">")) .join("\n"); } case "horizontalRule": return "---"; case "codeBlock": { const lang = String(node.attrs?.language || node.attrs?.lang || ""); const code = (node.content || []) .map((c) => c.text || "") .join(""); return `\`\`\`${lang}\n${code}\n\`\`\``; } case "bulletList": return renderList(node, ctx, "-"); case "orderedList": return renderList(node, ctx, "1."); case "listItem": { // Should normally be reached via renderList, but if encountered // standalone we just render its blocks. return renderBlocks(node.content, ctx); } case "blockMath": { const latex = String(node.attrs?.latex || "").trim(); return latex ? `$$\n${latex}\n$$` : ""; } case "table": return renderTable(node, ctx); // --- Custom block components --- case "accordion": { const title = String(node.attrs?.title || "Details"); const inner = renderBlocks(node.content, ctx); return `**${title}**\n\n${inner}`; } case "note": { const inner = renderBlocks(node.content, ctx); return inner .split("\n") .map((l) => (l.length ? `> ${l}` : ">")) .join("\n"); } case "quoteBlock": { const inner = renderBlocks(node.content, ctx); const author = String(node.attrs?.author || "").trim(); const source = String(node.attrs?.source || "").trim(); const attribution = [author, source].filter(Boolean).join(", "); const quoted = inner .split("\n") .map((l) => (l.length ? `> ${l}` : ">")) .join("\n"); return attribution ? `${quoted}\n>\n> -- ${attribution}` : quoted; } case "wide": case "fullWidth": case "stack": case "stackColumn": return renderBlocks(node.content, ctx); case "sidenote": { const inner = renderBlocks(node.content, ctx); // No `slot="aside"` in TipTap-JSON: render as a blockquote. return inner .split("\n") .map((l) => (l.length ? `> ${l}` : ">")) .join("\n"); } case "reference": { const inner = renderBlocks(node.content, ctx); const caption = String(node.attrs?.caption || "").trim(); return caption ? `${inner}\n\n*Figure: ${caption}*` : inner; } case "htmlEmbed": { const src = String(node.attrs?.src || "").trim(); const title = String(node.attrs?.title || "").trim(); const desc = String(node.attrs?.desc || "").trim(); const labelParts = [title, desc].filter(Boolean); const label = labelParts.length ? labelParts.join(" - ") : src || "embed"; return `*[Interactive visualization: ${label}]*`; } case "iframe": { const src = String(node.attrs?.src || "").trim(); const title = String(node.attrs?.title || "").trim(); const desc = String(node.attrs?.desc || "").trim(); if (!src) return ""; const label = title || desc || src; // Surface the URL so LLM agents and crawlers can follow it. return `*[Embedded page: [${label}](${src})]*`; } case "hfUser": { const username = String(node.attrs?.username || "").trim(); if (!username) return ""; const url = String(node.attrs?.url || "").trim() || `https://huggingface.co/${encodeURIComponent(username)}`; const name = String(node.attrs?.name || "").trim() || `@${username}`; return `[${name}](${url})`; } case "rawHtml": { const html = String(node.attrs?.html || ""); return stripHtmlToText(html).trim(); } case "mermaid": { const code = String(node.attrs?.code || "").trim(); return code ? `\`\`\`mermaid\n${code}\n\`\`\`` : ""; } case "bibliography": // Emitted by `appendBibliographySection` from the post-walk step. return ""; default: // Unknown block: fall back to its content, or empty. return renderBlocks(node.content, ctx); } } function renderList( node: JSONNode, ctx: RenderCtx, marker: string, ): string { const items = node.content || []; const lines: string[] = []; items.forEach((item, idx) => { const innerBlocks = renderBlocks(item.content, ctx); const prefix = marker === "1." ? `${idx + 1}.` : marker; const innerLines = innerBlocks.split("\n"); lines.push(`${prefix} ${innerLines[0] ?? ""}`); for (let i = 1; i < innerLines.length; i++) { const indent = " ".repeat(prefix.length + 1); lines.push(`${indent}${innerLines[i]}`); } }); return lines.join("\n"); } function renderTable(node: JSONNode, ctx: RenderCtx): string { const rows = node.content || []; if (!rows.length) return ""; const grid: string[][] = []; let headerRowIndex = -1; for (let r = 0; r < rows.length; r++) { const row = rows[r]; const cells = row.content || []; const rowText: string[] = []; let rowIsHeader = false; for (const cell of cells) { if (cell.type === "tableHeader") rowIsHeader = true; const text = renderBlocks(cell.content, ctx) .replace(/\n+/g, " ") .replace(/\|/g, "\\|") .trim(); rowText.push(text); } grid.push(rowText); if (rowIsHeader && headerRowIndex === -1) headerRowIndex = r; } const colCount = Math.max(...grid.map((r) => r.length)); for (const row of grid) { while (row.length < colCount) row.push(""); } const lines: string[] = []; if (headerRowIndex === -1) { // No explicit header row: synthesize one with empty cells so the // markdown table is still valid. lines.push(`| ${new Array(colCount).fill(" ").join(" | ")} |`); lines.push(`| ${new Array(colCount).fill("---").join(" | ")} |`); for (const row of grid) lines.push(`| ${row.join(" | ")} |`); } else { for (let r = 0; r < grid.length; r++) { lines.push(`| ${grid[r].join(" | ")} |`); if (r === headerRowIndex) { lines.push(`| ${new Array(colCount).fill("---").join(" | ")} |`); } } } return lines.join("\n"); } // --------------------------------------------------------------------------- // Header (frontmatter) + bibliography + footnotes // --------------------------------------------------------------------------- function buildHeader(meta: PublishMeta): string { const titleClean = (meta.title || "Untitled") .replace(/\\n/g, " ") .replace(/\n/g, " ") .replace(/\s{2,}/g, " ") .trim(); const parts = [`# ${titleClean}\n`]; const desc = (meta.description || meta.subtitle || "").trim(); if (desc) parts.push(`> ${desc}\n`); const metaLines: string[] = []; const authors = meta.authors.map((a) => a.name).filter(Boolean); if (authors.length) metaLines.push(`- **Authors**: ${authors.join(", ")}`); if (meta.date) metaLines.push(`- **Published**: ${meta.date}`); if (meta.doi) { const doiUrl = meta.doi.startsWith("http") ? meta.doi : `https://doi.org/${meta.doi}`; metaLines.push(`- **DOI**: ${doiUrl}`); } if (metaLines.length) parts.push(metaLines.join("\n") + "\n"); parts.push("---\n"); return parts.join("\n"); } function appendBibliographySection( ctx: RenderCtx, ): string { if (!ctx.biblioHtml) return ""; const text = stripHtmlToText(ctx.biblioHtml).trim(); if (!text) return ""; return `## References\n\n${text}`; } function appendFootnotesSection(ctx: RenderCtx): string { if (!ctx.footnotes.length) return ""; const lines = ctx.footnotes.map((content, i) => { const clean = stripHtmlToText(content).trim().replace(/\n+/g, " "); return `[^${i + 1}]: ${clean}`; }); return `## Footnotes\n\n${lines.join("\n\n")}`; } // --------------------------------------------------------------------------- // Helpers // --------------------------------------------------------------------------- /** * Strip HTML tags while keeping anchor hrefs as Markdown links and emitting * a blank line between block-level elements. Tuned for citation-js HTML * output and for `rawHtml` user content - not a general-purpose sanitiser. */ export function stripHtmlToText(html: string): string { if (!html) return ""; let out = html; out = out.replace(/<a\s+[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, "[$2]($1)"); out = out.replace(/<br\s*\/?>/gi, "\n"); out = out.replace( /<\/(p|div|li|tr|h[1-6])>/gi, "$&\n", ); out = out.replace(/<[^>]+>/g, ""); out = out .replace(/ /g, " ") .replace(/&/g, "&") .replace(/</g, "<") .replace(/>/g, ">") .replace(/"/g, '"') .replace(/'/g, "'"); return out .split("\n") .map((l) => l.trim()) .filter((l, i, arr) => !(l === "" && arr[i - 1] === "")) .join("\n"); } // --------------------------------------------------------------------------- // Public API // --------------------------------------------------------------------------- /** * Render a TipTap-JSON document into an `llms.txt`-compatible Markdown string. * * `serverBiblioHtml` is the same HTML produced by `formatBibliographyServer()` * for the HTML pipeline; we reuse it (stripped to text) so the agent gets the * same reference list a human would see. */ export function renderArticleMarkdown( json: Record<string, unknown>, meta: PublishMeta, citationData?: CitationData, serverBiblioHtml?: string, ): string { const ctx: RenderCtx = { citationData, biblioHtml: serverBiblioHtml, footnotes: [], }; const body = renderBlocks((json as JSONNode).content, ctx); const header = buildHeader(meta); const refs = appendBibliographySection(ctx); const footnotes = appendFootnotesSection(ctx); const sections = [header.trimEnd(), body, refs, footnotes] .filter(Boolean) .join("\n\n"); return sections.replace(/\n{3,}/g, "\n\n").trim() + "\n"; }