carbon-tokenization / backend /src /publisher /markdown-renderer.ts
tfrere's picture
tfrere HF Staff
feat(editor): Iframe embed component for remote URLs
0c69852
Raw
History Blame Contribute Delete
15.6 kB
/**
* Server-side Markdown renderer for the publisher pipeline.
*
* Walks the same TipTap-JSON document used by `html-renderer.ts` and
* produces a clean Markdown file conforming to the `llms.txt` convention
* (https://llmstxt.org/). The output is meant to be consumed by LLM
* agents and crawlers that struggle with the heavy published HTML
* (theme bootstrap, KaTeX/Mermaid runtimes, inlined CSS, iframe-wrapped
* D3 charts, ...).
*
* Conventions mirror the upstream `research-article-template` Astro
* plugin (`app/plugins/astro/generate-llms-txt.mjs`):
*
* - `<HtmlEmbed src="..." />` -> `*[Interactive visualization: <src>]*`
* - `<Note>...</Note>` -> blockquote
* - `<Quote author=...>` -> blockquote + attribution
* - `<Wide>`/`<FullWidth>` -> unwrap content
* - `<Sidenote>` -> main content + blockquote aside
* - `<Reference caption=...>` -> content + caption
* - `<Accordion title=...>` -> bold title + content
* - `<Mermaid code=...>` -> fenced ```mermaid``` code block
* - `<HfUser username=...>` -> `[@u](https://huggingface.co/u)`
* - `<Citation key=...>` -> `[key]` (or `[N]` for IEEE)
* - `<Footnote content=...>` -> Pandoc `[^N]` reference + footnotes section
* - inline / block math -> `$...$` / `$$...$$`
*
* Output shape:
* # <title>
*
* > <subtitle / description>
*
* - **Authors**: ...
* - **Published**: ...
* - **DOI**: ...
*
* ---
*
* <body markdown>
*
* ## References
* ...
*
* ## Footnotes
* ...
*/
import type { PublishMeta, CitationData } from "./html-renderer.js";
type JSONNode = {
type?: string;
attrs?: Record<string, any>;
marks?: Array<{ type: string; attrs?: Record<string, any> }>;
text?: string;
content?: JSONNode[];
};
interface RenderCtx {
citationData?: CitationData;
/** Pre-formatted bibliography (HTML from citation-js) - we strip tags. */
biblioHtml?: string;
/** Footnote texts collected during the walk, emitted at the end. */
footnotes: string[];
}
// ---------------------------------------------------------------------------
// Inline rendering (text + marks + inline atoms)
// ---------------------------------------------------------------------------
function applyMarks(text: string, marks: JSONNode["marks"]): string {
if (!marks?.length) return text;
let out = text;
for (const mark of marks) {
switch (mark.type) {
case "bold":
out = `**${out}**`;
break;
case "italic":
out = `*${out}*`;
break;
case "strike":
out = `~~${out}~~`;
break;
case "code":
out = `\`${out}\``;
break;
case "link": {
const href = mark.attrs?.href || "";
out = href ? `[${out}](${href})` : out;
break;
}
default:
break;
}
}
return out;
}
function getCitationLabel(key: string, ctx: RenderCtx, fallbackLabel?: string): string {
if (!ctx.citationData) return fallbackLabel || `[${key}]`;
const { style, orderedKeys } = ctx.citationData;
if (style === "ieee" || style === "vancouver") {
const idx = orderedKeys.indexOf(key);
if (idx >= 0) return `[${idx + 1}]`;
}
return fallbackLabel || `[${key}]`;
}
function renderInline(nodes: JSONNode[] | undefined, ctx: RenderCtx): string {
if (!nodes) return "";
let out = "";
for (const node of nodes) {
out += renderInlineNode(node, ctx);
}
return out;
}
function renderInlineNode(node: JSONNode, ctx: RenderCtx): string {
switch (node.type) {
case "text":
return applyMarks(node.text || "", node.marks);
case "hardBreak":
return " \n";
case "inlineMath": {
const latex = node.attrs?.latex || "";
return latex ? `$${latex}$` : "";
}
case "citation": {
const key = String(node.attrs?.key || "");
if (!key) return "";
return getCitationLabel(key, ctx, node.attrs?.label);
}
case "glossary": {
const term = String(node.attrs?.term || "");
return term;
}
case "footnote": {
const content = String(node.attrs?.content || "");
ctx.footnotes.push(content);
return `[^${ctx.footnotes.length}]`;
}
case "image": {
const src = String(node.attrs?.src || "");
const alt = String(node.attrs?.alt || "");
const title = node.attrs?.title ? ` "${node.attrs.title}"` : "";
return src ? `![${alt}](${src}${title})` : alt;
}
default:
// Unknown inline: fall back to its text content if any.
return renderInline(node.content, ctx);
}
}
// ---------------------------------------------------------------------------
// Block rendering
// ---------------------------------------------------------------------------
function renderBlocks(nodes: JSONNode[] | undefined, ctx: RenderCtx): string {
if (!nodes?.length) return "";
const parts: string[] = [];
for (const node of nodes) {
const rendered = renderBlock(node, ctx);
if (rendered) parts.push(rendered);
}
return parts.join("\n\n");
}
function renderBlock(node: JSONNode, ctx: RenderCtx): string {
switch (node.type) {
case "doc":
return renderBlocks(node.content, ctx);
case "paragraph": {
const inner = renderInline(node.content, ctx).trim();
return inner;
}
case "heading": {
const level = Math.min(Math.max(Number(node.attrs?.level) || 1, 1), 6);
const inner = renderInline(node.content, ctx).trim();
return `${"#".repeat(level)} ${inner}`;
}
case "blockquote": {
const inner = renderBlocks(node.content, ctx);
return inner
.split("\n")
.map((l) => (l.length ? `> ${l}` : ">"))
.join("\n");
}
case "horizontalRule":
return "---";
case "codeBlock": {
const lang = String(node.attrs?.language || node.attrs?.lang || "");
const code = (node.content || [])
.map((c) => c.text || "")
.join("");
return `\`\`\`${lang}\n${code}\n\`\`\``;
}
case "bulletList":
return renderList(node, ctx, "-");
case "orderedList":
return renderList(node, ctx, "1.");
case "listItem": {
// Should normally be reached via renderList, but if encountered
// standalone we just render its blocks.
return renderBlocks(node.content, ctx);
}
case "blockMath": {
const latex = String(node.attrs?.latex || "").trim();
return latex ? `$$\n${latex}\n$$` : "";
}
case "table":
return renderTable(node, ctx);
// --- Custom block components ---
case "accordion": {
const title = String(node.attrs?.title || "Details");
const inner = renderBlocks(node.content, ctx);
return `**${title}**\n\n${inner}`;
}
case "note": {
const inner = renderBlocks(node.content, ctx);
return inner
.split("\n")
.map((l) => (l.length ? `> ${l}` : ">"))
.join("\n");
}
case "quoteBlock": {
const inner = renderBlocks(node.content, ctx);
const author = String(node.attrs?.author || "").trim();
const source = String(node.attrs?.source || "").trim();
const attribution = [author, source].filter(Boolean).join(", ");
const quoted = inner
.split("\n")
.map((l) => (l.length ? `> ${l}` : ">"))
.join("\n");
return attribution ? `${quoted}\n>\n> -- ${attribution}` : quoted;
}
case "wide":
case "fullWidth":
case "stack":
case "stackColumn":
return renderBlocks(node.content, ctx);
case "sidenote": {
const inner = renderBlocks(node.content, ctx);
// No `slot="aside"` in TipTap-JSON: render as a blockquote.
return inner
.split("\n")
.map((l) => (l.length ? `> ${l}` : ">"))
.join("\n");
}
case "reference": {
const inner = renderBlocks(node.content, ctx);
const caption = String(node.attrs?.caption || "").trim();
return caption ? `${inner}\n\n*Figure: ${caption}*` : inner;
}
case "htmlEmbed": {
const src = String(node.attrs?.src || "").trim();
const title = String(node.attrs?.title || "").trim();
const desc = String(node.attrs?.desc || "").trim();
const labelParts = [title, desc].filter(Boolean);
const label = labelParts.length
? labelParts.join(" - ")
: src || "embed";
return `*[Interactive visualization: ${label}]*`;
}
case "iframe": {
const src = String(node.attrs?.src || "").trim();
const title = String(node.attrs?.title || "").trim();
const desc = String(node.attrs?.desc || "").trim();
if (!src) return "";
const label = title || desc || src;
// Surface the URL so LLM agents and crawlers can follow it.
return `*[Embedded page: [${label}](${src})]*`;
}
case "hfUser": {
const username = String(node.attrs?.username || "").trim();
if (!username) return "";
const url =
String(node.attrs?.url || "").trim() ||
`https://huggingface.co/${encodeURIComponent(username)}`;
const name = String(node.attrs?.name || "").trim() || `@${username}`;
return `[${name}](${url})`;
}
case "rawHtml": {
const html = String(node.attrs?.html || "");
return stripHtmlToText(html).trim();
}
case "mermaid": {
const code = String(node.attrs?.code || "").trim();
return code ? `\`\`\`mermaid\n${code}\n\`\`\`` : "";
}
case "bibliography":
// Emitted by `appendBibliographySection` from the post-walk step.
return "";
default:
// Unknown block: fall back to its content, or empty.
return renderBlocks(node.content, ctx);
}
}
function renderList(
node: JSONNode,
ctx: RenderCtx,
marker: string,
): string {
const items = node.content || [];
const lines: string[] = [];
items.forEach((item, idx) => {
const innerBlocks = renderBlocks(item.content, ctx);
const prefix = marker === "1." ? `${idx + 1}.` : marker;
const innerLines = innerBlocks.split("\n");
lines.push(`${prefix} ${innerLines[0] ?? ""}`);
for (let i = 1; i < innerLines.length; i++) {
const indent = " ".repeat(prefix.length + 1);
lines.push(`${indent}${innerLines[i]}`);
}
});
return lines.join("\n");
}
function renderTable(node: JSONNode, ctx: RenderCtx): string {
const rows = node.content || [];
if (!rows.length) return "";
const grid: string[][] = [];
let headerRowIndex = -1;
for (let r = 0; r < rows.length; r++) {
const row = rows[r];
const cells = row.content || [];
const rowText: string[] = [];
let rowIsHeader = false;
for (const cell of cells) {
if (cell.type === "tableHeader") rowIsHeader = true;
const text = renderBlocks(cell.content, ctx)
.replace(/\n+/g, " ")
.replace(/\|/g, "\\|")
.trim();
rowText.push(text);
}
grid.push(rowText);
if (rowIsHeader && headerRowIndex === -1) headerRowIndex = r;
}
const colCount = Math.max(...grid.map((r) => r.length));
for (const row of grid) {
while (row.length < colCount) row.push("");
}
const lines: string[] = [];
if (headerRowIndex === -1) {
// No explicit header row: synthesize one with empty cells so the
// markdown table is still valid.
lines.push(`| ${new Array(colCount).fill(" ").join(" | ")} |`);
lines.push(`| ${new Array(colCount).fill("---").join(" | ")} |`);
for (const row of grid) lines.push(`| ${row.join(" | ")} |`);
} else {
for (let r = 0; r < grid.length; r++) {
lines.push(`| ${grid[r].join(" | ")} |`);
if (r === headerRowIndex) {
lines.push(`| ${new Array(colCount).fill("---").join(" | ")} |`);
}
}
}
return lines.join("\n");
}
// ---------------------------------------------------------------------------
// Header (frontmatter) + bibliography + footnotes
// ---------------------------------------------------------------------------
function buildHeader(meta: PublishMeta): string {
const titleClean = (meta.title || "Untitled")
.replace(/\\n/g, " ")
.replace(/\n/g, " ")
.replace(/\s{2,}/g, " ")
.trim();
const parts = [`# ${titleClean}\n`];
const desc = (meta.description || meta.subtitle || "").trim();
if (desc) parts.push(`> ${desc}\n`);
const metaLines: string[] = [];
const authors = meta.authors.map((a) => a.name).filter(Boolean);
if (authors.length) metaLines.push(`- **Authors**: ${authors.join(", ")}`);
if (meta.date) metaLines.push(`- **Published**: ${meta.date}`);
if (meta.doi) {
const doiUrl = meta.doi.startsWith("http")
? meta.doi
: `https://doi.org/${meta.doi}`;
metaLines.push(`- **DOI**: ${doiUrl}`);
}
if (metaLines.length) parts.push(metaLines.join("\n") + "\n");
parts.push("---\n");
return parts.join("\n");
}
function appendBibliographySection(
ctx: RenderCtx,
): string {
if (!ctx.biblioHtml) return "";
const text = stripHtmlToText(ctx.biblioHtml).trim();
if (!text) return "";
return `## References\n\n${text}`;
}
function appendFootnotesSection(ctx: RenderCtx): string {
if (!ctx.footnotes.length) return "";
const lines = ctx.footnotes.map((content, i) => {
const clean = stripHtmlToText(content).trim().replace(/\n+/g, " ");
return `[^${i + 1}]: ${clean}`;
});
return `## Footnotes\n\n${lines.join("\n\n")}`;
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/**
* Strip HTML tags while keeping anchor hrefs as Markdown links and emitting
* a blank line between block-level elements. Tuned for citation-js HTML
* output and for `rawHtml` user content - not a general-purpose sanitiser.
*/
export function stripHtmlToText(html: string): string {
if (!html) return "";
let out = html;
out = out.replace(/<a\s+[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, "[$2]($1)");
out = out.replace(/<br\s*\/?>/gi, "\n");
out = out.replace(
/<\/(p|div|li|tr|h[1-6])>/gi,
"$&\n",
);
out = out.replace(/<[^>]+>/g, "");
out = out
.replace(/&nbsp;/g, " ")
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
return out
.split("\n")
.map((l) => l.trim())
.filter((l, i, arr) => !(l === "" && arr[i - 1] === ""))
.join("\n");
}
// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------
/**
* Render a TipTap-JSON document into an `llms.txt`-compatible Markdown string.
*
* `serverBiblioHtml` is the same HTML produced by `formatBibliographyServer()`
* for the HTML pipeline; we reuse it (stripped to text) so the agent gets the
* same reference list a human would see.
*/
export function renderArticleMarkdown(
json: Record<string, unknown>,
meta: PublishMeta,
citationData?: CitationData,
serverBiblioHtml?: string,
): string {
const ctx: RenderCtx = {
citationData,
biblioHtml: serverBiblioHtml,
footnotes: [],
};
const body = renderBlocks((json as JSONNode).content, ctx);
const header = buildHeader(meta);
const refs = appendBibliographySection(ctx);
const footnotes = appendFootnotesSection(ctx);
const sections = [header.trimEnd(), body, refs, footnotes]
.filter(Boolean)
.join("\n\n");
return sections.replace(/\n{3,}/g, "\n\n").trim() + "\n";
}