tfrere HF Staff Cursor commited on
Commit
3de227d
·
1 Parent(s): 8d2864f

feat(publisher): generate llms.txt Markdown twin for LLM agents

Browse files

External LLMs/agents (Claude, Perplexity, ...) struggle with the
heavy published HTML: theme bootstrap, KaTeX/Mermaid runtimes,
~7 inlined CSS files, TOC chrome, and D3 charts wrapped in
<iframe srcdoc>. The static research-article-template already
ships an `llms.txt` (Astro build plugin); the editor's publisher
now does the same, walking the TipTap-JSON we already extract
for the HTML/PDF pipeline.

Pipeline addition (publisher/index.ts):
- After writing index.html, render the same JSON to Markdown via
the new markdown-renderer.ts and write `published/<doc>/llms.txt`
- Re-render with the public HF meta and include it in the dataset
upload commit (hf-storage.ts adds llms.txt to the file list)

Routing (create-app.ts):
- GET /llms.txt serves the published Markdown twin with
Content-Type: text/markdown; charset=utf-8 (404 before publish)
- GET /robots.txt advertises it via the `LLMs-Txt:` header,
mirroring the upstream template

Conventions in the renderer mirror the upstream Astro plugin:
- HtmlEmbed -> *[Interactive visualization: <title|src>]*
- Note/Sidenote -> blockquote
- QuoteBlock -> blockquote + author/source attribution
- Wide/FullWidth/Stack -> unwrap content
- Accordion -> bold title + content
- Mermaid -> ```mermaid fenced block
- HfUser -> [name](https://huggingface.co/<u>)
- Citations -> key (APA) or [N] (IEEE/Vancouver)
- Footnotes -> Pandoc [^N] + footnotes section
- Bibliography -> stripped to plain text in a References section
- Inline/block math kept as $...$ / $$...$$

22 new unit tests cover header, blocks, custom components,
citations/footnotes, and the HTML stripper helper. All 87
backend tests pass; tsc clean on backend and frontend.

Co-authored-by: Cursor <cursoragent@cursor.com>

README.md CHANGED
@@ -26,7 +26,7 @@ A collaborative, real-time editor for web-native scientific articles. It lets mu
26
  - **Comments & discussion** anchored on any selection
27
  - **Slash menu** (`/`) and drag/drop block handles, in the spirit of Notion
28
  - **Click-to-edit frontmatter**: title, subtitle, authors, affiliations, links, banner color
29
- - **Publishing pipeline**: one-click export to a standalone static HTML bundle, plus PDF generation (Puppeteer)
30
  - **Persistence**:
31
  - Local mode: documents stored on disk under `DATA_DIR`
32
  - HF mode: documents pushed/pulled from a Hugging Face dataset via OAuth
 
26
  - **Comments & discussion** anchored on any selection
27
  - **Slash menu** (`/`) and drag/drop block handles, in the spirit of Notion
28
  - **Click-to-edit frontmatter**: title, subtitle, authors, affiliations, links, banner color
29
+ - **Publishing pipeline**: one-click export to a standalone static HTML bundle, plus PDF generation (Puppeteer) and an `llms.txt` Markdown twin for LLM agents/crawlers (served at `/llms.txt`, advertised in `/robots.txt`)
30
  - **Persistence**:
31
  - Local mode: documents stored on disk under `DATA_DIR`
32
  - HF mode: documents pushed/pulled from a Hugging Face dataset via OAuth
backend/src/create-app.ts CHANGED
@@ -307,9 +307,44 @@ export function createApp() {
307
  return join(DATA_DIR, "published", docName, "index.html");
308
  }
309
 
 
 
 
 
310
  if (existsSync(staticDir)) {
311
  app.use(express.static(staticDir, { index: false }));
312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  app.get("/editor", async (req, res) => {
314
  if (oauthEnabled) {
315
  const { resolveUser } = await import("./auth.js");
 
307
  return join(DATA_DIR, "published", docName, "index.html");
308
  }
309
 
310
+ function getPublishedAssetPath(docName: string, filename: string): string {
311
+ return join(DATA_DIR, "published", docName, filename);
312
+ }
313
+
314
  if (existsSync(staticDir)) {
315
  app.use(express.static(staticDir, { index: false }));
316
 
317
+ // ---- LLM-friendly endpoints --------------------------------------
318
+ // The publisher generates a Markdown twin of the article (`llms.txt`)
319
+ // following the https://llmstxt.org/ convention. We expose it at the
320
+ // Space root so external agents/crawlers (Claude, Perplexity, ...) can
321
+ // consume the article without having to parse the heavy HTML page.
322
+ // `robots.txt` advertises this endpoint.
323
+
324
+ app.get("/llms.txt", async (_req, res) => {
325
+ const llmsPath = getPublishedAssetPath(DEFAULT_DOC_NAME, "llms.txt");
326
+ if (!existsSync(llmsPath)) {
327
+ res.status(404).type("text/plain").send("Not yet published");
328
+ return;
329
+ }
330
+ res.type("text/markdown; charset=utf-8");
331
+ res.sendFile(llmsPath);
332
+ });
333
+
334
+ app.get("/robots.txt", (_req, res) => {
335
+ res
336
+ .type("text/plain; charset=utf-8")
337
+ .send(
338
+ [
339
+ "User-agent: *",
340
+ "Allow: /",
341
+ "",
342
+ "LLMs-Txt: /llms.txt",
343
+ "",
344
+ ].join("\n"),
345
+ );
346
+ });
347
+
348
  app.get("/editor", async (req, res) => {
349
  if (oauthEnabled) {
350
  const { resolveUser } = await import("./auth.js");
backend/src/hf-storage.ts CHANGED
@@ -157,7 +157,7 @@ export async function pullPublishedAssets(
157
 
158
  const safeName = sanitizeName(docName);
159
  const base = `published/${safeName}`;
160
- const files = ["index.html", "article.pdf", "thumb.jpg", "meta.json"];
161
 
162
  const { mkdirSync, writeFileSync } = await import("fs");
163
  const { join } = await import("path");
@@ -226,6 +226,7 @@ interface PublishedPayload {
226
  pdf: Buffer | null;
227
  thumbnail: Buffer | null;
228
  meta: { title: string; description: string; authors: string[]; date: string; [key: string]: unknown };
 
229
  }
230
 
231
  export async function uploadPublishedAssets(
@@ -274,6 +275,14 @@ export async function uploadPublishedAssets(
274
  thumbUrl = `${baseUrl}/thumb.jpg`;
275
  }
276
 
 
 
 
 
 
 
 
 
277
  await commit({
278
  repo,
279
  operations,
 
157
 
158
  const safeName = sanitizeName(docName);
159
  const base = `published/${safeName}`;
160
+ const files = ["index.html", "article.pdf", "thumb.jpg", "meta.json", "llms.txt"];
161
 
162
  const { mkdirSync, writeFileSync } = await import("fs");
163
  const { join } = await import("path");
 
226
  pdf: Buffer | null;
227
  thumbnail: Buffer | null;
228
  meta: { title: string; description: string; authors: string[]; date: string; [key: string]: unknown };
229
+ llmsTxt?: string;
230
  }
231
 
232
  export async function uploadPublishedAssets(
 
275
  thumbUrl = `${baseUrl}/thumb.jpg`;
276
  }
277
 
278
+ if (payload.llmsTxt) {
279
+ operations.push({
280
+ operation: "addOrUpdate",
281
+ path: `${base}/llms.txt`,
282
+ content: new Blob([payload.llmsTxt], { type: "text/markdown; charset=utf-8" }),
283
+ });
284
+ }
285
+
286
  await commit({
287
  repo,
288
  operations,
backend/src/publisher/index.ts CHANGED
@@ -11,6 +11,7 @@ import * as Y from "yjs";
11
  import { getDataDir, docPath, sanitizeName } from "../utils.js";
12
  import { TiptapTransformer } from "@hocuspocus/transformer";
13
  import { renderArticleHTML, type PublishMeta, type CitationData } from "./html-renderer.js";
 
14
  import { formatBibliographyServer } from "./format-bibliography.js";
15
  import { getServerExtensions } from "./extensions.js";
16
  import { isPdfEnabled, generatePdfAndThumbnail } from "./pdf-generator.js";
@@ -435,6 +436,18 @@ export async function publishDocument(
435
  fsMkdir(publishDir, { recursive: true });
436
  fsWrite(join(publishDir, "index.html"), localHtml);
437
  fsWrite(join(publishDir, "meta.json"), JSON.stringify(localMeta, null, 2));
 
 
 
 
 
 
 
 
 
 
 
 
438
  console.log("[publish] index.html written to", publishDir);
439
 
440
  // --- Step 2: generate PDF + thumbnail from the served URL ---------------
@@ -487,11 +500,21 @@ export async function publishDocument(
487
  if (pdf) hfMeta.pdfUrl = getPublishedAssetUrl(docName, "article.pdf");
488
  const hfHtml = await renderArticleHTML(json, hfMeta, css, citationData, biblioHtml, embeds);
489
  try {
 
 
 
 
 
 
 
 
 
490
  const urls = await uploadPublishedAssets(docName, {
491
  html: hfHtml,
492
  pdf,
493
  thumbnail,
494
  meta: hfMeta as any,
 
495
  }, token);
496
  return { ...urls, success: true };
497
  } catch (err) {
 
11
  import { getDataDir, docPath, sanitizeName } from "../utils.js";
12
  import { TiptapTransformer } from "@hocuspocus/transformer";
13
  import { renderArticleHTML, type PublishMeta, type CitationData } from "./html-renderer.js";
14
+ import { renderArticleMarkdown } from "./markdown-renderer.js";
15
  import { formatBibliographyServer } from "./format-bibliography.js";
16
  import { getServerExtensions } from "./extensions.js";
17
  import { isPdfEnabled, generatePdfAndThumbnail } from "./pdf-generator.js";
 
436
  fsMkdir(publishDir, { recursive: true });
437
  fsWrite(join(publishDir, "index.html"), localHtml);
438
  fsWrite(join(publishDir, "meta.json"), JSON.stringify(localMeta, null, 2));
439
+
440
+ // Co-located `llms.txt` (https://llmstxt.org/) for LLM agents and
441
+ // crawlers that struggle with the heavy published HTML. Cheap to
442
+ // generate (one walk over the same TipTap-JSON, no Playwright) so we
443
+ // always write it; serving is gated separately by `create-app.ts`.
444
+ let llmsTxt = "";
445
+ try {
446
+ llmsTxt = renderArticleMarkdown(json, localMeta, citationData, biblioHtml);
447
+ fsWrite(join(publishDir, "llms.txt"), llmsTxt);
448
+ } catch (err) {
449
+ console.warn("[publish] llms.txt generation failed:", (err as Error).message);
450
+ }
451
  console.log("[publish] index.html written to", publishDir);
452
 
453
  // --- Step 2: generate PDF + thumbnail from the served URL ---------------
 
500
  if (pdf) hfMeta.pdfUrl = getPublishedAssetUrl(docName, "article.pdf");
501
  const hfHtml = await renderArticleHTML(json, hfMeta, css, citationData, biblioHtml, embeds);
502
  try {
503
+ // Re-render llms.txt with the HF meta so internal links (DOI, etc.)
504
+ // match what's in the uploaded HTML. Cheap, deterministic, and we
505
+ // already have everything in memory.
506
+ let hfLlmsTxt: string | undefined = undefined;
507
+ try {
508
+ hfLlmsTxt = renderArticleMarkdown(json, hfMeta, citationData, biblioHtml);
509
+ } catch (err) {
510
+ console.warn("[publish] HF llms.txt render failed:", (err as Error).message);
511
+ }
512
  const urls = await uploadPublishedAssets(docName, {
513
  html: hfHtml,
514
  pdf,
515
  thumbnail,
516
  meta: hfMeta as any,
517
+ llmsTxt: hfLlmsTxt,
518
  }, token);
519
  return { ...urls, success: true };
520
  } catch (err) {
backend/src/publisher/markdown-renderer.ts ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Server-side Markdown renderer for the publisher pipeline.
3
+ *
4
+ * Walks the same TipTap-JSON document used by `html-renderer.ts` and
5
+ * produces a clean Markdown file conforming to the `llms.txt` convention
6
+ * (https://llmstxt.org/). The output is meant to be consumed by LLM
7
+ * agents and crawlers that struggle with the heavy published HTML
8
+ * (theme bootstrap, KaTeX/Mermaid runtimes, inlined CSS, iframe-wrapped
9
+ * D3 charts, ...).
10
+ *
11
+ * Conventions mirror the upstream `research-article-template` Astro
12
+ * plugin (`app/plugins/astro/generate-llms-txt.mjs`):
13
+ *
14
+ * - `<HtmlEmbed src="..." />` -> `*[Interactive visualization: <src>]*`
15
+ * - `<Note>...</Note>` -> blockquote
16
+ * - `<Quote author=...>` -> blockquote + attribution
17
+ * - `<Wide>`/`<FullWidth>` -> unwrap content
18
+ * - `<Sidenote>` -> main content + blockquote aside
19
+ * - `<Reference caption=...>` -> content + caption
20
+ * - `<Accordion title=...>` -> bold title + content
21
+ * - `<Mermaid code=...>` -> fenced ```mermaid``` code block
22
+ * - `<HfUser username=...>` -> `[@u](https://huggingface.co/u)`
23
+ * - `<Citation key=...>` -> `[key]` (or `[N]` for IEEE)
24
+ * - `<Footnote content=...>` -> Pandoc `[^N]` reference + footnotes section
25
+ * - inline / block math -> `$...$` / `$$...$$`
26
+ *
27
+ * Output shape:
28
+ * # <title>
29
+ *
30
+ * > <subtitle / description>
31
+ *
32
+ * - **Authors**: ...
33
+ * - **Published**: ...
34
+ * - **DOI**: ...
35
+ *
36
+ * ---
37
+ *
38
+ * <body markdown>
39
+ *
40
+ * ## References
41
+ * ...
42
+ *
43
+ * ## Footnotes
44
+ * ...
45
+ */
46
+
47
+ import type { PublishMeta, CitationData } from "./html-renderer.js";
48
+
49
+ type JSONNode = {
50
+ type?: string;
51
+ attrs?: Record<string, any>;
52
+ marks?: Array<{ type: string; attrs?: Record<string, any> }>;
53
+ text?: string;
54
+ content?: JSONNode[];
55
+ };
56
+
57
+ interface RenderCtx {
58
+ citationData?: CitationData;
59
+ /** Pre-formatted bibliography (HTML from citation-js) - we strip tags. */
60
+ biblioHtml?: string;
61
+ /** Footnote texts collected during the walk, emitted at the end. */
62
+ footnotes: string[];
63
+ }
64
+
65
+ // ---------------------------------------------------------------------------
66
+ // Inline rendering (text + marks + inline atoms)
67
+ // ---------------------------------------------------------------------------
68
+
69
+ function applyMarks(text: string, marks: JSONNode["marks"]): string {
70
+ if (!marks?.length) return text;
71
+ let out = text;
72
+ for (const mark of marks) {
73
+ switch (mark.type) {
74
+ case "bold":
75
+ out = `**${out}**`;
76
+ break;
77
+ case "italic":
78
+ out = `*${out}*`;
79
+ break;
80
+ case "strike":
81
+ out = `~~${out}~~`;
82
+ break;
83
+ case "code":
84
+ out = `\`${out}\``;
85
+ break;
86
+ case "link": {
87
+ const href = mark.attrs?.href || "";
88
+ out = href ? `[${out}](${href})` : out;
89
+ break;
90
+ }
91
+ default:
92
+ break;
93
+ }
94
+ }
95
+ return out;
96
+ }
97
+
98
+ function getCitationLabel(key: string, ctx: RenderCtx, fallbackLabel?: string): string {
99
+ if (!ctx.citationData) return fallbackLabel || `[${key}]`;
100
+ const { style, orderedKeys } = ctx.citationData;
101
+ if (style === "ieee" || style === "vancouver") {
102
+ const idx = orderedKeys.indexOf(key);
103
+ if (idx >= 0) return `[${idx + 1}]`;
104
+ }
105
+ return fallbackLabel || `[${key}]`;
106
+ }
107
+
108
+ function renderInline(nodes: JSONNode[] | undefined, ctx: RenderCtx): string {
109
+ if (!nodes) return "";
110
+ let out = "";
111
+ for (const node of nodes) {
112
+ out += renderInlineNode(node, ctx);
113
+ }
114
+ return out;
115
+ }
116
+
117
+ function renderInlineNode(node: JSONNode, ctx: RenderCtx): string {
118
+ switch (node.type) {
119
+ case "text":
120
+ return applyMarks(node.text || "", node.marks);
121
+ case "hardBreak":
122
+ return " \n";
123
+ case "inlineMath": {
124
+ const latex = node.attrs?.latex || "";
125
+ return latex ? `$${latex}$` : "";
126
+ }
127
+ case "citation": {
128
+ const key = String(node.attrs?.key || "");
129
+ if (!key) return "";
130
+ return getCitationLabel(key, ctx, node.attrs?.label);
131
+ }
132
+ case "glossary": {
133
+ const term = String(node.attrs?.term || "");
134
+ return term;
135
+ }
136
+ case "footnote": {
137
+ const content = String(node.attrs?.content || "");
138
+ ctx.footnotes.push(content);
139
+ return `[^${ctx.footnotes.length}]`;
140
+ }
141
+ case "image": {
142
+ const src = String(node.attrs?.src || "");
143
+ const alt = String(node.attrs?.alt || "");
144
+ const title = node.attrs?.title ? ` "${node.attrs.title}"` : "";
145
+ return src ? `![${alt}](${src}${title})` : alt;
146
+ }
147
+ default:
148
+ // Unknown inline: fall back to its text content if any.
149
+ return renderInline(node.content, ctx);
150
+ }
151
+ }
152
+
153
+ // ---------------------------------------------------------------------------
154
+ // Block rendering
155
+ // ---------------------------------------------------------------------------
156
+
157
+ function renderBlocks(nodes: JSONNode[] | undefined, ctx: RenderCtx): string {
158
+ if (!nodes?.length) return "";
159
+ const parts: string[] = [];
160
+ for (const node of nodes) {
161
+ const rendered = renderBlock(node, ctx);
162
+ if (rendered) parts.push(rendered);
163
+ }
164
+ return parts.join("\n\n");
165
+ }
166
+
167
+ function renderBlock(node: JSONNode, ctx: RenderCtx): string {
168
+ switch (node.type) {
169
+ case "doc":
170
+ return renderBlocks(node.content, ctx);
171
+
172
+ case "paragraph": {
173
+ const inner = renderInline(node.content, ctx).trim();
174
+ return inner;
175
+ }
176
+
177
+ case "heading": {
178
+ const level = Math.min(Math.max(Number(node.attrs?.level) || 1, 1), 6);
179
+ const inner = renderInline(node.content, ctx).trim();
180
+ return `${"#".repeat(level)} ${inner}`;
181
+ }
182
+
183
+ case "blockquote": {
184
+ const inner = renderBlocks(node.content, ctx);
185
+ return inner
186
+ .split("\n")
187
+ .map((l) => (l.length ? `> ${l}` : ">"))
188
+ .join("\n");
189
+ }
190
+
191
+ case "horizontalRule":
192
+ return "---";
193
+
194
+ case "codeBlock": {
195
+ const lang = String(node.attrs?.language || node.attrs?.lang || "");
196
+ const code = (node.content || [])
197
+ .map((c) => c.text || "")
198
+ .join("");
199
+ return `\`\`\`${lang}\n${code}\n\`\`\``;
200
+ }
201
+
202
+ case "bulletList":
203
+ return renderList(node, ctx, "-");
204
+
205
+ case "orderedList":
206
+ return renderList(node, ctx, "1.");
207
+
208
+ case "listItem": {
209
+ // Should normally be reached via renderList, but if encountered
210
+ // standalone we just render its blocks.
211
+ return renderBlocks(node.content, ctx);
212
+ }
213
+
214
+ case "blockMath": {
215
+ const latex = String(node.attrs?.latex || "").trim();
216
+ return latex ? `$$\n${latex}\n$$` : "";
217
+ }
218
+
219
+ case "table":
220
+ return renderTable(node, ctx);
221
+
222
+ // --- Custom block components ---
223
+
224
+ case "accordion": {
225
+ const title = String(node.attrs?.title || "Details");
226
+ const inner = renderBlocks(node.content, ctx);
227
+ return `**${title}**\n\n${inner}`;
228
+ }
229
+
230
+ case "note": {
231
+ const inner = renderBlocks(node.content, ctx);
232
+ return inner
233
+ .split("\n")
234
+ .map((l) => (l.length ? `> ${l}` : ">"))
235
+ .join("\n");
236
+ }
237
+
238
+ case "quoteBlock": {
239
+ const inner = renderBlocks(node.content, ctx);
240
+ const author = String(node.attrs?.author || "").trim();
241
+ const source = String(node.attrs?.source || "").trim();
242
+ const attribution = [author, source].filter(Boolean).join(", ");
243
+ const quoted = inner
244
+ .split("\n")
245
+ .map((l) => (l.length ? `> ${l}` : ">"))
246
+ .join("\n");
247
+ return attribution ? `${quoted}\n>\n> -- ${attribution}` : quoted;
248
+ }
249
+
250
+ case "wide":
251
+ case "fullWidth":
252
+ case "stack":
253
+ case "stackColumn":
254
+ return renderBlocks(node.content, ctx);
255
+
256
+ case "sidenote": {
257
+ const inner = renderBlocks(node.content, ctx);
258
+ // No `slot="aside"` in TipTap-JSON: render as a blockquote.
259
+ return inner
260
+ .split("\n")
261
+ .map((l) => (l.length ? `> ${l}` : ">"))
262
+ .join("\n");
263
+ }
264
+
265
+ case "reference": {
266
+ const inner = renderBlocks(node.content, ctx);
267
+ const caption = String(node.attrs?.caption || "").trim();
268
+ return caption ? `${inner}\n\n*Figure: ${caption}*` : inner;
269
+ }
270
+
271
+ case "htmlEmbed": {
272
+ const src = String(node.attrs?.src || "").trim();
273
+ const title = String(node.attrs?.title || "").trim();
274
+ const desc = String(node.attrs?.desc || "").trim();
275
+ const labelParts = [title, desc].filter(Boolean);
276
+ const label = labelParts.length
277
+ ? labelParts.join(" - ")
278
+ : src || "embed";
279
+ return `*[Interactive visualization: ${label}]*`;
280
+ }
281
+
282
+ case "hfUser": {
283
+ const username = String(node.attrs?.username || "").trim();
284
+ if (!username) return "";
285
+ const url =
286
+ String(node.attrs?.url || "").trim() ||
287
+ `https://huggingface.co/${encodeURIComponent(username)}`;
288
+ const name = String(node.attrs?.name || "").trim() || `@${username}`;
289
+ return `[${name}](${url})`;
290
+ }
291
+
292
+ case "rawHtml": {
293
+ const html = String(node.attrs?.html || "");
294
+ return stripHtmlToText(html).trim();
295
+ }
296
+
297
+ case "mermaid": {
298
+ const code = String(node.attrs?.code || "").trim();
299
+ return code ? `\`\`\`mermaid\n${code}\n\`\`\`` : "";
300
+ }
301
+
302
+ case "bibliography":
303
+ // Emitted by `appendBibliographySection` from the post-walk step.
304
+ return "";
305
+
306
+ default:
307
+ // Unknown block: fall back to its content, or empty.
308
+ return renderBlocks(node.content, ctx);
309
+ }
310
+ }
311
+
312
+ function renderList(
313
+ node: JSONNode,
314
+ ctx: RenderCtx,
315
+ marker: string,
316
+ ): string {
317
+ const items = node.content || [];
318
+ const lines: string[] = [];
319
+ items.forEach((item, idx) => {
320
+ const innerBlocks = renderBlocks(item.content, ctx);
321
+ const prefix = marker === "1." ? `${idx + 1}.` : marker;
322
+ const innerLines = innerBlocks.split("\n");
323
+ lines.push(`${prefix} ${innerLines[0] ?? ""}`);
324
+ for (let i = 1; i < innerLines.length; i++) {
325
+ const indent = " ".repeat(prefix.length + 1);
326
+ lines.push(`${indent}${innerLines[i]}`);
327
+ }
328
+ });
329
+ return lines.join("\n");
330
+ }
331
+
332
+ function renderTable(node: JSONNode, ctx: RenderCtx): string {
333
+ const rows = node.content || [];
334
+ if (!rows.length) return "";
335
+
336
+ const grid: string[][] = [];
337
+ let headerRowIndex = -1;
338
+
339
+ for (let r = 0; r < rows.length; r++) {
340
+ const row = rows[r];
341
+ const cells = row.content || [];
342
+ const rowText: string[] = [];
343
+ let rowIsHeader = false;
344
+ for (const cell of cells) {
345
+ if (cell.type === "tableHeader") rowIsHeader = true;
346
+ const text = renderBlocks(cell.content, ctx)
347
+ .replace(/\n+/g, " ")
348
+ .replace(/\|/g, "\\|")
349
+ .trim();
350
+ rowText.push(text);
351
+ }
352
+ grid.push(rowText);
353
+ if (rowIsHeader && headerRowIndex === -1) headerRowIndex = r;
354
+ }
355
+
356
+ const colCount = Math.max(...grid.map((r) => r.length));
357
+ for (const row of grid) {
358
+ while (row.length < colCount) row.push("");
359
+ }
360
+
361
+ const lines: string[] = [];
362
+ if (headerRowIndex === -1) {
363
+ // No explicit header row: synthesize one with empty cells so the
364
+ // markdown table is still valid.
365
+ lines.push(`| ${new Array(colCount).fill(" ").join(" | ")} |`);
366
+ lines.push(`| ${new Array(colCount).fill("---").join(" | ")} |`);
367
+ for (const row of grid) lines.push(`| ${row.join(" | ")} |`);
368
+ } else {
369
+ for (let r = 0; r < grid.length; r++) {
370
+ lines.push(`| ${grid[r].join(" | ")} |`);
371
+ if (r === headerRowIndex) {
372
+ lines.push(`| ${new Array(colCount).fill("---").join(" | ")} |`);
373
+ }
374
+ }
375
+ }
376
+
377
+ return lines.join("\n");
378
+ }
379
+
380
+ // ---------------------------------------------------------------------------
381
+ // Header (frontmatter) + bibliography + footnotes
382
+ // ---------------------------------------------------------------------------
383
+
384
+ function buildHeader(meta: PublishMeta): string {
385
+ const titleClean = (meta.title || "Untitled")
386
+ .replace(/\\n/g, " ")
387
+ .replace(/\n/g, " ")
388
+ .replace(/\s{2,}/g, " ")
389
+ .trim();
390
+
391
+ const parts = [`# ${titleClean}\n`];
392
+
393
+ const desc = (meta.description || meta.subtitle || "").trim();
394
+ if (desc) parts.push(`> ${desc}\n`);
395
+
396
+ const metaLines: string[] = [];
397
+ const authors = meta.authors.map((a) => a.name).filter(Boolean);
398
+ if (authors.length) metaLines.push(`- **Authors**: ${authors.join(", ")}`);
399
+ if (meta.date) metaLines.push(`- **Published**: ${meta.date}`);
400
+ if (meta.doi) {
401
+ const doiUrl = meta.doi.startsWith("http")
402
+ ? meta.doi
403
+ : `https://doi.org/${meta.doi}`;
404
+ metaLines.push(`- **DOI**: ${doiUrl}`);
405
+ }
406
+ if (metaLines.length) parts.push(metaLines.join("\n") + "\n");
407
+
408
+ parts.push("---\n");
409
+ return parts.join("\n");
410
+ }
411
+
412
+ function appendBibliographySection(
413
+ ctx: RenderCtx,
414
+ ): string {
415
+ if (!ctx.biblioHtml) return "";
416
+ const text = stripHtmlToText(ctx.biblioHtml).trim();
417
+ if (!text) return "";
418
+ return `## References\n\n${text}`;
419
+ }
420
+
421
+ function appendFootnotesSection(ctx: RenderCtx): string {
422
+ if (!ctx.footnotes.length) return "";
423
+ const lines = ctx.footnotes.map((content, i) => {
424
+ const clean = stripHtmlToText(content).trim().replace(/\n+/g, " ");
425
+ return `[^${i + 1}]: ${clean}`;
426
+ });
427
+ return `## Footnotes\n\n${lines.join("\n\n")}`;
428
+ }
429
+
430
+ // ---------------------------------------------------------------------------
431
+ // Helpers
432
+ // ---------------------------------------------------------------------------
433
+
434
+ /**
435
+ * Strip HTML tags while keeping anchor hrefs as Markdown links and emitting
436
+ * a blank line between block-level elements. Tuned for citation-js HTML
437
+ * output and for `rawHtml` user content - not a general-purpose sanitiser.
438
+ */
439
+ export function stripHtmlToText(html: string): string {
440
+ if (!html) return "";
441
+ let out = html;
442
+ out = out.replace(/<a\s+[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, "[$2]($1)");
443
+ out = out.replace(/<br\s*\/?>/gi, "\n");
444
+ out = out.replace(
445
+ /<\/(p|div|li|tr|h[1-6])>/gi,
446
+ "$&\n",
447
+ );
448
+ out = out.replace(/<[^>]+>/g, "");
449
+ out = out
450
+ .replace(/&nbsp;/g, " ")
451
+ .replace(/&amp;/g, "&")
452
+ .replace(/&lt;/g, "<")
453
+ .replace(/&gt;/g, ">")
454
+ .replace(/&quot;/g, '"')
455
+ .replace(/&#39;/g, "'");
456
+ return out
457
+ .split("\n")
458
+ .map((l) => l.trim())
459
+ .filter((l, i, arr) => !(l === "" && arr[i - 1] === ""))
460
+ .join("\n");
461
+ }
462
+
463
+ // ---------------------------------------------------------------------------
464
+ // Public API
465
+ // ---------------------------------------------------------------------------
466
+
467
+ /**
468
+ * Render a TipTap-JSON document into an `llms.txt`-compatible Markdown string.
469
+ *
470
+ * `serverBiblioHtml` is the same HTML produced by `formatBibliographyServer()`
471
+ * for the HTML pipeline; we reuse it (stripped to text) so the agent gets the
472
+ * same reference list a human would see.
473
+ */
474
+ export function renderArticleMarkdown(
475
+ json: Record<string, unknown>,
476
+ meta: PublishMeta,
477
+ citationData?: CitationData,
478
+ serverBiblioHtml?: string,
479
+ ): string {
480
+ const ctx: RenderCtx = {
481
+ citationData,
482
+ biblioHtml: serverBiblioHtml,
483
+ footnotes: [],
484
+ };
485
+
486
+ const body = renderBlocks((json as JSONNode).content, ctx);
487
+ const header = buildHeader(meta);
488
+ const refs = appendBibliographySection(ctx);
489
+ const footnotes = appendFootnotesSection(ctx);
490
+
491
+ const sections = [header.trimEnd(), body, refs, footnotes]
492
+ .filter(Boolean)
493
+ .join("\n\n");
494
+
495
+ return sections.replace(/\n{3,}/g, "\n\n").trim() + "\n";
496
+ }
backend/tests/markdown-renderer.test.ts ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { describe, it, expect } from "vitest";
2
+ import {
3
+ renderArticleMarkdown,
4
+ stripHtmlToText,
5
+ } from "../src/publisher/markdown-renderer.js";
6
+ import type { PublishMeta, CitationData } from "../src/publisher/html-renderer.js";
7
+
8
+ const META: PublishMeta = {
9
+ title: "Test Article",
10
+ subtitle: "A subtitle",
11
+ description: "A short description for SEO",
12
+ authors: [
13
+ { name: "Alice", affiliationIndices: [1], affiliationNames: ["MIT"] },
14
+ { name: "Bob", affiliationIndices: [2], affiliationNames: ["HF"] },
15
+ ],
16
+ affiliations: [{ name: "MIT" }, { name: "HF" }],
17
+ date: "2026-04-30",
18
+ doi: "10.1234/abcd.efgh",
19
+ };
20
+
21
+ const doc = (content: any[]) => ({ type: "doc", content });
22
+
23
+ describe("renderArticleMarkdown - header", () => {
24
+ it("emits an llms.txt-style header with title, description, authors, date and DOI", () => {
25
+ const md = renderArticleMarkdown(doc([{ type: "paragraph" }]), META);
26
+ expect(md).toContain("# Test Article");
27
+ expect(md).toContain("> A short description for SEO");
28
+ expect(md).toContain("- **Authors**: Alice, Bob");
29
+ expect(md).toContain("- **Published**: 2026-04-30");
30
+ expect(md).toContain("- **DOI**: https://doi.org/10.1234/abcd.efgh");
31
+ expect(md).toContain("---");
32
+ });
33
+
34
+ it("falls back to subtitle when description is empty", () => {
35
+ const md = renderArticleMarkdown(
36
+ doc([{ type: "paragraph" }]),
37
+ { ...META, description: "" },
38
+ );
39
+ expect(md).toContain("> A subtitle");
40
+ });
41
+
42
+ it("collapses multi-line titles", () => {
43
+ const md = renderArticleMarkdown(
44
+ doc([{ type: "paragraph" }]),
45
+ { ...META, title: "Line one\\nLine two" },
46
+ );
47
+ expect(md).toContain("# Line one Line two");
48
+ expect(md).not.toContain("\\n");
49
+ });
50
+ });
51
+
52
+ describe("renderArticleMarkdown - block nodes", () => {
53
+ it("renders headings with the correct markdown level", () => {
54
+ const md = renderArticleMarkdown(
55
+ doc([
56
+ { type: "heading", attrs: { level: 2 }, content: [{ type: "text", text: "Hello" }] },
57
+ { type: "heading", attrs: { level: 3 }, content: [{ type: "text", text: "Sub" }] },
58
+ ]),
59
+ META,
60
+ );
61
+ expect(md).toContain("## Hello");
62
+ expect(md).toContain("### Sub");
63
+ });
64
+
65
+ it("applies bold/italic/code/link marks", () => {
66
+ const md = renderArticleMarkdown(
67
+ doc([
68
+ {
69
+ type: "paragraph",
70
+ content: [
71
+ { type: "text", text: "bold", marks: [{ type: "bold" }] },
72
+ { type: "text", text: " " },
73
+ { type: "text", text: "italic", marks: [{ type: "italic" }] },
74
+ { type: "text", text: " " },
75
+ { type: "text", text: "code", marks: [{ type: "code" }] },
76
+ { type: "text", text: " " },
77
+ {
78
+ type: "text",
79
+ text: "link",
80
+ marks: [{ type: "link", attrs: { href: "https://example.com" } }],
81
+ },
82
+ ],
83
+ },
84
+ ]),
85
+ META,
86
+ );
87
+ expect(md).toContain("**bold**");
88
+ expect(md).toContain("*italic*");
89
+ expect(md).toContain("`code`");
90
+ expect(md).toContain("[link](https://example.com)");
91
+ });
92
+
93
+ it("renders bullet and ordered lists", () => {
94
+ const md = renderArticleMarkdown(
95
+ doc([
96
+ {
97
+ type: "bulletList",
98
+ content: [
99
+ { type: "listItem", content: [{ type: "paragraph", content: [{ type: "text", text: "one" }] }] },
100
+ { type: "listItem", content: [{ type: "paragraph", content: [{ type: "text", text: "two" }] }] },
101
+ ],
102
+ },
103
+ {
104
+ type: "orderedList",
105
+ content: [
106
+ { type: "listItem", content: [{ type: "paragraph", content: [{ type: "text", text: "first" }] }] },
107
+ { type: "listItem", content: [{ type: "paragraph", content: [{ type: "text", text: "second" }] }] },
108
+ ],
109
+ },
110
+ ]),
111
+ META,
112
+ );
113
+ expect(md).toContain("- one");
114
+ expect(md).toContain("- two");
115
+ expect(md).toContain("1. first");
116
+ expect(md).toContain("2. second");
117
+ });
118
+
119
+ it("renders code blocks with language fence", () => {
120
+ const md = renderArticleMarkdown(
121
+ doc([
122
+ {
123
+ type: "codeBlock",
124
+ attrs: { language: "ts" },
125
+ content: [{ type: "text", text: "const x = 1;" }],
126
+ },
127
+ ]),
128
+ META,
129
+ );
130
+ expect(md).toContain("```ts");
131
+ expect(md).toContain("const x = 1;");
132
+ expect(md).toContain("```");
133
+ });
134
+
135
+ it("renders inline and block math", () => {
136
+ const md = renderArticleMarkdown(
137
+ doc([
138
+ {
139
+ type: "paragraph",
140
+ content: [
141
+ { type: "text", text: "Energy: " },
142
+ { type: "inlineMath", attrs: { latex: "E = mc^2" } },
143
+ ],
144
+ },
145
+ { type: "blockMath", attrs: { latex: "\\int_0^1 x dx" } },
146
+ ]),
147
+ META,
148
+ );
149
+ expect(md).toContain("$E = mc^2$");
150
+ expect(md).toContain("$$\n\\int_0^1 x dx\n$$");
151
+ });
152
+
153
+ it("renders tables with a header row separator", () => {
154
+ const md = renderArticleMarkdown(
155
+ doc([
156
+ {
157
+ type: "table",
158
+ content: [
159
+ {
160
+ type: "tableRow",
161
+ content: [
162
+ { type: "tableHeader", content: [{ type: "paragraph", content: [{ type: "text", text: "Col A" }] }] },
163
+ { type: "tableHeader", content: [{ type: "paragraph", content: [{ type: "text", text: "Col B" }] }] },
164
+ ],
165
+ },
166
+ {
167
+ type: "tableRow",
168
+ content: [
169
+ { type: "tableCell", content: [{ type: "paragraph", content: [{ type: "text", text: "1" }] }] },
170
+ { type: "tableCell", content: [{ type: "paragraph", content: [{ type: "text", text: "2" }] }] },
171
+ ],
172
+ },
173
+ ],
174
+ },
175
+ ]),
176
+ META,
177
+ );
178
+ expect(md).toContain("| Col A | Col B |");
179
+ expect(md).toContain("| --- | --- |");
180
+ expect(md).toContain("| 1 | 2 |");
181
+ });
182
+ });
183
+
184
+ describe("renderArticleMarkdown - custom components", () => {
185
+ it("collapses HtmlEmbed to a single inline placeholder with title and src", () => {
186
+ const md = renderArticleMarkdown(
187
+ doc([
188
+ {
189
+ type: "htmlEmbed",
190
+ attrs: { src: "d3-chart.html", title: "Citations over time", desc: "" },
191
+ },
192
+ ]),
193
+ META,
194
+ );
195
+ expect(md).toContain("*[Interactive visualization: Citations over time]*");
196
+ expect(md).not.toContain("<iframe");
197
+ });
198
+
199
+ it("renders Note as a blockquote", () => {
200
+ const md = renderArticleMarkdown(
201
+ doc([
202
+ {
203
+ type: "note",
204
+ content: [
205
+ { type: "paragraph", content: [{ type: "text", text: "Heads up." }] },
206
+ ],
207
+ },
208
+ ]),
209
+ META,
210
+ );
211
+ expect(md).toContain("> Heads up.");
212
+ });
213
+
214
+ it("renders Accordion with bold title and inner content", () => {
215
+ const md = renderArticleMarkdown(
216
+ doc([
217
+ {
218
+ type: "accordion",
219
+ attrs: { title: "More details" },
220
+ content: [
221
+ { type: "paragraph", content: [{ type: "text", text: "Inside." }] },
222
+ ],
223
+ },
224
+ ]),
225
+ META,
226
+ );
227
+ expect(md).toContain("**More details**");
228
+ expect(md).toContain("Inside.");
229
+ });
230
+
231
+ it("renders QuoteBlock with attribution", () => {
232
+ const md = renderArticleMarkdown(
233
+ doc([
234
+ {
235
+ type: "quoteBlock",
236
+ attrs: { author: "Ada Lovelace", source: "Notes" },
237
+ content: [
238
+ { type: "paragraph", content: [{ type: "text", text: "The future is open." }] },
239
+ ],
240
+ },
241
+ ]),
242
+ META,
243
+ );
244
+ expect(md).toContain("> The future is open.");
245
+ expect(md).toContain("> -- Ada Lovelace, Notes");
246
+ });
247
+
248
+ it("renders HfUser as a markdown link to huggingface.co/<u>", () => {
249
+ const md = renderArticleMarkdown(
250
+ doc([
251
+ {
252
+ type: "hfUser",
253
+ attrs: { username: "tfrere", name: "Thibaud Frere" },
254
+ },
255
+ ]),
256
+ META,
257
+ );
258
+ expect(md).toContain("[Thibaud Frere](https://huggingface.co/tfrere)");
259
+ });
260
+
261
+ it("renders Mermaid as a fenced ```mermaid block", () => {
262
+ const md = renderArticleMarkdown(
263
+ doc([
264
+ {
265
+ type: "mermaid",
266
+ attrs: { code: "graph TD\n A --> B" },
267
+ },
268
+ ]),
269
+ META,
270
+ );
271
+ expect(md).toContain("```mermaid");
272
+ expect(md).toContain("graph TD");
273
+ expect(md).toContain("A --> B");
274
+ });
275
+
276
+ it("unwraps Wide / FullWidth / Stack containers", () => {
277
+ const md = renderArticleMarkdown(
278
+ doc([
279
+ {
280
+ type: "wide",
281
+ content: [
282
+ { type: "paragraph", content: [{ type: "text", text: "Wide content." }] },
283
+ ],
284
+ },
285
+ ]),
286
+ META,
287
+ );
288
+ expect(md).toContain("Wide content.");
289
+ expect(md).not.toContain("[wide]");
290
+ });
291
+ });
292
+
293
+ describe("renderArticleMarkdown - citations and footnotes", () => {
294
+ it("renders citations as keys for APA and as numeric tags for IEEE", () => {
295
+ const json = doc([
296
+ {
297
+ type: "paragraph",
298
+ content: [
299
+ { type: "text", text: "See " },
300
+ { type: "citation", attrs: { key: "smith2024", label: "Smith (2024)" } },
301
+ { type: "text", text: "." },
302
+ ],
303
+ },
304
+ ]);
305
+ const apa: CitationData = {
306
+ entries: [{ id: "smith2024" }],
307
+ orderedKeys: ["smith2024"],
308
+ style: "apa",
309
+ };
310
+ const ieee: CitationData = {
311
+ entries: [{ id: "smith2024" }],
312
+ orderedKeys: ["smith2024"],
313
+ style: "ieee",
314
+ };
315
+ expect(renderArticleMarkdown(json, META, apa)).toContain("Smith (2024)");
316
+ expect(renderArticleMarkdown(json, META, ieee)).toContain("[1]");
317
+ });
318
+
319
+ it("collects footnotes and emits a footnotes section", () => {
320
+ const md = renderArticleMarkdown(
321
+ doc([
322
+ {
323
+ type: "paragraph",
324
+ content: [
325
+ { type: "text", text: "Body" },
326
+ { type: "footnote", attrs: { content: "First note" } },
327
+ { type: "text", text: " more " },
328
+ { type: "footnote", attrs: { content: "Second note" } },
329
+ ],
330
+ },
331
+ ]),
332
+ META,
333
+ );
334
+ expect(md).toContain("[^1]");
335
+ expect(md).toContain("[^2]");
336
+ expect(md).toContain("## Footnotes");
337
+ expect(md).toContain("[^1]: First note");
338
+ expect(md).toContain("[^2]: Second note");
339
+ });
340
+
341
+ it("appends a References section from the formatted bibliography", () => {
342
+ const biblio = '<div class="csl-entry">Smith, J. (2024). <i>Test Paper</i>. Journal.</div>';
343
+ const md = renderArticleMarkdown(
344
+ doc([{ type: "paragraph", content: [{ type: "text", text: "Body" }] }]),
345
+ META,
346
+ undefined,
347
+ biblio,
348
+ );
349
+ expect(md).toContain("## References");
350
+ expect(md).toContain("Smith, J. (2024).");
351
+ expect(md).toContain("Test Paper");
352
+ expect(md).not.toContain("<div");
353
+ });
354
+ });
355
+
356
+ describe("stripHtmlToText", () => {
357
+ it("converts <a href> to a markdown link", () => {
358
+ expect(stripHtmlToText('<a href="https://example.com">click</a>')).toBe(
359
+ "[click](https://example.com)",
360
+ );
361
+ });
362
+
363
+ it("decodes common HTML entities", () => {
364
+ expect(stripHtmlToText("Tom &amp; Jerry &lt;3")).toBe("Tom & Jerry <3");
365
+ });
366
+
367
+ it("collapses block tags into newlines and removes the rest", () => {
368
+ const html = "<p>One.</p><p>Two.</p>";
369
+ expect(stripHtmlToText(html).trim()).toBe("One.\nTwo.");
370
+ });
371
+ });