feat(publisher): generate llms.txt Markdown twin for LLM agents
Browse filesExternal LLMs/agents (Claude, Perplexity, ...) struggle with the
heavy published HTML: theme bootstrap, KaTeX/Mermaid runtimes,
~7 inlined CSS files, TOC chrome, and D3 charts wrapped in
<iframe srcdoc>. The static research-article-template already
ships an `llms.txt` (Astro build plugin); the editor's publisher
now does the same, walking the TipTap-JSON we already extract
for the HTML/PDF pipeline.
Pipeline addition (publisher/index.ts):
- After writing index.html, render the same JSON to Markdown via
the new markdown-renderer.ts and write `published/<doc>/llms.txt`
- Re-render with the public HF meta and include it in the dataset
upload commit (hf-storage.ts adds llms.txt to the file list)
Routing (create-app.ts):
- GET /llms.txt serves the published Markdown twin with
Content-Type: text/markdown; charset=utf-8 (404 before publish)
- GET /robots.txt advertises it via the `LLMs-Txt:` header,
mirroring the upstream template
Conventions in the renderer mirror the upstream Astro plugin:
- HtmlEmbed -> *[Interactive visualization: <title|src>]*
- Note/Sidenote -> blockquote
- QuoteBlock -> blockquote + author/source attribution
- Wide/FullWidth/Stack -> unwrap content
- Accordion -> bold title + content
- Mermaid -> ```mermaid fenced block
- HfUser -> [name](https://huggingface.co/<u>)
- Citations -> key (APA) or [N] (IEEE/Vancouver)
- Footnotes -> Pandoc [^N] + footnotes section
- Bibliography -> stripped to plain text in a References section
- Inline/block math kept as $...$ / $$...$$
22 new unit tests cover header, blocks, custom components,
citations/footnotes, and the HTML stripper helper. All 87
backend tests pass; tsc clean on backend and frontend.
Co-authored-by: Cursor <cursoragent@cursor.com>
- README.md +1 -1
- backend/src/create-app.ts +35 -0
- backend/src/hf-storage.ts +10 -1
- backend/src/publisher/index.ts +23 -0
- backend/src/publisher/markdown-renderer.ts +496 -0
- backend/tests/markdown-renderer.test.ts +371 -0
|
@@ -26,7 +26,7 @@ A collaborative, real-time editor for web-native scientific articles. It lets mu
|
|
| 26 |
- **Comments & discussion** anchored on any selection
|
| 27 |
- **Slash menu** (`/`) and drag/drop block handles, in the spirit of Notion
|
| 28 |
- **Click-to-edit frontmatter**: title, subtitle, authors, affiliations, links, banner color
|
| 29 |
-
- **Publishing pipeline**: one-click export to a standalone static HTML bundle, plus PDF generation (Puppeteer)
|
| 30 |
- **Persistence**:
|
| 31 |
- Local mode: documents stored on disk under `DATA_DIR`
|
| 32 |
- HF mode: documents pushed/pulled from a Hugging Face dataset via OAuth
|
|
|
|
| 26 |
- **Comments & discussion** anchored on any selection
|
| 27 |
- **Slash menu** (`/`) and drag/drop block handles, in the spirit of Notion
|
| 28 |
- **Click-to-edit frontmatter**: title, subtitle, authors, affiliations, links, banner color
|
| 29 |
+
- **Publishing pipeline**: one-click export to a standalone static HTML bundle, plus PDF generation (Puppeteer) and an `llms.txt` Markdown twin for LLM agents/crawlers (served at `/llms.txt`, advertised in `/robots.txt`)
|
| 30 |
- **Persistence**:
|
| 31 |
- Local mode: documents stored on disk under `DATA_DIR`
|
| 32 |
- HF mode: documents pushed/pulled from a Hugging Face dataset via OAuth
|
|
@@ -307,9 +307,44 @@ export function createApp() {
|
|
| 307 |
return join(DATA_DIR, "published", docName, "index.html");
|
| 308 |
}
|
| 309 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
if (existsSync(staticDir)) {
|
| 311 |
app.use(express.static(staticDir, { index: false }));
|
| 312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
app.get("/editor", async (req, res) => {
|
| 314 |
if (oauthEnabled) {
|
| 315 |
const { resolveUser } = await import("./auth.js");
|
|
|
|
| 307 |
return join(DATA_DIR, "published", docName, "index.html");
|
| 308 |
}
|
| 309 |
|
| 310 |
+
function getPublishedAssetPath(docName: string, filename: string): string {
|
| 311 |
+
return join(DATA_DIR, "published", docName, filename);
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
if (existsSync(staticDir)) {
|
| 315 |
app.use(express.static(staticDir, { index: false }));
|
| 316 |
|
| 317 |
+
// ---- LLM-friendly endpoints --------------------------------------
|
| 318 |
+
// The publisher generates a Markdown twin of the article (`llms.txt`)
|
| 319 |
+
// following the https://llmstxt.org/ convention. We expose it at the
|
| 320 |
+
// Space root so external agents/crawlers (Claude, Perplexity, ...) can
|
| 321 |
+
// consume the article without having to parse the heavy HTML page.
|
| 322 |
+
// `robots.txt` advertises this endpoint.
|
| 323 |
+
|
| 324 |
+
app.get("/llms.txt", async (_req, res) => {
|
| 325 |
+
const llmsPath = getPublishedAssetPath(DEFAULT_DOC_NAME, "llms.txt");
|
| 326 |
+
if (!existsSync(llmsPath)) {
|
| 327 |
+
res.status(404).type("text/plain").send("Not yet published");
|
| 328 |
+
return;
|
| 329 |
+
}
|
| 330 |
+
res.type("text/markdown; charset=utf-8");
|
| 331 |
+
res.sendFile(llmsPath);
|
| 332 |
+
});
|
| 333 |
+
|
| 334 |
+
app.get("/robots.txt", (_req, res) => {
|
| 335 |
+
res
|
| 336 |
+
.type("text/plain; charset=utf-8")
|
| 337 |
+
.send(
|
| 338 |
+
[
|
| 339 |
+
"User-agent: *",
|
| 340 |
+
"Allow: /",
|
| 341 |
+
"",
|
| 342 |
+
"LLMs-Txt: /llms.txt",
|
| 343 |
+
"",
|
| 344 |
+
].join("\n"),
|
| 345 |
+
);
|
| 346 |
+
});
|
| 347 |
+
|
| 348 |
app.get("/editor", async (req, res) => {
|
| 349 |
if (oauthEnabled) {
|
| 350 |
const { resolveUser } = await import("./auth.js");
|
|
@@ -157,7 +157,7 @@ export async function pullPublishedAssets(
|
|
| 157 |
|
| 158 |
const safeName = sanitizeName(docName);
|
| 159 |
const base = `published/${safeName}`;
|
| 160 |
-
const files = ["index.html", "article.pdf", "thumb.jpg", "meta.json"];
|
| 161 |
|
| 162 |
const { mkdirSync, writeFileSync } = await import("fs");
|
| 163 |
const { join } = await import("path");
|
|
@@ -226,6 +226,7 @@ interface PublishedPayload {
|
|
| 226 |
pdf: Buffer | null;
|
| 227 |
thumbnail: Buffer | null;
|
| 228 |
meta: { title: string; description: string; authors: string[]; date: string; [key: string]: unknown };
|
|
|
|
| 229 |
}
|
| 230 |
|
| 231 |
export async function uploadPublishedAssets(
|
|
@@ -274,6 +275,14 @@ export async function uploadPublishedAssets(
|
|
| 274 |
thumbUrl = `${baseUrl}/thumb.jpg`;
|
| 275 |
}
|
| 276 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
await commit({
|
| 278 |
repo,
|
| 279 |
operations,
|
|
|
|
| 157 |
|
| 158 |
const safeName = sanitizeName(docName);
|
| 159 |
const base = `published/${safeName}`;
|
| 160 |
+
const files = ["index.html", "article.pdf", "thumb.jpg", "meta.json", "llms.txt"];
|
| 161 |
|
| 162 |
const { mkdirSync, writeFileSync } = await import("fs");
|
| 163 |
const { join } = await import("path");
|
|
|
|
| 226 |
pdf: Buffer | null;
|
| 227 |
thumbnail: Buffer | null;
|
| 228 |
meta: { title: string; description: string; authors: string[]; date: string; [key: string]: unknown };
|
| 229 |
+
llmsTxt?: string;
|
| 230 |
}
|
| 231 |
|
| 232 |
export async function uploadPublishedAssets(
|
|
|
|
| 275 |
thumbUrl = `${baseUrl}/thumb.jpg`;
|
| 276 |
}
|
| 277 |
|
| 278 |
+
if (payload.llmsTxt) {
|
| 279 |
+
operations.push({
|
| 280 |
+
operation: "addOrUpdate",
|
| 281 |
+
path: `${base}/llms.txt`,
|
| 282 |
+
content: new Blob([payload.llmsTxt], { type: "text/markdown; charset=utf-8" }),
|
| 283 |
+
});
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
await commit({
|
| 287 |
repo,
|
| 288 |
operations,
|
|
@@ -11,6 +11,7 @@ import * as Y from "yjs";
|
|
| 11 |
import { getDataDir, docPath, sanitizeName } from "../utils.js";
|
| 12 |
import { TiptapTransformer } from "@hocuspocus/transformer";
|
| 13 |
import { renderArticleHTML, type PublishMeta, type CitationData } from "./html-renderer.js";
|
|
|
|
| 14 |
import { formatBibliographyServer } from "./format-bibliography.js";
|
| 15 |
import { getServerExtensions } from "./extensions.js";
|
| 16 |
import { isPdfEnabled, generatePdfAndThumbnail } from "./pdf-generator.js";
|
|
@@ -435,6 +436,18 @@ export async function publishDocument(
|
|
| 435 |
fsMkdir(publishDir, { recursive: true });
|
| 436 |
fsWrite(join(publishDir, "index.html"), localHtml);
|
| 437 |
fsWrite(join(publishDir, "meta.json"), JSON.stringify(localMeta, null, 2));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
console.log("[publish] index.html written to", publishDir);
|
| 439 |
|
| 440 |
// --- Step 2: generate PDF + thumbnail from the served URL ---------------
|
|
@@ -487,11 +500,21 @@ export async function publishDocument(
|
|
| 487 |
if (pdf) hfMeta.pdfUrl = getPublishedAssetUrl(docName, "article.pdf");
|
| 488 |
const hfHtml = await renderArticleHTML(json, hfMeta, css, citationData, biblioHtml, embeds);
|
| 489 |
try {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
const urls = await uploadPublishedAssets(docName, {
|
| 491 |
html: hfHtml,
|
| 492 |
pdf,
|
| 493 |
thumbnail,
|
| 494 |
meta: hfMeta as any,
|
|
|
|
| 495 |
}, token);
|
| 496 |
return { ...urls, success: true };
|
| 497 |
} catch (err) {
|
|
|
|
| 11 |
import { getDataDir, docPath, sanitizeName } from "../utils.js";
|
| 12 |
import { TiptapTransformer } from "@hocuspocus/transformer";
|
| 13 |
import { renderArticleHTML, type PublishMeta, type CitationData } from "./html-renderer.js";
|
| 14 |
+
import { renderArticleMarkdown } from "./markdown-renderer.js";
|
| 15 |
import { formatBibliographyServer } from "./format-bibliography.js";
|
| 16 |
import { getServerExtensions } from "./extensions.js";
|
| 17 |
import { isPdfEnabled, generatePdfAndThumbnail } from "./pdf-generator.js";
|
|
|
|
| 436 |
fsMkdir(publishDir, { recursive: true });
|
| 437 |
fsWrite(join(publishDir, "index.html"), localHtml);
|
| 438 |
fsWrite(join(publishDir, "meta.json"), JSON.stringify(localMeta, null, 2));
|
| 439 |
+
|
| 440 |
+
// Co-located `llms.txt` (https://llmstxt.org/) for LLM agents and
|
| 441 |
+
// crawlers that struggle with the heavy published HTML. Cheap to
|
| 442 |
+
// generate (one walk over the same TipTap-JSON, no Playwright) so we
|
| 443 |
+
// always write it; serving is gated separately by `create-app.ts`.
|
| 444 |
+
let llmsTxt = "";
|
| 445 |
+
try {
|
| 446 |
+
llmsTxt = renderArticleMarkdown(json, localMeta, citationData, biblioHtml);
|
| 447 |
+
fsWrite(join(publishDir, "llms.txt"), llmsTxt);
|
| 448 |
+
} catch (err) {
|
| 449 |
+
console.warn("[publish] llms.txt generation failed:", (err as Error).message);
|
| 450 |
+
}
|
| 451 |
console.log("[publish] index.html written to", publishDir);
|
| 452 |
|
| 453 |
// --- Step 2: generate PDF + thumbnail from the served URL ---------------
|
|
|
|
| 500 |
if (pdf) hfMeta.pdfUrl = getPublishedAssetUrl(docName, "article.pdf");
|
| 501 |
const hfHtml = await renderArticleHTML(json, hfMeta, css, citationData, biblioHtml, embeds);
|
| 502 |
try {
|
| 503 |
+
// Re-render llms.txt with the HF meta so internal links (DOI, etc.)
|
| 504 |
+
// match what's in the uploaded HTML. Cheap, deterministic, and we
|
| 505 |
+
// already have everything in memory.
|
| 506 |
+
let hfLlmsTxt: string | undefined = undefined;
|
| 507 |
+
try {
|
| 508 |
+
hfLlmsTxt = renderArticleMarkdown(json, hfMeta, citationData, biblioHtml);
|
| 509 |
+
} catch (err) {
|
| 510 |
+
console.warn("[publish] HF llms.txt render failed:", (err as Error).message);
|
| 511 |
+
}
|
| 512 |
const urls = await uploadPublishedAssets(docName, {
|
| 513 |
html: hfHtml,
|
| 514 |
pdf,
|
| 515 |
thumbnail,
|
| 516 |
meta: hfMeta as any,
|
| 517 |
+
llmsTxt: hfLlmsTxt,
|
| 518 |
}, token);
|
| 519 |
return { ...urls, success: true };
|
| 520 |
} catch (err) {
|
|
@@ -0,0 +1,496 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Server-side Markdown renderer for the publisher pipeline.
|
| 3 |
+
*
|
| 4 |
+
* Walks the same TipTap-JSON document used by `html-renderer.ts` and
|
| 5 |
+
* produces a clean Markdown file conforming to the `llms.txt` convention
|
| 6 |
+
* (https://llmstxt.org/). The output is meant to be consumed by LLM
|
| 7 |
+
* agents and crawlers that struggle with the heavy published HTML
|
| 8 |
+
* (theme bootstrap, KaTeX/Mermaid runtimes, inlined CSS, iframe-wrapped
|
| 9 |
+
* D3 charts, ...).
|
| 10 |
+
*
|
| 11 |
+
* Conventions mirror the upstream `research-article-template` Astro
|
| 12 |
+
* plugin (`app/plugins/astro/generate-llms-txt.mjs`):
|
| 13 |
+
*
|
| 14 |
+
* - `<HtmlEmbed src="..." />` -> `*[Interactive visualization: <src>]*`
|
| 15 |
+
* - `<Note>...</Note>` -> blockquote
|
| 16 |
+
* - `<Quote author=...>` -> blockquote + attribution
|
| 17 |
+
* - `<Wide>`/`<FullWidth>` -> unwrap content
|
| 18 |
+
* - `<Sidenote>` -> main content + blockquote aside
|
| 19 |
+
* - `<Reference caption=...>` -> content + caption
|
| 20 |
+
* - `<Accordion title=...>` -> bold title + content
|
| 21 |
+
* - `<Mermaid code=...>` -> fenced ```mermaid``` code block
|
| 22 |
+
* - `<HfUser username=...>` -> `[@u](https://huggingface.co/u)`
|
| 23 |
+
* - `<Citation key=...>` -> `[key]` (or `[N]` for IEEE)
|
| 24 |
+
* - `<Footnote content=...>` -> Pandoc `[^N]` reference + footnotes section
|
| 25 |
+
* - inline / block math -> `$...$` / `$$...$$`
|
| 26 |
+
*
|
| 27 |
+
* Output shape:
|
| 28 |
+
* # <title>
|
| 29 |
+
*
|
| 30 |
+
* > <subtitle / description>
|
| 31 |
+
*
|
| 32 |
+
* - **Authors**: ...
|
| 33 |
+
* - **Published**: ...
|
| 34 |
+
* - **DOI**: ...
|
| 35 |
+
*
|
| 36 |
+
* ---
|
| 37 |
+
*
|
| 38 |
+
* <body markdown>
|
| 39 |
+
*
|
| 40 |
+
* ## References
|
| 41 |
+
* ...
|
| 42 |
+
*
|
| 43 |
+
* ## Footnotes
|
| 44 |
+
* ...
|
| 45 |
+
*/
|
| 46 |
+
|
| 47 |
+
import type { PublishMeta, CitationData } from "./html-renderer.js";
|
| 48 |
+
|
| 49 |
+
type JSONNode = {
|
| 50 |
+
type?: string;
|
| 51 |
+
attrs?: Record<string, any>;
|
| 52 |
+
marks?: Array<{ type: string; attrs?: Record<string, any> }>;
|
| 53 |
+
text?: string;
|
| 54 |
+
content?: JSONNode[];
|
| 55 |
+
};
|
| 56 |
+
|
| 57 |
+
interface RenderCtx {
|
| 58 |
+
citationData?: CitationData;
|
| 59 |
+
/** Pre-formatted bibliography (HTML from citation-js) - we strip tags. */
|
| 60 |
+
biblioHtml?: string;
|
| 61 |
+
/** Footnote texts collected during the walk, emitted at the end. */
|
| 62 |
+
footnotes: string[];
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
// ---------------------------------------------------------------------------
|
| 66 |
+
// Inline rendering (text + marks + inline atoms)
|
| 67 |
+
// ---------------------------------------------------------------------------
|
| 68 |
+
|
| 69 |
+
function applyMarks(text: string, marks: JSONNode["marks"]): string {
|
| 70 |
+
if (!marks?.length) return text;
|
| 71 |
+
let out = text;
|
| 72 |
+
for (const mark of marks) {
|
| 73 |
+
switch (mark.type) {
|
| 74 |
+
case "bold":
|
| 75 |
+
out = `**${out}**`;
|
| 76 |
+
break;
|
| 77 |
+
case "italic":
|
| 78 |
+
out = `*${out}*`;
|
| 79 |
+
break;
|
| 80 |
+
case "strike":
|
| 81 |
+
out = `~~${out}~~`;
|
| 82 |
+
break;
|
| 83 |
+
case "code":
|
| 84 |
+
out = `\`${out}\``;
|
| 85 |
+
break;
|
| 86 |
+
case "link": {
|
| 87 |
+
const href = mark.attrs?.href || "";
|
| 88 |
+
out = href ? `[${out}](${href})` : out;
|
| 89 |
+
break;
|
| 90 |
+
}
|
| 91 |
+
default:
|
| 92 |
+
break;
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
return out;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
function getCitationLabel(key: string, ctx: RenderCtx, fallbackLabel?: string): string {
|
| 99 |
+
if (!ctx.citationData) return fallbackLabel || `[${key}]`;
|
| 100 |
+
const { style, orderedKeys } = ctx.citationData;
|
| 101 |
+
if (style === "ieee" || style === "vancouver") {
|
| 102 |
+
const idx = orderedKeys.indexOf(key);
|
| 103 |
+
if (idx >= 0) return `[${idx + 1}]`;
|
| 104 |
+
}
|
| 105 |
+
return fallbackLabel || `[${key}]`;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
function renderInline(nodes: JSONNode[] | undefined, ctx: RenderCtx): string {
|
| 109 |
+
if (!nodes) return "";
|
| 110 |
+
let out = "";
|
| 111 |
+
for (const node of nodes) {
|
| 112 |
+
out += renderInlineNode(node, ctx);
|
| 113 |
+
}
|
| 114 |
+
return out;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
function renderInlineNode(node: JSONNode, ctx: RenderCtx): string {
|
| 118 |
+
switch (node.type) {
|
| 119 |
+
case "text":
|
| 120 |
+
return applyMarks(node.text || "", node.marks);
|
| 121 |
+
case "hardBreak":
|
| 122 |
+
return " \n";
|
| 123 |
+
case "inlineMath": {
|
| 124 |
+
const latex = node.attrs?.latex || "";
|
| 125 |
+
return latex ? `$${latex}$` : "";
|
| 126 |
+
}
|
| 127 |
+
case "citation": {
|
| 128 |
+
const key = String(node.attrs?.key || "");
|
| 129 |
+
if (!key) return "";
|
| 130 |
+
return getCitationLabel(key, ctx, node.attrs?.label);
|
| 131 |
+
}
|
| 132 |
+
case "glossary": {
|
| 133 |
+
const term = String(node.attrs?.term || "");
|
| 134 |
+
return term;
|
| 135 |
+
}
|
| 136 |
+
case "footnote": {
|
| 137 |
+
const content = String(node.attrs?.content || "");
|
| 138 |
+
ctx.footnotes.push(content);
|
| 139 |
+
return `[^${ctx.footnotes.length}]`;
|
| 140 |
+
}
|
| 141 |
+
case "image": {
|
| 142 |
+
const src = String(node.attrs?.src || "");
|
| 143 |
+
const alt = String(node.attrs?.alt || "");
|
| 144 |
+
const title = node.attrs?.title ? ` "${node.attrs.title}"` : "";
|
| 145 |
+
return src ? `` : alt;
|
| 146 |
+
}
|
| 147 |
+
default:
|
| 148 |
+
// Unknown inline: fall back to its text content if any.
|
| 149 |
+
return renderInline(node.content, ctx);
|
| 150 |
+
}
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
// ---------------------------------------------------------------------------
|
| 154 |
+
// Block rendering
|
| 155 |
+
// ---------------------------------------------------------------------------
|
| 156 |
+
|
| 157 |
+
function renderBlocks(nodes: JSONNode[] | undefined, ctx: RenderCtx): string {
|
| 158 |
+
if (!nodes?.length) return "";
|
| 159 |
+
const parts: string[] = [];
|
| 160 |
+
for (const node of nodes) {
|
| 161 |
+
const rendered = renderBlock(node, ctx);
|
| 162 |
+
if (rendered) parts.push(rendered);
|
| 163 |
+
}
|
| 164 |
+
return parts.join("\n\n");
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
function renderBlock(node: JSONNode, ctx: RenderCtx): string {
|
| 168 |
+
switch (node.type) {
|
| 169 |
+
case "doc":
|
| 170 |
+
return renderBlocks(node.content, ctx);
|
| 171 |
+
|
| 172 |
+
case "paragraph": {
|
| 173 |
+
const inner = renderInline(node.content, ctx).trim();
|
| 174 |
+
return inner;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
case "heading": {
|
| 178 |
+
const level = Math.min(Math.max(Number(node.attrs?.level) || 1, 1), 6);
|
| 179 |
+
const inner = renderInline(node.content, ctx).trim();
|
| 180 |
+
return `${"#".repeat(level)} ${inner}`;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
case "blockquote": {
|
| 184 |
+
const inner = renderBlocks(node.content, ctx);
|
| 185 |
+
return inner
|
| 186 |
+
.split("\n")
|
| 187 |
+
.map((l) => (l.length ? `> ${l}` : ">"))
|
| 188 |
+
.join("\n");
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
case "horizontalRule":
|
| 192 |
+
return "---";
|
| 193 |
+
|
| 194 |
+
case "codeBlock": {
|
| 195 |
+
const lang = String(node.attrs?.language || node.attrs?.lang || "");
|
| 196 |
+
const code = (node.content || [])
|
| 197 |
+
.map((c) => c.text || "")
|
| 198 |
+
.join("");
|
| 199 |
+
return `\`\`\`${lang}\n${code}\n\`\`\``;
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
case "bulletList":
|
| 203 |
+
return renderList(node, ctx, "-");
|
| 204 |
+
|
| 205 |
+
case "orderedList":
|
| 206 |
+
return renderList(node, ctx, "1.");
|
| 207 |
+
|
| 208 |
+
case "listItem": {
|
| 209 |
+
// Should normally be reached via renderList, but if encountered
|
| 210 |
+
// standalone we just render its blocks.
|
| 211 |
+
return renderBlocks(node.content, ctx);
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
case "blockMath": {
|
| 215 |
+
const latex = String(node.attrs?.latex || "").trim();
|
| 216 |
+
return latex ? `$$\n${latex}\n$$` : "";
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
case "table":
|
| 220 |
+
return renderTable(node, ctx);
|
| 221 |
+
|
| 222 |
+
// --- Custom block components ---
|
| 223 |
+
|
| 224 |
+
case "accordion": {
|
| 225 |
+
const title = String(node.attrs?.title || "Details");
|
| 226 |
+
const inner = renderBlocks(node.content, ctx);
|
| 227 |
+
return `**${title}**\n\n${inner}`;
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
case "note": {
|
| 231 |
+
const inner = renderBlocks(node.content, ctx);
|
| 232 |
+
return inner
|
| 233 |
+
.split("\n")
|
| 234 |
+
.map((l) => (l.length ? `> ${l}` : ">"))
|
| 235 |
+
.join("\n");
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
case "quoteBlock": {
|
| 239 |
+
const inner = renderBlocks(node.content, ctx);
|
| 240 |
+
const author = String(node.attrs?.author || "").trim();
|
| 241 |
+
const source = String(node.attrs?.source || "").trim();
|
| 242 |
+
const attribution = [author, source].filter(Boolean).join(", ");
|
| 243 |
+
const quoted = inner
|
| 244 |
+
.split("\n")
|
| 245 |
+
.map((l) => (l.length ? `> ${l}` : ">"))
|
| 246 |
+
.join("\n");
|
| 247 |
+
return attribution ? `${quoted}\n>\n> -- ${attribution}` : quoted;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
case "wide":
|
| 251 |
+
case "fullWidth":
|
| 252 |
+
case "stack":
|
| 253 |
+
case "stackColumn":
|
| 254 |
+
return renderBlocks(node.content, ctx);
|
| 255 |
+
|
| 256 |
+
case "sidenote": {
|
| 257 |
+
const inner = renderBlocks(node.content, ctx);
|
| 258 |
+
// No `slot="aside"` in TipTap-JSON: render as a blockquote.
|
| 259 |
+
return inner
|
| 260 |
+
.split("\n")
|
| 261 |
+
.map((l) => (l.length ? `> ${l}` : ">"))
|
| 262 |
+
.join("\n");
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
case "reference": {
|
| 266 |
+
const inner = renderBlocks(node.content, ctx);
|
| 267 |
+
const caption = String(node.attrs?.caption || "").trim();
|
| 268 |
+
return caption ? `${inner}\n\n*Figure: ${caption}*` : inner;
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
case "htmlEmbed": {
|
| 272 |
+
const src = String(node.attrs?.src || "").trim();
|
| 273 |
+
const title = String(node.attrs?.title || "").trim();
|
| 274 |
+
const desc = String(node.attrs?.desc || "").trim();
|
| 275 |
+
const labelParts = [title, desc].filter(Boolean);
|
| 276 |
+
const label = labelParts.length
|
| 277 |
+
? labelParts.join(" - ")
|
| 278 |
+
: src || "embed";
|
| 279 |
+
return `*[Interactive visualization: ${label}]*`;
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
case "hfUser": {
|
| 283 |
+
const username = String(node.attrs?.username || "").trim();
|
| 284 |
+
if (!username) return "";
|
| 285 |
+
const url =
|
| 286 |
+
String(node.attrs?.url || "").trim() ||
|
| 287 |
+
`https://huggingface.co/${encodeURIComponent(username)}`;
|
| 288 |
+
const name = String(node.attrs?.name || "").trim() || `@${username}`;
|
| 289 |
+
return `[${name}](${url})`;
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
case "rawHtml": {
|
| 293 |
+
const html = String(node.attrs?.html || "");
|
| 294 |
+
return stripHtmlToText(html).trim();
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
case "mermaid": {
|
| 298 |
+
const code = String(node.attrs?.code || "").trim();
|
| 299 |
+
return code ? `\`\`\`mermaid\n${code}\n\`\`\`` : "";
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
case "bibliography":
|
| 303 |
+
// Emitted by `appendBibliographySection` from the post-walk step.
|
| 304 |
+
return "";
|
| 305 |
+
|
| 306 |
+
default:
|
| 307 |
+
// Unknown block: fall back to its content, or empty.
|
| 308 |
+
return renderBlocks(node.content, ctx);
|
| 309 |
+
}
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
function renderList(
|
| 313 |
+
node: JSONNode,
|
| 314 |
+
ctx: RenderCtx,
|
| 315 |
+
marker: string,
|
| 316 |
+
): string {
|
| 317 |
+
const items = node.content || [];
|
| 318 |
+
const lines: string[] = [];
|
| 319 |
+
items.forEach((item, idx) => {
|
| 320 |
+
const innerBlocks = renderBlocks(item.content, ctx);
|
| 321 |
+
const prefix = marker === "1." ? `${idx + 1}.` : marker;
|
| 322 |
+
const innerLines = innerBlocks.split("\n");
|
| 323 |
+
lines.push(`${prefix} ${innerLines[0] ?? ""}`);
|
| 324 |
+
for (let i = 1; i < innerLines.length; i++) {
|
| 325 |
+
const indent = " ".repeat(prefix.length + 1);
|
| 326 |
+
lines.push(`${indent}${innerLines[i]}`);
|
| 327 |
+
}
|
| 328 |
+
});
|
| 329 |
+
return lines.join("\n");
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
function renderTable(node: JSONNode, ctx: RenderCtx): string {
|
| 333 |
+
const rows = node.content || [];
|
| 334 |
+
if (!rows.length) return "";
|
| 335 |
+
|
| 336 |
+
const grid: string[][] = [];
|
| 337 |
+
let headerRowIndex = -1;
|
| 338 |
+
|
| 339 |
+
for (let r = 0; r < rows.length; r++) {
|
| 340 |
+
const row = rows[r];
|
| 341 |
+
const cells = row.content || [];
|
| 342 |
+
const rowText: string[] = [];
|
| 343 |
+
let rowIsHeader = false;
|
| 344 |
+
for (const cell of cells) {
|
| 345 |
+
if (cell.type === "tableHeader") rowIsHeader = true;
|
| 346 |
+
const text = renderBlocks(cell.content, ctx)
|
| 347 |
+
.replace(/\n+/g, " ")
|
| 348 |
+
.replace(/\|/g, "\\|")
|
| 349 |
+
.trim();
|
| 350 |
+
rowText.push(text);
|
| 351 |
+
}
|
| 352 |
+
grid.push(rowText);
|
| 353 |
+
if (rowIsHeader && headerRowIndex === -1) headerRowIndex = r;
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
const colCount = Math.max(...grid.map((r) => r.length));
|
| 357 |
+
for (const row of grid) {
|
| 358 |
+
while (row.length < colCount) row.push("");
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
const lines: string[] = [];
|
| 362 |
+
if (headerRowIndex === -1) {
|
| 363 |
+
// No explicit header row: synthesize one with empty cells so the
|
| 364 |
+
// markdown table is still valid.
|
| 365 |
+
lines.push(`| ${new Array(colCount).fill(" ").join(" | ")} |`);
|
| 366 |
+
lines.push(`| ${new Array(colCount).fill("---").join(" | ")} |`);
|
| 367 |
+
for (const row of grid) lines.push(`| ${row.join(" | ")} |`);
|
| 368 |
+
} else {
|
| 369 |
+
for (let r = 0; r < grid.length; r++) {
|
| 370 |
+
lines.push(`| ${grid[r].join(" | ")} |`);
|
| 371 |
+
if (r === headerRowIndex) {
|
| 372 |
+
lines.push(`| ${new Array(colCount).fill("---").join(" | ")} |`);
|
| 373 |
+
}
|
| 374 |
+
}
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
return lines.join("\n");
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
// ---------------------------------------------------------------------------
|
| 381 |
+
// Header (frontmatter) + bibliography + footnotes
|
| 382 |
+
// ---------------------------------------------------------------------------
|
| 383 |
+
|
| 384 |
+
function buildHeader(meta: PublishMeta): string {
|
| 385 |
+
const titleClean = (meta.title || "Untitled")
|
| 386 |
+
.replace(/\\n/g, " ")
|
| 387 |
+
.replace(/\n/g, " ")
|
| 388 |
+
.replace(/\s{2,}/g, " ")
|
| 389 |
+
.trim();
|
| 390 |
+
|
| 391 |
+
const parts = [`# ${titleClean}\n`];
|
| 392 |
+
|
| 393 |
+
const desc = (meta.description || meta.subtitle || "").trim();
|
| 394 |
+
if (desc) parts.push(`> ${desc}\n`);
|
| 395 |
+
|
| 396 |
+
const metaLines: string[] = [];
|
| 397 |
+
const authors = meta.authors.map((a) => a.name).filter(Boolean);
|
| 398 |
+
if (authors.length) metaLines.push(`- **Authors**: ${authors.join(", ")}`);
|
| 399 |
+
if (meta.date) metaLines.push(`- **Published**: ${meta.date}`);
|
| 400 |
+
if (meta.doi) {
|
| 401 |
+
const doiUrl = meta.doi.startsWith("http")
|
| 402 |
+
? meta.doi
|
| 403 |
+
: `https://doi.org/${meta.doi}`;
|
| 404 |
+
metaLines.push(`- **DOI**: ${doiUrl}`);
|
| 405 |
+
}
|
| 406 |
+
if (metaLines.length) parts.push(metaLines.join("\n") + "\n");
|
| 407 |
+
|
| 408 |
+
parts.push("---\n");
|
| 409 |
+
return parts.join("\n");
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
function appendBibliographySection(
|
| 413 |
+
ctx: RenderCtx,
|
| 414 |
+
): string {
|
| 415 |
+
if (!ctx.biblioHtml) return "";
|
| 416 |
+
const text = stripHtmlToText(ctx.biblioHtml).trim();
|
| 417 |
+
if (!text) return "";
|
| 418 |
+
return `## References\n\n${text}`;
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
function appendFootnotesSection(ctx: RenderCtx): string {
|
| 422 |
+
if (!ctx.footnotes.length) return "";
|
| 423 |
+
const lines = ctx.footnotes.map((content, i) => {
|
| 424 |
+
const clean = stripHtmlToText(content).trim().replace(/\n+/g, " ");
|
| 425 |
+
return `[^${i + 1}]: ${clean}`;
|
| 426 |
+
});
|
| 427 |
+
return `## Footnotes\n\n${lines.join("\n\n")}`;
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
// ---------------------------------------------------------------------------
|
| 431 |
+
// Helpers
|
| 432 |
+
// ---------------------------------------------------------------------------
|
| 433 |
+
|
| 434 |
+
/**
|
| 435 |
+
* Strip HTML tags while keeping anchor hrefs as Markdown links and emitting
|
| 436 |
+
* a blank line between block-level elements. Tuned for citation-js HTML
|
| 437 |
+
* output and for `rawHtml` user content - not a general-purpose sanitiser.
|
| 438 |
+
*/
|
| 439 |
+
export function stripHtmlToText(html: string): string {
|
| 440 |
+
if (!html) return "";
|
| 441 |
+
let out = html;
|
| 442 |
+
out = out.replace(/<a\s+[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, "[$2]($1)");
|
| 443 |
+
out = out.replace(/<br\s*\/?>/gi, "\n");
|
| 444 |
+
out = out.replace(
|
| 445 |
+
/<\/(p|div|li|tr|h[1-6])>/gi,
|
| 446 |
+
"$&\n",
|
| 447 |
+
);
|
| 448 |
+
out = out.replace(/<[^>]+>/g, "");
|
| 449 |
+
out = out
|
| 450 |
+
.replace(/ /g, " ")
|
| 451 |
+
.replace(/&/g, "&")
|
| 452 |
+
.replace(/</g, "<")
|
| 453 |
+
.replace(/>/g, ">")
|
| 454 |
+
.replace(/"/g, '"')
|
| 455 |
+
.replace(/'/g, "'");
|
| 456 |
+
return out
|
| 457 |
+
.split("\n")
|
| 458 |
+
.map((l) => l.trim())
|
| 459 |
+
.filter((l, i, arr) => !(l === "" && arr[i - 1] === ""))
|
| 460 |
+
.join("\n");
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
// ---------------------------------------------------------------------------
|
| 464 |
+
// Public API
|
| 465 |
+
// ---------------------------------------------------------------------------
|
| 466 |
+
|
| 467 |
+
/**
|
| 468 |
+
* Render a TipTap-JSON document into an `llms.txt`-compatible Markdown string.
|
| 469 |
+
*
|
| 470 |
+
* `serverBiblioHtml` is the same HTML produced by `formatBibliographyServer()`
|
| 471 |
+
* for the HTML pipeline; we reuse it (stripped to text) so the agent gets the
|
| 472 |
+
* same reference list a human would see.
|
| 473 |
+
*/
|
| 474 |
+
export function renderArticleMarkdown(
|
| 475 |
+
json: Record<string, unknown>,
|
| 476 |
+
meta: PublishMeta,
|
| 477 |
+
citationData?: CitationData,
|
| 478 |
+
serverBiblioHtml?: string,
|
| 479 |
+
): string {
|
| 480 |
+
const ctx: RenderCtx = {
|
| 481 |
+
citationData,
|
| 482 |
+
biblioHtml: serverBiblioHtml,
|
| 483 |
+
footnotes: [],
|
| 484 |
+
};
|
| 485 |
+
|
| 486 |
+
const body = renderBlocks((json as JSONNode).content, ctx);
|
| 487 |
+
const header = buildHeader(meta);
|
| 488 |
+
const refs = appendBibliographySection(ctx);
|
| 489 |
+
const footnotes = appendFootnotesSection(ctx);
|
| 490 |
+
|
| 491 |
+
const sections = [header.trimEnd(), body, refs, footnotes]
|
| 492 |
+
.filter(Boolean)
|
| 493 |
+
.join("\n\n");
|
| 494 |
+
|
| 495 |
+
return sections.replace(/\n{3,}/g, "\n\n").trim() + "\n";
|
| 496 |
+
}
|
|
@@ -0,0 +1,371 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { describe, it, expect } from "vitest";
|
| 2 |
+
import {
|
| 3 |
+
renderArticleMarkdown,
|
| 4 |
+
stripHtmlToText,
|
| 5 |
+
} from "../src/publisher/markdown-renderer.js";
|
| 6 |
+
import type { PublishMeta, CitationData } from "../src/publisher/html-renderer.js";
|
| 7 |
+
|
| 8 |
+
const META: PublishMeta = {
|
| 9 |
+
title: "Test Article",
|
| 10 |
+
subtitle: "A subtitle",
|
| 11 |
+
description: "A short description for SEO",
|
| 12 |
+
authors: [
|
| 13 |
+
{ name: "Alice", affiliationIndices: [1], affiliationNames: ["MIT"] },
|
| 14 |
+
{ name: "Bob", affiliationIndices: [2], affiliationNames: ["HF"] },
|
| 15 |
+
],
|
| 16 |
+
affiliations: [{ name: "MIT" }, { name: "HF" }],
|
| 17 |
+
date: "2026-04-30",
|
| 18 |
+
doi: "10.1234/abcd.efgh",
|
| 19 |
+
};
|
| 20 |
+
|
| 21 |
+
const doc = (content: any[]) => ({ type: "doc", content });
|
| 22 |
+
|
| 23 |
+
describe("renderArticleMarkdown - header", () => {
|
| 24 |
+
it("emits an llms.txt-style header with title, description, authors, date and DOI", () => {
|
| 25 |
+
const md = renderArticleMarkdown(doc([{ type: "paragraph" }]), META);
|
| 26 |
+
expect(md).toContain("# Test Article");
|
| 27 |
+
expect(md).toContain("> A short description for SEO");
|
| 28 |
+
expect(md).toContain("- **Authors**: Alice, Bob");
|
| 29 |
+
expect(md).toContain("- **Published**: 2026-04-30");
|
| 30 |
+
expect(md).toContain("- **DOI**: https://doi.org/10.1234/abcd.efgh");
|
| 31 |
+
expect(md).toContain("---");
|
| 32 |
+
});
|
| 33 |
+
|
| 34 |
+
it("falls back to subtitle when description is empty", () => {
|
| 35 |
+
const md = renderArticleMarkdown(
|
| 36 |
+
doc([{ type: "paragraph" }]),
|
| 37 |
+
{ ...META, description: "" },
|
| 38 |
+
);
|
| 39 |
+
expect(md).toContain("> A subtitle");
|
| 40 |
+
});
|
| 41 |
+
|
| 42 |
+
it("collapses multi-line titles", () => {
|
| 43 |
+
const md = renderArticleMarkdown(
|
| 44 |
+
doc([{ type: "paragraph" }]),
|
| 45 |
+
{ ...META, title: "Line one\\nLine two" },
|
| 46 |
+
);
|
| 47 |
+
expect(md).toContain("# Line one Line two");
|
| 48 |
+
expect(md).not.toContain("\\n");
|
| 49 |
+
});
|
| 50 |
+
});
|
| 51 |
+
|
| 52 |
+
describe("renderArticleMarkdown - block nodes", () => {
|
| 53 |
+
it("renders headings with the correct markdown level", () => {
|
| 54 |
+
const md = renderArticleMarkdown(
|
| 55 |
+
doc([
|
| 56 |
+
{ type: "heading", attrs: { level: 2 }, content: [{ type: "text", text: "Hello" }] },
|
| 57 |
+
{ type: "heading", attrs: { level: 3 }, content: [{ type: "text", text: "Sub" }] },
|
| 58 |
+
]),
|
| 59 |
+
META,
|
| 60 |
+
);
|
| 61 |
+
expect(md).toContain("## Hello");
|
| 62 |
+
expect(md).toContain("### Sub");
|
| 63 |
+
});
|
| 64 |
+
|
| 65 |
+
it("applies bold/italic/code/link marks", () => {
|
| 66 |
+
const md = renderArticleMarkdown(
|
| 67 |
+
doc([
|
| 68 |
+
{
|
| 69 |
+
type: "paragraph",
|
| 70 |
+
content: [
|
| 71 |
+
{ type: "text", text: "bold", marks: [{ type: "bold" }] },
|
| 72 |
+
{ type: "text", text: " " },
|
| 73 |
+
{ type: "text", text: "italic", marks: [{ type: "italic" }] },
|
| 74 |
+
{ type: "text", text: " " },
|
| 75 |
+
{ type: "text", text: "code", marks: [{ type: "code" }] },
|
| 76 |
+
{ type: "text", text: " " },
|
| 77 |
+
{
|
| 78 |
+
type: "text",
|
| 79 |
+
text: "link",
|
| 80 |
+
marks: [{ type: "link", attrs: { href: "https://example.com" } }],
|
| 81 |
+
},
|
| 82 |
+
],
|
| 83 |
+
},
|
| 84 |
+
]),
|
| 85 |
+
META,
|
| 86 |
+
);
|
| 87 |
+
expect(md).toContain("**bold**");
|
| 88 |
+
expect(md).toContain("*italic*");
|
| 89 |
+
expect(md).toContain("`code`");
|
| 90 |
+
expect(md).toContain("[link](https://example.com)");
|
| 91 |
+
});
|
| 92 |
+
|
| 93 |
+
it("renders bullet and ordered lists", () => {
|
| 94 |
+
const md = renderArticleMarkdown(
|
| 95 |
+
doc([
|
| 96 |
+
{
|
| 97 |
+
type: "bulletList",
|
| 98 |
+
content: [
|
| 99 |
+
{ type: "listItem", content: [{ type: "paragraph", content: [{ type: "text", text: "one" }] }] },
|
| 100 |
+
{ type: "listItem", content: [{ type: "paragraph", content: [{ type: "text", text: "two" }] }] },
|
| 101 |
+
],
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
type: "orderedList",
|
| 105 |
+
content: [
|
| 106 |
+
{ type: "listItem", content: [{ type: "paragraph", content: [{ type: "text", text: "first" }] }] },
|
| 107 |
+
{ type: "listItem", content: [{ type: "paragraph", content: [{ type: "text", text: "second" }] }] },
|
| 108 |
+
],
|
| 109 |
+
},
|
| 110 |
+
]),
|
| 111 |
+
META,
|
| 112 |
+
);
|
| 113 |
+
expect(md).toContain("- one");
|
| 114 |
+
expect(md).toContain("- two");
|
| 115 |
+
expect(md).toContain("1. first");
|
| 116 |
+
expect(md).toContain("2. second");
|
| 117 |
+
});
|
| 118 |
+
|
| 119 |
+
it("renders code blocks with language fence", () => {
|
| 120 |
+
const md = renderArticleMarkdown(
|
| 121 |
+
doc([
|
| 122 |
+
{
|
| 123 |
+
type: "codeBlock",
|
| 124 |
+
attrs: { language: "ts" },
|
| 125 |
+
content: [{ type: "text", text: "const x = 1;" }],
|
| 126 |
+
},
|
| 127 |
+
]),
|
| 128 |
+
META,
|
| 129 |
+
);
|
| 130 |
+
expect(md).toContain("```ts");
|
| 131 |
+
expect(md).toContain("const x = 1;");
|
| 132 |
+
expect(md).toContain("```");
|
| 133 |
+
});
|
| 134 |
+
|
| 135 |
+
it("renders inline and block math", () => {
|
| 136 |
+
const md = renderArticleMarkdown(
|
| 137 |
+
doc([
|
| 138 |
+
{
|
| 139 |
+
type: "paragraph",
|
| 140 |
+
content: [
|
| 141 |
+
{ type: "text", text: "Energy: " },
|
| 142 |
+
{ type: "inlineMath", attrs: { latex: "E = mc^2" } },
|
| 143 |
+
],
|
| 144 |
+
},
|
| 145 |
+
{ type: "blockMath", attrs: { latex: "\\int_0^1 x dx" } },
|
| 146 |
+
]),
|
| 147 |
+
META,
|
| 148 |
+
);
|
| 149 |
+
expect(md).toContain("$E = mc^2$");
|
| 150 |
+
expect(md).toContain("$$\n\\int_0^1 x dx\n$$");
|
| 151 |
+
});
|
| 152 |
+
|
| 153 |
+
it("renders tables with a header row separator", () => {
|
| 154 |
+
const md = renderArticleMarkdown(
|
| 155 |
+
doc([
|
| 156 |
+
{
|
| 157 |
+
type: "table",
|
| 158 |
+
content: [
|
| 159 |
+
{
|
| 160 |
+
type: "tableRow",
|
| 161 |
+
content: [
|
| 162 |
+
{ type: "tableHeader", content: [{ type: "paragraph", content: [{ type: "text", text: "Col A" }] }] },
|
| 163 |
+
{ type: "tableHeader", content: [{ type: "paragraph", content: [{ type: "text", text: "Col B" }] }] },
|
| 164 |
+
],
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
type: "tableRow",
|
| 168 |
+
content: [
|
| 169 |
+
{ type: "tableCell", content: [{ type: "paragraph", content: [{ type: "text", text: "1" }] }] },
|
| 170 |
+
{ type: "tableCell", content: [{ type: "paragraph", content: [{ type: "text", text: "2" }] }] },
|
| 171 |
+
],
|
| 172 |
+
},
|
| 173 |
+
],
|
| 174 |
+
},
|
| 175 |
+
]),
|
| 176 |
+
META,
|
| 177 |
+
);
|
| 178 |
+
expect(md).toContain("| Col A | Col B |");
|
| 179 |
+
expect(md).toContain("| --- | --- |");
|
| 180 |
+
expect(md).toContain("| 1 | 2 |");
|
| 181 |
+
});
|
| 182 |
+
});
|
| 183 |
+
|
| 184 |
+
describe("renderArticleMarkdown - custom components", () => {
|
| 185 |
+
it("collapses HtmlEmbed to a single inline placeholder with title and src", () => {
|
| 186 |
+
const md = renderArticleMarkdown(
|
| 187 |
+
doc([
|
| 188 |
+
{
|
| 189 |
+
type: "htmlEmbed",
|
| 190 |
+
attrs: { src: "d3-chart.html", title: "Citations over time", desc: "" },
|
| 191 |
+
},
|
| 192 |
+
]),
|
| 193 |
+
META,
|
| 194 |
+
);
|
| 195 |
+
expect(md).toContain("*[Interactive visualization: Citations over time]*");
|
| 196 |
+
expect(md).not.toContain("<iframe");
|
| 197 |
+
});
|
| 198 |
+
|
| 199 |
+
it("renders Note as a blockquote", () => {
|
| 200 |
+
const md = renderArticleMarkdown(
|
| 201 |
+
doc([
|
| 202 |
+
{
|
| 203 |
+
type: "note",
|
| 204 |
+
content: [
|
| 205 |
+
{ type: "paragraph", content: [{ type: "text", text: "Heads up." }] },
|
| 206 |
+
],
|
| 207 |
+
},
|
| 208 |
+
]),
|
| 209 |
+
META,
|
| 210 |
+
);
|
| 211 |
+
expect(md).toContain("> Heads up.");
|
| 212 |
+
});
|
| 213 |
+
|
| 214 |
+
it("renders Accordion with bold title and inner content", () => {
|
| 215 |
+
const md = renderArticleMarkdown(
|
| 216 |
+
doc([
|
| 217 |
+
{
|
| 218 |
+
type: "accordion",
|
| 219 |
+
attrs: { title: "More details" },
|
| 220 |
+
content: [
|
| 221 |
+
{ type: "paragraph", content: [{ type: "text", text: "Inside." }] },
|
| 222 |
+
],
|
| 223 |
+
},
|
| 224 |
+
]),
|
| 225 |
+
META,
|
| 226 |
+
);
|
| 227 |
+
expect(md).toContain("**More details**");
|
| 228 |
+
expect(md).toContain("Inside.");
|
| 229 |
+
});
|
| 230 |
+
|
| 231 |
+
it("renders QuoteBlock with attribution", () => {
|
| 232 |
+
const md = renderArticleMarkdown(
|
| 233 |
+
doc([
|
| 234 |
+
{
|
| 235 |
+
type: "quoteBlock",
|
| 236 |
+
attrs: { author: "Ada Lovelace", source: "Notes" },
|
| 237 |
+
content: [
|
| 238 |
+
{ type: "paragraph", content: [{ type: "text", text: "The future is open." }] },
|
| 239 |
+
],
|
| 240 |
+
},
|
| 241 |
+
]),
|
| 242 |
+
META,
|
| 243 |
+
);
|
| 244 |
+
expect(md).toContain("> The future is open.");
|
| 245 |
+
expect(md).toContain("> -- Ada Lovelace, Notes");
|
| 246 |
+
});
|
| 247 |
+
|
| 248 |
+
it("renders HfUser as a markdown link to huggingface.co/<u>", () => {
|
| 249 |
+
const md = renderArticleMarkdown(
|
| 250 |
+
doc([
|
| 251 |
+
{
|
| 252 |
+
type: "hfUser",
|
| 253 |
+
attrs: { username: "tfrere", name: "Thibaud Frere" },
|
| 254 |
+
},
|
| 255 |
+
]),
|
| 256 |
+
META,
|
| 257 |
+
);
|
| 258 |
+
expect(md).toContain("[Thibaud Frere](https://huggingface.co/tfrere)");
|
| 259 |
+
});
|
| 260 |
+
|
| 261 |
+
it("renders Mermaid as a fenced ```mermaid block", () => {
|
| 262 |
+
const md = renderArticleMarkdown(
|
| 263 |
+
doc([
|
| 264 |
+
{
|
| 265 |
+
type: "mermaid",
|
| 266 |
+
attrs: { code: "graph TD\n A --> B" },
|
| 267 |
+
},
|
| 268 |
+
]),
|
| 269 |
+
META,
|
| 270 |
+
);
|
| 271 |
+
expect(md).toContain("```mermaid");
|
| 272 |
+
expect(md).toContain("graph TD");
|
| 273 |
+
expect(md).toContain("A --> B");
|
| 274 |
+
});
|
| 275 |
+
|
| 276 |
+
it("unwraps Wide / FullWidth / Stack containers", () => {
|
| 277 |
+
const md = renderArticleMarkdown(
|
| 278 |
+
doc([
|
| 279 |
+
{
|
| 280 |
+
type: "wide",
|
| 281 |
+
content: [
|
| 282 |
+
{ type: "paragraph", content: [{ type: "text", text: "Wide content." }] },
|
| 283 |
+
],
|
| 284 |
+
},
|
| 285 |
+
]),
|
| 286 |
+
META,
|
| 287 |
+
);
|
| 288 |
+
expect(md).toContain("Wide content.");
|
| 289 |
+
expect(md).not.toContain("[wide]");
|
| 290 |
+
});
|
| 291 |
+
});
|
| 292 |
+
|
| 293 |
+
describe("renderArticleMarkdown - citations and footnotes", () => {
|
| 294 |
+
it("renders citations as keys for APA and as numeric tags for IEEE", () => {
|
| 295 |
+
const json = doc([
|
| 296 |
+
{
|
| 297 |
+
type: "paragraph",
|
| 298 |
+
content: [
|
| 299 |
+
{ type: "text", text: "See " },
|
| 300 |
+
{ type: "citation", attrs: { key: "smith2024", label: "Smith (2024)" } },
|
| 301 |
+
{ type: "text", text: "." },
|
| 302 |
+
],
|
| 303 |
+
},
|
| 304 |
+
]);
|
| 305 |
+
const apa: CitationData = {
|
| 306 |
+
entries: [{ id: "smith2024" }],
|
| 307 |
+
orderedKeys: ["smith2024"],
|
| 308 |
+
style: "apa",
|
| 309 |
+
};
|
| 310 |
+
const ieee: CitationData = {
|
| 311 |
+
entries: [{ id: "smith2024" }],
|
| 312 |
+
orderedKeys: ["smith2024"],
|
| 313 |
+
style: "ieee",
|
| 314 |
+
};
|
| 315 |
+
expect(renderArticleMarkdown(json, META, apa)).toContain("Smith (2024)");
|
| 316 |
+
expect(renderArticleMarkdown(json, META, ieee)).toContain("[1]");
|
| 317 |
+
});
|
| 318 |
+
|
| 319 |
+
it("collects footnotes and emits a footnotes section", () => {
|
| 320 |
+
const md = renderArticleMarkdown(
|
| 321 |
+
doc([
|
| 322 |
+
{
|
| 323 |
+
type: "paragraph",
|
| 324 |
+
content: [
|
| 325 |
+
{ type: "text", text: "Body" },
|
| 326 |
+
{ type: "footnote", attrs: { content: "First note" } },
|
| 327 |
+
{ type: "text", text: " more " },
|
| 328 |
+
{ type: "footnote", attrs: { content: "Second note" } },
|
| 329 |
+
],
|
| 330 |
+
},
|
| 331 |
+
]),
|
| 332 |
+
META,
|
| 333 |
+
);
|
| 334 |
+
expect(md).toContain("[^1]");
|
| 335 |
+
expect(md).toContain("[^2]");
|
| 336 |
+
expect(md).toContain("## Footnotes");
|
| 337 |
+
expect(md).toContain("[^1]: First note");
|
| 338 |
+
expect(md).toContain("[^2]: Second note");
|
| 339 |
+
});
|
| 340 |
+
|
| 341 |
+
it("appends a References section from the formatted bibliography", () => {
|
| 342 |
+
const biblio = '<div class="csl-entry">Smith, J. (2024). <i>Test Paper</i>. Journal.</div>';
|
| 343 |
+
const md = renderArticleMarkdown(
|
| 344 |
+
doc([{ type: "paragraph", content: [{ type: "text", text: "Body" }] }]),
|
| 345 |
+
META,
|
| 346 |
+
undefined,
|
| 347 |
+
biblio,
|
| 348 |
+
);
|
| 349 |
+
expect(md).toContain("## References");
|
| 350 |
+
expect(md).toContain("Smith, J. (2024).");
|
| 351 |
+
expect(md).toContain("Test Paper");
|
| 352 |
+
expect(md).not.toContain("<div");
|
| 353 |
+
});
|
| 354 |
+
});
|
| 355 |
+
|
| 356 |
+
describe("stripHtmlToText", () => {
|
| 357 |
+
it("converts <a href> to a markdown link", () => {
|
| 358 |
+
expect(stripHtmlToText('<a href="https://example.com">click</a>')).toBe(
|
| 359 |
+
"[click](https://example.com)",
|
| 360 |
+
);
|
| 361 |
+
});
|
| 362 |
+
|
| 363 |
+
it("decodes common HTML entities", () => {
|
| 364 |
+
expect(stripHtmlToText("Tom & Jerry <3")).toBe("Tom & Jerry <3");
|
| 365 |
+
});
|
| 366 |
+
|
| 367 |
+
it("collapses block tags into newlines and removes the rest", () => {
|
| 368 |
+
const html = "<p>One.</p><p>Two.</p>";
|
| 369 |
+
expect(stripHtmlToText(html).trim()).toBe("One.\nTwo.");
|
| 370 |
+
});
|
| 371 |
+
});
|