import { uploadFile, downloadFile, createRepo, commit, type CommitFile, type RepoDesignation } from "@huggingface/hub"; import { sanitizeName } from "./utils.js"; const SPACE_ID = process.env.SPACE_ID || ""; const SPACE_HOST = process.env.SPACE_HOST || ""; // Derive dataset ID: explicit env > "-data" const HF_DATASET_ID = process.env.HF_DATASET_ID || (SPACE_ID ? `${SPACE_ID}-data` : ""); const repo: RepoDesignation = { type: "dataset", name: HF_DATASET_ID }; // Fall-back token from env (optional); overridden by user OAuth tokens at runtime const ENV_TOKEN = process.env.HF_TOKEN || ""; // Last known valid OAuth token, kept in memory for background operations let _cachedToken = ENV_TOKEN; export function getDatasetId(): string { return HF_DATASET_ID; } export function setUserToken(token: string): void { if (token) _cachedToken = token; } function getToken(explicit?: string): string { return explicit || _cachedToken || ENV_TOKEN; } /** * Public wrapper around the same token cascade used internally. The * dataset-proxy route needs to forward the right token to HF when * fetching assets on behalf of a viewer. */ export function resolveToken(explicit?: string): string { return getToken(explicit); } export function isHfStorageEnabled(): boolean { return Boolean(HF_DATASET_ID && (_cachedToken || ENV_TOKEN)); } // ============================================================ // Storage status tracker // ============================================================ // // Every silent failure point in the persistence pipeline used to // disappear into `console.error` and the editor would happily // keep showing "Saved". This tracker is the single source of // truth for "what's the actual state of my data right now": // every write/error path updates it, and the /api/storage/status // endpoint surfaces it to the frontend SyncIndicator so the user // sees the truth, not a comforting lie. // // Stages we care about: // - `dataset-create` : ensureDatasetExists couldn't create the // backing repo (most common: missing manage-repos scope) // - `local-save` : writeFileSync into data/.yjs failed // (disk full, readonly FS, permission denied) // - `cloud-push` : uploadFile into the dataset failed (HF // down, token expired, network blip) // // We deliberately keep this in-memory: it's per-container, // non-critical, and the frontend polls every few seconds anyway. export type StorageErrorStage = "dataset-create" | "local-save" | "cloud-push"; export interface StorageError { stage: StorageErrorStage; message: string; statusCode?: number; at: number; /** Doc name when the failure is per-doc (local-save, cloud-push). */ docName?: string; } export interface StorageStatus { enabled: boolean; datasetId: string; datasetReady: boolean; /** ms epoch of the last successful local writeFileSync. */ lastLocalSaveAt: number | null; /** ms epoch of the last successful HF dataset push. */ lastCloudPushAt: number | null; /** True while a debounced push timer is armed. */ pendingPush: boolean; /** Last error in the pipeline, or null if everything's fine. */ lastError: StorageError | null; } const status: StorageStatus = { enabled: false, datasetId: HF_DATASET_ID, datasetReady: false, lastLocalSaveAt: null, lastCloudPushAt: null, pendingPush: false, lastError: null, }; export function getStorageStatus(): StorageStatus { // Refresh `enabled` on every read - it depends on whether we've // received a user token yet, which can flip mid-session. return { ...status, enabled: isHfStorageEnabled() }; } export function recordLocalSave(docName: string): void { status.lastLocalSaveAt = Date.now(); // Local save success clears a prior local-save error for the // same doc, but leaves cloud-push / dataset-create errors alone // since those are independent. if (status.lastError?.stage === "local-save" && status.lastError.docName === docName) { status.lastError = null; } } export function recordLocalSaveError(docName: string, err: unknown): void { status.lastError = { stage: "local-save", message: (err as Error)?.message || String(err), at: Date.now(), docName, }; } function recordCloudPush(docName: string): void { status.lastCloudPushAt = Date.now(); if (status.lastError?.stage === "cloud-push" && status.lastError.docName === docName) { status.lastError = null; } } function recordCloudPushError(docName: string, err: unknown, statusCode?: number): void { status.lastError = { stage: "cloud-push", message: (err as Error)?.message || String(err), statusCode, at: Date.now(), docName, }; } function recordDatasetReady(): void { status.datasetReady = true; if (status.lastError?.stage === "dataset-create") { status.lastError = null; } } function recordDatasetError(err: unknown, statusCode?: number): void { status.lastError = { stage: "dataset-create", message: (err as Error)?.message || String(err), statusCode, at: Date.now(), }; } /** * Public-facing base URL the editor is reachable at, used to build * absolute proxy URLs for assets that need to render outside the * Space's own page (og:image consumed by social card unfurlers, * "Download PDF" link from external sites, ...). In dev we don't * have SPACE_HOST so we fall back to localhost. */ function getPublicBaseUrl(): string { if (SPACE_HOST) return `https://${SPACE_HOST}`; const port = process.env.PORT || "8080"; return `http://localhost:${port}`; } /** * Ensure the backing dataset exists on HF. * * The dataset is created **private by default**: most editor * deployments live in organisations where the working drafts must * not leak, and even for solo public projects the published assets * (images, PDFs, thumbnails) flow back through the editor's own * `/d/...` reverse proxy anyway, so anonymous viewers never need * direct dataset access. Keeping it private also avoids running * into orgs that block `private: false` creations by policy. * * Safe to call multiple times; treats 409 as success and caches a * "ready" flag so we don't retry on every push. A non-409 failure * is logged with the HTTP status when available (the most common * cause is a missing `manage-repos` scope on the user's OAuth * grant) and the flag is *not* set, so the next push will retry. */ let _datasetReady = false; export async function ensureDatasetExists(token?: string): Promise { if (_datasetReady || !HF_DATASET_ID) return; const accessToken = getToken(token); if (!accessToken) return; try { await createRepo({ repo, private: true, accessToken }); console.log(`[hf-storage] created private dataset ${HF_DATASET_ID}`); } catch (err: any) { const statusCode = err?.statusCode ?? err?.status; const message = String(err?.message || err); if (statusCode === 409 || message.includes("already")) { console.log(`[hf-storage] dataset ${HF_DATASET_ID} already exists`); } else { // Surface enough detail to triage from Space logs: status code // (403 = scope/permission, 422 = invalid name, 5xx = HF down) // and a one-line message. We deliberately omit the full error // object to avoid noisy stack dumps on every retry. console.error( `[hf-storage] failed to create dataset ${HF_DATASET_ID}` + ` (status=${statusCode ?? "unknown"}): ${message}`, ); recordDatasetError(err, statusCode); return; } } _datasetReady = true; recordDatasetReady(); } // ---------- Images ---------- export async function uploadImageToHf( buffer: Buffer, filename: string, token?: string, ): Promise { await ensureDatasetExists(token); const path = `images/${filename}`; await uploadFile({ repo, file: { path, content: new Blob([new Uint8Array(buffer)]) }, accessToken: getToken(token), commitTitle: `upload image ${filename}`, }); // Relative URL through the editor's reverse proxy. The dataset is // private and the raw `huggingface.co/datasets/...` URL would 401 // for anonymous viewers - the proxy attaches a server-side token // and forwards. Relative is fine because images are only ever // rendered from inside the Space's own pages. return `/d/${path}`; } // ---------- Documents ---------- const dirtyDocs = new Map(); const DEBOUNCE_MS = 10_000; export function schedulePush(docName: string, state: Buffer): void { const existing = dirtyDocs.get(docName); if (existing) clearTimeout(existing.timer); const timer = setTimeout(() => pushDocument(docName), DEBOUNCE_MS); dirtyDocs.set(docName, { state, timer }); status.pendingPush = true; } async function pushDocument(docName: string): Promise { const entry = dirtyDocs.get(docName); if (!entry) return; dirtyDocs.delete(docName); // Update `pendingPush` based on whether OTHER docs still have a // timer armed. A single editor only ever touches one doc, so in // practice this is always false after `delete`, but the multi-doc // case must not lie either. status.pendingPush = dirtyDocs.size > 0; await ensureDatasetExists(); const safeName = sanitizeName(docName); const path = `articles/${safeName}.yjs`; try { await uploadFile({ repo, file: { path, content: new Blob([new Uint8Array(entry.state)]) }, accessToken: getToken(), commitTitle: `save ${safeName}`, }); console.log(`[hf-storage] pushed ${path}`); recordCloudPush(docName); } catch (err: any) { const statusCode = err?.statusCode ?? err?.status; console.error( `[hf-storage] failed to push ${path}` + ` (status=${statusCode ?? "unknown"}): ${(err as Error)?.message || err}`, ); recordCloudPushError(docName, err, statusCode); } } export async function pullDocument(docName: string): Promise { const safeName = sanitizeName(docName); const path = `articles/${safeName}.yjs`; try { const res = await downloadFile({ repo, path, accessToken: getToken(), }); if (!res) return null; const arrayBuf = await res.arrayBuffer(); return new Uint8Array(arrayBuf); } catch { return null; } } export async function flushAll(): Promise { const names = [...dirtyDocs.keys()]; await Promise.allSettled(names.map((n) => pushDocument(n))); } /** * Pull published assets (HTML, PDF, thumbnail, meta) from the HF dataset * and write them to the local data directory so the server can serve them. * Called on startup to restore published state after a container rebuild. */ export async function pullPublishedAssets( docName: string, localDir: string, ): Promise { if (!HF_DATASET_ID) { console.log("[hf-storage] pullPublished skipped: no dataset ID"); return false; } const token = getToken(); const accessToken = token || undefined; console.log(`[hf-storage] pullPublished for "${docName}" from dataset ${HF_DATASET_ID} (token: ${token ? "yes" : "none - public access"})`); const safeName = sanitizeName(docName); const base = `published/${safeName}`; const files = ["index.html", "article.pdf", "thumb.jpg", "meta.json", "llms.txt"]; const { mkdirSync, writeFileSync } = await import("fs"); const { join } = await import("path"); const outDir = join(localDir, "published", safeName); let foundAny = false; for (const file of files) { try { const res = await downloadFile({ repo, path: `${base}/${file}`, accessToken, }); if (!res) { console.log(`[hf-storage] ${base}/${file}: not found via Hub lib`); continue; } mkdirSync(outDir, { recursive: true }); const buf = Buffer.from(await res.arrayBuffer()); writeFileSync(join(outDir, file), buf); console.log(`[hf-storage] pulled ${base}/${file} (${buf.length} bytes)`); foundAny = true; } catch (err) { const msg = (err as Error).message || ""; const status = msg.match(/status (\d+)/)?.[1]; // A 404 from the Hub lib is the definitive "this file isn't on // the dataset" answer (we used a valid token and got through to // the repo metadata). Retrying with a tokenless `fetch()` would // either re-confirm 404 or, on a private dataset, return a // misleading 401 - either way it spams logs. Skip the fallback // entirely in that case. if (status === "404") { console.log(`[hf-storage] ${base}/${file}: not found (404)`); continue; } console.warn( `[hf-storage] Hub lib failed for ${base}/${file}: ${msg}`, ); // For other errors (network glitch, lib bug, ...) fall back to // a direct HTTP fetch. Critically, forward the same access // token we used above - the dataset may be private (org-owned // and grant-protected) and an anonymous fetch would 401 even // though the user does have access. try { const url = `https://huggingface.co/datasets/${HF_DATASET_ID}/resolve/main/${base}/${file}`; console.log(`[hf-storage] trying direct fetch: ${url}`); const httpRes = await fetch( url, accessToken ? { headers: { Authorization: `Bearer ${accessToken}` } } : undefined, ); if (!httpRes.ok) { console.log(`[hf-storage] direct fetch ${file}: ${httpRes.status}`); continue; } mkdirSync(outDir, { recursive: true }); const buf = Buffer.from(await httpRes.arrayBuffer()); writeFileSync(join(outDir, file), buf); console.log( `[hf-storage] pulled ${base}/${file} via direct fetch (${buf.length} bytes)`, ); foundAny = true; } catch (fetchErr) { console.warn( `[hf-storage] direct fetch also failed for ${file}:`, (fetchErr as Error).message, ); } } } console.log(`[hf-storage] pullPublished result: ${foundAny ? "restored" : "nothing found"}`); return foundAny; } // ---------- Published Assets ---------- /** * Build the proxied URL for a published asset. * * Returns an **absolute** URL because consumers are sometimes * external (Twitter / Slack / Discord card unfurlers reading * og:image, or external links to the PDF download), and a relative * `/d/...` wouldn't resolve there. Inside the Space's own pages a * relative URL would have worked equally well, but emitting the * absolute form keeps the contract uniform. */ export function getPublishedAssetUrl(docName: string, filename: string): string { const safeName = sanitizeName(docName); return `${getPublicBaseUrl()}/d/published/${safeName}/${filename}`; } interface PublishedPayload { html: string; pdf: Buffer | null; thumbnail: Buffer | null; meta: { title: string; description: string; authors: string[]; date: string; [key: string]: unknown }; llmsTxt?: string; } export async function uploadPublishedAssets( docName: string, payload: PublishedPayload, token?: string, ): Promise<{ htmlUrl: string; pdfUrl: string | null; thumbUrl: string | null }> { await ensureDatasetExists(token); const accessToken = getToken(token); const safeName = sanitizeName(docName); const base = `published/${safeName}`; const operations: CommitFile[] = [ { operation: "addOrUpdate", path: `${base}/index.html`, content: new Blob([payload.html]), }, { operation: "addOrUpdate", path: `${base}/meta.json`, content: new Blob([JSON.stringify(payload.meta, null, 2)]), }, ]; const baseUrl = `https://huggingface.co/datasets/${HF_DATASET_ID}/resolve/main/${base}`; let pdfUrl: string | null = null; let thumbUrl: string | null = null; if (payload.pdf) { operations.push({ operation: "addOrUpdate", path: `${base}/article.pdf`, content: new Blob([new Uint8Array(payload.pdf)]), }); pdfUrl = `${baseUrl}/article.pdf`; } if (payload.thumbnail) { operations.push({ operation: "addOrUpdate", path: `${base}/thumb.jpg`, content: new Blob([new Uint8Array(payload.thumbnail)]), }); thumbUrl = `${baseUrl}/thumb.jpg`; } if (payload.llmsTxt) { operations.push({ operation: "addOrUpdate", path: `${base}/llms.txt`, content: new Blob([payload.llmsTxt], { type: "text/markdown; charset=utf-8" }), }); } await commit({ repo, operations, title: `publish ${safeName}`, accessToken, }); console.log(`[hf-storage] published ${safeName} (${operations.length} files, single commit)`); return { htmlUrl: `${baseUrl}/index.html`, pdfUrl, thumbUrl }; }