carbon-tokenization / backend /src /hf-storage.ts
tfrere's picture
tfrere HF Staff
feat(storage): first-class data - no silent failures in the persistence pipeline
7a42df5
Raw
History Blame Contribute Delete
16.8 kB
import { uploadFile, downloadFile, createRepo, commit, type CommitFile, type RepoDesignation } from "@huggingface/hub";
import { sanitizeName } from "./utils.js";
const SPACE_ID = process.env.SPACE_ID || "";
const SPACE_HOST = process.env.SPACE_HOST || "";
// Derive dataset ID: explicit env > "<space-id>-data"
const HF_DATASET_ID =
process.env.HF_DATASET_ID ||
(SPACE_ID ? `${SPACE_ID}-data` : "");
const repo: RepoDesignation = { type: "dataset", name: HF_DATASET_ID };
// Fall-back token from env (optional); overridden by user OAuth tokens at runtime
const ENV_TOKEN = process.env.HF_TOKEN || "";
// Last known valid OAuth token, kept in memory for background operations
let _cachedToken = ENV_TOKEN;
export function getDatasetId(): string {
return HF_DATASET_ID;
}
export function setUserToken(token: string): void {
if (token) _cachedToken = token;
}
function getToken(explicit?: string): string {
return explicit || _cachedToken || ENV_TOKEN;
}
/**
* Public wrapper around the same token cascade used internally. The
* dataset-proxy route needs to forward the right token to HF when
* fetching assets on behalf of a viewer.
*/
export function resolveToken(explicit?: string): string {
return getToken(explicit);
}
export function isHfStorageEnabled(): boolean {
return Boolean(HF_DATASET_ID && (_cachedToken || ENV_TOKEN));
}
// ============================================================
// Storage status tracker
// ============================================================
//
// Every silent failure point in the persistence pipeline used to
// disappear into `console.error` and the editor would happily
// keep showing "Saved". This tracker is the single source of
// truth for "what's the actual state of my data right now":
// every write/error path updates it, and the /api/storage/status
// endpoint surfaces it to the frontend SyncIndicator so the user
// sees the truth, not a comforting lie.
//
// Stages we care about:
// - `dataset-create` : ensureDatasetExists couldn't create the
// backing repo (most common: missing manage-repos scope)
// - `local-save` : writeFileSync into data/<name>.yjs failed
// (disk full, readonly FS, permission denied)
// - `cloud-push` : uploadFile into the dataset failed (HF
// down, token expired, network blip)
//
// We deliberately keep this in-memory: it's per-container,
// non-critical, and the frontend polls every few seconds anyway.
export type StorageErrorStage = "dataset-create" | "local-save" | "cloud-push";
export interface StorageError {
stage: StorageErrorStage;
message: string;
statusCode?: number;
at: number;
/** Doc name when the failure is per-doc (local-save, cloud-push). */
docName?: string;
}
export interface StorageStatus {
enabled: boolean;
datasetId: string;
datasetReady: boolean;
/** ms epoch of the last successful local writeFileSync. */
lastLocalSaveAt: number | null;
/** ms epoch of the last successful HF dataset push. */
lastCloudPushAt: number | null;
/** True while a debounced push timer is armed. */
pendingPush: boolean;
/** Last error in the pipeline, or null if everything's fine. */
lastError: StorageError | null;
}
const status: StorageStatus = {
enabled: false,
datasetId: HF_DATASET_ID,
datasetReady: false,
lastLocalSaveAt: null,
lastCloudPushAt: null,
pendingPush: false,
lastError: null,
};
export function getStorageStatus(): StorageStatus {
// Refresh `enabled` on every read - it depends on whether we've
// received a user token yet, which can flip mid-session.
return { ...status, enabled: isHfStorageEnabled() };
}
export function recordLocalSave(docName: string): void {
status.lastLocalSaveAt = Date.now();
// Local save success clears a prior local-save error for the
// same doc, but leaves cloud-push / dataset-create errors alone
// since those are independent.
if (status.lastError?.stage === "local-save" && status.lastError.docName === docName) {
status.lastError = null;
}
}
export function recordLocalSaveError(docName: string, err: unknown): void {
status.lastError = {
stage: "local-save",
message: (err as Error)?.message || String(err),
at: Date.now(),
docName,
};
}
function recordCloudPush(docName: string): void {
status.lastCloudPushAt = Date.now();
if (status.lastError?.stage === "cloud-push" && status.lastError.docName === docName) {
status.lastError = null;
}
}
function recordCloudPushError(docName: string, err: unknown, statusCode?: number): void {
status.lastError = {
stage: "cloud-push",
message: (err as Error)?.message || String(err),
statusCode,
at: Date.now(),
docName,
};
}
function recordDatasetReady(): void {
status.datasetReady = true;
if (status.lastError?.stage === "dataset-create") {
status.lastError = null;
}
}
function recordDatasetError(err: unknown, statusCode?: number): void {
status.lastError = {
stage: "dataset-create",
message: (err as Error)?.message || String(err),
statusCode,
at: Date.now(),
};
}
/**
* Public-facing base URL the editor is reachable at, used to build
* absolute proxy URLs for assets that need to render outside the
* Space's own page (og:image consumed by social card unfurlers,
* "Download PDF" link from external sites, ...). In dev we don't
* have SPACE_HOST so we fall back to localhost.
*/
function getPublicBaseUrl(): string {
if (SPACE_HOST) return `https://${SPACE_HOST}`;
const port = process.env.PORT || "8080";
return `http://localhost:${port}`;
}
/**
* Ensure the backing dataset exists on HF.
*
* The dataset is created **private by default**: most editor
* deployments live in organisations where the working drafts must
* not leak, and even for solo public projects the published assets
* (images, PDFs, thumbnails) flow back through the editor's own
* `/d/...` reverse proxy anyway, so anonymous viewers never need
* direct dataset access. Keeping it private also avoids running
* into orgs that block `private: false` creations by policy.
*
* Safe to call multiple times; treats 409 as success and caches a
* "ready" flag so we don't retry on every push. A non-409 failure
* is logged with the HTTP status when available (the most common
* cause is a missing `manage-repos` scope on the user's OAuth
* grant) and the flag is *not* set, so the next push will retry.
*/
let _datasetReady = false;
export async function ensureDatasetExists(token?: string): Promise<void> {
if (_datasetReady || !HF_DATASET_ID) return;
const accessToken = getToken(token);
if (!accessToken) return;
try {
await createRepo({ repo, private: true, accessToken });
console.log(`[hf-storage] created private dataset ${HF_DATASET_ID}`);
} catch (err: any) {
const statusCode = err?.statusCode ?? err?.status;
const message = String(err?.message || err);
if (statusCode === 409 || message.includes("already")) {
console.log(`[hf-storage] dataset ${HF_DATASET_ID} already exists`);
} else {
// Surface enough detail to triage from Space logs: status code
// (403 = scope/permission, 422 = invalid name, 5xx = HF down)
// and a one-line message. We deliberately omit the full error
// object to avoid noisy stack dumps on every retry.
console.error(
`[hf-storage] failed to create dataset ${HF_DATASET_ID}` +
` (status=${statusCode ?? "unknown"}): ${message}`,
);
recordDatasetError(err, statusCode);
return;
}
}
_datasetReady = true;
recordDatasetReady();
}
// ---------- Images ----------
export async function uploadImageToHf(
buffer: Buffer,
filename: string,
token?: string,
): Promise<string> {
await ensureDatasetExists(token);
const path = `images/${filename}`;
await uploadFile({
repo,
file: { path, content: new Blob([new Uint8Array(buffer)]) },
accessToken: getToken(token),
commitTitle: `upload image ${filename}`,
});
// Relative URL through the editor's reverse proxy. The dataset is
// private and the raw `huggingface.co/datasets/...` URL would 401
// for anonymous viewers - the proxy attaches a server-side token
// and forwards. Relative is fine because images are only ever
// rendered from inside the Space's own pages.
return `/d/${path}`;
}
// ---------- Documents ----------
const dirtyDocs = new Map<string, { state: Buffer; timer: NodeJS.Timeout }>();
const DEBOUNCE_MS = 10_000;
export function schedulePush(docName: string, state: Buffer): void {
const existing = dirtyDocs.get(docName);
if (existing) clearTimeout(existing.timer);
const timer = setTimeout(() => pushDocument(docName), DEBOUNCE_MS);
dirtyDocs.set(docName, { state, timer });
status.pendingPush = true;
}
async function pushDocument(docName: string): Promise<void> {
const entry = dirtyDocs.get(docName);
if (!entry) return;
dirtyDocs.delete(docName);
// Update `pendingPush` based on whether OTHER docs still have a
// timer armed. A single editor only ever touches one doc, so in
// practice this is always false after `delete`, but the multi-doc
// case must not lie either.
status.pendingPush = dirtyDocs.size > 0;
await ensureDatasetExists();
const safeName = sanitizeName(docName);
const path = `articles/${safeName}.yjs`;
try {
await uploadFile({
repo,
file: { path, content: new Blob([new Uint8Array(entry.state)]) },
accessToken: getToken(),
commitTitle: `save ${safeName}`,
});
console.log(`[hf-storage] pushed ${path}`);
recordCloudPush(docName);
} catch (err: any) {
const statusCode = err?.statusCode ?? err?.status;
console.error(
`[hf-storage] failed to push ${path}` +
` (status=${statusCode ?? "unknown"}): ${(err as Error)?.message || err}`,
);
recordCloudPushError(docName, err, statusCode);
}
}
export async function pullDocument(docName: string): Promise<Uint8Array | null> {
const safeName = sanitizeName(docName);
const path = `articles/${safeName}.yjs`;
try {
const res = await downloadFile({
repo,
path,
accessToken: getToken(),
});
if (!res) return null;
const arrayBuf = await res.arrayBuffer();
return new Uint8Array(arrayBuf);
} catch {
return null;
}
}
export async function flushAll(): Promise<void> {
const names = [...dirtyDocs.keys()];
await Promise.allSettled(names.map((n) => pushDocument(n)));
}
/**
* Pull published assets (HTML, PDF, thumbnail, meta) from the HF dataset
* and write them to the local data directory so the server can serve them.
* Called on startup to restore published state after a container rebuild.
*/
export async function pullPublishedAssets(
docName: string,
localDir: string,
): Promise<boolean> {
if (!HF_DATASET_ID) {
console.log("[hf-storage] pullPublished skipped: no dataset ID");
return false;
}
const token = getToken();
const accessToken = token || undefined;
console.log(`[hf-storage] pullPublished for "${docName}" from dataset ${HF_DATASET_ID} (token: ${token ? "yes" : "none - public access"})`);
const safeName = sanitizeName(docName);
const base = `published/${safeName}`;
const files = ["index.html", "article.pdf", "thumb.jpg", "meta.json", "llms.txt"];
const { mkdirSync, writeFileSync } = await import("fs");
const { join } = await import("path");
const outDir = join(localDir, "published", safeName);
let foundAny = false;
for (const file of files) {
try {
const res = await downloadFile({
repo,
path: `${base}/${file}`,
accessToken,
});
if (!res) {
console.log(`[hf-storage] ${base}/${file}: not found via Hub lib`);
continue;
}
mkdirSync(outDir, { recursive: true });
const buf = Buffer.from(await res.arrayBuffer());
writeFileSync(join(outDir, file), buf);
console.log(`[hf-storage] pulled ${base}/${file} (${buf.length} bytes)`);
foundAny = true;
} catch (err) {
const msg = (err as Error).message || "";
const status = msg.match(/status (\d+)/)?.[1];
// A 404 from the Hub lib is the definitive "this file isn't on
// the dataset" answer (we used a valid token and got through to
// the repo metadata). Retrying with a tokenless `fetch()` would
// either re-confirm 404 or, on a private dataset, return a
// misleading 401 - either way it spams logs. Skip the fallback
// entirely in that case.
if (status === "404") {
console.log(`[hf-storage] ${base}/${file}: not found (404)`);
continue;
}
console.warn(
`[hf-storage] Hub lib failed for ${base}/${file}: ${msg}`,
);
// For other errors (network glitch, lib bug, ...) fall back to
// a direct HTTP fetch. Critically, forward the same access
// token we used above - the dataset may be private (org-owned
// and grant-protected) and an anonymous fetch would 401 even
// though the user does have access.
try {
const url = `https://huggingface.co/datasets/${HF_DATASET_ID}/resolve/main/${base}/${file}`;
console.log(`[hf-storage] trying direct fetch: ${url}`);
const httpRes = await fetch(
url,
accessToken
? { headers: { Authorization: `Bearer ${accessToken}` } }
: undefined,
);
if (!httpRes.ok) {
console.log(`[hf-storage] direct fetch ${file}: ${httpRes.status}`);
continue;
}
mkdirSync(outDir, { recursive: true });
const buf = Buffer.from(await httpRes.arrayBuffer());
writeFileSync(join(outDir, file), buf);
console.log(
`[hf-storage] pulled ${base}/${file} via direct fetch (${buf.length} bytes)`,
);
foundAny = true;
} catch (fetchErr) {
console.warn(
`[hf-storage] direct fetch also failed for ${file}:`,
(fetchErr as Error).message,
);
}
}
}
console.log(`[hf-storage] pullPublished result: ${foundAny ? "restored" : "nothing found"}`);
return foundAny;
}
// ---------- Published Assets ----------
/**
* Build the proxied URL for a published asset.
*
* Returns an **absolute** URL because consumers are sometimes
* external (Twitter / Slack / Discord card unfurlers reading
* og:image, or external links to the PDF download), and a relative
* `/d/...` wouldn't resolve there. Inside the Space's own pages a
* relative URL would have worked equally well, but emitting the
* absolute form keeps the contract uniform.
*/
export function getPublishedAssetUrl(docName: string, filename: string): string {
const safeName = sanitizeName(docName);
return `${getPublicBaseUrl()}/d/published/${safeName}/${filename}`;
}
interface PublishedPayload {
html: string;
pdf: Buffer | null;
thumbnail: Buffer | null;
meta: { title: string; description: string; authors: string[]; date: string; [key: string]: unknown };
llmsTxt?: string;
}
export async function uploadPublishedAssets(
docName: string,
payload: PublishedPayload,
token?: string,
): Promise<{ htmlUrl: string; pdfUrl: string | null; thumbUrl: string | null }> {
await ensureDatasetExists(token);
const accessToken = getToken(token);
const safeName = sanitizeName(docName);
const base = `published/${safeName}`;
const operations: CommitFile[] = [
{
operation: "addOrUpdate",
path: `${base}/index.html`,
content: new Blob([payload.html]),
},
{
operation: "addOrUpdate",
path: `${base}/meta.json`,
content: new Blob([JSON.stringify(payload.meta, null, 2)]),
},
];
const baseUrl = `https://huggingface.co/datasets/${HF_DATASET_ID}/resolve/main/${base}`;
let pdfUrl: string | null = null;
let thumbUrl: string | null = null;
if (payload.pdf) {
operations.push({
operation: "addOrUpdate",
path: `${base}/article.pdf`,
content: new Blob([new Uint8Array(payload.pdf)]),
});
pdfUrl = `${baseUrl}/article.pdf`;
}
if (payload.thumbnail) {
operations.push({
operation: "addOrUpdate",
path: `${base}/thumb.jpg`,
content: new Blob([new Uint8Array(payload.thumbnail)]),
});
thumbUrl = `${baseUrl}/thumb.jpg`;
}
if (payload.llmsTxt) {
operations.push({
operation: "addOrUpdate",
path: `${base}/llms.txt`,
content: new Blob([payload.llmsTxt], { type: "text/markdown; charset=utf-8" }),
});
}
await commit({
repo,
operations,
title: `publish ${safeName}`,
accessToken,
});
console.log(`[hf-storage] published ${safeName} (${operations.length} files, single commit)`);
return { htmlUrl: `${baseUrl}/index.html`, pdfUrl, thumbUrl };
}