| import { uploadFile, downloadFile, createRepo, commit, type CommitFile, type RepoDesignation } from "@huggingface/hub"; |
| import { sanitizeName } from "./utils.js"; |
|
|
| const SPACE_ID = process.env.SPACE_ID || ""; |
| const SPACE_HOST = process.env.SPACE_HOST || ""; |
|
|
| |
| const HF_DATASET_ID = |
| process.env.HF_DATASET_ID || |
| (SPACE_ID ? `${SPACE_ID}-data` : ""); |
|
|
| const repo: RepoDesignation = { type: "dataset", name: HF_DATASET_ID }; |
|
|
| |
| const ENV_TOKEN = process.env.HF_TOKEN || ""; |
|
|
| |
| let _cachedToken = ENV_TOKEN; |
|
|
| export function getDatasetId(): string { |
| return HF_DATASET_ID; |
| } |
|
|
| export function setUserToken(token: string): void { |
| if (token) _cachedToken = token; |
| } |
|
|
| function getToken(explicit?: string): string { |
| return explicit || _cachedToken || ENV_TOKEN; |
| } |
|
|
| |
| |
| |
| |
| |
| export function resolveToken(explicit?: string): string { |
| return getToken(explicit); |
| } |
|
|
| export function isHfStorageEnabled(): boolean { |
| return Boolean(HF_DATASET_ID && (_cachedToken || ENV_TOKEN)); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| export type StorageErrorStage = "dataset-create" | "local-save" | "cloud-push"; |
|
|
| export interface StorageError { |
| stage: StorageErrorStage; |
| message: string; |
| statusCode?: number; |
| at: number; |
| |
| docName?: string; |
| } |
|
|
| export interface StorageStatus { |
| enabled: boolean; |
| datasetId: string; |
| datasetReady: boolean; |
| |
| lastLocalSaveAt: number | null; |
| |
| lastCloudPushAt: number | null; |
| |
| pendingPush: boolean; |
| |
| lastError: StorageError | null; |
| } |
|
|
| const status: StorageStatus = { |
| enabled: false, |
| datasetId: HF_DATASET_ID, |
| datasetReady: false, |
| lastLocalSaveAt: null, |
| lastCloudPushAt: null, |
| pendingPush: false, |
| lastError: null, |
| }; |
|
|
| export function getStorageStatus(): StorageStatus { |
| |
| |
| return { ...status, enabled: isHfStorageEnabled() }; |
| } |
|
|
| export function recordLocalSave(docName: string): void { |
| status.lastLocalSaveAt = Date.now(); |
| |
| |
| |
| if (status.lastError?.stage === "local-save" && status.lastError.docName === docName) { |
| status.lastError = null; |
| } |
| } |
|
|
| export function recordLocalSaveError(docName: string, err: unknown): void { |
| status.lastError = { |
| stage: "local-save", |
| message: (err as Error)?.message || String(err), |
| at: Date.now(), |
| docName, |
| }; |
| } |
|
|
| function recordCloudPush(docName: string): void { |
| status.lastCloudPushAt = Date.now(); |
| if (status.lastError?.stage === "cloud-push" && status.lastError.docName === docName) { |
| status.lastError = null; |
| } |
| } |
|
|
| function recordCloudPushError(docName: string, err: unknown, statusCode?: number): void { |
| status.lastError = { |
| stage: "cloud-push", |
| message: (err as Error)?.message || String(err), |
| statusCode, |
| at: Date.now(), |
| docName, |
| }; |
| } |
|
|
| function recordDatasetReady(): void { |
| status.datasetReady = true; |
| if (status.lastError?.stage === "dataset-create") { |
| status.lastError = null; |
| } |
| } |
|
|
| function recordDatasetError(err: unknown, statusCode?: number): void { |
| status.lastError = { |
| stage: "dataset-create", |
| message: (err as Error)?.message || String(err), |
| statusCode, |
| at: Date.now(), |
| }; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| function getPublicBaseUrl(): string { |
| if (SPACE_HOST) return `https://${SPACE_HOST}`; |
| const port = process.env.PORT || "8080"; |
| return `http://localhost:${port}`; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| let _datasetReady = false; |
|
|
| export async function ensureDatasetExists(token?: string): Promise<void> { |
| if (_datasetReady || !HF_DATASET_ID) return; |
|
|
| const accessToken = getToken(token); |
| if (!accessToken) return; |
|
|
| try { |
| await createRepo({ repo, private: true, accessToken }); |
| console.log(`[hf-storage] created private dataset ${HF_DATASET_ID}`); |
| } catch (err: any) { |
| const statusCode = err?.statusCode ?? err?.status; |
| const message = String(err?.message || err); |
| if (statusCode === 409 || message.includes("already")) { |
| console.log(`[hf-storage] dataset ${HF_DATASET_ID} already exists`); |
| } else { |
| |
| |
| |
| |
| console.error( |
| `[hf-storage] failed to create dataset ${HF_DATASET_ID}` + |
| ` (status=${statusCode ?? "unknown"}): ${message}`, |
| ); |
| recordDatasetError(err, statusCode); |
| return; |
| } |
| } |
| _datasetReady = true; |
| recordDatasetReady(); |
| } |
|
|
| |
|
|
| export async function uploadImageToHf( |
| buffer: Buffer, |
| filename: string, |
| token?: string, |
| ): Promise<string> { |
| await ensureDatasetExists(token); |
|
|
| const path = `images/${filename}`; |
| await uploadFile({ |
| repo, |
| file: { path, content: new Blob([new Uint8Array(buffer)]) }, |
| accessToken: getToken(token), |
| commitTitle: `upload image ${filename}`, |
| }); |
|
|
| |
| |
| |
| |
| |
| return `/d/${path}`; |
| } |
|
|
| |
|
|
| const dirtyDocs = new Map<string, { state: Buffer; timer: NodeJS.Timeout }>(); |
| const DEBOUNCE_MS = 10_000; |
|
|
| export function schedulePush(docName: string, state: Buffer): void { |
| const existing = dirtyDocs.get(docName); |
| if (existing) clearTimeout(existing.timer); |
|
|
| const timer = setTimeout(() => pushDocument(docName), DEBOUNCE_MS); |
| dirtyDocs.set(docName, { state, timer }); |
| status.pendingPush = true; |
| } |
|
|
| async function pushDocument(docName: string): Promise<void> { |
| const entry = dirtyDocs.get(docName); |
| if (!entry) return; |
| dirtyDocs.delete(docName); |
| |
| |
| |
| |
| status.pendingPush = dirtyDocs.size > 0; |
|
|
| await ensureDatasetExists(); |
|
|
| const safeName = sanitizeName(docName); |
| const path = `articles/${safeName}.yjs`; |
|
|
| try { |
| await uploadFile({ |
| repo, |
| file: { path, content: new Blob([new Uint8Array(entry.state)]) }, |
| accessToken: getToken(), |
| commitTitle: `save ${safeName}`, |
| }); |
| console.log(`[hf-storage] pushed ${path}`); |
| recordCloudPush(docName); |
| } catch (err: any) { |
| const statusCode = err?.statusCode ?? err?.status; |
| console.error( |
| `[hf-storage] failed to push ${path}` + |
| ` (status=${statusCode ?? "unknown"}): ${(err as Error)?.message || err}`, |
| ); |
| recordCloudPushError(docName, err, statusCode); |
| } |
| } |
|
|
| export async function pullDocument(docName: string): Promise<Uint8Array | null> { |
| const safeName = sanitizeName(docName); |
| const path = `articles/${safeName}.yjs`; |
|
|
| try { |
| const res = await downloadFile({ |
| repo, |
| path, |
| accessToken: getToken(), |
| }); |
| if (!res) return null; |
| const arrayBuf = await res.arrayBuffer(); |
| return new Uint8Array(arrayBuf); |
| } catch { |
| return null; |
| } |
| } |
|
|
| export async function flushAll(): Promise<void> { |
| const names = [...dirtyDocs.keys()]; |
| await Promise.allSettled(names.map((n) => pushDocument(n))); |
| } |
|
|
| |
| |
| |
| |
| |
| export async function pullPublishedAssets( |
| docName: string, |
| localDir: string, |
| ): Promise<boolean> { |
| if (!HF_DATASET_ID) { |
| console.log("[hf-storage] pullPublished skipped: no dataset ID"); |
| return false; |
| } |
|
|
| const token = getToken(); |
| const accessToken = token || undefined; |
| console.log(`[hf-storage] pullPublished for "${docName}" from dataset ${HF_DATASET_ID} (token: ${token ? "yes" : "none - public access"})`); |
|
|
| const safeName = sanitizeName(docName); |
| const base = `published/${safeName}`; |
| const files = ["index.html", "article.pdf", "thumb.jpg", "meta.json", "llms.txt"]; |
|
|
| const { mkdirSync, writeFileSync } = await import("fs"); |
| const { join } = await import("path"); |
| const outDir = join(localDir, "published", safeName); |
|
|
| let foundAny = false; |
|
|
| for (const file of files) { |
| try { |
| const res = await downloadFile({ |
| repo, |
| path: `${base}/${file}`, |
| accessToken, |
| }); |
| if (!res) { |
| console.log(`[hf-storage] ${base}/${file}: not found via Hub lib`); |
| continue; |
| } |
|
|
| mkdirSync(outDir, { recursive: true }); |
| const buf = Buffer.from(await res.arrayBuffer()); |
| writeFileSync(join(outDir, file), buf); |
| console.log(`[hf-storage] pulled ${base}/${file} (${buf.length} bytes)`); |
| foundAny = true; |
| } catch (err) { |
| const msg = (err as Error).message || ""; |
| const status = msg.match(/status (\d+)/)?.[1]; |
|
|
| |
| |
| |
| |
| |
| |
| if (status === "404") { |
| console.log(`[hf-storage] ${base}/${file}: not found (404)`); |
| continue; |
| } |
|
|
| console.warn( |
| `[hf-storage] Hub lib failed for ${base}/${file}: ${msg}`, |
| ); |
|
|
| |
| |
| |
| |
| |
| try { |
| const url = `https://huggingface.co/datasets/${HF_DATASET_ID}/resolve/main/${base}/${file}`; |
| console.log(`[hf-storage] trying direct fetch: ${url}`); |
| const httpRes = await fetch( |
| url, |
| accessToken |
| ? { headers: { Authorization: `Bearer ${accessToken}` } } |
| : undefined, |
| ); |
| if (!httpRes.ok) { |
| console.log(`[hf-storage] direct fetch ${file}: ${httpRes.status}`); |
| continue; |
| } |
| mkdirSync(outDir, { recursive: true }); |
| const buf = Buffer.from(await httpRes.arrayBuffer()); |
| writeFileSync(join(outDir, file), buf); |
| console.log( |
| `[hf-storage] pulled ${base}/${file} via direct fetch (${buf.length} bytes)`, |
| ); |
| foundAny = true; |
| } catch (fetchErr) { |
| console.warn( |
| `[hf-storage] direct fetch also failed for ${file}:`, |
| (fetchErr as Error).message, |
| ); |
| } |
| } |
| } |
|
|
| console.log(`[hf-storage] pullPublished result: ${foundAny ? "restored" : "nothing found"}`); |
| return foundAny; |
| } |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| export function getPublishedAssetUrl(docName: string, filename: string): string { |
| const safeName = sanitizeName(docName); |
| return `${getPublicBaseUrl()}/d/published/${safeName}/${filename}`; |
| } |
|
|
| interface PublishedPayload { |
| html: string; |
| pdf: Buffer | null; |
| thumbnail: Buffer | null; |
| meta: { title: string; description: string; authors: string[]; date: string; [key: string]: unknown }; |
| llmsTxt?: string; |
| } |
|
|
| export async function uploadPublishedAssets( |
| docName: string, |
| payload: PublishedPayload, |
| token?: string, |
| ): Promise<{ htmlUrl: string; pdfUrl: string | null; thumbUrl: string | null }> { |
| await ensureDatasetExists(token); |
|
|
| const accessToken = getToken(token); |
| const safeName = sanitizeName(docName); |
| const base = `published/${safeName}`; |
|
|
| const operations: CommitFile[] = [ |
| { |
| operation: "addOrUpdate", |
| path: `${base}/index.html`, |
| content: new Blob([payload.html]), |
| }, |
| { |
| operation: "addOrUpdate", |
| path: `${base}/meta.json`, |
| content: new Blob([JSON.stringify(payload.meta, null, 2)]), |
| }, |
| ]; |
|
|
| const baseUrl = `https://huggingface.co/datasets/${HF_DATASET_ID}/resolve/main/${base}`; |
| let pdfUrl: string | null = null; |
| let thumbUrl: string | null = null; |
|
|
| if (payload.pdf) { |
| operations.push({ |
| operation: "addOrUpdate", |
| path: `${base}/article.pdf`, |
| content: new Blob([new Uint8Array(payload.pdf)]), |
| }); |
| pdfUrl = `${baseUrl}/article.pdf`; |
| } |
|
|
| if (payload.thumbnail) { |
| operations.push({ |
| operation: "addOrUpdate", |
| path: `${base}/thumb.jpg`, |
| content: new Blob([new Uint8Array(payload.thumbnail)]), |
| }); |
| thumbUrl = `${baseUrl}/thumb.jpg`; |
| } |
|
|
| if (payload.llmsTxt) { |
| operations.push({ |
| operation: "addOrUpdate", |
| path: `${base}/llms.txt`, |
| content: new Blob([payload.llmsTxt], { type: "text/markdown; charset=utf-8" }), |
| }); |
| } |
|
|
| await commit({ |
| repo, |
| operations, |
| title: `publish ${safeName}`, |
| accessToken, |
| }); |
|
|
| console.log(`[hf-storage] published ${safeName} (${operations.length} files, single commit)`); |
|
|
| return { htmlUrl: `${baseUrl}/index.html`, pdfUrl, thumbUrl }; |
| } |
|
|