/** * HF Dataset ↔ SQLite DB sync. * On startup: pull raqim.db from a private HF Dataset if present. * On interval / SIGTERM: push raqim.db back to the dataset. * * Uses git (already installed in the Docker image) to push/pull, * which is the most reliable method for HF Hub repos. */ import { execSync, spawnSync } from "child_process"; import fs from "fs"; import path from "path"; import { fileURLToPath } from "url"; import { DatabaseSync } from "node:sqlite"; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const HF_TOKEN = process.env.HF_TOKEN; const HF_USERNAME = process.env.HF_USERNAME; const DATASET_NAME = "raqim-db"; const DB_PATH = process.env.DB_PATH || "/data/raqim.db"; const CLONE_DIR = "/tmp/raqim-db-repo"; function run(cmd, opts = {}) { try { execSync(cmd, { stdio: "inherit", ...opts }); return true; } catch { return false; } } function gitUrl() { return `https://${HF_USERNAME}:${HF_TOKEN}@huggingface.co/datasets/${HF_USERNAME}/${DATASET_NAME}`; } async function ensureDataset() { if (!HF_TOKEN || !HF_USERNAME) return; await fetch("https://huggingface.co/api/repos/create", { method: "POST", headers: { Authorization: `Bearer ${HF_TOKEN}`, "Content-Type": "application/json", }, body: JSON.stringify({ type: "dataset", name: DATASET_NAME, private: true }), }).catch(() => {}); } export async function pullDb() { if (!HF_TOKEN || !HF_USERNAME) { console.log("[db-sync] No HF credentials — skipping pull."); return; } // /data is HF Spaces' persistent volume — it survives container restarts. // Only restore from the HF Dataset backup when there is no local DB at all // (i.e. first boot on a freshly created or fully reset Space). // Overwriting an existing /data/raqim.db would roll back any data that was // written after the last backup push, causing files to disappear. if (fs.existsSync(DB_PATH)) { console.log("[db-sync] Local DB already exists — skipping restore to preserve recent data."); return; } await ensureDataset(); // Ensure DB dir exists fs.mkdirSync(path.dirname(DB_PATH), { recursive: true }); // Clone or update the dataset repo if (fs.existsSync(CLONE_DIR)) { run(`git -C "${CLONE_DIR}" pull --depth=1 origin main`); } else { run(`git clone --depth=1 "${gitUrl()}" "${CLONE_DIR}"`); } const srcDb = path.join(CLONE_DIR, "raqim.db"); if (fs.existsSync(srcDb)) { fs.copyFileSync(srcDb, DB_PATH); // Remove any stale WAL/SHM files — they belong to a different DB instance. for (const suffix of ["-wal", "-shm"]) { const stale = DB_PATH + suffix; if (fs.existsSync(stale)) { fs.rmSync(stale); console.log(`[db-sync] Removed stale ${suffix} file.`); } } console.log("[db-sync] ✓ DB restored from HF Dataset (first boot)."); } else { console.log("[db-sync] No existing DB in dataset — starting fresh."); } } export async function pushDb() { if (!HF_TOKEN || !HF_USERNAME) return; if (!fs.existsSync(DB_PATH)) return; // Ensure repo is cloned if (!fs.existsSync(CLONE_DIR)) { await ensureDataset(); run(`git clone --depth=1 "${gitUrl()}" "${CLONE_DIR}"`); } // Checkpoint WAL into the main DB file before copying so the backup // is a self-contained, consistent snapshot without a dangling WAL. try { const tmpDb = new DatabaseSync(DB_PATH); tmpDb.exec("PRAGMA wal_checkpoint(TRUNCATE)"); tmpDb.close(); } catch (e) { console.warn("[db-sync] WAL checkpoint failed (non-fatal):", e.message); } fs.copyFileSync(DB_PATH, path.join(CLONE_DIR, "raqim.db")); run(`git -C "${CLONE_DIR}" config user.email "sync@raqim.app"`); run(`git -C "${CLONE_DIR}" config user.name "RAQIM Sync"`); run(`git -C "${CLONE_DIR}" add raqim.db`); const committed = run(`git -C "${CLONE_DIR}" commit -m "DB sync $(date '+%Y-%m-%d %H:%M')"`); if (committed) { const pushed = run(`git -C "${CLONE_DIR}" push origin HEAD:main`); if (pushed) console.log("[db-sync] ✓ DB saved to HF Dataset."); else console.warn("[db-sync] ✗ Push failed — will retry next interval."); } } // ── Called standalone: node scripts/db-hf-sync.mjs pull|push ──────────── const action = process.argv[2]; if (action === "pull") { await pullDb(); } else if (action === "push") { await pushDb(); }