rag-kb-system / scripts /export-hf-dataset.ts
duqing2026's picture
同步 hf
9ed89c8
import Database from 'better-sqlite3';
import fs from 'fs';
import path from 'path';
type DocRow = {
id: string;
yuque_id?: number;
title: string;
slug: string;
url?: string | null;
namespace?: string | null;
content_preview?: string | null;
word_count?: number | null;
updated_at?: number | null;
created_at?: number | null;
tags?: string | null;
sort_order?: number | null;
};
type KbRow = {
namespace: string;
name: string;
description?: string | null;
synced_at: number;
last_offset?: number | null;
};
function safeSlug(s: string) {
return s.replace(/[\\/]/g, '_').replace(/\s+/g, '-');
}
function ensureDir(p: string) {
if (!fs.existsSync(p)) fs.mkdirSync(p, { recursive: true });
}
function writeJson(filePath: string, data: unknown) {
fs.writeFileSync(filePath, JSON.stringify(data, null, 2), 'utf8');
}
async function main() {
const cwd = process.cwd();
const dbPath = path.join(cwd, 'rag-kb.db');
const outDir = path.join(cwd, '..', 'hf_dataset_rag-kb-system');
const filesDir = path.join(outDir, 'files');
const metaDir = path.join(outDir, 'metadata');
ensureDir(outDir);
ensureDir(filesDir);
ensureDir(metaDir);
const db = new Database(dbPath);
const docs = db
.prepare(
`SELECT id, yuque_id, title, slug, url, namespace, content_preview, word_count, updated_at, created_at, tags, sort_order
FROM documents
ORDER BY namespace ASC, sort_order ASC, synced_at DESC`
)
.all() as DocRow[];
const kbs = db
.prepare(
`SELECT namespace, name, description, synced_at, last_offset
FROM knowledge_bases
ORDER BY synced_at DESC`
)
.all() as KbRow[];
const index: Array<Omit<DocRow, 'content_preview'>> = [];
for (const d of docs) {
const ns = d.namespace || 'UNKNOWN';
const nsDir = path.join(filesDir, ns);
ensureDir(nsDir);
const slug = safeSlug(d.slug);
const filePath = path.join(nsDir, `${slug}.md`);
const content = d.content_preview || '';
fs.writeFileSync(filePath, content, 'utf8');
index.push({
id: d.id,
yuque_id: d.yuque_id,
title: d.title,
slug: d.slug,
url: d.url,
namespace: d.namespace,
word_count: d.word_count,
updated_at: d.updated_at,
created_at: d.created_at,
tags: d.tags,
sort_order: d.sort_order,
});
}
writeJson(path.join(metaDir, 'documents.json'), {
count: index.length,
documents: index,
});
writeJson(path.join(metaDir, 'knowledge_bases.json'), {
count: kbs.length,
knowledge_bases: kbs,
});
const summary = {
generated_at: new Date().toISOString(),
files_dir: 'files',
metadata_dir: 'metadata',
namespaces: Array.from(new Set(index.map((d) => d.namespace || 'UNKNOWN'))),
};
writeJson(path.join(outDir, 'dataset_summary.json'), summary);
console.log(`Exported ${index.length} documents to: ${outDir}`);
}
main().catch((e) => {
console.error(e);
process.exit(1);
});