rag-kb-system / scripts /yuque-ingest.ts
duqing2026's picture
对接语雀 token,同步所有文档,并同样布局展示页面和交互功能,并未实现对话知识库功能
2c10495
import { Document } from "@langchain/core/documents";
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
import { HNSWLib } from "@langchain/community/vectorstores/hnswlib";
import * as dotenv from "dotenv";
import path from "path";
import fs from "fs";
import { HttpsProxyAgent } from "https-proxy-agent";
import nodeFetch from "node-fetch";
// Import getEmbeddings from vector-store
import { getEmbeddings } from "../src/lib/vector-store";
// 加载环境变量
dotenv.config({ path: ".env.local" });
dotenv.config();
// 代理设置 (如果需要)
if (process.env.HTTPS_PROXY) {
const agent = new HttpsProxyAgent(process.env.HTTPS_PROXY);
(global as any).fetch = (url: any, init: any) => {
return nodeFetch(url, { ...init, agent }) as any;
};
}
const VECTOR_STORE_PATH = path.join(process.cwd(), "vector_store");
const YUQUE_BASE_URL = "https://www.yuque.com/api/v2";
interface YuqueDoc {
id: number;
slug: string;
title: string;
body?: string; // 详情接口才有
}
// 并发控制工具函数
async function asyncPool(poolLimit: number, array: any[], iteratorFn: (item: any, array: any[]) => Promise<any>) {
const ret = [];
const executing: Promise<any>[] = [];
for (const item of array) {
const p = Promise.resolve().then(() => iteratorFn(item, array));
ret.push(p);
if (poolLimit <= array.length) {
const e: Promise<void> = p.then(() => {
executing.splice(executing.indexOf(e), 1);
});
executing.push(e);
if (executing.length >= poolLimit) {
await Promise.race(executing);
}
}
}
return Promise.all(ret);
}
// 简单的语雀数据加载器
class SimpleYuqueLoader {
constructor(private token: string, private namespace: string) {}
private async fetchAPI(endpoint: string) {
const url = `${YUQUE_BASE_URL}${endpoint}`;
const response = await fetch(url, {
headers: {
"X-Auth-Token": this.token,
"User-Agent": "rag-kb-system-demo",
"Content-Type": "application/json",
},
});
if (!response.ok) {
throw new Error(`Yuque API Error: ${response.status} ${response.statusText} - ${url}`);
}
return await response.json();
}
// 获取所有文档元数据列表
async loadDocList(): Promise<YuqueDoc[]> {
console.log(`Fetching docs list from namespace: ${this.namespace}...`);
try {
const listData = await this.fetchAPI(`/repos/${this.namespace}/docs`);
return listData.data;
} catch (e) {
console.error(`Failed to fetch doc list for ${this.namespace}:`, e);
return [];
}
}
// 获取单个文档详情
async fetchDocDetail(docInfo: YuqueDoc): Promise<Document | null> {
try {
const detailData = await this.fetchAPI(`/repos/${this.namespace}/docs/${docInfo.id}`);
const docDetail = detailData.data;
const content = docDetail.body || "";
if (content) {
return new Document({
pageContent: content,
metadata: {
source: `yuque://${this.namespace}/${docInfo.slug}`,
title: docInfo.title,
id: docInfo.id,
yuque_slug: docInfo.slug
}
});
}
} catch (e) {
console.error(`Failed to fetch doc ${docInfo.title}:`, e);
}
return null;
}
}
const run = async () => {
const token = process.env.YUQUE_TOKEN;
const namespacesEnv = process.env.YUQUE_NAMESPACE;
if (!token || !namespacesEnv) {
console.error("Error: Please set YUQUE_TOKEN and YUQUE_NAMESPACE in .env.local");
process.exit(1);
}
const namespaces = namespacesEnv.split(",").map(s => s.trim()).filter(Boolean);
// 初始化 Embeddings
const embeddings = getEmbeddings();
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
});
// 1. 获取所有知识库的文档列表
let allDocInfos: { namespace: string, doc: YuqueDoc }[] = [];
for (const ns of namespaces) {
const loader = new SimpleYuqueLoader(token, ns);
const docs = await loader.loadDocList();
console.log(`Found ${docs.length} docs in ${ns}`);
allDocInfos = allDocInfos.concat(docs.map(d => ({ namespace: ns, doc: d })));
}
const totalDocs = allDocInfos.length;
console.log(`\nTotal documents to process: ${totalDocs}`);
if (totalDocs === 0) return;
// 2. 分批处理 (Batch Processing)
const BATCH_SIZE = 50; // 每批处理多少篇文档
const CONCURRENCY = 10; // 每批内部并发请求数
for (let i = 0; i < totalDocs; i += BATCH_SIZE) {
const batchInfos = allDocInfos.slice(i, i + BATCH_SIZE);
const currentBatchNum = Math.floor(i / BATCH_SIZE) + 1;
const totalBatches = Math.ceil(totalDocs / BATCH_SIZE);
console.log(`\nProcessing Batch ${currentBatchNum}/${totalBatches} (${batchInfos.length} docs)...`);
// 2.1 并发获取文档详情
const docs: Document[] = [];
await asyncPool(CONCURRENCY, batchInfos, async (item) => {
const loader = new SimpleYuqueLoader(token, item.namespace);
const doc = await loader.fetchDocDetail(item.doc);
if (doc) docs.push(doc);
// 简单的进度显示
process.stdout.write(".");
});
process.stdout.write("\n"); // 换行
if (docs.length === 0) continue;
// 2.2 Split
const chunks = await splitter.splitDocuments(docs);
console.log(` > Generated ${chunks.length} chunks.`);
// 2.3 Embed & Store (Append mode)
let vectorStore: HNSWLib;
// 每次都重新加载最新的索引 (因为是分批写入,上一次写入的需要被这次看到)
if (fs.existsSync(path.join(VECTOR_STORE_PATH, "hnswlib.index"))) {
vectorStore = await HNSWLib.load(VECTOR_STORE_PATH, embeddings);
await vectorStore.addDocuments(chunks);
} else {
vectorStore = await HNSWLib.fromDocuments(chunks, embeddings);
}
await vectorStore.save(VECTOR_STORE_PATH);
console.log(` > Saved batch to vector store.`);
// 手动垃圾回收建议 (虽然 Node 会自动做,但在大循环中解引用有帮助)
// docs = null; chunks = null; vectorStore = null;
}
console.log(`\nIngestion complete! All ${totalDocs} documents processed.`);
};
run().catch(console.error);