Spaces:
Build error
Build error
| import { Document } from "@langchain/core/documents"; | |
| import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters"; | |
| import { HNSWLib } from "@langchain/community/vectorstores/hnswlib"; | |
| import * as dotenv from "dotenv"; | |
| import path from "path"; | |
| import fs from "fs"; | |
| import { HttpsProxyAgent } from "https-proxy-agent"; | |
| import nodeFetch from "node-fetch"; | |
| // Import getEmbeddings from vector-store | |
| import { getEmbeddings } from "../src/lib/vector-store"; | |
| // 加载环境变量 | |
| dotenv.config({ path: ".env.local" }); | |
| dotenv.config(); | |
| // 代理设置 (如果需要) | |
| if (process.env.HTTPS_PROXY) { | |
| const agent = new HttpsProxyAgent(process.env.HTTPS_PROXY); | |
| (global as any).fetch = (url: any, init: any) => { | |
| return nodeFetch(url, { ...init, agent }) as any; | |
| }; | |
| } | |
| const VECTOR_STORE_PATH = path.join(process.cwd(), "vector_store"); | |
| const YUQUE_BASE_URL = "https://www.yuque.com/api/v2"; | |
| interface YuqueDoc { | |
| id: number; | |
| slug: string; | |
| title: string; | |
| body?: string; // 详情接口才有 | |
| } | |
| // 并发控制工具函数 | |
| async function asyncPool(poolLimit: number, array: any[], iteratorFn: (item: any, array: any[]) => Promise<any>) { | |
| const ret = []; | |
| const executing: Promise<any>[] = []; | |
| for (const item of array) { | |
| const p = Promise.resolve().then(() => iteratorFn(item, array)); | |
| ret.push(p); | |
| if (poolLimit <= array.length) { | |
| const e: Promise<void> = p.then(() => { | |
| executing.splice(executing.indexOf(e), 1); | |
| }); | |
| executing.push(e); | |
| if (executing.length >= poolLimit) { | |
| await Promise.race(executing); | |
| } | |
| } | |
| } | |
| return Promise.all(ret); | |
| } | |
| // 简单的语雀数据加载器 | |
| class SimpleYuqueLoader { | |
| constructor(private token: string, private namespace: string) {} | |
| private async fetchAPI(endpoint: string) { | |
| const url = `${YUQUE_BASE_URL}${endpoint}`; | |
| const response = await fetch(url, { | |
| headers: { | |
| "X-Auth-Token": this.token, | |
| "User-Agent": "rag-kb-system-demo", | |
| "Content-Type": "application/json", | |
| }, | |
| }); | |
| if (!response.ok) { | |
| throw new Error(`Yuque API Error: ${response.status} ${response.statusText} - ${url}`); | |
| } | |
| return await response.json(); | |
| } | |
| // 获取所有文档元数据列表 | |
| async loadDocList(): Promise<YuqueDoc[]> { | |
| console.log(`Fetching docs list from namespace: ${this.namespace}...`); | |
| try { | |
| const listData = await this.fetchAPI(`/repos/${this.namespace}/docs`); | |
| return listData.data; | |
| } catch (e) { | |
| console.error(`Failed to fetch doc list for ${this.namespace}:`, e); | |
| return []; | |
| } | |
| } | |
| // 获取单个文档详情 | |
| async fetchDocDetail(docInfo: YuqueDoc): Promise<Document | null> { | |
| try { | |
| const detailData = await this.fetchAPI(`/repos/${this.namespace}/docs/${docInfo.id}`); | |
| const docDetail = detailData.data; | |
| const content = docDetail.body || ""; | |
| if (content) { | |
| return new Document({ | |
| pageContent: content, | |
| metadata: { | |
| source: `yuque://${this.namespace}/${docInfo.slug}`, | |
| title: docInfo.title, | |
| id: docInfo.id, | |
| yuque_slug: docInfo.slug | |
| } | |
| }); | |
| } | |
| } catch (e) { | |
| console.error(`Failed to fetch doc ${docInfo.title}:`, e); | |
| } | |
| return null; | |
| } | |
| } | |
| const run = async () => { | |
| const token = process.env.YUQUE_TOKEN; | |
| const namespacesEnv = process.env.YUQUE_NAMESPACE; | |
| if (!token || !namespacesEnv) { | |
| console.error("Error: Please set YUQUE_TOKEN and YUQUE_NAMESPACE in .env.local"); | |
| process.exit(1); | |
| } | |
| const namespaces = namespacesEnv.split(",").map(s => s.trim()).filter(Boolean); | |
| // 初始化 Embeddings | |
| const embeddings = getEmbeddings(); | |
| const splitter = new RecursiveCharacterTextSplitter({ | |
| chunkSize: 1000, | |
| chunkOverlap: 200, | |
| }); | |
| // 1. 获取所有知识库的文档列表 | |
| let allDocInfos: { namespace: string, doc: YuqueDoc }[] = []; | |
| for (const ns of namespaces) { | |
| const loader = new SimpleYuqueLoader(token, ns); | |
| const docs = await loader.loadDocList(); | |
| console.log(`Found ${docs.length} docs in ${ns}`); | |
| allDocInfos = allDocInfos.concat(docs.map(d => ({ namespace: ns, doc: d }))); | |
| } | |
| const totalDocs = allDocInfos.length; | |
| console.log(`\nTotal documents to process: ${totalDocs}`); | |
| if (totalDocs === 0) return; | |
| // 2. 分批处理 (Batch Processing) | |
| const BATCH_SIZE = 50; // 每批处理多少篇文档 | |
| const CONCURRENCY = 10; // 每批内部并发请求数 | |
| for (let i = 0; i < totalDocs; i += BATCH_SIZE) { | |
| const batchInfos = allDocInfos.slice(i, i + BATCH_SIZE); | |
| const currentBatchNum = Math.floor(i / BATCH_SIZE) + 1; | |
| const totalBatches = Math.ceil(totalDocs / BATCH_SIZE); | |
| console.log(`\nProcessing Batch ${currentBatchNum}/${totalBatches} (${batchInfos.length} docs)...`); | |
| // 2.1 并发获取文档详情 | |
| const docs: Document[] = []; | |
| await asyncPool(CONCURRENCY, batchInfos, async (item) => { | |
| const loader = new SimpleYuqueLoader(token, item.namespace); | |
| const doc = await loader.fetchDocDetail(item.doc); | |
| if (doc) docs.push(doc); | |
| // 简单的进度显示 | |
| process.stdout.write("."); | |
| }); | |
| process.stdout.write("\n"); // 换行 | |
| if (docs.length === 0) continue; | |
| // 2.2 Split | |
| const chunks = await splitter.splitDocuments(docs); | |
| console.log(` > Generated ${chunks.length} chunks.`); | |
| // 2.3 Embed & Store (Append mode) | |
| let vectorStore: HNSWLib; | |
| // 每次都重新加载最新的索引 (因为是分批写入,上一次写入的需要被这次看到) | |
| if (fs.existsSync(path.join(VECTOR_STORE_PATH, "hnswlib.index"))) { | |
| vectorStore = await HNSWLib.load(VECTOR_STORE_PATH, embeddings); | |
| await vectorStore.addDocuments(chunks); | |
| } else { | |
| vectorStore = await HNSWLib.fromDocuments(chunks, embeddings); | |
| } | |
| await vectorStore.save(VECTOR_STORE_PATH); | |
| console.log(` > Saved batch to vector store.`); | |
| // 手动垃圾回收建议 (虽然 Node 会自动做,但在大循环中解引用有帮助) | |
| // docs = null; chunks = null; vectorStore = null; | |
| } | |
| console.log(`\nIngestion complete! All ${totalDocs} documents processed.`); | |
| }; | |
| run().catch(console.error); | |