Alexend commited on
Commit
c7dd5c2
·
verified ·
1 Parent(s): 9d3d109

Create build_vector_store.py

Browse files
Files changed (1) hide show
  1. build_vector_store.py +44 -0
build_vector_store.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ✅ build_vector_store.py
2
+ # 將 web_data.txt 自動分段,嵌入向量並建立 FAISS 檢索庫
3
+
4
+ import os
5
+ import json
6
+ import faiss
7
+ from sentence_transformers import SentenceTransformer
8
+
9
+ # ✅ 參數設定
10
+ TEXT_FILE = "web_data.txt"
11
+ VECTOR_FILE = "faiss_index.faiss"
12
+ DOCS_FILE = "docs.json"
13
+ MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
14
+
15
+ # ✅ 讀取並切段 web_data.txt
16
+ print("📖 載入並分段 web_data.txt...")
17
+ with open(TEXT_FILE, "r", encoding="utf-8") as f:
18
+ content = f.read()
19
+
20
+ # 以空行為斷點分段(可根據實際資料再微調)
21
+ docs = [chunk.strip() for chunk in content.split("\n\n") if chunk.strip()]
22
+
23
+ # ✅ 載入嵌入模型
24
+ print("🔤 載入嵌入模型...")
25
+ model = SentenceTransformer(MODEL_NAME)
26
+
27
+ print("🔍 轉換成嵌入向量...")
28
+ embeddings = model.encode(docs, show_progress_bar=True)
29
+
30
+ # ✅ 建立 FAISS 索引
31
+ print("🧠 建立 FAISS 索引...")
32
+ dimension = embeddings[0].shape[0]
33
+ index = faiss.IndexFlatL2(dimension)
34
+ index.add(embeddings)
35
+
36
+ # ✅ 儲存向量與對應段落
37
+ faiss.write_index(index, VECTOR_FILE)
38
+ with open(DOCS_FILE, "w", encoding="utf-8") as f:
39
+ json.dump(docs, f, ensure_ascii=False, indent=2)
40
+
41
+ print("✅ 向量資料庫建立完成:")
42
+ print(f" - 向量檔:{VECTOR_FILE}")
43
+ print(f" - 文件對應檔:{DOCS_FILE}")
44
+ print(f" - 總段落數:{len(docs)}")