Alexend commited on
Commit
f0470a1
·
verified ·
1 Parent(s): 2b2cc60

Create build_vector_store.py

Browse files
Files changed (1) hide show
  1. build_vector_store.py +19 -32
build_vector_store.py CHANGED
@@ -1,44 +1,31 @@
1
- # build_vector_store.py
2
- # 將 web_data.txt 自動分段,嵌入向量並建立 FAISS 檢索庫
3
-
4
- import os
5
- import json
6
  import faiss
7
  from sentence_transformers import SentenceTransformer
 
8
 
9
- # ✅ 參數設定
10
- TEXT_FILE = "web_data.txt"
11
- VECTOR_FILE = "faiss_index.faiss"
12
- DOCS_FILE = "docs.json"
13
- MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
14
-
15
- # ✅ 讀取並切段 web_data.txt
16
- print("📖 載入並分段 web_data.txt...")
17
- with open(TEXT_FILE, "r", encoding="utf-8") as f:
18
- content = f.read()
19
 
20
- # 空行為斷點分段(可根據實際資料再微調)
21
- docs = [chunk.strip() for chunk in content.split("\n\n") if chunk.strip()]
22
 
23
- # ✅ 載入嵌入模型
24
- print("🔤 載入嵌入模型...")
25
- model = SentenceTransformer(MODEL_NAME)
26
 
27
- print("🔍 轉換成嵌入向量...")
28
- embeddings = model.encode(docs, show_progress_bar=True)
29
 
30
- # 建立 FAISS 索引
31
- print("🧠 建立 FAISS 索引...")
32
  dimension = embeddings[0].shape[0]
33
  index = faiss.IndexFlatL2(dimension)
34
  index.add(embeddings)
35
 
36
- # 儲存向量與對應段落
37
- faiss.write_index(index, VECTOR_FILE)
38
- with open(DOCS_FILE, "w", encoding="utf-8") as f:
39
- json.dump(docs, f, ensure_ascii=False, indent=2)
 
 
40
 
41
- print("✅ 向量資料庫建立完成")
42
- print(f" - 向量檔:{VECTOR_FILE}")
43
- print(f" - 文件對應檔:{DOCS_FILE}")
44
- print(f" - 總段落數:{len(docs)}")
 
1
+ # build_vector_store.py
 
 
 
 
2
  import faiss
3
  from sentence_transformers import SentenceTransformer
4
+ import json
5
 
6
+ # 載入爬蟲資料
7
+ with open("web_data.txt", "r", encoding="utf-8") as f:
8
+ documents = f.readlines()
 
 
 
 
 
 
 
9
 
10
+ # 去除空行與整理
11
+ documents = [line.strip() for line in documents if line.strip()]
12
 
13
+ # 初始化向量模型(中文支援佳)
14
+ encoder = SentenceTransformer("shibing624/text2vec-base-chinese")
 
15
 
16
+ # 建立向量資料
17
+ embeddings = encoder.encode(documents, show_progress_bar=True)
18
 
19
+ # 建立 FAISS Index
 
20
  dimension = embeddings[0].shape[0]
21
  index = faiss.IndexFlatL2(dimension)
22
  index.add(embeddings)
23
 
24
+ # 儲存
25
+ faiss.write_index(index, "vector_store.faiss")
26
+
27
+ # 儲存原始文件
28
+ with open("documents.json", "w", encoding="utf-8") as f:
29
+ json.dump(documents, f, ensure_ascii=False, indent=2)
30
 
31
+ print("✅ 向量資料庫建立完成")