Stust / build_vector_store.py
Alexend's picture
Create build_vector_store.py
f0470a1 verified
raw
history blame
849 Bytes
# build_vector_store.py
import faiss
from sentence_transformers import SentenceTransformer
import json
# 載入爬蟲資料
with open("web_data.txt", "r", encoding="utf-8") as f:
documents = f.readlines()
# 去除空行與整理
documents = [line.strip() for line in documents if line.strip()]
# 初始化向量模型(中文支援佳)
encoder = SentenceTransformer("shibing624/text2vec-base-chinese")
# 建立向量資料
embeddings = encoder.encode(documents, show_progress_bar=True)
# 建立 FAISS Index
dimension = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
# 儲存
faiss.write_index(index, "vector_store.faiss")
# 儲存原始文件
with open("documents.json", "w", encoding="utf-8") as f:
json.dump(documents, f, ensure_ascii=False, indent=2)
print("✅ 向量資料庫建立完成")