| # build_vector_store.py | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| import json | |
| # 載入爬蟲資料 | |
| with open("web_data.txt", "r", encoding="utf-8") as f: | |
| documents = f.readlines() | |
| # 去除空行與整理 | |
| documents = [line.strip() for line in documents if line.strip()] | |
| # 初始化向量模型(中文支援佳) | |
| encoder = SentenceTransformer("shibing624/text2vec-base-chinese") | |
| # 建立向量資料 | |
| embeddings = encoder.encode(documents, show_progress_bar=True) | |
| # 建立 FAISS Index | |
| dimension = embeddings[0].shape[0] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings) | |
| # 儲存 | |
| faiss.write_index(index, "vector_store.faiss") | |
| # 儲存原始文件 | |
| with open("documents.json", "w", encoding="utf-8") as f: | |
| json.dump(documents, f, ensure_ascii=False, indent=2) | |
| print("✅ 向量資料庫建立完成") | |