| | from langchain_community.vectorstores import FAISS |
| | from langchain_core.documents import Document |
| | from embeddings import EmbeddingsManager |
| | from typing import List |
| | from pathlib import Path |
| | import json |
| | |
| |
|
| | from data_loader import DataLoader |
| |
|
| | class VectorStoreManager: |
| | def __init__(self): |
| | self.embeddings_manager = EmbeddingsManager() |
| | self.embeddings = self.embeddings_manager.get_embeddings() |
| | self.vector_store = None |
| | |
| | current_dir = Path(__file__).parent.parent |
| | self.cache_dir = current_dir / "faiss_cache" |
| | self.cache_dir.mkdir(parents=True, exist_ok=True) |
| |
|
| | self.load_faiss_cache() |
| |
|
| | def load_faiss_cache(self): |
| | """LOAD FAISS 0.1s""" |
| | index_path = self.cache_dir / "faiss_index" |
| | if index_path.exists(): |
| | print(" LOADING FAISS CACHE...") |
| | self.vector_store = FAISS.load_local( |
| | str(index_path), |
| | self.embeddings, |
| | allow_dangerous_deserialization=True |
| | ) |
| | print(f"LOADED {len(self.vector_store.docstore._dict)} chunks!") |
| | return True |
| | return False |
| |
|
| | def build_and_cache(self, docs): |
| | """EMBED + SAVE""" |
| | |
| | if self.vector_store is not None: |
| | return |
| | |
| | print(f" Embedding {len(docs)} chunks...") |
| | self.vector_store = FAISS.from_documents(docs, self.embeddings) |
| | |
| | |
| | index_path = self.cache_dir / "faiss_index" |
| | self.vector_store.save_local(str(index_path)) |
| | print(f"SAVED FAISS: {index_path}") |
| |
|
| |
|
| | |
| | def build_from_docs(self, docs: List[Document]): |
| | """Xây dựng FAISS từ list chunks""" |
| |
|
| | """ |
| | self.vector_store CHÍNH LÀ nơi lưu trữ: |
| | |
| | + Vector embedding (dạng số) của từng Document |
| | + Nội dung gốc (page_content) |
| | + Metadata (source_file, chunk_id, title, …) |
| | + Chỉ mục FAISS để tìm kiếm nhanh theo độ giống (cosine / L2) |
| | """ |
| |
|
| | self.vector_store = FAISS.from_documents(docs, self.embeddings) |
| | print(f"FAISS built with {len(docs)} medical chunks indexed!") |
| | |
| | def get_retriever(self, k: int = 3): |
| | """Trả về retriever với k docs""" |
| | if self.vector_store is None: |
| | raise ValueError("Vector store not built. Call build_from_docs first.") |
| | return self.vector_store.as_retriever(search_kwargs={"k": k}) |
| | |
| | def save_documents(self, docs): |
| | from pymongo import MongoClient |
| | output_dir = Path(__file__).parent.parent / "store" |
| | output_dir.mkdir(parents=True, exist_ok=True) |
| |
|
| | records = [] |
| | for i, doc in enumerate(docs): |
| | records.append({ |
| | "doc_id": f"doc_{i}", |
| | "page_content": doc.page_content, |
| | "metadata": doc.metadata |
| | }) |
| |
|
| | |
| | with open(output_dir / "documents.json", "w", encoding="utf-8") as f: |
| | json.dump(records, f, ensure_ascii=False, indent=2) |
| |
|
| | |
| | client = MongoClient("mongodb://localhost:27017/") |
| | col = client["medchat"]["store"] |
| | col.delete_many({}) |
| | col.insert_many(records) |
| | client.close() |
| |
|
| | print(f"Saved {len(records)} documents") |
| | |
| | def save_embeddings(self, docs): |
| |
|
| | texts = [doc.page_content for doc in docs] |
| | vectors = self.embeddings.embed_documents(texts) |
| |
|
| | output_dir = Path(r"D:\Storage\rag_project\store") |
| |
|
| | records = [] |
| | for i, vec in enumerate(vectors): |
| | records.append({ |
| | "doc_id": f"doc_{i}", |
| | "embedding": vec |
| | }) |
| |
|
| | |
| | with open(output_dir / "embeddings.json", "w", encoding="utf-8") as f: |
| | json.dump(records, f) |
| |
|
| | |
| | client = MongoClient("mongodb://localhost:27017/") |
| | col = client["medchat"]["embeddings"] |
| | col.delete_many({}) |
| | col.insert_many(records) |
| | client.close() |
| |
|
| | print(f" Saved {len(records)} embeddings") |
| |
|
| |
|
| | |
| | if __name__ == "__main__": |
| | print("TEST VECTOR STORE...") |
| |
|
| | |
| | docs = [ |
| | Document( |
| | page_content="Sốt ở trẻ em là tình trạng thân nhiệt tăng trên 38 độ.", |
| | metadata={"title": "Sốt trẻ em"} |
| | ), |
| | Document( |
| | page_content="Tiêu chảy cấp ở trẻ thường do virus hoặc vi khuẩn.", |
| | metadata={"title": "Tiêu chảy"} |
| | ), |
| | Document( |
| | page_content="Gãy xương tay cần được cố định và đưa đến cơ sở y tế.", |
| | metadata={"title": "Chấn thương"} |
| | ), |
| | ] |
| |
|
| | vs = VectorStoreManager() |
| | vs.build_from_docs(docs) |
| | print(f"Đã build xong docs") |
| |
|
| | retriever = vs.get_retriever(k=2) |
| | results = retriever.get_relevant_documents("vi khuẩn") |
| |
|
| | print("\n KẾT QUẢ TRUY XUẤT:") |
| | for i, doc in enumerate(results): |
| | print(f"{i+1}. {doc.metadata['title']} | {doc.page_content[:60]}...") |
| |
|
| | print("\n VECTOR STORE OK!") |
| |
|
| | all_docs = DataLoader.load_all_chunks() |
| |
|
| | vs.build_from_docs(docs) |
| | |
| | print(f"Đã build xong all_docs") |
| | vs.save_documents(all_docs) |
| | vs.save_embeddings(all_docs) |
| |
|
| | print("\n HOÀN THÀNH LƯU DOCUMENT VÀ EMBEDING HOÀN TẤT") |