chatbot1 / vectorstore.py
Nguyen5's picture
commit
6548bf5
"""
BƯỚC 4: VECTORSTORE (FAISS in-memory)
-------------------------------------
Tạo FAISS index từ các CHUNK văn bản.
- Không ghi file .faiss nào, tất cả nằm trong RAM.
- Embeddings được lấy từ get_embeddings() (Bước 3).
"""
from langchain_community.vectorstores import FAISS
from embeddings import get_embeddings
def build_vectorstore(chunks):
"""
Nhận danh sách Document (đã split) và trả về FAISS VectorStore.
"""
print(">>> Initialising embedding model for FAISS index ...")
embeddings = get_embeddings()
print(f">>> Building FAISS index from {len(chunks)} chunks ...")
vs = FAISS.from_documents(chunks, embeddings)
print(">>> FAISS index built.\n")
return vs
if __name__ == "__main__":
# Test toàn pipeline: load -> split -> FAISS -> similarity_search
from load_documents import load_documents
from split_documents import split_documents
print("=== TEST: load_documents -> split_documents -> FAISS.similarity_search ===\n")
# 1) Load tài liệu (PDF + HTML) từ HuggingFace
docs = load_documents()
# 2) Split thành chunks
from pprint import pprint
print(f"Loaded {len(docs)} raw documents.")
chunks = split_documents(docs)
print(f"Split into {len(chunks)} chunks.\n")
# 3) Xây FAISS vectorstore
vectorstore = build_vectorstore(chunks)
# 4) Test similarity_search
query = "Fristen für die Prüfungsanmeldung im Bachelorstudium"
print("Test query:")
print(" ", query, "\n")
results = vectorstore.similarity_search(query, k=3)
print("Top-3 ähnliche Chunks aus dem VectorStore:")
for i, doc in enumerate(results, start=1):
print(f"\n=== RESULT {i} ===")
print(doc.page_content[:400], "...")
print("Metadata:", doc.metadata)