""" BƯỚC 4: VECTORSTORE (FAISS in-memory) ------------------------------------- Tạo FAISS index từ các CHUNK văn bản. - Không ghi file .faiss nào, tất cả nằm trong RAM. - Embeddings được lấy từ get_embeddings() (Bước 3). """ from langchain_community.vectorstores import FAISS from embeddings import get_embeddings def build_vectorstore(chunks): """ Nhận danh sách Document (đã split) và trả về FAISS VectorStore. """ print(">>> Initialising embedding model for FAISS index ...") embeddings = get_embeddings() print(f">>> Building FAISS index from {len(chunks)} chunks ...") vs = FAISS.from_documents(chunks, embeddings) print(">>> FAISS index built.\n") return vs if __name__ == "__main__": # Test toàn pipeline: load -> split -> FAISS -> similarity_search from load_documents import load_documents from split_documents import split_documents print("=== TEST: load_documents -> split_documents -> FAISS.similarity_search ===\n") # 1) Load tài liệu (PDF + HTML) từ HuggingFace docs = load_documents() # 2) Split thành chunks from pprint import pprint print(f"Loaded {len(docs)} raw documents.") chunks = split_documents(docs) print(f"Split into {len(chunks)} chunks.\n") # 3) Xây FAISS vectorstore vectorstore = build_vectorstore(chunks) # 4) Test similarity_search query = "Fristen für die Prüfungsanmeldung im Bachelorstudium" print("Test query:") print(" ", query, "\n") results = vectorstore.similarity_search(query, k=3) print("Top-3 ähnliche Chunks aus dem VectorStore:") for i, doc in enumerate(results, start=1): print(f"\n=== RESULT {i} ===") print(doc.page_content[:400], "...") print("Metadata:", doc.metadata)