File size: 1,825 Bytes
6548bf5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
"""
BƯỚC 4: VECTORSTORE (FAISS in-memory)
-------------------------------------
Tạo FAISS index từ các CHUNK văn bản.
- Không ghi file .faiss nào, tất cả nằm trong RAM.
- Embeddings được lấy từ get_embeddings() (Bước 3).
"""
from langchain_community.vectorstores import FAISS
from embeddings import get_embeddings
def build_vectorstore(chunks):
"""
Nhận danh sách Document (đã split) và trả về FAISS VectorStore.
"""
print(">>> Initialising embedding model for FAISS index ...")
embeddings = get_embeddings()
print(f">>> Building FAISS index from {len(chunks)} chunks ...")
vs = FAISS.from_documents(chunks, embeddings)
print(">>> FAISS index built.\n")
return vs
if __name__ == "__main__":
# Test toàn pipeline: load -> split -> FAISS -> similarity_search
from load_documents import load_documents
from split_documents import split_documents
print("=== TEST: load_documents -> split_documents -> FAISS.similarity_search ===\n")
# 1) Load tài liệu (PDF + HTML) từ HuggingFace
docs = load_documents()
# 2) Split thành chunks
from pprint import pprint
print(f"Loaded {len(docs)} raw documents.")
chunks = split_documents(docs)
print(f"Split into {len(chunks)} chunks.\n")
# 3) Xây FAISS vectorstore
vectorstore = build_vectorstore(chunks)
# 4) Test similarity_search
query = "Fristen für die Prüfungsanmeldung im Bachelorstudium"
print("Test query:")
print(" ", query, "\n")
results = vectorstore.similarity_search(query, k=3)
print("Top-3 ähnliche Chunks aus dem VectorStore:")
for i, doc in enumerate(results, start=1):
print(f"\n=== RESULT {i} ===")
print(doc.page_content[:400], "...")
print("Metadata:", doc.metadata)
|