|
|
""" |
|
|
BƯỚC 4: VECTORSTORE (FAISS in-memory) |
|
|
------------------------------------- |
|
|
Tạo FAISS index từ các CHUNK văn bản. |
|
|
- Không ghi file .faiss nào, tất cả nằm trong RAM. |
|
|
- Embeddings được lấy từ get_embeddings() (Bước 3). |
|
|
""" |
|
|
|
|
|
from langchain_community.vectorstores import FAISS |
|
|
from embeddings import get_embeddings |
|
|
|
|
|
def build_vectorstore(chunks): |
|
|
""" |
|
|
Nhận danh sách Document (đã split) và trả về FAISS VectorStore. |
|
|
""" |
|
|
print(">>> Initialising embedding model for FAISS index ...") |
|
|
embeddings = get_embeddings() |
|
|
|
|
|
print(f">>> Building FAISS index from {len(chunks)} chunks ...") |
|
|
vs = FAISS.from_documents(chunks, embeddings) |
|
|
print(">>> FAISS index built.\n") |
|
|
return vs |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
from load_documents import load_documents |
|
|
from split_documents import split_documents |
|
|
|
|
|
print("=== TEST: load_documents -> split_documents -> FAISS.similarity_search ===\n") |
|
|
|
|
|
|
|
|
docs = load_documents() |
|
|
|
|
|
|
|
|
from pprint import pprint |
|
|
print(f"Loaded {len(docs)} raw documents.") |
|
|
chunks = split_documents(docs) |
|
|
print(f"Split into {len(chunks)} chunks.\n") |
|
|
|
|
|
|
|
|
vectorstore = build_vectorstore(chunks) |
|
|
|
|
|
|
|
|
query = "Fristen für die Prüfungsanmeldung im Bachelorstudium" |
|
|
print("Test query:") |
|
|
print(" ", query, "\n") |
|
|
|
|
|
results = vectorstore.similarity_search(query, k=3) |
|
|
|
|
|
print("Top-3 ähnliche Chunks aus dem VectorStore:") |
|
|
for i, doc in enumerate(results, start=1): |
|
|
print(f"\n=== RESULT {i} ===") |
|
|
print(doc.page_content[:400], "...") |
|
|
print("Metadata:", doc.metadata) |
|
|
|
|
|
|
|
|
|