| import os |
| import faiss |
| import numpy as np |
| from pypdf import PdfReader |
| from sentence_transformers import SentenceTransformer |
|
|
| |
| |
| |
| index = None |
| documents = [] |
| metadata = [] |
|
|
| embedder = SentenceTransformer("all-MiniLM-L6-v2") |
|
|
| |
| |
| |
| def ingest_documents(files): |
| global index, documents, metadata |
|
|
| texts = [] |
| meta = [] |
|
|
| for file in files: |
| filename = file.filename |
|
|
| if filename.endswith(".pdf"): |
| reader = PdfReader(file.file) |
| for i, page in enumerate(reader.pages): |
| text = page.extract_text() |
| if text: |
| texts.append(text) |
| meta.append({ |
| "source": filename, |
| "page": i + 1 |
| }) |
|
|
| elif filename.endswith(".txt"): |
| content = file.file.read().decode("utf-8") |
| texts.append(content) |
| meta.append({ |
| "source": filename, |
| "page": "N/A" |
| }) |
|
|
| if not texts: |
| raise ValueError("No readable text found.") |
|
|
| embeddings = embedder.encode(texts) |
|
|
| index = faiss.IndexFlatL2(embeddings.shape[1]) |
| index.add(np.array(embeddings)) |
|
|
| documents = texts |
| metadata = meta |
|
|
| return len(texts) |
|
|
| |
| |
| |
| def search_knowledge(query, top_k=5): |
| if index is None: |
| return [] |
|
|
| query_vec = embedder.encode([query]) |
| distances, indices = index.search(query_vec, top_k) |
|
|
| results = [] |
| for idx, dist in zip(indices[0], distances[0]): |
| results.append({ |
| "text": documents[idx], |
| "distance": float(dist), |
| "metadata": metadata[idx] |
| }) |
|
|
| return results |
|
|