|
|
import os |
|
|
import faiss |
|
|
import numpy as np |
|
|
from pypdf import PdfReader |
|
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
index = None |
|
|
documents = [] |
|
|
metadata = [] |
|
|
|
|
|
embedder = SentenceTransformer("all-MiniLM-L6-v2") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ingest_documents(files): |
|
|
global index, documents, metadata |
|
|
|
|
|
texts = [] |
|
|
meta = [] |
|
|
|
|
|
for file in files: |
|
|
filename = file.filename |
|
|
|
|
|
if filename.endswith(".pdf"): |
|
|
reader = PdfReader(file.file) |
|
|
for i, page in enumerate(reader.pages): |
|
|
text = page.extract_text() |
|
|
if text: |
|
|
texts.append(text) |
|
|
meta.append({ |
|
|
"source": filename, |
|
|
"page": i + 1 |
|
|
}) |
|
|
|
|
|
elif filename.endswith(".txt"): |
|
|
content = file.file.read().decode("utf-8") |
|
|
texts.append(content) |
|
|
meta.append({ |
|
|
"source": filename, |
|
|
"page": "N/A" |
|
|
}) |
|
|
|
|
|
if not texts: |
|
|
raise ValueError("No readable text found.") |
|
|
|
|
|
embeddings = embedder.encode(texts) |
|
|
|
|
|
index = faiss.IndexFlatL2(embeddings.shape[1]) |
|
|
index.add(np.array(embeddings)) |
|
|
|
|
|
documents = texts |
|
|
metadata = meta |
|
|
|
|
|
return len(texts) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def search_knowledge(query, top_k=5): |
|
|
if index is None: |
|
|
return [] |
|
|
|
|
|
query_vec = embedder.encode([query]) |
|
|
distances, indices = index.search(query_vec, top_k) |
|
|
|
|
|
results = [] |
|
|
for idx, dist in zip(indices[0], distances[0]): |
|
|
results.append({ |
|
|
"text": documents[idx], |
|
|
"distance": float(dist), |
|
|
"metadata": metadata[idx] |
|
|
}) |
|
|
|
|
|
return results |
|
|
|