File size: 1,884 Bytes
775a7d0 9d21791 775a7d0 9d21791 775a7d0 9d21791 775a7d0 9d21791 775a7d0 9d21791 775a7d0 9d21791 775a7d0 9d21791 775a7d0 9d21791 775a7d0 9d21791 775a7d0 9d21791 775a7d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import os
import faiss
import numpy as np
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
# -----------------------
# Global in-memory state
# -----------------------
index = None
documents = []
metadata = []
embedder = SentenceTransformer("all-MiniLM-L6-v2")
# -----------------------
# Ingest uploaded files
# -----------------------
def ingest_documents(files):
global index, documents, metadata
texts = []
meta = []
for file in files:
filename = file.filename
if filename.endswith(".pdf"):
reader = PdfReader(file.file)
for i, page in enumerate(reader.pages):
text = page.extract_text()
if text:
texts.append(text)
meta.append({
"source": filename,
"page": i + 1
})
elif filename.endswith(".txt"):
content = file.file.read().decode("utf-8")
texts.append(content)
meta.append({
"source": filename,
"page": "N/A"
})
if not texts:
raise ValueError("No readable text found.")
embeddings = embedder.encode(texts)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))
documents = texts
metadata = meta
return len(texts)
# -----------------------
# Search
# -----------------------
def search_knowledge(query, top_k=5):
if index is None:
return []
query_vec = embedder.encode([query])
distances, indices = index.search(query_vec, top_k)
results = []
for idx, dist in zip(indices[0], distances[0]):
results.append({
"text": documents[idx],
"distance": float(dist),
"metadata": metadata[idx]
})
return results
|