File size: 1,884 Bytes
775a7d0
 
 
 
9d21791
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
775a7d0
 
 
 
9d21791
 
 
 
 
775a7d0
 
 
 
 
9d21791
775a7d0
 
 
9d21791
 
 
 
 
 
 
 
 
 
 
 
 
775a7d0
 
 
 
 
 
9d21791
775a7d0
9d21791
775a7d0
9d21791
775a7d0
9d21791
 
 
 
775a7d0
 
 
9d21791
775a7d0
 
9d21791
 
775a7d0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import faiss
import numpy as np
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer

# -----------------------
# Global in-memory state
# -----------------------
index = None
documents = []
metadata = []

embedder = SentenceTransformer("all-MiniLM-L6-v2")

# -----------------------
# Ingest uploaded files
# -----------------------
def ingest_documents(files):
    global index, documents, metadata

    texts = []
    meta = []

    for file in files:
        filename = file.filename

        if filename.endswith(".pdf"):
            reader = PdfReader(file.file)
            for i, page in enumerate(reader.pages):
                text = page.extract_text()
                if text:
                    texts.append(text)
                    meta.append({
                        "source": filename,
                        "page": i + 1
                    })

        elif filename.endswith(".txt"):
            content = file.file.read().decode("utf-8")
            texts.append(content)
            meta.append({
                "source": filename,
                "page": "N/A"
            })

    if not texts:
        raise ValueError("No readable text found.")

    embeddings = embedder.encode(texts)

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))

    documents = texts
    metadata = meta

    return len(texts)

# -----------------------
# Search
# -----------------------
def search_knowledge(query, top_k=5):
    if index is None:
        return []

    query_vec = embedder.encode([query])
    distances, indices = index.search(query_vec, top_k)

    results = []
    for idx, dist in zip(indices[0], distances[0]):
        results.append({
            "text": documents[idx],
            "distance": float(dist),
            "metadata": metadata[idx]
        })

    return results