File size: 3,208 Bytes
c055550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a8bb36
 
c055550
 
 
1a8bb36
c055550
1a8bb36
 
 
 
c055550
 
 
 
 
 
 
1a8bb36
 
 
 
 
 
 
c055550
 
 
 
 
 
 
 
 
 
 
 
 
1a8bb36
 
 
c055550
 
 
 
 
1a8bb36
c055550
 
1a8bb36
 
c055550
 
 
 
 
1a8bb36
c055550
 
1a8bb36
c055550
 
 
 
 
 
 
1a8bb36
c055550
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import re
import faiss
import numpy as np
import gradio as gr
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
from docx import Document

# -------------------- LOAD MODEL --------------------
model = SentenceTransformer("all-MiniLM-L6-v2")

# -------------------- TEXT EXTRACTION --------------------
def extract_text(file_path):
    text = ""
    if file_path.endswith(".pdf"):
        reader = PdfReader(file_path)
        for page in reader.pages:
            if page.extract_text():
                text += page.extract_text() + "\n"
    elif file_path.endswith(".docx"):
        doc = Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    elif file_path.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
    return text.strip()

# -------------------- CHUNKING --------------------
def chunk_text(text, chunk_size=300):
    words = text.split()
    return [
        " ".join(words[i:i + chunk_size])
        for i in range(0, len(words), chunk_size)
    ]

# -------------------- LOAD DOCUMENTS (ROOT DIRECTORY) --------------------
def load_documents():
    docs = []
    sources = []

    for file in os.listdir("."):
        if file.endswith((".pdf", ".docx", ".txt")):
            if file == "requirements.txt" or file == "app.py":
                continue

            content = extract_text(file)
            chunks = chunk_text(content)

            for chunk in chunks:
                if len(chunk.strip()) > 20:
                    docs.append(chunk.strip())
                    sources.append(file)

    # ABSOLUTE SAFETY FALLBACK
    if len(docs) == 0:
        docs = [
            "Artificial intelligence and databases are important computer science topics."
        ]
        sources = ["fallback.txt"]

    return docs, sources

documents, sources = load_documents()

# -------------------- BUILD FAISS INDEX --------------------
embeddings = model.encode(documents, convert_to_numpy=True).astype("float32")
faiss.normalize_L2(embeddings)

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

# -------------------- SEARCH FUNCTION --------------------
def semantic_search(query):
    if query.strip() == "":
        return "Please enter a query."

    query_vec = model.encode([query]).astype("float32")
    faiss.normalize_L2(query_vec)

    D, I = index.search(query_vec, 3)

    result = ""
    for rank, idx in enumerate(I[0]):
        if D[0][rank] >= 0.35:
            result += (
                f"Rank: {rank + 1}\n"
                f"Source: {sources[idx]}\n"
                f"Similarity Score: {D[0][rank]:.4f}\n"
                f"Text: {documents[idx][:300]}\n\n"
            )

    if result == "":
        return "No strong semantic matches found."

    return result

# -------------------- GRADIO UI --------------------
iface = gr.Interface(
    fn=semantic_search,
    inputs=gr.Textbox(label="Enter your query"),
    outputs=gr.Textbox(label="Search Results"),
    title="Semantic Document Search",
    description="Search documents based on meaning using FAISS and Sentence Transformers"
)

iface.launch()