File size: 3,332 Bytes
5702037
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import re
import faiss
import numpy as np
import gradio as gr
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
from docx import Document

# -------------------- LOAD MODEL --------------------
model = SentenceTransformer("all-MiniLM-L6-v2")

# -------------------- TEXT EXTRACTION --------------------
def extract_text(file_path):
    text = ""
    if file_path.endswith(".pdf"):
        reader = PdfReader(file_path)
        for page in reader.pages:
            if page.extract_text():
                text += page.extract_text() + "\n"
    elif file_path.endswith(".docx"):
        doc = Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    elif file_path.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
    return text.strip()

# -------------------- CHUNKING --------------------
def chunk_text(text, chunk_size=300):
    words = text.split()
    return [
        " ".join(words[i:i + chunk_size])
        for i in range(0, len(words), chunk_size)
    ]

# -------------------- PROCESS UPLOADED FILE --------------------
def process_uploaded_file(uploaded_file):
    if uploaded_file is None:
        return None, None

    file_path = uploaded_file.name
    content = extract_text(file_path)

    if content.strip() == "":
        return None, None

    chunks = chunk_text(content)

    documents = []
    sources = []

    for chunk in chunks:
        if len(chunk.strip()) > 20:
            documents.append(chunk.strip())
            sources.append(uploaded_file.name)

    return documents, sources

# -------------------- SEMANTIC SEARCH --------------------
def semantic_search(uploaded_file, query):
    if uploaded_file is None:
        return "Please upload a document."

    if query.strip() == "":
        return "Please enter a query."

    documents, sources = process_uploaded_file(uploaded_file)

    if documents is None or len(documents) == 0:
        return "Could not extract readable text from the uploaded file."

    # Build embeddings
    embeddings = model.encode(documents, convert_to_numpy=True).astype("float32")
    faiss.normalize_L2(embeddings)

    # Build FAISS index
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)

    # Encode query
    query_vec = model.encode([query]).astype("float32")
    faiss.normalize_L2(query_vec)

    D, I = index.search(query_vec, 3)

    result = ""
    for rank, idx in enumerate(I[0]):
        if D[0][rank] >= 0.35:
            result += (
                f"Rank: {rank + 1}\n"
                f"Source: {sources[idx]}\n"
                f"Similarity Score: {D[0][rank]:.4f}\n"
                f"Text: {documents[idx][:300]}\n\n"
            )

    if result == "":
        return "No strong semantic matches found."

    return result

# -------------------- GRADIO UI --------------------
iface = gr.Interface(
    fn=semantic_search,
    inputs=[
        gr.File(label="Upload Document (PDF / DOCX / TXT)"),
        gr.Textbox(label="Enter your query")
    ],
    outputs=gr.Textbox(label="Search Results"),
    title="Semantic Document Search (Upload-Based)",
    description="Upload a document and search its content based on meaning using FAISS and embeddings"
)

iface.launch()