import os import re import faiss import numpy as np import gradio as gr from sentence_transformers import SentenceTransformer from PyPDF2 import PdfReader from docx import Document # -------------------- LOAD MODEL -------------------- model = SentenceTransformer("all-MiniLM-L6-v2") # -------------------- TEXT EXTRACTION -------------------- def extract_text(file_path): text = "" if file_path.endswith(".pdf"): reader = PdfReader(file_path) for page in reader.pages: if page.extract_text(): text += page.extract_text() + "\n" elif file_path.endswith(".docx"): doc = Document(file_path) for para in doc.paragraphs: text += para.text + "\n" elif file_path.endswith(".txt"): with open(file_path, "r", encoding="utf-8") as f: text = f.read() return text.strip() # -------------------- CHUNKING -------------------- def chunk_text(text, chunk_size=300): words = text.split() return [ " ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size) ] # -------------------- PROCESS UPLOADED FILE -------------------- def process_uploaded_file(uploaded_file): if uploaded_file is None: return None, None file_path = uploaded_file.name content = extract_text(file_path) if content.strip() == "": return None, None chunks = chunk_text(content) documents = [] sources = [] for chunk in chunks: if len(chunk.strip()) > 20: documents.append(chunk.strip()) sources.append(uploaded_file.name) return documents, sources # -------------------- SEMANTIC SEARCH -------------------- def semantic_search(uploaded_file, query): if uploaded_file is None: return "Please upload a document." if query.strip() == "": return "Please enter a query." documents, sources = process_uploaded_file(uploaded_file) if documents is None or len(documents) == 0: return "Could not extract readable text from the uploaded file." # Build embeddings embeddings = model.encode(documents, convert_to_numpy=True).astype("float32") faiss.normalize_L2(embeddings) # Build FAISS index index = faiss.IndexFlatIP(embeddings.shape[1]) index.add(embeddings) # Encode query query_vec = model.encode([query]).astype("float32") faiss.normalize_L2(query_vec) D, I = index.search(query_vec, 3) result = "" for rank, idx in enumerate(I[0]): if D[0][rank] >= 0.35: result += ( f"Rank: {rank + 1}\n" f"Source: {sources[idx]}\n" f"Similarity Score: {D[0][rank]:.4f}\n" f"Text: {documents[idx][:300]}\n\n" ) if result == "": return "No strong semantic matches found." return result # -------------------- GRADIO UI -------------------- iface = gr.Interface( fn=semantic_search, inputs=[ gr.File(label="Upload Document (PDF / DOCX / TXT)"), gr.Textbox(label="Enter your query") ], outputs=gr.Textbox(label="Search Results"), title="Semantic Document Search (Upload-Based)", description="Upload a document and search its content based on meaning using FAISS and embeddings" ) iface.launch()