import os import re import faiss import numpy as np import gradio as gr from sentence_transformers import SentenceTransformer from PyPDF2 import PdfReader from docx import Document # -------------------- LOAD MODEL -------------------- model = SentenceTransformer("all-MiniLM-L6-v2") # -------------------- TEXT EXTRACTION -------------------- def extract_text(file_path): text = "" if file_path.endswith(".pdf"): reader = PdfReader(file_path) for page in reader.pages: if page.extract_text(): text += page.extract_text() + "\n" elif file_path.endswith(".docx"): doc = Document(file_path) for para in doc.paragraphs: text += para.text + "\n" elif file_path.endswith(".txt"): with open(file_path, "r", encoding="utf-8") as f: text = f.read() return text.strip() # -------------------- CHUNKING -------------------- def chunk_text(text, chunk_size=300): words = text.split() return [ " ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size) ] # -------------------- LOAD DOCUMENTS (ROOT DIRECTORY) -------------------- def load_documents(): docs = [] sources = [] for file in os.listdir("."): if file.endswith((".pdf", ".docx", ".txt")): if file == "requirements.txt" or file == "app.py": continue content = extract_text(file) chunks = chunk_text(content) for chunk in chunks: if len(chunk.strip()) > 20: docs.append(chunk.strip()) sources.append(file) # ABSOLUTE SAFETY FALLBACK if len(docs) == 0: docs = [ "Artificial intelligence and databases are important computer science topics." ] sources = ["fallback.txt"] return docs, sources documents, sources = load_documents() # -------------------- BUILD FAISS INDEX -------------------- embeddings = model.encode(documents, convert_to_numpy=True).astype("float32") faiss.normalize_L2(embeddings) index = faiss.IndexFlatIP(embeddings.shape[1]) index.add(embeddings) # -------------------- SEARCH FUNCTION -------------------- def semantic_search(query): if query.strip() == "": return "Please enter a query." query_vec = model.encode([query]).astype("float32") faiss.normalize_L2(query_vec) D, I = index.search(query_vec, 3) result = "" for rank, idx in enumerate(I[0]): if D[0][rank] >= 0.35: result += ( f"Rank: {rank + 1}\n" f"Source: {sources[idx]}\n" f"Similarity Score: {D[0][rank]:.4f}\n" f"Text: {documents[idx][:300]}\n\n" ) if result == "": return "No strong semantic matches found." return result # -------------------- GRADIO UI -------------------- iface = gr.Interface( fn=semantic_search, inputs=gr.Textbox(label="Enter your query"), outputs=gr.Textbox(label="Search Results"), title="Semantic Document Search", description="Search documents based on meaning using FAISS and Sentence Transformers" ) iface.launch()