semantic / app.py
indhupamula's picture
Create app.py
5702037 verified
import os
import re
import faiss
import numpy as np
import gradio as gr
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
from docx import Document
# -------------------- LOAD MODEL --------------------
model = SentenceTransformer("all-MiniLM-L6-v2")
# -------------------- TEXT EXTRACTION --------------------
def extract_text(file_path):
text = ""
if file_path.endswith(".pdf"):
reader = PdfReader(file_path)
for page in reader.pages:
if page.extract_text():
text += page.extract_text() + "\n"
elif file_path.endswith(".docx"):
doc = Document(file_path)
for para in doc.paragraphs:
text += para.text + "\n"
elif file_path.endswith(".txt"):
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
return text.strip()
# -------------------- CHUNKING --------------------
def chunk_text(text, chunk_size=300):
words = text.split()
return [
" ".join(words[i:i + chunk_size])
for i in range(0, len(words), chunk_size)
]
# -------------------- PROCESS UPLOADED FILE --------------------
def process_uploaded_file(uploaded_file):
if uploaded_file is None:
return None, None
file_path = uploaded_file.name
content = extract_text(file_path)
if content.strip() == "":
return None, None
chunks = chunk_text(content)
documents = []
sources = []
for chunk in chunks:
if len(chunk.strip()) > 20:
documents.append(chunk.strip())
sources.append(uploaded_file.name)
return documents, sources
# -------------------- SEMANTIC SEARCH --------------------
def semantic_search(uploaded_file, query):
if uploaded_file is None:
return "Please upload a document."
if query.strip() == "":
return "Please enter a query."
documents, sources = process_uploaded_file(uploaded_file)
if documents is None or len(documents) == 0:
return "Could not extract readable text from the uploaded file."
# Build embeddings
embeddings = model.encode(documents, convert_to_numpy=True).astype("float32")
faiss.normalize_L2(embeddings)
# Build FAISS index
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
# Encode query
query_vec = model.encode([query]).astype("float32")
faiss.normalize_L2(query_vec)
D, I = index.search(query_vec, 3)
result = ""
for rank, idx in enumerate(I[0]):
if D[0][rank] >= 0.35:
result += (
f"Rank: {rank + 1}\n"
f"Source: {sources[idx]}\n"
f"Similarity Score: {D[0][rank]:.4f}\n"
f"Text: {documents[idx][:300]}\n\n"
)
if result == "":
return "No strong semantic matches found."
return result
# -------------------- GRADIO UI --------------------
iface = gr.Interface(
fn=semantic_search,
inputs=[
gr.File(label="Upload Document (PDF / DOCX / TXT)"),
gr.Textbox(label="Enter your query")
],
outputs=gr.Textbox(label="Search Results"),
title="Semantic Document Search (Upload-Based)",
description="Upload a document and search its content based on meaning using FAISS and embeddings"
)
iface.launch()