Spaces:

indhupamula
/

semantic

Sleeping

App Files Files Community

semantic / app.py

indhupamula

Create app.py

5702037 verified about 2 months ago

raw

history blame contribute delete

3.33 kB

	import os
	import re
	import faiss
	import numpy as np
	import gradio as gr
	from sentence_transformers import SentenceTransformer
	from PyPDF2 import PdfReader
	from docx import Document

	# -------------------- LOAD MODEL --------------------
	model = SentenceTransformer("all-MiniLM-L6-v2")

	# -------------------- TEXT EXTRACTION --------------------
	def extract_text(file_path):
	text = ""
	if file_path.endswith(".pdf"):
	reader = PdfReader(file_path)
	for page in reader.pages:
	if page.extract_text():
	text += page.extract_text() + "\n"
	elif file_path.endswith(".docx"):
	doc = Document(file_path)
	for para in doc.paragraphs:
	text += para.text + "\n"
	elif file_path.endswith(".txt"):
	with open(file_path, "r", encoding="utf-8") as f:
	text = f.read()
	return text.strip()

	# -------------------- CHUNKING --------------------
	def chunk_text(text, chunk_size=300):
	words = text.split()
	return [
	" ".join(words[i:i + chunk_size])
	for i in range(0, len(words), chunk_size)
	]

	# -------------------- PROCESS UPLOADED FILE --------------------
	def process_uploaded_file(uploaded_file):
	if uploaded_file is None:
	return None, None

	file_path = uploaded_file.name
	content = extract_text(file_path)

	if content.strip() == "":
	return None, None

	chunks = chunk_text(content)

	documents = []
	sources = []

	for chunk in chunks:
	if len(chunk.strip()) > 20:
	documents.append(chunk.strip())
	sources.append(uploaded_file.name)

	return documents, sources

	# -------------------- SEMANTIC SEARCH --------------------
	def semantic_search(uploaded_file, query):
	if uploaded_file is None:
	return "Please upload a document."

	if query.strip() == "":
	return "Please enter a query."

	documents, sources = process_uploaded_file(uploaded_file)

	if documents is None or len(documents) == 0:
	return "Could not extract readable text from the uploaded file."

	# Build embeddings
	embeddings = model.encode(documents, convert_to_numpy=True).astype("float32")
	faiss.normalize_L2(embeddings)

	# Build FAISS index
	index = faiss.IndexFlatIP(embeddings.shape[1])
	index.add(embeddings)

	# Encode query
	query_vec = model.encode([query]).astype("float32")
	faiss.normalize_L2(query_vec)

	D, I = index.search(query_vec, 3)

	result = ""
	for rank, idx in enumerate(I[0]):
	if D[0][rank] >= 0.35:
	result += (
	f"Rank: {rank + 1}\n"
	f"Source: {sources[idx]}\n"
	f"Similarity Score: {D[0][rank]:.4f}\n"
	f"Text: {documents[idx][:300]}\n\n"
	)

	if result == "":
	return "No strong semantic matches found."

	return result

	# -------------------- GRADIO UI --------------------
	iface = gr.Interface(
	fn=semantic_search,
	inputs=[
	gr.File(label="Upload Document (PDF / DOCX / TXT)"),
	gr.Textbox(label="Enter your query")
	],
	outputs=gr.Textbox(label="Search Results"),
	title="Semantic Document Search (Upload-Based)",
	description="Upload a document and search its content based on meaning using FAISS and embeddings"
	)

	iface.launch()