Spaces:

indhupamula
/

project10

Sleeping

App Files Files Community

project10 / app.py

indhupamula

Update app.py

1a8bb36 verified about 2 months ago

raw

history blame contribute delete

3.21 kB

	import os
	import re
	import faiss
	import numpy as np
	import gradio as gr
	from sentence_transformers import SentenceTransformer
	from PyPDF2 import PdfReader
	from docx import Document

	# -------------------- LOAD MODEL --------------------
	model = SentenceTransformer("all-MiniLM-L6-v2")

	# -------------------- TEXT EXTRACTION --------------------
	def extract_text(file_path):
	text = ""
	if file_path.endswith(".pdf"):
	reader = PdfReader(file_path)
	for page in reader.pages:
	if page.extract_text():
	text += page.extract_text() + "\n"
	elif file_path.endswith(".docx"):
	doc = Document(file_path)
	for para in doc.paragraphs:
	text += para.text + "\n"
	elif file_path.endswith(".txt"):
	with open(file_path, "r", encoding="utf-8") as f:
	text = f.read()
	return text.strip()

	# -------------------- CHUNKING --------------------
	def chunk_text(text, chunk_size=300):
	words = text.split()
	return [
	" ".join(words[i:i + chunk_size])
	for i in range(0, len(words), chunk_size)
	]

	# -------------------- LOAD DOCUMENTS (ROOT DIRECTORY) --------------------
	def load_documents():
	docs = []
	sources = []

	for file in os.listdir("."):
	if file.endswith((".pdf", ".docx", ".txt")):
	if file == "requirements.txt" or file == "app.py":
	continue

	content = extract_text(file)
	chunks = chunk_text(content)

	for chunk in chunks:
	if len(chunk.strip()) > 20:
	docs.append(chunk.strip())
	sources.append(file)

	# ABSOLUTE SAFETY FALLBACK
	if len(docs) == 0:
	docs = [
	"Artificial intelligence and databases are important computer science topics."
	]
	sources = ["fallback.txt"]

	return docs, sources

	documents, sources = load_documents()

	# -------------------- BUILD FAISS INDEX --------------------
	embeddings = model.encode(documents, convert_to_numpy=True).astype("float32")
	faiss.normalize_L2(embeddings)

	index = faiss.IndexFlatIP(embeddings.shape[1])
	index.add(embeddings)

	# -------------------- SEARCH FUNCTION --------------------
	def semantic_search(query):
	if query.strip() == "":
	return "Please enter a query."

	query_vec = model.encode([query]).astype("float32")
	faiss.normalize_L2(query_vec)

	D, I = index.search(query_vec, 3)

	result = ""
	for rank, idx in enumerate(I[0]):
	if D[0][rank] >= 0.35:
	result += (
	f"Rank: {rank + 1}\n"
	f"Source: {sources[idx]}\n"
	f"Similarity Score: {D[0][rank]:.4f}\n"
	f"Text: {documents[idx][:300]}\n\n"
	)

	if result == "":
	return "No strong semantic matches found."

	return result

	# -------------------- GRADIO UI --------------------
	iface = gr.Interface(
	fn=semantic_search,
	inputs=gr.Textbox(label="Enter your query"),
	outputs=gr.Textbox(label="Search Results"),
	title="Semantic Document Search",
	description="Search documents based on meaning using FAISS and Sentence Transformers"
	)

	iface.launch()