project10 / app.py
indhupamula's picture
Update app.py
1a8bb36 verified
import os
import re
import faiss
import numpy as np
import gradio as gr
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
from docx import Document
# -------------------- LOAD MODEL --------------------
model = SentenceTransformer("all-MiniLM-L6-v2")
# -------------------- TEXT EXTRACTION --------------------
def extract_text(file_path):
text = ""
if file_path.endswith(".pdf"):
reader = PdfReader(file_path)
for page in reader.pages:
if page.extract_text():
text += page.extract_text() + "\n"
elif file_path.endswith(".docx"):
doc = Document(file_path)
for para in doc.paragraphs:
text += para.text + "\n"
elif file_path.endswith(".txt"):
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
return text.strip()
# -------------------- CHUNKING --------------------
def chunk_text(text, chunk_size=300):
words = text.split()
return [
" ".join(words[i:i + chunk_size])
for i in range(0, len(words), chunk_size)
]
# -------------------- LOAD DOCUMENTS (ROOT DIRECTORY) --------------------
def load_documents():
docs = []
sources = []
for file in os.listdir("."):
if file.endswith((".pdf", ".docx", ".txt")):
if file == "requirements.txt" or file == "app.py":
continue
content = extract_text(file)
chunks = chunk_text(content)
for chunk in chunks:
if len(chunk.strip()) > 20:
docs.append(chunk.strip())
sources.append(file)
# ABSOLUTE SAFETY FALLBACK
if len(docs) == 0:
docs = [
"Artificial intelligence and databases are important computer science topics."
]
sources = ["fallback.txt"]
return docs, sources
documents, sources = load_documents()
# -------------------- BUILD FAISS INDEX --------------------
embeddings = model.encode(documents, convert_to_numpy=True).astype("float32")
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
# -------------------- SEARCH FUNCTION --------------------
def semantic_search(query):
if query.strip() == "":
return "Please enter a query."
query_vec = model.encode([query]).astype("float32")
faiss.normalize_L2(query_vec)
D, I = index.search(query_vec, 3)
result = ""
for rank, idx in enumerate(I[0]):
if D[0][rank] >= 0.35:
result += (
f"Rank: {rank + 1}\n"
f"Source: {sources[idx]}\n"
f"Similarity Score: {D[0][rank]:.4f}\n"
f"Text: {documents[idx][:300]}\n\n"
)
if result == "":
return "No strong semantic matches found."
return result
# -------------------- GRADIO UI --------------------
iface = gr.Interface(
fn=semantic_search,
inputs=gr.Textbox(label="Enter your query"),
outputs=gr.Textbox(label="Search Results"),
title="Semantic Document Search",
description="Search documents based on meaning using FAISS and Sentence Transformers"
)
iface.launch()