Spaces:

indhupamula
/

semantic

Sleeping

File size: 3,332 Bytes
import os
import re
import faiss
import numpy as np
import gradio as gr
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
from docx import Document

# -------------------- LOAD MODEL --------------------
model = SentenceTransformer("all-MiniLM-L6-v2")

# -------------------- TEXT EXTRACTION --------------------
def extract_text(file_path):
    text = ""
    if file_path.endswith(".pdf"):
        reader = PdfReader(file_path)
        for page in reader.pages:
            if page.extract_text():
                text += page.extract_text() + "\n"
    elif file_path.endswith(".docx"):
        doc = Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    elif file_path.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
    return text.strip()

# -------------------- CHUNKING --------------------
def chunk_text(text, chunk_size=300):
    words = text.split()
    return [
        " ".join(words[i:i + chunk_size])
        for i in range(0, len(words), chunk_size)
    ]

# -------------------- PROCESS UPLOADED FILE --------------------
def process_uploaded_file(uploaded_file):
    if uploaded_file is None:
        return None, None

    file_path = uploaded_file.name
    content = extract_text(file_path)

    if content.strip() == "":
        return None, None

    chunks = chunk_text(content)

    documents = []
    sources = []

    for chunk in chunks:
        if len(chunk.strip()) > 20:
            documents.append(chunk.strip())
            sources.append(uploaded_file.name)

    return documents, sources

# -------------------- SEMANTIC SEARCH --------------------
def semantic_search(uploaded_file, query):
    if uploaded_file is None:
        return "Please upload a document."

    if query.strip() == "":
        return "Please enter a query."

    documents, sources = process_uploaded_file(uploaded_file)

    if documents is None or len(documents) == 0:
        return "Could not extract readable text from the uploaded file."

    # Build embeddings
    embeddings = model.encode(documents, convert_to_numpy=True).astype("float32")
    faiss.normalize_L2(embeddings)

    # Build FAISS index
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)

    # Encode query
    query_vec = model.encode([query]).astype("float32")
    faiss.normalize_L2(query_vec)

    D, I = index.search(query_vec, 3)

    result = ""
    for rank, idx in enumerate(I[0]):
        if D[0][rank] >= 0.35:
            result += (
                f"Rank: {rank + 1}\n"
                f"Source: {sources[idx]}\n"
                f"Similarity Score: {D[0][rank]:.4f}\n"
                f"Text: {documents[idx][:300]}\n\n"
            )

    if result == "":
        return "No strong semantic matches found."

    return result

# -------------------- GRADIO UI --------------------
iface = gr.Interface(
    fn=semantic_search,
    inputs=[
        gr.File(label="Upload Document (PDF / DOCX / TXT)"),
        gr.Textbox(label="Enter your query")
    ],
    outputs=gr.Textbox(label="Search Results"),
    title="Semantic Document Search (Upload-Based)",
    description="Upload a document and search its content based on meaning using FAISS and embeddings"
)

iface.launch()