Spaces:

indhupamula
/

project10

Sleeping

App Files Files Community

indhupamula commited on Dec 20, 2025

Commit

c055550

verified ·

1 Parent(s): 4c10cc4

Create app.py

Browse files

Files changed (1) hide show

app.py +102 -0

app.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+import re
+import faiss
+import numpy as np
+import gradio as gr
+from sentence_transformers import SentenceTransformer
+from PyPDF2 import PdfReader
+from docx import Document
+# -------------------- LOAD MODEL --------------------
+model = SentenceTransformer("all-MiniLM-L6-v2")
+# -------------------- TEXT EXTRACTION --------------------
+def extract_text(file_path):
+    text = ""
+    if file_path.endswith(".pdf"):
+        reader = PdfReader(file_path)
+        for page in reader.pages:
+            if page.extract_text():
+                text += page.extract_text() + "\n"
+    elif file_path.endswith(".docx"):
+        doc = Document(file_path)
+        for para in doc.paragraphs:
+            text += para.text + "\n"
+    elif file_path.endswith(".txt"):
+        with open(file_path, "r", encoding="utf-8") as f:
+            text = f.read()
+    return text.strip()
+# -------------------- CHUNKING --------------------
+def chunk_text(text, chunk_size=300):
+    words = text.split()
+    return [
+        " ".join(words[i:i + chunk_size])
+        for i in range(0, len(words), chunk_size)
+    ]
+# -------------------- LOAD DOCUMENTS --------------------
+def load_documents(folder="documents"):
+    docs = []
+    sources = []
+    if not os.path.exists(folder):
+        return [], []
+    for file in os.listdir(folder):
+        if file.endswith((".pdf", ".docx", ".txt")):
+            path = os.path.join(folder, file)
+            content = extract_text(path)
+            chunks = chunk_text(content)
+            for chunk in chunks:
+                if len(chunk.strip()) > 20:
+                    docs.append(chunk.strip())
+                    sources.append(file)
+    return docs, sources
+documents, sources = load_documents()
+if len(documents) == 0:
+    raise RuntimeError("No documents found in the documents folder.")
+# -------------------- BUILD FAISS INDEX --------------------
+embeddings = model.encode(documents, convert_to_numpy=True).astype("float32")
+faiss.normalize_L2(embeddings)
+index = faiss.IndexFlatIP(embeddings.shape[1])
+index.add(embeddings)
+# -------------------- SEARCH FUNCTION --------------------
+def semantic_search(query):
+    query_vec = model.encode([query]).astype("float32")
+    faiss.normalize_L2(query_vec)
+    D, I = index.search(query_vec, 3)
+    output = ""
+    for rank, idx in enumerate(I[0]):
+        if D[0][rank] >= 0.35:
+            output += (
+                f"Rank: {rank+1}\n"
+                f"Source: {sources[idx]}\n"
+                f"Similarity Score: {D[0][rank]:.4f}\n"
+                f"Text: {documents[idx][:300]}\n\n"
+            )
+    if output == "":
+        return "No strong semantic matches found."
+    return output
+# -------------------- GRADIO UI --------------------
+iface = gr.Interface(
+    fn=semantic_search,
+    inputs=gr.Textbox(label="Enter your query"),
+    outputs=gr.Textbox(label="Search Results"),
+    title="Semantic Document Search",
+    description="Search documents based on meaning using FAISS and embeddings"
+)
+iface.launch()