Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import faiss | |
| import numpy as np | |
| import gradio as gr | |
| from sentence_transformers import SentenceTransformer | |
| from PyPDF2 import PdfReader | |
| from docx import Document | |
| # -------------------- LOAD MODEL -------------------- | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| # -------------------- TEXT EXTRACTION -------------------- | |
| def extract_text(file_path): | |
| text = "" | |
| if file_path.endswith(".pdf"): | |
| reader = PdfReader(file_path) | |
| for page in reader.pages: | |
| if page.extract_text(): | |
| text += page.extract_text() + "\n" | |
| elif file_path.endswith(".docx"): | |
| doc = Document(file_path) | |
| for para in doc.paragraphs: | |
| text += para.text + "\n" | |
| elif file_path.endswith(".txt"): | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| return text.strip() | |
| # -------------------- CHUNKING -------------------- | |
| def chunk_text(text, chunk_size=300): | |
| words = text.split() | |
| return [ | |
| " ".join(words[i:i + chunk_size]) | |
| for i in range(0, len(words), chunk_size) | |
| ] | |
| # -------------------- PROCESS UPLOADED FILE -------------------- | |
| def process_uploaded_file(uploaded_file): | |
| if uploaded_file is None: | |
| return None, None | |
| file_path = uploaded_file.name | |
| content = extract_text(file_path) | |
| if content.strip() == "": | |
| return None, None | |
| chunks = chunk_text(content) | |
| documents = [] | |
| sources = [] | |
| for chunk in chunks: | |
| if len(chunk.strip()) > 20: | |
| documents.append(chunk.strip()) | |
| sources.append(uploaded_file.name) | |
| return documents, sources | |
| # -------------------- SEMANTIC SEARCH -------------------- | |
| def semantic_search(uploaded_file, query): | |
| if uploaded_file is None: | |
| return "Please upload a document." | |
| if query.strip() == "": | |
| return "Please enter a query." | |
| documents, sources = process_uploaded_file(uploaded_file) | |
| if documents is None or len(documents) == 0: | |
| return "Could not extract readable text from the uploaded file." | |
| # Build embeddings | |
| embeddings = model.encode(documents, convert_to_numpy=True).astype("float32") | |
| faiss.normalize_L2(embeddings) | |
| # Build FAISS index | |
| index = faiss.IndexFlatIP(embeddings.shape[1]) | |
| index.add(embeddings) | |
| # Encode query | |
| query_vec = model.encode([query]).astype("float32") | |
| faiss.normalize_L2(query_vec) | |
| D, I = index.search(query_vec, 3) | |
| result = "" | |
| for rank, idx in enumerate(I[0]): | |
| if D[0][rank] >= 0.35: | |
| result += ( | |
| f"Rank: {rank + 1}\n" | |
| f"Source: {sources[idx]}\n" | |
| f"Similarity Score: {D[0][rank]:.4f}\n" | |
| f"Text: {documents[idx][:300]}\n\n" | |
| ) | |
| if result == "": | |
| return "No strong semantic matches found." | |
| return result | |
| # -------------------- GRADIO UI -------------------- | |
| iface = gr.Interface( | |
| fn=semantic_search, | |
| inputs=[ | |
| gr.File(label="Upload Document (PDF / DOCX / TXT)"), | |
| gr.Textbox(label="Enter your query") | |
| ], | |
| outputs=gr.Textbox(label="Search Results"), | |
| title="Semantic Document Search (Upload-Based)", | |
| description="Upload a document and search its content based on meaning using FAISS and embeddings" | |
| ) | |
| iface.launch() | |