Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import faiss | |
| import numpy as np | |
| import gradio as gr | |
| from sentence_transformers import SentenceTransformer | |
| from PyPDF2 import PdfReader | |
| from docx import Document | |
| # -------------------- LOAD MODEL -------------------- | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| # -------------------- TEXT EXTRACTION -------------------- | |
| def extract_text(file_path): | |
| text = "" | |
| if file_path.endswith(".pdf"): | |
| reader = PdfReader(file_path) | |
| for page in reader.pages: | |
| if page.extract_text(): | |
| text += page.extract_text() + "\n" | |
| elif file_path.endswith(".docx"): | |
| doc = Document(file_path) | |
| for para in doc.paragraphs: | |
| text += para.text + "\n" | |
| elif file_path.endswith(".txt"): | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| return text.strip() | |
| # -------------------- CHUNKING -------------------- | |
| def chunk_text(text, chunk_size=300): | |
| words = text.split() | |
| return [ | |
| " ".join(words[i:i + chunk_size]) | |
| for i in range(0, len(words), chunk_size) | |
| ] | |
| # -------------------- LOAD DOCUMENTS (ROOT DIRECTORY) -------------------- | |
| def load_documents(): | |
| docs = [] | |
| sources = [] | |
| for file in os.listdir("."): | |
| if file.endswith((".pdf", ".docx", ".txt")): | |
| if file == "requirements.txt" or file == "app.py": | |
| continue | |
| content = extract_text(file) | |
| chunks = chunk_text(content) | |
| for chunk in chunks: | |
| if len(chunk.strip()) > 20: | |
| docs.append(chunk.strip()) | |
| sources.append(file) | |
| # ABSOLUTE SAFETY FALLBACK | |
| if len(docs) == 0: | |
| docs = [ | |
| "Artificial intelligence and databases are important computer science topics." | |
| ] | |
| sources = ["fallback.txt"] | |
| return docs, sources | |
| documents, sources = load_documents() | |
| # -------------------- BUILD FAISS INDEX -------------------- | |
| embeddings = model.encode(documents, convert_to_numpy=True).astype("float32") | |
| faiss.normalize_L2(embeddings) | |
| index = faiss.IndexFlatIP(embeddings.shape[1]) | |
| index.add(embeddings) | |
| # -------------------- SEARCH FUNCTION -------------------- | |
| def semantic_search(query): | |
| if query.strip() == "": | |
| return "Please enter a query." | |
| query_vec = model.encode([query]).astype("float32") | |
| faiss.normalize_L2(query_vec) | |
| D, I = index.search(query_vec, 3) | |
| result = "" | |
| for rank, idx in enumerate(I[0]): | |
| if D[0][rank] >= 0.35: | |
| result += ( | |
| f"Rank: {rank + 1}\n" | |
| f"Source: {sources[idx]}\n" | |
| f"Similarity Score: {D[0][rank]:.4f}\n" | |
| f"Text: {documents[idx][:300]}\n\n" | |
| ) | |
| if result == "": | |
| return "No strong semantic matches found." | |
| return result | |
| # -------------------- GRADIO UI -------------------- | |
| iface = gr.Interface( | |
| fn=semantic_search, | |
| inputs=gr.Textbox(label="Enter your query"), | |
| outputs=gr.Textbox(label="Search Results"), | |
| title="Semantic Document Search", | |
| description="Search documents based on meaning using FAISS and Sentence Transformers" | |
| ) | |
| iface.launch() | |