indhupamula commited on
Commit
c055550
·
verified ·
1 Parent(s): 4c10cc4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -0
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import faiss
4
+ import numpy as np
5
+ import gradio as gr
6
+ from sentence_transformers import SentenceTransformer
7
+ from PyPDF2 import PdfReader
8
+ from docx import Document
9
+
10
+ # -------------------- LOAD MODEL --------------------
11
+ model = SentenceTransformer("all-MiniLM-L6-v2")
12
+
13
+ # -------------------- TEXT EXTRACTION --------------------
14
+ def extract_text(file_path):
15
+ text = ""
16
+ if file_path.endswith(".pdf"):
17
+ reader = PdfReader(file_path)
18
+ for page in reader.pages:
19
+ if page.extract_text():
20
+ text += page.extract_text() + "\n"
21
+ elif file_path.endswith(".docx"):
22
+ doc = Document(file_path)
23
+ for para in doc.paragraphs:
24
+ text += para.text + "\n"
25
+ elif file_path.endswith(".txt"):
26
+ with open(file_path, "r", encoding="utf-8") as f:
27
+ text = f.read()
28
+ return text.strip()
29
+
30
+ # -------------------- CHUNKING --------------------
31
+ def chunk_text(text, chunk_size=300):
32
+ words = text.split()
33
+ return [
34
+ " ".join(words[i:i + chunk_size])
35
+ for i in range(0, len(words), chunk_size)
36
+ ]
37
+
38
+ # -------------------- LOAD DOCUMENTS --------------------
39
+ def load_documents(folder="documents"):
40
+ docs = []
41
+ sources = []
42
+
43
+ if not os.path.exists(folder):
44
+ return [], []
45
+
46
+ for file in os.listdir(folder):
47
+ if file.endswith((".pdf", ".docx", ".txt")):
48
+ path = os.path.join(folder, file)
49
+ content = extract_text(path)
50
+ chunks = chunk_text(content)
51
+
52
+ for chunk in chunks:
53
+ if len(chunk.strip()) > 20:
54
+ docs.append(chunk.strip())
55
+ sources.append(file)
56
+
57
+ return docs, sources
58
+
59
+ documents, sources = load_documents()
60
+
61
+ if len(documents) == 0:
62
+ raise RuntimeError("No documents found in the documents folder.")
63
+
64
+ # -------------------- BUILD FAISS INDEX --------------------
65
+ embeddings = model.encode(documents, convert_to_numpy=True).astype("float32")
66
+ faiss.normalize_L2(embeddings)
67
+
68
+ index = faiss.IndexFlatIP(embeddings.shape[1])
69
+ index.add(embeddings)
70
+
71
+ # -------------------- SEARCH FUNCTION --------------------
72
+ def semantic_search(query):
73
+ query_vec = model.encode([query]).astype("float32")
74
+ faiss.normalize_L2(query_vec)
75
+
76
+ D, I = index.search(query_vec, 3)
77
+
78
+ output = ""
79
+ for rank, idx in enumerate(I[0]):
80
+ if D[0][rank] >= 0.35:
81
+ output += (
82
+ f"Rank: {rank+1}\n"
83
+ f"Source: {sources[idx]}\n"
84
+ f"Similarity Score: {D[0][rank]:.4f}\n"
85
+ f"Text: {documents[idx][:300]}\n\n"
86
+ )
87
+
88
+ if output == "":
89
+ return "No strong semantic matches found."
90
+
91
+ return output
92
+
93
+ # -------------------- GRADIO UI --------------------
94
+ iface = gr.Interface(
95
+ fn=semantic_search,
96
+ inputs=gr.Textbox(label="Enter your query"),
97
+ outputs=gr.Textbox(label="Search Results"),
98
+ title="Semantic Document Search",
99
+ description="Search documents based on meaning using FAISS and embeddings"
100
+ )
101
+
102
+ iface.launch()