heerjtdev commited on
Commit
1049876
Β·
verified Β·
1 Parent(s): d443dc8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -67
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  # import gradio as gr
2
  # import fitz # PyMuPDF
3
  # from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -10,22 +11,32 @@
10
  # class VectorSystem:
11
  # def __init__(self):
12
  # self.vector_store = None
13
- # # Use a lightweight CPU-friendly model
14
  # self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
15
 
16
- # def process_pdf(self, file_obj):
17
- # """Extracts text from PDF and builds the Vector Index"""
18
  # if file_obj is None:
19
  # return "No file uploaded."
20
 
21
  # try:
22
- # # 1. Extract Text
23
- # doc = fitz.open(file_obj.name)
24
  # text = ""
25
- # for page in doc:
26
- # text += page.get_text()
27
 
28
- # # 2. Split Text into Chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # text_splitter = RecursiveCharacterTextSplitter(
30
  # chunk_size=800,
31
  # chunk_overlap=150,
@@ -34,28 +45,25 @@
34
  # chunks = text_splitter.split_text(text)
35
 
36
  # if not chunks:
37
- # return "Could not extract text. Is the PDF scanned images?"
38
 
39
  # # 3. Build Vector Index (FAISS)
40
  # self.vector_store = FAISS.from_texts(chunks, self.embeddings)
41
 
42
- # return f"βœ… Success! Indexed {len(chunks)} text chunks from the PDF."
43
 
44
  # except Exception as e:
45
- # return f"Error processing PDF: {str(e)}"
46
 
47
  # def retrieve_evidence(self, question, student_answer):
48
- # """Finds relevant text chunks based on the Question"""
49
  # if not self.vector_store:
50
- # return "⚠️ Please upload and process a PDF first."
51
 
52
  # if not question:
53
  # return "⚠️ Please enter a Question."
54
 
55
- # # We search primarily using the Question to find the 'Ground Truth' in the text.
56
  # docs = self.vector_store.similarity_search(question, k=3)
57
 
58
- # # Format the output
59
  # output_text = "### πŸ” Relevant Context Found:\n\n"
60
  # for i, doc in enumerate(docs):
61
  # output_text += f"**Chunk {i+1}:**\n> {doc.page_content}\n\n"
@@ -69,28 +77,26 @@
69
  # # --- Gradio UI ---
70
 
71
  # with gr.Blocks(title="EduGenius Context Retriever") as demo:
72
- # gr.Markdown("# πŸŽ“ EduGenius: PDF Context Retriever")
73
- # gr.Markdown("Upload a chapter, ask a question, and see exactly which part of the text proves the answer right or wrong.")
74
 
75
  # with gr.Row():
76
  # with gr.Column(scale=1):
77
- # # Step 1: Upload
78
- # pdf_input = gr.File(label="1. Upload PDF Chapter", file_types=[".pdf"])
79
- # upload_btn = gr.Button("Process PDF", variant="primary")
80
  # upload_status = gr.Textbox(label="Status", interactive=False)
81
 
82
  # with gr.Column(scale=2):
83
- # # Step 2: Query
84
  # question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
85
  # answer_input = gr.Textbox(label="Student Answer (Optional Context)", placeholder="e.g., The heat causes it...")
86
  # search_btn = gr.Button("Find Relevant Evidence", variant="secondary")
87
 
88
- # # Output
89
  # evidence_output = gr.Markdown(label="Relevant Text Chunks")
90
 
91
  # # Event Handlers
92
  # upload_btn.click(
93
- # fn=system.process_pdf,
94
  # inputs=[pdf_input],
95
  # outputs=[upload_status]
96
  # )
@@ -101,67 +107,66 @@
101
  # outputs=[evidence_output]
102
  # )
103
 
104
- # # Launch
105
  # if __name__ == "__main__":
106
  # demo.launch()
107
 
108
 
109
 
110
 
111
-
112
-
113
  import gradio as gr
114
  import fitz # PyMuPDF
115
  from langchain_text_splitters import RecursiveCharacterTextSplitter
116
  from langchain_community.vectorstores import FAISS
117
  from langchain_huggingface import HuggingFaceEmbeddings
118
- import os
119
-
120
- # --- Backend Logic ---
121
 
122
  class VectorSystem:
123
  def __init__(self):
124
  self.vector_store = None
125
  self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
 
 
126
 
127
  def process_file(self, file_obj):
128
- """Extracts text from PDF OR TXT and builds the Vector Index"""
129
  if file_obj is None:
130
  return "No file uploaded."
131
 
132
  try:
 
133
  text = ""
134
  file_path = file_obj.name
135
 
136
- # --- LOGIC BRANCH: Detect File Type ---
137
  if file_path.lower().endswith('.pdf'):
138
- # Handle PDF
139
  doc = fitz.open(file_path)
140
- for page in doc:
141
- text += page.get_text()
142
  elif file_path.lower().endswith('.txt'):
143
- # Handle Text File
144
- with open(file_path, 'r', encoding='utf-8') as f:
145
- text = f.read()
146
  else:
147
  return "❌ Error: Only .pdf and .txt files are supported."
148
- # --------------------------------------
149
 
150
- # 2. Split Text into Chunks (Logic is identical for both)
151
  text_splitter = RecursiveCharacterTextSplitter(
152
  chunk_size=800,
153
  chunk_overlap=150,
154
  separators=["\n\n", "\n", ".", " ", ""]
155
  )
156
- chunks = text_splitter.split_text(text)
 
157
 
158
- if not chunks:
159
  return "Could not extract text. Is the file empty?"
160
 
161
- # 3. Build Vector Index (FAISS)
162
- self.vector_store = FAISS.from_texts(chunks, self.embeddings)
 
163
 
164
- return f"βœ… Success! Indexed {len(chunks)} text chunks."
 
 
 
 
 
 
165
 
166
  except Exception as e:
167
  return f"Error processing file: {str(e)}"
@@ -173,50 +178,59 @@ class VectorSystem:
173
  if not question:
174
  return "⚠️ Please enter a Question."
175
 
176
- docs = self.vector_store.similarity_search(question, k=3)
 
 
 
 
177
 
178
- output_text = "### πŸ” Relevant Context Found:\n\n"
179
- for i, doc in enumerate(docs):
180
- output_text += f"**Chunk {i+1}:**\n> {doc.page_content}\n\n"
181
 
182
- output_text += "---\n*These are the most relevant segments to grade the answer against.*"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  return output_text
184
 
185
  # Initialize System
186
  system = VectorSystem()
187
 
188
  # --- Gradio UI ---
189
-
190
  with gr.Blocks(title="EduGenius Context Retriever") as demo:
191
- gr.Markdown("# πŸŽ“ EduGenius: Context Retriever")
192
- gr.Markdown("Upload a Chapter (PDF or TXT), ask a question, and see exactly which part of the text proves the answer right or wrong.")
193
 
194
  with gr.Row():
195
  with gr.Column(scale=1):
196
- # UPDATED: Added ".txt" to file_types and changed label
197
  pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"])
198
  upload_btn = gr.Button("Process File", variant="primary")
199
  upload_status = gr.Textbox(label="Status", interactive=False)
200
 
201
  with gr.Column(scale=2):
202
  question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
203
- answer_input = gr.Textbox(label="Student Answer (Optional Context)", placeholder="e.g., The heat causes it...")
204
- search_btn = gr.Button("Find Relevant Evidence", variant="secondary")
205
 
206
  evidence_output = gr.Markdown(label="Relevant Text Chunks")
207
 
208
- # Event Handlers
209
- upload_btn.click(
210
- fn=system.process_file, # Note: Function name changed
211
- inputs=[pdf_input],
212
- outputs=[upload_status]
213
- )
214
-
215
- search_btn.click(
216
- fn=system.retrieve_evidence,
217
- inputs=[question_input, answer_input],
218
- outputs=[evidence_output]
219
- )
220
 
221
  if __name__ == "__main__":
222
  demo.launch()
 
1
+
2
  # import gradio as gr
3
  # import fitz # PyMuPDF
4
  # from langchain_text_splitters import RecursiveCharacterTextSplitter
 
11
  # class VectorSystem:
12
  # def __init__(self):
13
  # self.vector_store = None
 
14
  # self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
15
 
16
+ # def process_file(self, file_obj):
17
+ # """Extracts text from PDF OR TXT and builds the Vector Index"""
18
  # if file_obj is None:
19
  # return "No file uploaded."
20
 
21
  # try:
 
 
22
  # text = ""
23
+ # file_path = file_obj.name
 
24
 
25
+ # # --- LOGIC BRANCH: Detect File Type ---
26
+ # if file_path.lower().endswith('.pdf'):
27
+ # # Handle PDF
28
+ # doc = fitz.open(file_path)
29
+ # for page in doc:
30
+ # text += page.get_text()
31
+ # elif file_path.lower().endswith('.txt'):
32
+ # # Handle Text File
33
+ # with open(file_path, 'r', encoding='utf-8') as f:
34
+ # text = f.read()
35
+ # else:
36
+ # return "❌ Error: Only .pdf and .txt files are supported."
37
+ # # --------------------------------------
38
+
39
+ # # 2. Split Text into Chunks (Logic is identical for both)
40
  # text_splitter = RecursiveCharacterTextSplitter(
41
  # chunk_size=800,
42
  # chunk_overlap=150,
 
45
  # chunks = text_splitter.split_text(text)
46
 
47
  # if not chunks:
48
+ # return "Could not extract text. Is the file empty?"
49
 
50
  # # 3. Build Vector Index (FAISS)
51
  # self.vector_store = FAISS.from_texts(chunks, self.embeddings)
52
 
53
+ # return f"βœ… Success! Indexed {len(chunks)} text chunks."
54
 
55
  # except Exception as e:
56
+ # return f"Error processing file: {str(e)}"
57
 
58
  # def retrieve_evidence(self, question, student_answer):
 
59
  # if not self.vector_store:
60
+ # return "⚠️ Please upload and process a file first."
61
 
62
  # if not question:
63
  # return "⚠️ Please enter a Question."
64
 
 
65
  # docs = self.vector_store.similarity_search(question, k=3)
66
 
 
67
  # output_text = "### πŸ” Relevant Context Found:\n\n"
68
  # for i, doc in enumerate(docs):
69
  # output_text += f"**Chunk {i+1}:**\n> {doc.page_content}\n\n"
 
77
  # # --- Gradio UI ---
78
 
79
  # with gr.Blocks(title="EduGenius Context Retriever") as demo:
80
+ # gr.Markdown("# πŸŽ“ EduGenius: Context Retriever")
81
+ # gr.Markdown("Upload a Chapter (PDF or TXT), ask a question, and see exactly which part of the text proves the answer right or wrong.")
82
 
83
  # with gr.Row():
84
  # with gr.Column(scale=1):
85
+ # # UPDATED: Added ".txt" to file_types and changed label
86
+ # pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"])
87
+ # upload_btn = gr.Button("Process File", variant="primary")
88
  # upload_status = gr.Textbox(label="Status", interactive=False)
89
 
90
  # with gr.Column(scale=2):
 
91
  # question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
92
  # answer_input = gr.Textbox(label="Student Answer (Optional Context)", placeholder="e.g., The heat causes it...")
93
  # search_btn = gr.Button("Find Relevant Evidence", variant="secondary")
94
 
 
95
  # evidence_output = gr.Markdown(label="Relevant Text Chunks")
96
 
97
  # # Event Handlers
98
  # upload_btn.click(
99
+ # fn=system.process_file, # Note: Function name changed
100
  # inputs=[pdf_input],
101
  # outputs=[upload_status]
102
  # )
 
107
  # outputs=[evidence_output]
108
  # )
109
 
 
110
  # if __name__ == "__main__":
111
  # demo.launch()
112
 
113
 
114
 
115
 
 
 
116
  import gradio as gr
117
  import fitz # PyMuPDF
118
  from langchain_text_splitters import RecursiveCharacterTextSplitter
119
  from langchain_community.vectorstores import FAISS
120
  from langchain_huggingface import HuggingFaceEmbeddings
 
 
 
121
 
122
  class VectorSystem:
123
  def __init__(self):
124
  self.vector_store = None
125
  self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
126
+ # NEW: We keep a copy of all chunks in a list so we can access neighbors by index
127
+ self.all_chunks = []
128
 
129
  def process_file(self, file_obj):
130
+ """Extracts text, preserves order, and builds the Vector Index"""
131
  if file_obj is None:
132
  return "No file uploaded."
133
 
134
  try:
135
+ # 1. Extract Text
136
  text = ""
137
  file_path = file_obj.name
138
 
 
139
  if file_path.lower().endswith('.pdf'):
 
140
  doc = fitz.open(file_path)
141
+ for page in doc: text += page.get_text()
 
142
  elif file_path.lower().endswith('.txt'):
143
+ with open(file_path, 'r', encoding='utf-8') as f: text = f.read()
 
 
144
  else:
145
  return "❌ Error: Only .pdf and .txt files are supported."
 
146
 
147
+ # 2. Split Text
148
  text_splitter = RecursiveCharacterTextSplitter(
149
  chunk_size=800,
150
  chunk_overlap=150,
151
  separators=["\n\n", "\n", ".", " ", ""]
152
  )
153
+ # Store chunks in the class so we can look them up by ID later
154
+ self.all_chunks = text_splitter.split_text(text)
155
 
156
+ if not self.all_chunks:
157
  return "Could not extract text. Is the file empty?"
158
 
159
+ # 3. Build Vector Index with ID Metadata
160
+ # We attach the index ID (0, 1, 2...) to every vector
161
+ metadatas = [{"id": i} for i in range(len(self.all_chunks))]
162
 
163
+ self.vector_store = FAISS.from_texts(
164
+ self.all_chunks,
165
+ self.embeddings,
166
+ metadatas=metadatas
167
+ )
168
+
169
+ return f"βœ… Success! Indexed {len(self.all_chunks)} chunks."
170
 
171
  except Exception as e:
172
  return f"Error processing file: {str(e)}"
 
178
  if not question:
179
  return "⚠️ Please enter a Question."
180
 
181
+ # NEW: use 'similarity_search_with_score' to see the numbers
182
+ # Lower Score = Better Match (L2 Distance)
183
+ results = self.vector_store.similarity_search_with_score(question, k=3)
184
+
185
+ output_text = "### πŸ” Expanded Context Analysis:\n"
186
 
187
+ for i, (doc, score) in enumerate(results):
188
+ # Get the ID of the matched chunk
189
+ chunk_id = doc.metadata['id']
190
 
191
+ # Retrieve Previous and Next chunks from our saved list
192
+ # We use max/min to ensure we don't crash if it's the first or last chunk
193
+ prev_chunk = self.all_chunks[chunk_id - 1] if chunk_id > 0 else "[Start of Text]"
194
+ next_chunk = self.all_chunks[chunk_id + 1] if chunk_id < len(self.all_chunks) - 1 else "[End of Text]"
195
+
196
+ output_text += f"\n#### 🎯 Match #{i+1} (Distance Score: {score:.4f})\n"
197
+ output_text += f"*A lower score means a closer match.*\n\n"
198
+
199
+ # Display Preceding Context (Greyed out to show it's context)
200
+ output_text += f"> **Preceding Context:** ...{prev_chunk[-200:]}\n"
201
+
202
+ # Display The Actual Match (Bold)
203
+ output_text += f"> **MATCH:** {doc.page_content}\n"
204
+
205
+ # Display Succeeding Context
206
+ output_text += f"> **Succeeding Context:** {next_chunk[:200]}...\n"
207
+ output_text += "---\n"
208
+
209
  return output_text
210
 
211
  # Initialize System
212
  system = VectorSystem()
213
 
214
  # --- Gradio UI ---
 
215
  with gr.Blocks(title="EduGenius Context Retriever") as demo:
216
+ gr.Markdown("# πŸŽ“ EduGenius: Smart Context Retriever")
217
+ gr.Markdown("Upload a Chapter. This version finds the best match AND shows you the text immediately before and after it.")
218
 
219
  with gr.Row():
220
  with gr.Column(scale=1):
 
221
  pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"])
222
  upload_btn = gr.Button("Process File", variant="primary")
223
  upload_status = gr.Textbox(label="Status", interactive=False)
224
 
225
  with gr.Column(scale=2):
226
  question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
227
+ answer_input = gr.Textbox(label="Student Answer (Optional)", placeholder="e.g., The heat causes it...")
228
+ search_btn = gr.Button("Find Context + Neighbors", variant="secondary")
229
 
230
  evidence_output = gr.Markdown(label="Relevant Text Chunks")
231
 
232
+ upload_btn.click(fn=system.process_file, inputs=[pdf_input], outputs=[upload_status])
233
+ search_btn.click(fn=system.retrieve_evidence, inputs=[question_input, answer_input], outputs=[evidence_output])
 
 
 
 
 
 
 
 
 
 
234
 
235
  if __name__ == "__main__":
236
  demo.launch()