IW2025 commited on
Commit
7caf6aa
·
verified ·
1 Parent(s): fc0df0e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -23
app.py CHANGED
@@ -79,13 +79,14 @@ class CurriculumChatbot:
79
  self.llm = HuggingFacePipeline(pipeline=pipe)
80
 
81
  # Create QA prompt template for DialoGPT
82
- qa_template = """Based on the following curriculum content, please answer this question clearly and educationally:
83
 
 
84
  {filled_context}
85
 
86
  Question: {question}
87
 
88
- Answer:"""
89
 
90
  self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
91
  input_variables=["question", "filled_context"],
@@ -107,6 +108,21 @@ Answer:"""
107
  template=slide_template
108
  ))
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  print("✅ Llama 3.1-8B loaded successfully!")
111
  except Exception as e:
112
  print(f"Warning: Could not load Llama 3.1-8B: {e}")
@@ -155,23 +171,68 @@ Answer:"""
155
  def chat(self, query):
156
  """Comprehensive chat function with LLM answers and slide navigation"""
157
  # First, try to find relevant curriculum content
158
- results = self.vector_db.similarity_search(query, k=3)
159
 
160
  # Check if query is curriculum-related
161
  curriculum_relevance_score = 0
162
  if results:
163
  # Calculate relevance score based on similarity
164
  curriculum_relevance_score = len([r for r in results if r.page_content.strip()])
 
 
 
 
 
 
 
165
 
166
- # ALWAYS generate LLM answer (never fallback to raw text)
167
- if self.qa_chain:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  try:
169
  if curriculum_relevance_score > 0:
170
- # Use curriculum context
171
  context = "\n\n".join([result.page_content for result in results])
172
  filled_context = f"Curriculum Context:\n{context}\n\nPlease answer based on this curriculum content."
173
  else:
174
- # No curriculum context - general programming answer
175
  filled_context = "Note: This question is not covered in the current curriculum. Please provide a general programming answer."
176
 
177
  answer = self.qa_chain.run(question=query, filled_context=filled_context)
@@ -180,10 +241,17 @@ Answer:"""
180
  answer = answer.strip()
181
  if "<|eot_id|>" in answer:
182
  answer = answer.split("<|eot_id|>")[-1].strip()
183
-
184
- # Remove any prompt artifacts
185
  if answer.startswith("Answer:"):
186
  answer = answer[7:].strip()
 
 
 
 
 
 
 
 
 
187
 
188
  # Add warning if not in curriculum
189
  if curriculum_relevance_score == 0:
@@ -191,22 +259,21 @@ Answer:"""
191
 
192
  except Exception as e:
193
  print(f"Error generating answer: {e}")
194
- # Even if LLM fails, try to provide a helpful response
195
  if curriculum_relevance_score > 0:
196
- answer = f"Based on the curriculum content, here's what I found:\n\n{results[0].page_content}\n\n*Note: I'm having trouble generating a custom answer right now, but here's the relevant curriculum content.*"
197
  else:
198
  answer = "I'm sorry, I couldn't generate an answer at the moment. Please try rephrasing your question."
199
  else:
200
- # If no LLM available, still provide helpful response
201
  if curriculum_relevance_score > 0:
202
- answer = f"Based on the curriculum content:\n\n{results[0].page_content}\n\n*Note: AI generation is not available, but here's the relevant curriculum content.*"
203
  else:
204
  answer = "I couldn't find relevant content in the curriculum for this question. Please try rephrasing or ask about a different programming topic."
205
 
206
  # Get the most relevant slide and its neighboring pages
207
  relevant_slides = []
208
  if curriculum_relevance_score > 0:
209
- # Get the most relevant result
210
  best_result = results[0]
211
  filename = best_result.metadata["filename"]
212
  page_number = best_result.metadata["page_number"]
@@ -218,19 +285,47 @@ Answer:"""
218
  total_pages = len(doc)
219
  doc.close()
220
 
221
- # Try to find a better page if the current one is a title slide
222
  target_page = page_number
223
- page_text = self.pdf_pages[filename].get(page_number, "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
- # If current page has very little text (likely a title slide), look for content pages
226
- if len(page_text.strip()) < 100: # Title slides usually have little text
227
- # Look for pages with more content in the same PDF
 
 
 
 
228
  for page_num in range(1, total_pages + 1):
229
  if page_num in self.pdf_pages[filename]:
230
- text = self.pdf_pages[filename][page_num]
231
- if len(text.strip()) > 200: # Look for content-rich pages
232
- target_page = page_num
233
- break
 
 
 
 
 
 
 
 
234
 
235
  # Get the target page and neighboring pages (2 before, 2 after)
236
  start_page = max(1, target_page - 2)
 
79
  self.llm = HuggingFacePipeline(pipeline=pipe)
80
 
81
  # Create QA prompt template for DialoGPT
82
+ qa_template = """You are a helpful programming tutor. Answer the following question based on the curriculum content provided.
83
 
84
+ Curriculum Content:
85
  {filled_context}
86
 
87
  Question: {question}
88
 
89
+ Provide a clear, educational answer explaining the concept:"""
90
 
91
  self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
92
  input_variables=["question", "filled_context"],
 
108
  template=slide_template
109
  ))
110
 
111
+ # Create focused answer prompt template
112
+ focused_qa_template = """You are a helpful programming tutor. Answer the question based on the specific slide content provided.
113
+
114
+ Slide Content:
115
+ {slide_content}
116
+
117
+ Question: {question}
118
+
119
+ Provide a clear, educational answer based on this slide:"""
120
+
121
+ self.focused_qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
122
+ input_variables=["question", "slide_content"],
123
+ template=focused_qa_template
124
+ ))
125
+
126
  print("✅ Llama 3.1-8B loaded successfully!")
127
  except Exception as e:
128
  print(f"Warning: Could not load Llama 3.1-8B: {e}")
 
171
  def chat(self, query):
172
  """Comprehensive chat function with LLM answers and slide navigation"""
173
  # First, try to find relevant curriculum content
174
+ results = self.vector_db.similarity_search(query, k=5) # Get more results for better selection
175
 
176
  # Check if query is curriculum-related
177
  curriculum_relevance_score = 0
178
  if results:
179
  # Calculate relevance score based on similarity
180
  curriculum_relevance_score = len([r for r in results if r.page_content.strip()])
181
+
182
+ # Debug: Print what we found
183
+ print(f"Query: {query}")
184
+ print(f"Found {len(results)} relevant results:")
185
+ for i, result in enumerate(results[:3]):
186
+ print(f" {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
187
+ print(f" Content: {result.page_content[:100]}...")
188
 
189
+ # Find the most relevant slide content first
190
+ best_slide_content = ""
191
+ if curriculum_relevance_score > 0:
192
+ # Get the most relevant result
193
+ best_result = results[0]
194
+ best_slide_content = best_result.page_content
195
+
196
+ # If the best slide has little content, try to find a better one
197
+ if len(best_slide_content.strip()) < 100:
198
+ for result in results[1:]:
199
+ if len(result.page_content.strip()) > len(best_slide_content.strip()):
200
+ best_slide_content = result.page_content
201
+ best_result = result
202
+
203
+ # Generate focused LLM answer using the most relevant slide
204
+ if self.focused_qa_chain and curriculum_relevance_score > 0:
205
+ try:
206
+ answer = self.focused_qa_chain.run(question=query, slide_content=best_slide_content)
207
+
208
+ # Clean up the answer
209
+ answer = answer.strip()
210
+ if "<|eot_id|>" in answer:
211
+ answer = answer.split("<|eot_id|>")[-1].strip()
212
+
213
+ # Remove any prompt artifacts
214
+ if answer.startswith("Answer:"):
215
+ answer = answer[7:].strip()
216
+ if answer.startswith("Provide a clear, educational answer based on this slide:"):
217
+ answer = answer[58:].strip()
218
+
219
+ # Check if the answer is too short or just repeats the question
220
+ if len(answer.strip()) < 50 or answer.lower().startswith("how does that work"):
221
+ # Generate a better answer using the slide content
222
+ answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide explains the concept clearly. Let me provide additional context: Loops are programming constructs that allow you to repeat code multiple times efficiently."
223
+
224
+ except Exception as e:
225
+ print(f"Error generating focused answer: {e}")
226
+ # Fallback to slide content with explanation
227
+ answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide contains the relevant information about your question."
228
+
229
+ elif self.qa_chain:
230
+ # Fallback to general LLM if focused chain fails
231
  try:
232
  if curriculum_relevance_score > 0:
 
233
  context = "\n\n".join([result.page_content for result in results])
234
  filled_context = f"Curriculum Context:\n{context}\n\nPlease answer based on this curriculum content."
235
  else:
 
236
  filled_context = "Note: This question is not covered in the current curriculum. Please provide a general programming answer."
237
 
238
  answer = self.qa_chain.run(question=query, filled_context=filled_context)
 
241
  answer = answer.strip()
242
  if "<|eot_id|>" in answer:
243
  answer = answer.split("<|eot_id|>")[-1].strip()
 
 
244
  if answer.startswith("Answer:"):
245
  answer = answer[7:].strip()
246
+ if answer.startswith("Provide a clear, educational answer explaining the concept:"):
247
+ answer = answer[58:].strip()
248
+
249
+ # Check if the answer is too short
250
+ if len(answer.strip()) < 50:
251
+ if curriculum_relevance_score > 0:
252
+ answer = f"Based on the curriculum content:\n\n{best_slide_content}\n\nThis slide explains the concept clearly."
253
+ else:
254
+ answer = "I'm sorry, I couldn't generate a proper answer. Please try rephrasing your question."
255
 
256
  # Add warning if not in curriculum
257
  if curriculum_relevance_score == 0:
 
259
 
260
  except Exception as e:
261
  print(f"Error generating answer: {e}")
 
262
  if curriculum_relevance_score > 0:
263
+ answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide contains the relevant information about your question."
264
  else:
265
  answer = "I'm sorry, I couldn't generate an answer at the moment. Please try rephrasing your question."
266
  else:
267
+ # If no LLM available
268
  if curriculum_relevance_score > 0:
269
+ answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\n*Note: AI generation is not available, but here's the relevant curriculum content.*"
270
  else:
271
  answer = "I couldn't find relevant content in the curriculum for this question. Please try rephrasing or ask about a different programming topic."
272
 
273
  # Get the most relevant slide and its neighboring pages
274
  relevant_slides = []
275
  if curriculum_relevance_score > 0:
276
+ # Get multiple relevant results to find the best one
277
  best_result = results[0]
278
  filename = best_result.metadata["filename"]
279
  page_number = best_result.metadata["page_number"]
 
285
  total_pages = len(doc)
286
  doc.close()
287
 
288
+ # Find the best content page by analyzing all results
289
  target_page = page_number
290
+ best_content_score = 0
291
+
292
+ # Check all search results for the best content page
293
+ for result in results:
294
+ if result.metadata["filename"] == filename:
295
+ page_num = result.metadata["page_number"]
296
+ page_text = self.pdf_pages[filename].get(page_num, "")
297
+ text_length = len(page_text.strip())
298
+
299
+ # Score based on text length and relevance
300
+ content_score = text_length
301
+ if text_length > 100: # Prefer content pages over title slides
302
+ content_score += 500
303
+
304
+ if content_score > best_content_score:
305
+ best_content_score = content_score
306
+ target_page = page_num
307
 
308
+ # If we still have a title slide, look for better content in the same PDF
309
+ page_text = self.pdf_pages[filename].get(target_page, "")
310
+ if len(page_text.strip()) < 150: # Still a title slide
311
+ # Search for pages with the query terms
312
+ query_terms = query.lower().split()
313
+ best_match_score = 0
314
+
315
  for page_num in range(1, total_pages + 1):
316
  if page_num in self.pdf_pages[filename]:
317
+ text = self.pdf_pages[filename][page_num].lower()
318
+ text_length = len(text.strip())
319
+
320
+ # Count how many query terms appear in this page
321
+ match_score = sum(1 for term in query_terms if term in text)
322
+
323
+ # Prefer pages with both query terms and good content
324
+ if match_score > 0 and text_length > 200:
325
+ total_score = match_score * 1000 + text_length
326
+ if total_score > best_match_score:
327
+ best_match_score = total_score
328
+ target_page = page_num
329
 
330
  # Get the target page and neighboring pages (2 before, 2 after)
331
  start_page = max(1, target_page - 2)