IW2025 commited on
Commit
fc75dd1
·
verified ·
1 Parent(s): f9e1010

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -262
app.py CHANGED
@@ -103,12 +103,11 @@ class CurriculumChatbot:
103
  self.llm = HuggingFacePipeline(pipeline=pipe)
104
 
105
  # Warm and engaging prompt templates
106
- qa_template = """You are a friendly and encouraging programming tutor. A student has asked: {question}
107
 
108
- Here's the relevant curriculum content to help answer their question:
109
- {filled_context}
110
 
111
- Please provide a warm, encouraging answer that directly answers their question using the curriculum content. Use a conversational tone and explain the concepts clearly."""
112
 
113
  self.qa_prompt = PromptTemplate(
114
  input_variables=["question", "filled_context"],
@@ -131,12 +130,11 @@ Please select the most relevant slide (filename.pdf - Page X) that would best he
131
  self.slide_selection_chain = self.slide_selection_prompt | self.llm
132
 
133
  # Warm and detailed focused QA template
134
- focused_qa_template = """You are a friendly and encouraging programming tutor. A student has asked: {question}
135
 
136
- Here's the curriculum slide content to help answer their question:
137
- {slide_content}
138
 
139
- Please provide a warm, encouraging answer that directly answers their question about the slide content. Use a conversational tone and explain the concepts clearly."""
140
 
141
  self.focused_qa_prompt = PromptTemplate(
142
  input_variables=["question", "slide_content"],
@@ -145,6 +143,8 @@ Please provide a warm, encouraging answer that directly answers their question a
145
  self.focused_qa_chain = self.focused_qa_prompt | self.llm
146
 
147
  print("✅ Optimized model loaded successfully!")
 
 
148
  except Exception as e:
149
  print(f"Warning: Could not load optimized model: {e}")
150
  print("Falling back to basic search mode...")
@@ -191,283 +191,72 @@ Please provide a warm, encouraging answer that directly answers their question a
191
  return "\n".join(slides_text)
192
 
193
  def chat(self, query):
194
- """Comprehensive chat function with LLM answers and slide navigation"""
195
- # Check cache first for faster responses
196
- if query in self.response_cache:
197
- print("✅ Using cached response")
198
- return self.response_cache[query]
199
 
200
- # First, try to find relevant curriculum content
201
- results = self.vector_db.similarity_search(query, k=3) # Reduced from 5 to 3 for speed
202
 
203
- # Check if query is curriculum-related
204
- curriculum_relevance_score = 0
205
- if results:
206
- # Calculate relevance score based on similarity and content relevance
207
- relevant_results = []
208
- for result in results:
209
- content = result.page_content.lower()
210
- query_terms = query.lower().split()
211
-
212
- # Check if any query terms appear in the content
213
- term_matches = sum(1 for term in query_terms if len(term) > 2 and term in content)
214
-
215
- # Only consider results that have some relevance to the query
216
- if term_matches > 0 or len(content.strip()) > 50:
217
- relevant_results.append(result)
218
-
219
- curriculum_relevance_score = len(relevant_results)
220
- results = relevant_results # Use only relevant results
221
-
222
- # Debug: Print what we found
223
- print(f"Query: {query}")
224
- print(f"Found {len(results)} relevant results:")
225
- for i, result in enumerate(results[:3]):
226
- print(f" {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
227
- print(f" Content: {result.page_content[:100]}...")
228
 
229
- # Use LLM to analyze top 5 slides and select the best one for teaching
230
- best_slide_content = ""
231
- best_result = None
232
- if curriculum_relevance_score > 0 and self.slide_selection_chain and not self.fast_mode:
233
- try:
234
- # Prepare slide contents for LLM analysis
235
- slide_contents = []
236
- for i, result in enumerate(results[:5]): # Top 5 results
237
- filename = result.metadata["filename"]
238
- page_num = result.metadata["page_number"]
239
- content = result.page_content
240
- slide_contents.append(f"Slide {i+1}: {filename} - Page {page_num}\nContent: {content}\n")
241
-
242
- slide_contents_text = "\n".join(slide_contents)
243
-
244
- # Use LLM to select the best slide
245
- slide_response = self.slide_selection_chain.invoke({
246
- "question": query,
247
- "slide_contents": slide_contents_text
248
- })
249
-
250
- # Extract filename and page from response
251
- slide_response = slide_response.strip()
252
- if "<|eot_id|>" in slide_response:
253
- slide_response = slide_response.split("<|eot_id|>")[-1].strip()
254
-
255
- # Parse the response to get filename and page
256
- match = re.search(r'(.+\.pdf)\s*-\s*Page\s*(\d+)', slide_response)
257
- if match:
258
- filename = match.group(1)
259
- page_num = int(match.group(2))
260
-
261
- # Find the corresponding result
262
- for result in results:
263
- if (result.metadata["filename"] == filename and
264
- result.metadata["page_number"] == page_num):
265
- best_result = result
266
- best_slide_content = result.page_content
267
- break
268
-
269
- # If LLM selection failed, fall back to first result
270
- if not best_result:
271
- best_result = results[0]
272
- best_slide_content = results[0].page_content
273
- else:
274
- # Fallback to first result if parsing failed
275
- best_result = results[0]
276
- best_slide_content = results[0].page_content
277
-
278
- except Exception as e:
279
- print(f"Error in LLM slide selection: {e}")
280
- # Fallback to first result
281
- best_result = results[0]
282
- best_slide_content = results[0].page_content
283
- else:
284
- # Fallback without LLM
285
- if curriculum_relevance_score > 0:
286
- best_result = results[0]
287
- best_slide_content = results[0].page_content
288
 
289
- # Generate focused LLM answer using the most relevant slide
290
- if self.focused_qa_chain and curriculum_relevance_score > 0 and not self.fast_mode:
 
 
 
291
  try:
 
 
292
  answer = self.focused_qa_chain.invoke({
293
  "question": query,
294
  "slide_content": best_slide_content
295
  })
296
 
297
- # Debug: Print what the LLM returned
298
- print(f"LLM Raw Response: {answer[:200]}...")
299
 
300
  # Clean up the answer
301
  answer = answer.strip()
302
  if "<|eot_id|>" in answer:
303
  answer = answer.split("<|eot_id|>")[-1].strip()
304
 
305
- # Remove any prompt artifacts
306
- if answer.startswith("Answer:"):
307
- answer = answer[7:].strip()
308
- if answer.startswith("Provide a clear, educational answer based on this slide:"):
309
- answer = answer[58:].strip()
310
-
311
- # If LLM response is too short or problematic, show slide content with explanation
312
- if len(answer.strip()) < 30:
313
- slide_info = f"📄 **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
314
- answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\n*Note: Here's the relevant curriculum content to help answer your question.*"
315
-
316
- except Exception as e:
317
- print(f"Error generating focused answer: {e}")
318
- # Show slide content with explanation
319
- slide_info = f"📄 **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
320
- answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\n*Note: Here's the relevant curriculum content to help answer your question.*"
321
-
322
- elif self.qa_chain and not self.fast_mode:
323
- # Fallback to general LLM if focused chain fails
324
- try:
325
- if curriculum_relevance_score > 0:
326
- context = "\n\n".join([result.page_content for result in results])
327
- filled_context = f"Here's the relevant curriculum content from the student's course materials:\n{context}\n\nPlease provide a warm, encouraging answer that directly uses this curriculum content to help the student understand the concept."
328
- else:
329
- filled_context = "Note: This question is not covered in the current curriculum. Please provide a friendly, general programming answer that encourages the student's curiosity."
330
-
331
- answer = self.qa_chain.invoke({
332
- "question": query,
333
- "filled_context": filled_context
334
- })
335
-
336
- # Clean up the answer
337
- answer = answer.strip()
338
- if "<|eot_id|>" in answer:
339
- answer = answer.split("<|eot_id|>")[-1].strip()
340
- if answer.startswith("Answer:"):
341
- answer = answer[7:].strip()
342
- if answer.startswith("Provide a clear, educational answer explaining the concept:"):
343
- answer = answer[58:].strip()
344
-
345
- # If answer is too short, show slide content
346
- if len(answer.strip()) < 30:
347
- if curriculum_relevance_score > 0:
348
- slide_info = f"📄 **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
349
- answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\n*Note: Here's the relevant curriculum content to help answer your question.*"
350
- else:
351
- answer = "I'm sorry, I couldn't generate a proper answer right now. Please try rephrasing your question - sometimes a different way of asking helps! 😊"
352
-
353
- # Add warning if not in curriculum
354
- if curriculum_relevance_score == 0:
355
- answer = "💡 **Note: This topic isn't covered in your current curriculum, but here's a helpful answer:**\n\n" + answer
356
-
357
  except Exception as e:
358
  print(f"Error generating answer: {e}")
359
- if curriculum_relevance_score > 0:
360
- slide_info = f"📄 **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
361
- answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\n*Note: Here's the relevant curriculum content to help answer your question.*"
362
- else:
363
- answer = "I'm sorry, I couldn't generate an answer at the moment. Please try rephrasing your question - sometimes a different approach helps! 😊"
364
  else:
365
- # If no LLM available
366
- if curriculum_relevance_score > 0:
367
- slide_info = f"📄 **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
368
- answer = f"{slide_info}\n\n**Great question! Here's what your curriculum teaches:**\n\n{best_slide_content}\n\n*Note: AI generation is not available, but here's the relevant curriculum content to help you learn!* 🌟"
369
- else:
370
- answer = "I couldn't find any programming-related content in the curriculum for this question. This appears to be about something outside the scope of your programming course. Try asking about programming concepts like variables, loops, functions, or other topics covered in your curriculum! 😊"
371
 
372
- # Get the most relevant slide and its neighboring pages
373
  relevant_slides = []
374
- if curriculum_relevance_score > 0:
375
- # Get multiple relevant results to find the best one
376
- best_result = results[0]
377
- filename = best_result.metadata["filename"]
378
- page_number = best_result.metadata["page_number"]
379
-
380
- # Get the specific PDF and its pages
381
- if filename in self.pdf_files:
382
- pdf_path = self.pdf_files[filename]
383
- doc = fitz.open(pdf_path)
384
- total_pages = len(doc)
385
- doc.close()
386
-
387
- # Find the best content page by analyzing all results
388
- target_page = page_number
389
- best_content_score = 0
390
-
391
- # Check all search results for the best content page
392
- for result in results:
393
- if result.metadata["filename"] == filename:
394
- page_num = result.metadata["page_number"]
395
- page_text = self.pdf_pages[filename].get(page_num, "")
396
- text_length = len(page_text.strip())
397
-
398
- # Score based on text length and relevance
399
- content_score = text_length
400
- if text_length > 100: # Prefer content pages over title slides
401
- content_score += 500
402
-
403
- if content_score > best_content_score:
404
- best_content_score = content_score
405
- target_page = page_num
406
-
407
- # If we still have a title slide, look for better content in the same PDF
408
- page_text = self.pdf_pages[filename].get(target_page, "")
409
- if len(page_text.strip()) < 150: # Still a title slide
410
- # Search for pages with the query terms
411
- query_terms = query.lower().split()
412
- best_match_score = 0
413
-
414
- for page_num in range(1, total_pages + 1):
415
- if page_num in self.pdf_pages[filename]:
416
- text = self.pdf_pages[filename][page_num].lower()
417
- text_length = len(text.strip())
418
-
419
- # Count how many query terms appear in this page
420
- match_score = sum(1 for term in query_terms if term in text)
421
-
422
- # Prefer pages with both query terms and good content
423
- if match_score > 0 and text_length > 200:
424
- total_score = match_score * 1000 + text_length
425
- if total_score > best_match_score:
426
- best_match_score = total_score
427
- target_page = page_num
428
-
429
- # Get the target page and neighboring pages (2 before, 2 after)
430
- start_page = max(1, target_page - 2)
431
- end_page = min(total_pages, target_page + 2)
432
-
433
- # Use a set to track unique slides and avoid duplicates
434
- seen_slides = set()
435
-
436
- for page_num in range(start_page, end_page + 1):
437
- img = self.get_pdf_page_image(pdf_path, page_num)
438
- if img:
439
- slide_key = f"{filename}-{page_num}"
440
- if slide_key not in seen_slides:
441
- seen_slides.add(slide_key)
442
- if page_num == target_page:
443
- # Highlight the most relevant page
444
- label = f"📌 {filename} - Page {page_num} (Most Relevant)"
445
- else:
446
- label = f"{filename} - Page {page_num}"
447
- relevant_slides.append((img, label))
448
-
449
- recommended_slide = relevant_slides[0][0] if relevant_slides else None
450
- recommended_label = relevant_slides[0][1] if relevant_slides else None
451
- else:
452
- # Fallback if filename not found
453
- recommended_slide = None
454
- recommended_label = None
455
- else:
456
- # If no curriculum content, provide a helpful response
457
- relevant_slides = []
458
- recommended_slide = None
459
- recommended_label = None
460
-
461
- # Cache the response for future use
462
- self.response_cache[query] = (answer, recommended_slide, recommended_label, relevant_slides)
463
 
464
- # Limit cache size to prevent memory issues
465
- if len(self.response_cache) > 50:
466
- # Remove oldest entries
467
- oldest_key = next(iter(self.response_cache))
468
- del self.response_cache[oldest_key]
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
- return answer, recommended_slide, recommended_label, relevant_slides
471
 
472
  # --- Gradio UI ---
473
  chatbot = CurriculumChatbot(fast_mode=False) # Enable AI mode by default
 
103
  self.llm = HuggingFacePipeline(pipeline=pipe)
104
 
105
  # Warm and engaging prompt templates
106
+ qa_template = """Answer this question: {question}
107
 
108
+ Using this information: {filled_context}
 
109
 
110
+ Provide a helpful, friendly answer."""
111
 
112
  self.qa_prompt = PromptTemplate(
113
  input_variables=["question", "filled_context"],
 
130
  self.slide_selection_chain = self.slide_selection_prompt | self.llm
131
 
132
  # Warm and detailed focused QA template
133
+ focused_qa_template = """Answer this question: {question}
134
 
135
+ Using this information: {slide_content}
 
136
 
137
+ Provide a helpful, friendly answer."""
138
 
139
  self.focused_qa_prompt = PromptTemplate(
140
  input_variables=["question", "slide_content"],
 
143
  self.focused_qa_chain = self.focused_qa_prompt | self.llm
144
 
145
  print("✅ Optimized model loaded successfully!")
146
+ print(f"🔍 LLM object: {self.llm}")
147
+ print(f"🔍 Focused QA chain: {self.focused_qa_chain}")
148
  except Exception as e:
149
  print(f"Warning: Could not load optimized model: {e}")
150
  print("Falling back to basic search mode...")
 
191
  return "\n".join(slides_text)
192
 
193
  def chat(self, query):
194
+ """Simplified chat function with vector search, LLM analysis, and slide display"""
 
 
 
 
195
 
196
+ # 1. Vector Search - Find relevant slides
197
+ results = self.vector_db.similarity_search(query, k=3)
198
 
199
+ if not results:
200
+ return "I couldn't find relevant content in the curriculum for this question.", None, None, []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
+ # Debug: Show what we found
203
+ print(f"Query: {query}")
204
+ print(f"Found {len(results)} relevant slides:")
205
+ for i, result in enumerate(results):
206
+ print(f" {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
+ # 2. LLM Check - Analyze slides and generate answer
209
+ best_result = results[0]
210
+ best_slide_content = best_result.page_content
211
+
212
+ if self.focused_qa_chain and not self.fast_mode:
213
  try:
214
+ print(f"🔍 Calling LLM with question: {query}")
215
+
216
  answer = self.focused_qa_chain.invoke({
217
  "question": query,
218
  "slide_content": best_slide_content
219
  })
220
 
221
+ print(f"LLM Response: {answer[:100]}...")
 
222
 
223
  # Clean up the answer
224
  answer = answer.strip()
225
  if "<|eot_id|>" in answer:
226
  answer = answer.split("<|eot_id|>")[-1].strip()
227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  except Exception as e:
229
  print(f"Error generating answer: {e}")
230
+ answer = f"📄 **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}\n\n**Slide Content:**\n{best_slide_content}"
 
 
 
 
231
  else:
232
+ # Fallback to slide content
233
+ answer = f"📄 **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}\n\n**Slide Content:**\n{best_slide_content}"
 
 
 
 
234
 
235
+ # 3. Slide Output - Get relevant slides
236
  relevant_slides = []
237
+ filename = best_result.metadata["filename"]
238
+ page_number = best_result.metadata["page_number"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
+ if filename in self.pdf_files:
241
+ pdf_path = self.pdf_files[filename]
242
+ doc = fitz.open(pdf_path)
243
+ total_pages = len(doc)
244
+ doc.close()
245
+
246
+ # Get the target page and neighboring pages (2 before, 2 after)
247
+ start_page = max(1, page_number - 2)
248
+ end_page = min(total_pages, page_number + 2)
249
+
250
+ for page_num in range(start_page, end_page + 1):
251
+ img = self.get_pdf_page_image(pdf_path, page_num)
252
+ if img:
253
+ if page_num == page_number:
254
+ label = f"📌 {filename} - Page {page_num} (Most Relevant)"
255
+ else:
256
+ label = f"{filename} - Page {page_num}"
257
+ relevant_slides.append((img, label))
258
 
259
+ return answer, relevant_slides[0][0] if relevant_slides else None, relevant_slides[0][1] if relevant_slides else None, relevant_slides
260
 
261
  # --- Gradio UI ---
262
  chatbot = CurriculumChatbot(fast_mode=False) # Enable AI mode by default