IW2025 commited on
Commit
ce130ce
·
verified ·
1 Parent(s): fc75dd1

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -108
app.py CHANGED
@@ -70,88 +70,56 @@ class CurriculumChatbot:
70
  )
71
 
72
  def _setup_llm(self):
73
- # Initialize all LLM-related attributes to None first
74
- self.llm = None
75
- self.qa_chain = None
76
- self.slide_selection_chain = None
77
- self.focused_qa_chain = None
78
-
79
  try:
80
- # Use a smaller, faster model for better performance
81
- # Llama 3.1 8B is quite large and slow - let's use a smaller model
82
- model_name = "microsoft/DialoGPT-medium" # Much faster, smaller model
83
-
84
- # Get token from secrets
85
- import os
86
- token = os.environ.get("IW_Token")
87
- if not token:
88
- raise ValueError("IW_Token not found in environment variables")
89
-
90
  pipe = pipeline(
91
  "text-generation",
92
- model=model_name,
93
- max_new_tokens=100, # Reduced for faster responses
94
- temperature=0.3,
 
95
  do_sample=True,
 
96
  top_p=0.9,
97
- repetition_penalty=1.1,
98
- device_map="auto" if torch.cuda.is_available() else None,
99
- token=token,
100
- # Performance optimizations
101
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
102
  )
 
103
  self.llm = HuggingFacePipeline(pipeline=pipe)
104
 
105
- # Warm and engaging prompt templates
106
- qa_template = """Answer this question: {question}
 
 
 
107
 
108
- Using this information: {filled_context}
109
 
110
- Provide a helpful, friendly answer."""
111
 
112
- self.qa_prompt = PromptTemplate(
113
  input_variables=["question", "filled_context"],
114
  template=qa_template
115
- )
116
- self.qa_chain = self.qa_prompt | self.llm
117
 
118
- # Enhanced slide selection template
119
- slide_selection_template = """As a helpful programming tutor, a student has asked: {question}
120
 
121
- Here are the available curriculum slides that might help answer their question:
122
- {slide_contents}
123
-
124
- Please select the most relevant slide (filename.pdf - Page X) that would best help explain this concept to the student. Choose the slide that has the most detailed and relevant content for their question."""
125
-
126
- self.slide_selection_prompt = PromptTemplate(
127
- input_variables=["question", "slide_contents"],
128
- template=slide_selection_template
129
- )
130
- self.slide_selection_chain = self.slide_selection_prompt | self.llm
131
-
132
- # Warm and detailed focused QA template
133
- focused_qa_template = """Answer this question: {question}
134
 
135
- Using this information: {slide_content}
136
 
137
- Provide a helpful, friendly answer."""
138
 
139
- self.focused_qa_prompt = PromptTemplate(
140
  input_variables=["question", "slide_content"],
141
  template=focused_qa_template
142
- )
143
- self.focused_qa_chain = self.focused_qa_prompt | self.llm
144
 
145
- print("✅ Optimized model loaded successfully!")
146
- print(f"🔍 LLM object: {self.llm}")
147
- print(f"🔍 Focused QA chain: {self.focused_qa_chain}")
148
  except Exception as e:
149
- print(f"Warning: Could not load optimized model: {e}")
150
- print("Falling back to basic search mode...")
151
- self.llm = None
152
- self.qa_chain = None
153
- self.slide_selection_chain = None
154
- self.focused_qa_chain = None
155
 
156
  def get_pdf_page_image(self, pdf_path, page_num):
157
  try:
@@ -191,72 +159,194 @@ Provide a helpful, friendly answer."""
191
  return "\n".join(slides_text)
192
 
193
  def chat(self, query):
194
- """Simplified chat function with vector search, LLM analysis, and slide display"""
195
-
196
- # 1. Vector Search - Find relevant slides
197
- results = self.vector_db.similarity_search(query, k=3)
198
-
199
- if not results:
200
- return "I couldn't find relevant content in the curriculum for this question.", None, None, []
201
 
202
- # Debug: Show what we found
203
- print(f"Query: {query}")
204
- print(f"Found {len(results)} relevant slides:")
205
- for i, result in enumerate(results):
206
- print(f" {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
 
 
 
 
 
 
 
207
 
208
- # 2. LLM Check - Analyze slides and generate answer
209
- best_result = results[0]
210
- best_slide_content = best_result.page_content
 
 
 
 
 
 
 
 
 
 
211
 
212
- if self.focused_qa_chain and not self.fast_mode:
 
213
  try:
214
- print(f"🔍 Calling LLM with question: {query}")
215
 
216
- answer = self.focused_qa_chain.invoke({
217
- "question": query,
218
- "slide_content": best_slide_content
219
- })
220
 
221
- print(f"LLM Response: {answer[:100]}...")
 
 
 
 
222
 
223
- # Clean up the answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  answer = answer.strip()
225
  if "<|eot_id|>" in answer:
226
  answer = answer.split("<|eot_id|>")[-1].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
  except Exception as e:
229
  print(f"Error generating answer: {e}")
230
- answer = f"📄 **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}\n\n**Slide Content:**\n{best_slide_content}"
 
 
 
 
231
  else:
232
- # Fallback to slide content
233
- answer = f"📄 **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}\n\n**Slide Content:**\n{best_slide_content}"
 
 
 
234
 
235
- # 3. Slide Output - Get relevant slides
236
  relevant_slides = []
237
- filename = best_result.metadata["filename"]
238
- page_number = best_result.metadata["page_number"]
239
-
240
- if filename in self.pdf_files:
241
- pdf_path = self.pdf_files[filename]
242
- doc = fitz.open(pdf_path)
243
- total_pages = len(doc)
244
- doc.close()
245
 
246
- # Get the target page and neighboring pages (2 before, 2 after)
247
- start_page = max(1, page_number - 2)
248
- end_page = min(total_pages, page_number + 2)
249
-
250
- for page_num in range(start_page, end_page + 1):
251
- img = self.get_pdf_page_image(pdf_path, page_num)
252
- if img:
253
- if page_num == page_number:
254
- label = f"📌 {filename} - Page {page_num} (Most Relevant)"
255
- else:
256
- label = f"{filename} - Page {page_num}"
257
- relevant_slides.append((img, label))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
- return answer, relevant_slides[0][0] if relevant_slides else None, relevant_slides[0][1] if relevant_slides else None, relevant_slides
260
 
261
  # --- Gradio UI ---
262
  chatbot = CurriculumChatbot(fast_mode=False) # Enable AI mode by default
 
70
  )
71
 
72
  def _setup_llm(self):
73
+ """Setup LLM with HuggingFace pipeline"""
 
 
 
 
 
74
  try:
75
+ # Load the model
 
 
 
 
 
 
 
 
 
76
  pipe = pipeline(
77
  "text-generation",
78
+ model="microsoft/DialoGPT-medium",
79
+ torch_dtype=torch.float16,
80
+ device_map="auto",
81
+ max_length=512,
82
  do_sample=True,
83
+ temperature=0.7,
84
  top_p=0.9,
85
+ repetition_penalty=1.1
 
 
 
 
86
  )
87
+
88
  self.llm = HuggingFacePipeline(pipeline=pipe)
89
 
90
+ # Create QA prompt template for DialoGPT
91
+ qa_template = """You are a helpful programming tutor. Answer the following question based on the curriculum content provided.
92
+
93
+ Curriculum Content:
94
+ {filled_context}
95
 
96
+ Question: {question}
97
 
98
+ Provide a clear, educational answer explaining the concept:"""
99
 
100
+ self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
101
  input_variables=["question", "filled_context"],
102
  template=qa_template
103
+ ))
 
104
 
105
+ # Create focused answer prompt template
106
+ focused_qa_template = """You are a helpful programming tutor. Answer the question based on the specific slide content provided.
107
 
108
+ Slide Content:
109
+ {slide_content}
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ Question: {question}
112
 
113
+ Provide a clear, educational answer based on this slide:"""
114
 
115
+ self.focused_qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
116
  input_variables=["question", "slide_content"],
117
  template=focused_qa_template
118
+ ))
 
119
 
120
+ print("✅ Llama 3.1-8B loaded successfully!")
 
 
121
  except Exception as e:
122
+ print(f"Warning: Could not load Llama 3.1-8B: {e}")
 
 
 
 
 
123
 
124
  def get_pdf_page_image(self, pdf_path, page_num):
125
  try:
 
159
  return "\n".join(slides_text)
160
 
161
  def chat(self, query):
162
+ """Comprehensive chat function with LLM answers and slide navigation"""
163
+ # First, try to find relevant curriculum content
164
+ results = self.vector_db.similarity_search(query, k=5) # Get more results for better selection
 
 
 
 
165
 
166
+ # Check if query is curriculum-related
167
+ curriculum_relevance_score = 0
168
+ if results:
169
+ # Calculate relevance score based on similarity
170
+ curriculum_relevance_score = len([r for r in results if r.page_content.strip()])
171
+
172
+ # Debug: Print what we found
173
+ print(f"Query: {query}")
174
+ print(f"Found {len(results)} relevant results:")
175
+ for i, result in enumerate(results[:3]):
176
+ print(f" {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
177
+ print(f" Content: {result.page_content[:100]}...")
178
 
179
+ # Find the most relevant slide content first
180
+ best_slide_content = ""
181
+ if curriculum_relevance_score > 0:
182
+ # Get the most relevant result
183
+ best_result = results[0]
184
+ best_slide_content = best_result.page_content
185
+
186
+ # If the best slide has little content, try to find a better one
187
+ if len(best_slide_content.strip()) < 100:
188
+ for result in results[1:]:
189
+ if len(result.page_content.strip()) > len(best_slide_content.strip()):
190
+ best_slide_content = result.page_content
191
+ best_result = result
192
 
193
+ # Generate focused LLM answer using the most relevant slide
194
+ if self.focused_qa_chain and curriculum_relevance_score > 0:
195
  try:
196
+ answer = self.focused_qa_chain.run(question=query, slide_content=best_slide_content)
197
 
198
+ # Clean up the answer
199
+ answer = answer.strip()
200
+ if "<|eot_id|>" in answer:
201
+ answer = answer.split("<|eot_id|>")[-1].strip()
202
 
203
+ # Remove any prompt artifacts
204
+ if answer.startswith("Answer:"):
205
+ answer = answer[7:].strip()
206
+ if answer.startswith("Provide a clear, educational answer based on this slide:"):
207
+ answer = answer[58:].strip()
208
 
209
+ # Check if the answer is too short or just repeats the question
210
+ if len(answer.strip()) < 50 or answer.lower().startswith("how does that work"):
211
+ # Generate a better answer using the slide content
212
+ answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide explains the concept clearly. Let me provide additional context: Loops are programming constructs that allow you to repeat code multiple times efficiently."
213
+
214
+ except Exception as e:
215
+ print(f"Error generating focused answer: {e}")
216
+ # Fallback to slide content with explanation
217
+ answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide contains the relevant information about your question."
218
+
219
+ elif self.qa_chain:
220
+ # Fallback to general LLM if focused chain fails
221
+ try:
222
+ if curriculum_relevance_score > 0:
223
+
224
+ context = "\n\n".join([result.page_content for result in results])
225
+ filled_context = f"Curriculum Context:\n{context}\n\nPlease answer based on this curriculum content."
226
+ else:
227
+
228
+ filled_context = "Note: This question is not covered in the current curriculum. Please provide a general programming answer."
229
+
230
+ answer = self.qa_chain.run(question=query, filled_context=filled_context)
231
  answer = answer.strip()
232
  if "<|eot_id|>" in answer:
233
  answer = answer.split("<|eot_id|>")[-1].strip()
234
+
235
+
236
+ if answer.startswith("Answer:"):
237
+ answer = answer[7:].strip()
238
+ if answer.startswith("Provide a clear, educational answer explaining the concept:"):
239
+ answer = answer[58:].strip()
240
+
241
+ # Check if the answer is too short
242
+ if len(answer.strip()) < 50:
243
+ if curriculum_relevance_score > 0:
244
+ answer = f"Based on the curriculum content:\n\n{best_slide_content}\n\nThis slide explains the concept clearly."
245
+ else:
246
+ answer = "I'm sorry, I couldn't generate a proper answer. Please try rephrasing your question."
247
+
248
+ # Add warning if not in curriculum
249
+ if curriculum_relevance_score == 0:
250
+ answer = "💡 **Note: This topic isn't covered in your current curriculum, but here's a helpful answer:**\n\n" + answer
251
 
252
  except Exception as e:
253
  print(f"Error generating answer: {e}")
254
+
255
+ if curriculum_relevance_score > 0:
256
+ answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide contains the relevant information about your question."
257
+ else:
258
+ answer = "I'm sorry, I couldn't generate an answer at the moment. Please try rephrasing your question."
259
  else:
260
+ # If no LLM available
261
+ if curriculum_relevance_score > 0:
262
+ answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\n*Note: AI generation is not available, but here's the relevant curriculum content.*"
263
+ else:
264
+ answer = "I couldn't find relevant content in the curriculum for this question. Please try rephrasing or ask about a different programming topic."
265
 
266
+ # Get the most relevant slide and its neighboring pages
267
  relevant_slides = []
268
+ if curriculum_relevance_score > 0:
269
+ # Get multiple relevant results to find the best one
270
+ best_result = results[0]
271
+ filename = best_result.metadata["filename"]
272
+ page_number = best_result.metadata["page_number"]
 
 
 
273
 
274
+ # Get the specific PDF and its pages
275
+ if filename in self.pdf_files:
276
+ pdf_path = self.pdf_files[filename]
277
+ doc = fitz.open(pdf_path)
278
+ total_pages = len(doc)
279
+ doc.close()
280
+
281
+ # Find the best content page by analyzing all results
282
+ target_page = page_number
283
+ best_content_score = 0
284
+
285
+ # Check all search results for the best content page
286
+ for result in results:
287
+ if result.metadata["filename"] == filename:
288
+ page_num = result.metadata["page_number"]
289
+ page_text = self.pdf_pages[filename].get(page_num, "")
290
+ text_length = len(page_text.strip())
291
+
292
+ # Score based on text length and relevance
293
+ content_score = text_length
294
+ if text_length > 100: # Prefer content pages over title slides
295
+ content_score += 500
296
+
297
+ if content_score > best_content_score:
298
+ best_content_score = content_score
299
+ target_page = page_num
300
+
301
+ # If we still have a title slide, look for better content in the same PDF
302
+ page_text = self.pdf_pages[filename].get(target_page, "")
303
+ if len(page_text.strip()) < 150: # Still a title slide
304
+ # Search for pages with the query terms
305
+ query_terms = query.lower().split()
306
+ best_match_score = 0
307
+
308
+ for page_num in range(1, total_pages + 1):
309
+ if page_num in self.pdf_pages[filename]:
310
+ text = self.pdf_pages[filename][page_num].lower()
311
+ text_length = len(text.strip())
312
+
313
+ # Count how many query terms appear in this page
314
+ match_score = sum(1 for term in query_terms if term in text)
315
+
316
+ # Prefer pages with both query terms and good content
317
+ if match_score > 0 and text_length > 200:
318
+ total_score = match_score * 1000 + text_length
319
+ if total_score > best_match_score:
320
+ best_match_score = total_score
321
+ target_page = page_num
322
+
323
+ # Get the target page and neighboring pages (2 before, 2 after)
324
+ start_page = max(1, target_page - 2)
325
+ end_page = min(total_pages, target_page + 2)
326
+
327
+ for page_num in range(start_page, end_page + 1):
328
+ img = self.get_pdf_page_image(pdf_path, page_num)
329
+ if img:
330
+ if page_num == target_page:
331
+ # Highlight the most relevant page
332
+ label = f"📌 {filename} - Page {page_num} (Most Relevant)"
333
+ else:
334
+ label = f"{filename} - Page {page_num}"
335
+ relevant_slides.append((img, label))
336
+
337
+ recommended_slide = relevant_slides[0][0] if relevant_slides else None
338
+ recommended_label = relevant_slides[0][1] if relevant_slides else None
339
+ else:
340
+ # Fallback if filename not found
341
+ recommended_slide = None
342
+ recommended_label = None
343
+ else:
344
+ # If no curriculum content, provide a helpful response
345
+ relevant_slides = []
346
+ recommended_slide = None
347
+ recommended_label = None
348
 
349
+ return answer, recommended_slide, recommended_label, relevant_slides
350
 
351
  # --- Gradio UI ---
352
  chatbot = CurriculumChatbot(fast_mode=False) # Enable AI mode by default