IW2025 commited on
Commit
eea9911
Β·
verified Β·
1 Parent(s): ce130ce

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -69
app.py CHANGED
@@ -7,6 +7,7 @@ from langchain_community.vectorstores import Chroma
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain_huggingface import HuggingFacePipeline
9
  from langchain.prompts import PromptTemplate
 
10
  from transformers import pipeline
11
  import torch
12
  import base64
@@ -72,6 +73,12 @@ class CurriculumChatbot:
72
  def _setup_llm(self):
73
  """Setup LLM with HuggingFace pipeline"""
74
  try:
 
 
 
 
 
 
75
  # Load the model
76
  pipe = pipeline(
77
  "text-generation",
@@ -87,6 +94,23 @@ class CurriculumChatbot:
87
 
88
  self.llm = HuggingFacePipeline(pipeline=pipe)
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  # Create QA prompt template for DialoGPT
91
  qa_template = """You are a helpful programming tutor. Answer the following question based on the curriculum content provided.
92
 
@@ -95,7 +119,7 @@ Curriculum Content:
95
 
96
  Question: {question}
97
 
98
- Provide a clear, educational answer explaining the concept:"""
99
 
100
  self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
101
  input_variables=["question", "filled_context"],
@@ -110,16 +134,24 @@ Slide Content:
110
 
111
  Question: {question}
112
 
113
- Provide a clear, educational answer based on this slide:"""
114
 
115
  self.focused_qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
116
  input_variables=["question", "slide_content"],
117
  template=focused_qa_template
118
  ))
119
 
120
- print("βœ… Llama 3.1-8B loaded successfully!")
 
 
 
121
  except Exception as e:
122
- print(f"Warning: Could not load Llama 3.1-8B: {e}")
 
 
 
 
 
123
 
124
  def get_pdf_page_image(self, pdf_path, page_num):
125
  try:
@@ -159,42 +191,90 @@ Provide a clear, educational answer based on this slide:"""
159
  return "\n".join(slides_text)
160
 
161
  def chat(self, query):
162
- """Comprehensive chat function with LLM answers and slide navigation"""
163
- # First, try to find relevant curriculum content
164
- results = self.vector_db.similarity_search(query, k=5) # Get more results for better selection
165
 
166
- # Check if query is curriculum-related
167
  curriculum_relevance_score = 0
 
 
 
168
  if results:
169
- # Calculate relevance score based on similarity
170
- curriculum_relevance_score = len([r for r in results if r.page_content.strip()])
171
 
172
  # Debug: Print what we found
173
  print(f"Query: {query}")
174
- print(f"Found {len(results)} relevant results:")
175
- for i, result in enumerate(results[:3]):
176
  print(f" {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
177
  print(f" Content: {result.page_content[:100]}...")
178
-
179
- # Find the most relevant slide content first
180
- best_slide_content = ""
181
- if curriculum_relevance_score > 0:
182
- # Get the most relevant result
183
- best_result = results[0]
184
- best_slide_content = best_result.page_content
185
 
186
- # If the best slide has little content, try to find a better one
187
- if len(best_slide_content.strip()) < 100:
188
- for result in results[1:]:
189
- if len(result.page_content.strip()) > len(best_slide_content.strip()):
190
- best_slide_content = result.page_content
191
- best_result = result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
  # Generate focused LLM answer using the most relevant slide
194
  if self.focused_qa_chain and curriculum_relevance_score > 0:
195
  try:
 
 
 
196
  answer = self.focused_qa_chain.run(question=query, slide_content=best_slide_content)
197
 
 
 
198
  # Clean up the answer
199
  answer = answer.strip()
200
  if "<|eot_id|>" in answer:
@@ -209,7 +289,7 @@ Provide a clear, educational answer based on this slide:"""
209
  # Check if the answer is too short or just repeats the question
210
  if len(answer.strip()) < 50 or answer.lower().startswith("how does that work"):
211
  # Generate a better answer using the slide content
212
- answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide explains the concept clearly. Let me provide additional context: Loops are programming constructs that allow you to repeat code multiple times efficiently."
213
 
214
  except Exception as e:
215
  print(f"Error generating focused answer: {e}")
@@ -265,9 +345,8 @@ Provide a clear, educational answer based on this slide:"""
265
 
266
  # Get the most relevant slide and its neighboring pages
267
  relevant_slides = []
268
- if curriculum_relevance_score > 0:
269
- # Get multiple relevant results to find the best one
270
- best_result = results[0]
271
  filename = best_result.metadata["filename"]
272
  page_number = best_result.metadata["page_number"]
273
 
@@ -278,47 +357,8 @@ Provide a clear, educational answer based on this slide:"""
278
  total_pages = len(doc)
279
  doc.close()
280
 
281
- # Find the best content page by analyzing all results
282
  target_page = page_number
283
- best_content_score = 0
284
-
285
- # Check all search results for the best content page
286
- for result in results:
287
- if result.metadata["filename"] == filename:
288
- page_num = result.metadata["page_number"]
289
- page_text = self.pdf_pages[filename].get(page_num, "")
290
- text_length = len(page_text.strip())
291
-
292
- # Score based on text length and relevance
293
- content_score = text_length
294
- if text_length > 100: # Prefer content pages over title slides
295
- content_score += 500
296
-
297
- if content_score > best_content_score:
298
- best_content_score = content_score
299
- target_page = page_num
300
-
301
- # If we still have a title slide, look for better content in the same PDF
302
- page_text = self.pdf_pages[filename].get(target_page, "")
303
- if len(page_text.strip()) < 150: # Still a title slide
304
- # Search for pages with the query terms
305
- query_terms = query.lower().split()
306
- best_match_score = 0
307
-
308
- for page_num in range(1, total_pages + 1):
309
- if page_num in self.pdf_pages[filename]:
310
- text = self.pdf_pages[filename][page_num].lower()
311
- text_length = len(text.strip())
312
-
313
- # Count how many query terms appear in this page
314
- match_score = sum(1 for term in query_terms if term in text)
315
-
316
- # Prefer pages with both query terms and good content
317
- if match_score > 0 and text_length > 200:
318
- total_score = match_score * 1000 + text_length
319
- if total_score > best_match_score:
320
- best_match_score = total_score
321
- target_page = page_num
322
 
323
  # Get the target page and neighboring pages (2 before, 2 after)
324
  start_page = max(1, target_page - 2)
 
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain_huggingface import HuggingFacePipeline
9
  from langchain.prompts import PromptTemplate
10
+ from langchain.chains import LLMChain
11
  from transformers import pipeline
12
  import torch
13
  import base64
 
73
  def _setup_llm(self):
74
  """Setup LLM with HuggingFace pipeline"""
75
  try:
76
+ # Initialize LLM attributes
77
+ self.llm = None
78
+ self.qa_chain = None
79
+ self.focused_qa_chain = None
80
+ self.content_selection_chain = None
81
+
82
  # Load the model
83
  pipe = pipeline(
84
  "text-generation",
 
94
 
95
  self.llm = HuggingFacePipeline(pipeline=pipe)
96
 
97
+ # Create content selection prompt template
98
+ content_selection_template = """You are an expert at analyzing curriculum content. Given a user's question and multiple slide contents, determine which slide is most relevant.
99
+
100
+ User Question: {question}
101
+
102
+ Available Slide Contents:
103
+ {slide_contents}
104
+
105
+ Analyze each slide and respond with ONLY the number (1, 2, 3, etc.) of the most relevant slide for the user's question. If no slide is relevant, respond with "0".
106
+
107
+ Most relevant slide number:"""
108
+
109
+ self.content_selection_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
110
+ input_variables=["question", "slide_contents"],
111
+ template=content_selection_template
112
+ ))
113
+
114
  # Create QA prompt template for DialoGPT
115
  qa_template = """You are a helpful programming tutor. Answer the following question based on the curriculum content provided.
116
 
 
119
 
120
  Question: {question}
121
 
122
+ Provide a clear, educational answer explaining the concept. Be specific and detailed in your explanation:"""
123
 
124
  self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
125
  input_variables=["question", "filled_context"],
 
134
 
135
  Question: {question}
136
 
137
+ Provide a clear, educational answer based on this slide. Be specific and detailed, focusing on the exact concept or topic the user is asking about:"""
138
 
139
  self.focused_qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
140
  input_variables=["question", "slide_content"],
141
  template=focused_qa_template
142
  ))
143
 
144
+ print("βœ… LLM loaded successfully!")
145
+ print(f"πŸ” LLM object: {self.llm}")
146
+ print(f"πŸ” Content selection chain: {self.content_selection_chain}")
147
+ print(f"πŸ” Focused QA chain: {self.focused_qa_chain}")
148
  except Exception as e:
149
+ print(f"Warning: Could not load LLM: {e}")
150
+ print("Falling back to basic search mode...")
151
+ self.llm = None
152
+ self.qa_chain = None
153
+ self.focused_qa_chain = None
154
+ self.content_selection_chain = None
155
 
156
  def get_pdf_page_image(self, pdf_path, page_num):
157
  try:
 
191
  return "\n".join(slides_text)
192
 
193
  def chat(self, query):
194
+ """Comprehensive chat function with LLM-powered content selection and answers"""
195
+ # First, try to find relevant curriculum content using vector search
196
+ results = self.vector_db.similarity_search(query, k=5) # Get top 5 results for LLM analysis
197
 
 
198
  curriculum_relevance_score = 0
199
+ best_slide_content = ""
200
+ best_result = None
201
+
202
  if results:
203
+ curriculum_relevance_score = len(results)
 
204
 
205
  # Debug: Print what we found
206
  print(f"Query: {query}")
207
+ print(f"Found {len(results)} relevant results for LLM analysis:")
208
+ for i, result in enumerate(results):
209
  print(f" {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
210
  print(f" Content: {result.page_content[:100]}...")
 
 
 
 
 
 
 
211
 
212
+ # Use LLM to select the most relevant content
213
+ if self.content_selection_chain and curriculum_relevance_score > 0:
214
+ try:
215
+ # Prepare slide contents for LLM analysis
216
+ slide_contents = []
217
+ for i, result in enumerate(results):
218
+ slide_contents.append(f"Slide {i+1}: {result.page_content[:500]}...")
219
+
220
+ slide_contents_text = "\n\n".join(slide_contents)
221
+
222
+ print(f"πŸ” Using LLM to select most relevant content...")
223
+
224
+ # Get LLM's selection
225
+ selection_response = self.content_selection_chain.run(
226
+ question=query,
227
+ slide_contents=slide_contents_text
228
+ )
229
+
230
+ print(f"LLM Selection Response: {selection_response}")
231
+
232
+ # Parse the selection (expecting a number)
233
+ try:
234
+ # Extract number from response
235
+ import re
236
+ numbers = re.findall(r'\d+', selection_response)
237
+ if numbers:
238
+ selected_index = int(numbers[0]) - 1 # Convert to 0-based index
239
+ if 0 <= selected_index < len(results):
240
+ best_result = results[selected_index]
241
+ best_slide_content = best_result.page_content
242
+ print(f"βœ… LLM selected slide {selected_index + 1}")
243
+ else:
244
+ print(f"⚠️ LLM selection out of range: {selected_index + 1}")
245
+ # Fallback to first result
246
+ best_result = results[0]
247
+ best_slide_content = best_result.page_content
248
+ else:
249
+ print("⚠️ No number found in LLM response, using first result")
250
+ best_result = results[0]
251
+ best_slide_content = best_result.page_content
252
+ except Exception as e:
253
+ print(f"Error parsing LLM selection: {e}")
254
+ # Fallback to first result
255
+ best_result = results[0]
256
+ best_slide_content = best_result.page_content
257
+
258
+ except Exception as e:
259
+ print(f"Error in LLM content selection: {e}")
260
+ # Fallback to simple selection
261
+ best_result = results[0]
262
+ best_slide_content = best_result.page_content
263
+ else:
264
+ # Fallback to simple selection if no LLM
265
+ best_result = results[0]
266
+ best_slide_content = best_result.page_content
267
 
268
  # Generate focused LLM answer using the most relevant slide
269
  if self.focused_qa_chain and curriculum_relevance_score > 0:
270
  try:
271
+ print(f"πŸ” Calling LLM with question: {query}")
272
+ print(f"πŸ” LLM available: {self.focused_qa_chain is not None}")
273
+
274
  answer = self.focused_qa_chain.run(question=query, slide_content=best_slide_content)
275
 
276
+ print(f"LLM Response: {answer[:200]}...")
277
+
278
  # Clean up the answer
279
  answer = answer.strip()
280
  if "<|eot_id|>" in answer:
 
289
  # Check if the answer is too short or just repeats the question
290
  if len(answer.strip()) < 50 or answer.lower().startswith("how does that work"):
291
  # Generate a better answer using the slide content
292
+ answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide explains the concept clearly. The curriculum content provides the foundation for understanding this programming concept."
293
 
294
  except Exception as e:
295
  print(f"Error generating focused answer: {e}")
 
345
 
346
  # Get the most relevant slide and its neighboring pages
347
  relevant_slides = []
348
+ if curriculum_relevance_score > 0 and best_result:
349
+ # Use the LLM-selected result
 
350
  filename = best_result.metadata["filename"]
351
  page_number = best_result.metadata["page_number"]
352
 
 
357
  total_pages = len(doc)
358
  doc.close()
359
 
360
+ # Use the LLM-selected page as the target
361
  target_page = page_number
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
 
363
  # Get the target page and neighboring pages (2 before, 2 after)
364
  start_page = max(1, target_page - 2)