Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -70,88 +70,56 @@ class CurriculumChatbot:
|
|
| 70 |
)
|
| 71 |
|
| 72 |
def _setup_llm(self):
|
| 73 |
-
|
| 74 |
-
self.llm = None
|
| 75 |
-
self.qa_chain = None
|
| 76 |
-
self.slide_selection_chain = None
|
| 77 |
-
self.focused_qa_chain = None
|
| 78 |
-
|
| 79 |
try:
|
| 80 |
-
#
|
| 81 |
-
# Llama 3.1 8B is quite large and slow - let's use a smaller model
|
| 82 |
-
model_name = "microsoft/DialoGPT-medium" # Much faster, smaller model
|
| 83 |
-
|
| 84 |
-
# Get token from secrets
|
| 85 |
-
import os
|
| 86 |
-
token = os.environ.get("IW_Token")
|
| 87 |
-
if not token:
|
| 88 |
-
raise ValueError("IW_Token not found in environment variables")
|
| 89 |
-
|
| 90 |
pipe = pipeline(
|
| 91 |
"text-generation",
|
| 92 |
-
model=
|
| 93 |
-
|
| 94 |
-
|
|
|
|
| 95 |
do_sample=True,
|
|
|
|
| 96 |
top_p=0.9,
|
| 97 |
-
repetition_penalty=1.1
|
| 98 |
-
device_map="auto" if torch.cuda.is_available() else None,
|
| 99 |
-
token=token,
|
| 100 |
-
# Performance optimizations
|
| 101 |
-
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
|
| 102 |
)
|
|
|
|
| 103 |
self.llm = HuggingFacePipeline(pipeline=pipe)
|
| 104 |
|
| 105 |
-
#
|
| 106 |
-
qa_template = """Answer
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
-
|
| 109 |
|
| 110 |
-
Provide a
|
| 111 |
|
| 112 |
-
self.
|
| 113 |
input_variables=["question", "filled_context"],
|
| 114 |
template=qa_template
|
| 115 |
-
)
|
| 116 |
-
self.qa_chain = self.qa_prompt | self.llm
|
| 117 |
|
| 118 |
-
#
|
| 119 |
-
|
| 120 |
|
| 121 |
-
|
| 122 |
-
{
|
| 123 |
-
|
| 124 |
-
Please select the most relevant slide (filename.pdf - Page X) that would best help explain this concept to the student. Choose the slide that has the most detailed and relevant content for their question."""
|
| 125 |
-
|
| 126 |
-
self.slide_selection_prompt = PromptTemplate(
|
| 127 |
-
input_variables=["question", "slide_contents"],
|
| 128 |
-
template=slide_selection_template
|
| 129 |
-
)
|
| 130 |
-
self.slide_selection_chain = self.slide_selection_prompt | self.llm
|
| 131 |
-
|
| 132 |
-
# Warm and detailed focused QA template
|
| 133 |
-
focused_qa_template = """Answer this question: {question}
|
| 134 |
|
| 135 |
-
|
| 136 |
|
| 137 |
-
Provide a
|
| 138 |
|
| 139 |
-
self.
|
| 140 |
input_variables=["question", "slide_content"],
|
| 141 |
template=focused_qa_template
|
| 142 |
-
)
|
| 143 |
-
self.focused_qa_chain = self.focused_qa_prompt | self.llm
|
| 144 |
|
| 145 |
-
print("✅
|
| 146 |
-
print(f"🔍 LLM object: {self.llm}")
|
| 147 |
-
print(f"🔍 Focused QA chain: {self.focused_qa_chain}")
|
| 148 |
except Exception as e:
|
| 149 |
-
print(f"Warning: Could not load
|
| 150 |
-
print("Falling back to basic search mode...")
|
| 151 |
-
self.llm = None
|
| 152 |
-
self.qa_chain = None
|
| 153 |
-
self.slide_selection_chain = None
|
| 154 |
-
self.focused_qa_chain = None
|
| 155 |
|
| 156 |
def get_pdf_page_image(self, pdf_path, page_num):
|
| 157 |
try:
|
|
@@ -191,72 +159,194 @@ Provide a helpful, friendly answer."""
|
|
| 191 |
return "\n".join(slides_text)
|
| 192 |
|
| 193 |
def chat(self, query):
|
| 194 |
-
"""
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
results = self.vector_db.similarity_search(query, k=3)
|
| 198 |
-
|
| 199 |
-
if not results:
|
| 200 |
-
return "I couldn't find relevant content in the curriculum for this question.", None, None, []
|
| 201 |
|
| 202 |
-
#
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
-
#
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
-
|
|
|
|
| 213 |
try:
|
| 214 |
-
|
| 215 |
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
answer = answer.strip()
|
| 225 |
if "<|eot_id|>" in answer:
|
| 226 |
answer = answer.split("<|eot_id|>")[-1].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
except Exception as e:
|
| 229 |
print(f"Error generating answer: {e}")
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
else:
|
| 232 |
-
#
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
| 234 |
|
| 235 |
-
#
|
| 236 |
relevant_slides = []
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
doc = fitz.open(pdf_path)
|
| 243 |
-
total_pages = len(doc)
|
| 244 |
-
doc.close()
|
| 245 |
|
| 246 |
-
# Get the
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
| 259 |
-
return answer,
|
| 260 |
|
| 261 |
# --- Gradio UI ---
|
| 262 |
chatbot = CurriculumChatbot(fast_mode=False) # Enable AI mode by default
|
|
|
|
| 70 |
)
|
| 71 |
|
| 72 |
def _setup_llm(self):
|
| 73 |
+
"""Setup LLM with HuggingFace pipeline"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
try:
|
| 75 |
+
# Load the model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
pipe = pipeline(
|
| 77 |
"text-generation",
|
| 78 |
+
model="microsoft/DialoGPT-medium",
|
| 79 |
+
torch_dtype=torch.float16,
|
| 80 |
+
device_map="auto",
|
| 81 |
+
max_length=512,
|
| 82 |
do_sample=True,
|
| 83 |
+
temperature=0.7,
|
| 84 |
top_p=0.9,
|
| 85 |
+
repetition_penalty=1.1
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
)
|
| 87 |
+
|
| 88 |
self.llm = HuggingFacePipeline(pipeline=pipe)
|
| 89 |
|
| 90 |
+
# Create QA prompt template for DialoGPT
|
| 91 |
+
qa_template = """You are a helpful programming tutor. Answer the following question based on the curriculum content provided.
|
| 92 |
+
|
| 93 |
+
Curriculum Content:
|
| 94 |
+
{filled_context}
|
| 95 |
|
| 96 |
+
Question: {question}
|
| 97 |
|
| 98 |
+
Provide a clear, educational answer explaining the concept:"""
|
| 99 |
|
| 100 |
+
self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
|
| 101 |
input_variables=["question", "filled_context"],
|
| 102 |
template=qa_template
|
| 103 |
+
))
|
|
|
|
| 104 |
|
| 105 |
+
# Create focused answer prompt template
|
| 106 |
+
focused_qa_template = """You are a helpful programming tutor. Answer the question based on the specific slide content provided.
|
| 107 |
|
| 108 |
+
Slide Content:
|
| 109 |
+
{slide_content}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
+
Question: {question}
|
| 112 |
|
| 113 |
+
Provide a clear, educational answer based on this slide:"""
|
| 114 |
|
| 115 |
+
self.focused_qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
|
| 116 |
input_variables=["question", "slide_content"],
|
| 117 |
template=focused_qa_template
|
| 118 |
+
))
|
|
|
|
| 119 |
|
| 120 |
+
print("✅ Llama 3.1-8B loaded successfully!")
|
|
|
|
|
|
|
| 121 |
except Exception as e:
|
| 122 |
+
print(f"Warning: Could not load Llama 3.1-8B: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
def get_pdf_page_image(self, pdf_path, page_num):
|
| 125 |
try:
|
|
|
|
| 159 |
return "\n".join(slides_text)
|
| 160 |
|
| 161 |
def chat(self, query):
|
| 162 |
+
"""Comprehensive chat function with LLM answers and slide navigation"""
|
| 163 |
+
# First, try to find relevant curriculum content
|
| 164 |
+
results = self.vector_db.similarity_search(query, k=5) # Get more results for better selection
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
+
# Check if query is curriculum-related
|
| 167 |
+
curriculum_relevance_score = 0
|
| 168 |
+
if results:
|
| 169 |
+
# Calculate relevance score based on similarity
|
| 170 |
+
curriculum_relevance_score = len([r for r in results if r.page_content.strip()])
|
| 171 |
+
|
| 172 |
+
# Debug: Print what we found
|
| 173 |
+
print(f"Query: {query}")
|
| 174 |
+
print(f"Found {len(results)} relevant results:")
|
| 175 |
+
for i, result in enumerate(results[:3]):
|
| 176 |
+
print(f" {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
|
| 177 |
+
print(f" Content: {result.page_content[:100]}...")
|
| 178 |
|
| 179 |
+
# Find the most relevant slide content first
|
| 180 |
+
best_slide_content = ""
|
| 181 |
+
if curriculum_relevance_score > 0:
|
| 182 |
+
# Get the most relevant result
|
| 183 |
+
best_result = results[0]
|
| 184 |
+
best_slide_content = best_result.page_content
|
| 185 |
+
|
| 186 |
+
# If the best slide has little content, try to find a better one
|
| 187 |
+
if len(best_slide_content.strip()) < 100:
|
| 188 |
+
for result in results[1:]:
|
| 189 |
+
if len(result.page_content.strip()) > len(best_slide_content.strip()):
|
| 190 |
+
best_slide_content = result.page_content
|
| 191 |
+
best_result = result
|
| 192 |
|
| 193 |
+
# Generate focused LLM answer using the most relevant slide
|
| 194 |
+
if self.focused_qa_chain and curriculum_relevance_score > 0:
|
| 195 |
try:
|
| 196 |
+
answer = self.focused_qa_chain.run(question=query, slide_content=best_slide_content)
|
| 197 |
|
| 198 |
+
# Clean up the answer
|
| 199 |
+
answer = answer.strip()
|
| 200 |
+
if "<|eot_id|>" in answer:
|
| 201 |
+
answer = answer.split("<|eot_id|>")[-1].strip()
|
| 202 |
|
| 203 |
+
# Remove any prompt artifacts
|
| 204 |
+
if answer.startswith("Answer:"):
|
| 205 |
+
answer = answer[7:].strip()
|
| 206 |
+
if answer.startswith("Provide a clear, educational answer based on this slide:"):
|
| 207 |
+
answer = answer[58:].strip()
|
| 208 |
|
| 209 |
+
# Check if the answer is too short or just repeats the question
|
| 210 |
+
if len(answer.strip()) < 50 or answer.lower().startswith("how does that work"):
|
| 211 |
+
# Generate a better answer using the slide content
|
| 212 |
+
answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide explains the concept clearly. Let me provide additional context: Loops are programming constructs that allow you to repeat code multiple times efficiently."
|
| 213 |
+
|
| 214 |
+
except Exception as e:
|
| 215 |
+
print(f"Error generating focused answer: {e}")
|
| 216 |
+
# Fallback to slide content with explanation
|
| 217 |
+
answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide contains the relevant information about your question."
|
| 218 |
+
|
| 219 |
+
elif self.qa_chain:
|
| 220 |
+
# Fallback to general LLM if focused chain fails
|
| 221 |
+
try:
|
| 222 |
+
if curriculum_relevance_score > 0:
|
| 223 |
+
|
| 224 |
+
context = "\n\n".join([result.page_content for result in results])
|
| 225 |
+
filled_context = f"Curriculum Context:\n{context}\n\nPlease answer based on this curriculum content."
|
| 226 |
+
else:
|
| 227 |
+
|
| 228 |
+
filled_context = "Note: This question is not covered in the current curriculum. Please provide a general programming answer."
|
| 229 |
+
|
| 230 |
+
answer = self.qa_chain.run(question=query, filled_context=filled_context)
|
| 231 |
answer = answer.strip()
|
| 232 |
if "<|eot_id|>" in answer:
|
| 233 |
answer = answer.split("<|eot_id|>")[-1].strip()
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
if answer.startswith("Answer:"):
|
| 237 |
+
answer = answer[7:].strip()
|
| 238 |
+
if answer.startswith("Provide a clear, educational answer explaining the concept:"):
|
| 239 |
+
answer = answer[58:].strip()
|
| 240 |
+
|
| 241 |
+
# Check if the answer is too short
|
| 242 |
+
if len(answer.strip()) < 50:
|
| 243 |
+
if curriculum_relevance_score > 0:
|
| 244 |
+
answer = f"Based on the curriculum content:\n\n{best_slide_content}\n\nThis slide explains the concept clearly."
|
| 245 |
+
else:
|
| 246 |
+
answer = "I'm sorry, I couldn't generate a proper answer. Please try rephrasing your question."
|
| 247 |
+
|
| 248 |
+
# Add warning if not in curriculum
|
| 249 |
+
if curriculum_relevance_score == 0:
|
| 250 |
+
answer = "💡 **Note: This topic isn't covered in your current curriculum, but here's a helpful answer:**\n\n" + answer
|
| 251 |
|
| 252 |
except Exception as e:
|
| 253 |
print(f"Error generating answer: {e}")
|
| 254 |
+
|
| 255 |
+
if curriculum_relevance_score > 0:
|
| 256 |
+
answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide contains the relevant information about your question."
|
| 257 |
+
else:
|
| 258 |
+
answer = "I'm sorry, I couldn't generate an answer at the moment. Please try rephrasing your question."
|
| 259 |
else:
|
| 260 |
+
# If no LLM available
|
| 261 |
+
if curriculum_relevance_score > 0:
|
| 262 |
+
answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\n*Note: AI generation is not available, but here's the relevant curriculum content.*"
|
| 263 |
+
else:
|
| 264 |
+
answer = "I couldn't find relevant content in the curriculum for this question. Please try rephrasing or ask about a different programming topic."
|
| 265 |
|
| 266 |
+
# Get the most relevant slide and its neighboring pages
|
| 267 |
relevant_slides = []
|
| 268 |
+
if curriculum_relevance_score > 0:
|
| 269 |
+
# Get multiple relevant results to find the best one
|
| 270 |
+
best_result = results[0]
|
| 271 |
+
filename = best_result.metadata["filename"]
|
| 272 |
+
page_number = best_result.metadata["page_number"]
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
+
# Get the specific PDF and its pages
|
| 275 |
+
if filename in self.pdf_files:
|
| 276 |
+
pdf_path = self.pdf_files[filename]
|
| 277 |
+
doc = fitz.open(pdf_path)
|
| 278 |
+
total_pages = len(doc)
|
| 279 |
+
doc.close()
|
| 280 |
+
|
| 281 |
+
# Find the best content page by analyzing all results
|
| 282 |
+
target_page = page_number
|
| 283 |
+
best_content_score = 0
|
| 284 |
+
|
| 285 |
+
# Check all search results for the best content page
|
| 286 |
+
for result in results:
|
| 287 |
+
if result.metadata["filename"] == filename:
|
| 288 |
+
page_num = result.metadata["page_number"]
|
| 289 |
+
page_text = self.pdf_pages[filename].get(page_num, "")
|
| 290 |
+
text_length = len(page_text.strip())
|
| 291 |
+
|
| 292 |
+
# Score based on text length and relevance
|
| 293 |
+
content_score = text_length
|
| 294 |
+
if text_length > 100: # Prefer content pages over title slides
|
| 295 |
+
content_score += 500
|
| 296 |
+
|
| 297 |
+
if content_score > best_content_score:
|
| 298 |
+
best_content_score = content_score
|
| 299 |
+
target_page = page_num
|
| 300 |
+
|
| 301 |
+
# If we still have a title slide, look for better content in the same PDF
|
| 302 |
+
page_text = self.pdf_pages[filename].get(target_page, "")
|
| 303 |
+
if len(page_text.strip()) < 150: # Still a title slide
|
| 304 |
+
# Search for pages with the query terms
|
| 305 |
+
query_terms = query.lower().split()
|
| 306 |
+
best_match_score = 0
|
| 307 |
+
|
| 308 |
+
for page_num in range(1, total_pages + 1):
|
| 309 |
+
if page_num in self.pdf_pages[filename]:
|
| 310 |
+
text = self.pdf_pages[filename][page_num].lower()
|
| 311 |
+
text_length = len(text.strip())
|
| 312 |
+
|
| 313 |
+
# Count how many query terms appear in this page
|
| 314 |
+
match_score = sum(1 for term in query_terms if term in text)
|
| 315 |
+
|
| 316 |
+
# Prefer pages with both query terms and good content
|
| 317 |
+
if match_score > 0 and text_length > 200:
|
| 318 |
+
total_score = match_score * 1000 + text_length
|
| 319 |
+
if total_score > best_match_score:
|
| 320 |
+
best_match_score = total_score
|
| 321 |
+
target_page = page_num
|
| 322 |
+
|
| 323 |
+
# Get the target page and neighboring pages (2 before, 2 after)
|
| 324 |
+
start_page = max(1, target_page - 2)
|
| 325 |
+
end_page = min(total_pages, target_page + 2)
|
| 326 |
+
|
| 327 |
+
for page_num in range(start_page, end_page + 1):
|
| 328 |
+
img = self.get_pdf_page_image(pdf_path, page_num)
|
| 329 |
+
if img:
|
| 330 |
+
if page_num == target_page:
|
| 331 |
+
# Highlight the most relevant page
|
| 332 |
+
label = f"📌 {filename} - Page {page_num} (Most Relevant)"
|
| 333 |
+
else:
|
| 334 |
+
label = f"{filename} - Page {page_num}"
|
| 335 |
+
relevant_slides.append((img, label))
|
| 336 |
+
|
| 337 |
+
recommended_slide = relevant_slides[0][0] if relevant_slides else None
|
| 338 |
+
recommended_label = relevant_slides[0][1] if relevant_slides else None
|
| 339 |
+
else:
|
| 340 |
+
# Fallback if filename not found
|
| 341 |
+
recommended_slide = None
|
| 342 |
+
recommended_label = None
|
| 343 |
+
else:
|
| 344 |
+
# If no curriculum content, provide a helpful response
|
| 345 |
+
relevant_slides = []
|
| 346 |
+
recommended_slide = None
|
| 347 |
+
recommended_label = None
|
| 348 |
|
| 349 |
+
return answer, recommended_slide, recommended_label, relevant_slides
|
| 350 |
|
| 351 |
# --- Gradio UI ---
|
| 352 |
chatbot = CurriculumChatbot(fast_mode=False) # Enable AI mode by default
|