Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -4,37 +4,51 @@ from pathlib import Path
|
|
| 4 |
import fitz # PyMuPDF
|
| 5 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 6 |
from langchain_community.vectorstores import Chroma
|
| 7 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
from langchain_huggingface import HuggingFacePipeline
|
| 9 |
from langchain.prompts import PromptTemplate
|
| 10 |
from transformers import pipeline
|
| 11 |
import torch
|
| 12 |
-
import base64
|
| 13 |
from PIL import Image
|
| 14 |
import io
|
| 15 |
import re
|
|
|
|
| 16 |
|
| 17 |
-
# ---
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
self.chunks = []
|
| 25 |
self.chunk_metadata = []
|
| 26 |
self.vector_db = None
|
| 27 |
self.embeddings = None
|
| 28 |
self.llm = None
|
| 29 |
self.qa_chain = None
|
| 30 |
-
self.
|
|
|
|
|
|
|
|
|
|
| 31 |
self._process_pdfs(slides_dir)
|
| 32 |
self._build_vector_db()
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
|
| 35 |
def _process_pdfs(self, slides_dir):
|
|
|
|
| 36 |
slides_path = Path(slides_dir)
|
| 37 |
pdf_files = list(slides_path.glob("*.pdf"))
|
|
|
|
| 38 |
for pdf_file in pdf_files:
|
| 39 |
self.pdf_files[pdf_file.name] = str(pdf_file)
|
| 40 |
doc = fitz.open(str(pdf_file))
|
|
@@ -46,6 +60,7 @@ class CurriculumChatbot:
|
|
| 46 |
pages[page_num + 1] = text.strip()
|
| 47 |
self.pdf_pages[pdf_file.name] = pages
|
| 48 |
doc.close()
|
|
|
|
| 49 |
# Add each page as a chunk
|
| 50 |
for page_num, text in pages.items():
|
| 51 |
self.chunks.append(text)
|
|
@@ -53,8 +68,11 @@ class CurriculumChatbot:
|
|
| 53 |
"filename": pdf_file.name,
|
| 54 |
"page_number": page_num
|
| 55 |
})
|
|
|
|
|
|
|
| 56 |
|
| 57 |
def _build_vector_db(self):
|
|
|
|
| 58 |
self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 59 |
self.vector_db = Chroma.from_texts(
|
| 60 |
texts=self.chunks,
|
|
@@ -62,107 +80,54 @@ class CurriculumChatbot:
|
|
| 62 |
metadatas=self.chunk_metadata,
|
| 63 |
persist_directory="./chroma_db"
|
| 64 |
)
|
|
|
|
| 65 |
|
| 66 |
def _setup_llm(self):
|
|
|
|
| 67 |
try:
|
| 68 |
-
# Use
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
# Get token from secrets
|
| 72 |
-
import os
|
| 73 |
token = os.environ.get("IW_Token")
|
| 74 |
if not token:
|
| 75 |
-
|
|
|
|
|
|
|
| 76 |
|
| 77 |
pipe = pipeline(
|
| 78 |
"text-generation",
|
| 79 |
model=model_name,
|
| 80 |
-
max_new_tokens=
|
| 81 |
temperature=0.3,
|
| 82 |
do_sample=True,
|
| 83 |
top_p=0.9,
|
| 84 |
repetition_penalty=1.1,
|
| 85 |
-
device_map="auto" if torch.cuda.is_available() else
|
| 86 |
-
token=token
|
|
|
|
|
|
|
| 87 |
)
|
| 88 |
self.llm = HuggingFacePipeline(pipeline=pipe)
|
| 89 |
|
| 90 |
-
|
| 91 |
-
qa_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
| 92 |
-
|
| 93 |
-
You are a helpful AI programming tutor. Answer questions about programming concepts clearly and educationally. If the question is about curriculum content, use the provided context. If not, provide a general programming answer.
|
| 94 |
-
|
| 95 |
-
<|eot_id|><|start_header_id|>user<|end_header_id|>
|
| 96 |
-
|
| 97 |
-
Question: {question}
|
| 98 |
-
|
| 99 |
-
{filled_context}
|
| 100 |
-
|
| 101 |
-
<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
|
| 102 |
-
|
| 103 |
self.qa_prompt = PromptTemplate(
|
| 104 |
-
input_variables=["question", "
|
| 105 |
template=qa_template
|
| 106 |
)
|
| 107 |
self.qa_chain = self.qa_prompt | self.llm
|
| 108 |
|
| 109 |
-
|
| 110 |
-
slide_selection_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
| 111 |
-
|
| 112 |
-
You are an AI that analyzes curriculum slides to find the best one for teaching a concept. Return ONLY the filename and page number.
|
| 113 |
-
|
| 114 |
-
<|eot_id|><|start_header_id|>user<|end_header_id|>
|
| 115 |
-
|
| 116 |
-
Question: {question}
|
| 117 |
-
|
| 118 |
-
Here are the top 5 most relevant slides from the curriculum:
|
| 119 |
-
|
| 120 |
-
{slide_contents}
|
| 121 |
-
|
| 122 |
-
Which slide is the BEST for teaching this concept to a student? Consider:
|
| 123 |
-
- Which slide has the most educational content?
|
| 124 |
-
- Which slide explains the concept most clearly?
|
| 125 |
-
- Which slide would be most helpful for learning?
|
| 126 |
-
|
| 127 |
-
Return only: "filename.pdf - Page X"
|
| 128 |
-
|
| 129 |
-
<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
|
| 130 |
-
|
| 131 |
-
self.slide_selection_prompt = PromptTemplate(
|
| 132 |
-
input_variables=["question", "slide_contents"],
|
| 133 |
-
template=slide_selection_template
|
| 134 |
-
)
|
| 135 |
-
self.slide_selection_chain = self.slide_selection_prompt | self.llm
|
| 136 |
-
|
| 137 |
-
# Create focused answer prompt template for Llama 3.1
|
| 138 |
-
focused_qa_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
| 139 |
-
|
| 140 |
-
You are a helpful AI programming tutor. Answer questions about programming concepts clearly and educationally based on the provided slide content.
|
| 141 |
-
|
| 142 |
-
<|eot_id|><|start_header_id|>user<|end_header_id|>
|
| 143 |
-
|
| 144 |
-
Slide Content:
|
| 145 |
-
{slide_content}
|
| 146 |
-
|
| 147 |
-
Question: {question}
|
| 148 |
-
|
| 149 |
-
<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
|
| 150 |
-
|
| 151 |
-
self.focused_qa_prompt = PromptTemplate(
|
| 152 |
-
input_variables=["question", "slide_content"],
|
| 153 |
-
template=focused_qa_template
|
| 154 |
-
)
|
| 155 |
-
self.focused_qa_chain = self.focused_qa_prompt | self.llm
|
| 156 |
-
|
| 157 |
-
print("β
Llama 3.1 8B loaded successfully!")
|
| 158 |
except Exception as e:
|
| 159 |
-
print(f"
|
| 160 |
-
print("Falling back to basic search mode...")
|
| 161 |
self.llm = None
|
| 162 |
-
self.qa_chain = None
|
| 163 |
-
self.slide_selection_chain = None
|
| 164 |
|
| 165 |
def get_pdf_page_image(self, pdf_path, page_num):
|
|
|
|
| 166 |
try:
|
| 167 |
doc = fitz.open(pdf_path)
|
| 168 |
if page_num <= len(doc):
|
|
@@ -180,333 +145,198 @@ Question: {question}
|
|
| 180 |
except Exception as e:
|
| 181 |
print(f"Error rendering PDF page: {str(e)}")
|
| 182 |
return None
|
| 183 |
-
|
| 184 |
-
def get_all_slides(self):
|
| 185 |
-
"""Get all available slides for display"""
|
| 186 |
-
all_slides = []
|
| 187 |
-
for filename, pages in self.pdf_pages.items():
|
| 188 |
-
for page_num in pages.keys():
|
| 189 |
-
img = self.get_pdf_page_image(self.pdf_files[filename], page_num)
|
| 190 |
-
if img:
|
| 191 |
-
all_slides.append((img, f"{filename} - Page {page_num}"))
|
| 192 |
-
return all_slides
|
| 193 |
-
|
| 194 |
-
def get_available_slides_text(self):
|
| 195 |
-
"""Get text representation of available slides for LLM"""
|
| 196 |
-
slides_text = []
|
| 197 |
-
for filename, pages in self.pdf_pages.items():
|
| 198 |
-
for page_num in pages.keys():
|
| 199 |
-
slides_text.append(f"{filename} - Page {page_num}")
|
| 200 |
-
return "\n".join(slides_text)
|
| 201 |
|
| 202 |
def chat(self, query):
|
| 203 |
-
"""
|
| 204 |
-
|
| 205 |
-
results = self.vector_db.similarity_search(query, k=5) # Get more results for better selection
|
| 206 |
|
| 207 |
-
# Check
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
curriculum_relevance_score = len([r for r in results if r.page_content.strip()])
|
| 212 |
-
|
| 213 |
-
# Debug: Print what we found
|
| 214 |
-
print(f"Query: {query}")
|
| 215 |
-
print(f"Found {len(results)} relevant results:")
|
| 216 |
-
for i, result in enumerate(results[:3]):
|
| 217 |
-
print(f" {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
|
| 218 |
-
print(f" Content: {result.page_content[:100]}...")
|
| 219 |
|
| 220 |
-
#
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
# Prepare slide contents for LLM analysis
|
| 226 |
-
slide_contents = []
|
| 227 |
-
for i, result in enumerate(results[:5]): # Top 5 results
|
| 228 |
-
filename = result.metadata["filename"]
|
| 229 |
-
page_num = result.metadata["page_number"]
|
| 230 |
-
content = result.page_content
|
| 231 |
-
slide_contents.append(f"Slide {i+1}: {filename} - Page {page_num}\nContent: {content}\n")
|
| 232 |
-
|
| 233 |
-
slide_contents_text = "\n".join(slide_contents)
|
| 234 |
-
|
| 235 |
-
# Use LLM to select the best slide
|
| 236 |
-
slide_response = self.slide_selection_chain.invoke({
|
| 237 |
-
"question": query,
|
| 238 |
-
"slide_contents": slide_contents_text
|
| 239 |
-
})
|
| 240 |
-
|
| 241 |
-
# Extract filename and page from response
|
| 242 |
-
slide_response = slide_response.strip()
|
| 243 |
-
if "<|eot_id|>" in slide_response:
|
| 244 |
-
slide_response = slide_response.split("<|eot_id|>")[-1].strip()
|
| 245 |
-
|
| 246 |
-
# Parse the response to get filename and page
|
| 247 |
-
match = re.search(r'(.+\.pdf)\s*-\s*Page\s*(\d+)', slide_response)
|
| 248 |
-
if match:
|
| 249 |
-
filename = match.group(1)
|
| 250 |
-
page_num = int(match.group(2))
|
| 251 |
-
|
| 252 |
-
# Find the corresponding result
|
| 253 |
-
for result in results:
|
| 254 |
-
if (result.metadata["filename"] == filename and
|
| 255 |
-
result.metadata["page_number"] == page_num):
|
| 256 |
-
best_result = result
|
| 257 |
-
best_slide_content = result.page_content
|
| 258 |
-
break
|
| 259 |
-
|
| 260 |
-
# If LLM selection failed, fall back to first result
|
| 261 |
-
if not best_result:
|
| 262 |
-
best_result = results[0]
|
| 263 |
-
best_slide_content = results[0].page_content
|
| 264 |
-
else:
|
| 265 |
-
# Fallback to first result if parsing failed
|
| 266 |
-
best_result = results[0]
|
| 267 |
-
best_slide_content = results[0].page_content
|
| 268 |
-
|
| 269 |
-
except Exception as e:
|
| 270 |
-
print(f"Error in LLM slide selection: {e}")
|
| 271 |
-
# Fallback to first result
|
| 272 |
-
best_result = results[0]
|
| 273 |
-
best_slide_content = results[0].page_content
|
| 274 |
else:
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
best_slide_content = results[0].page_content
|
| 279 |
|
| 280 |
-
#
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
}
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
# Clean up the answer
|
| 292 |
-
answer = answer.strip()
|
| 293 |
-
if "<|eot_id|>" in answer:
|
| 294 |
-
answer = answer.split("<|eot_id|>")[-1].strip()
|
| 295 |
-
|
| 296 |
-
# Remove any prompt artifacts
|
| 297 |
-
if answer.startswith("Answer:"):
|
| 298 |
-
answer = answer[7:].strip()
|
| 299 |
-
if answer.startswith("Provide a clear, educational answer based on this slide:"):
|
| 300 |
-
answer = answer[58:].strip()
|
| 301 |
-
|
| 302 |
-
# Check if the answer is too short, just repeats the question, or contains the prompt
|
| 303 |
-
if (len(answer.strip()) < 50 or
|
| 304 |
-
answer.lower().startswith("how does that work") or
|
| 305 |
-
"slide content provided" in answer.lower() or
|
| 306 |
-
"provide a clear" in answer.lower() or
|
| 307 |
-
"answer the question based on" in answer.lower() or
|
| 308 |
-
"slide content:" in answer.lower()):
|
| 309 |
-
|
| 310 |
-
# Generate a proper answer using the slide content
|
| 311 |
-
slide_info = f"π **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
|
| 312 |
-
|
| 313 |
-
if "loops" in query.lower():
|
| 314 |
-
answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\n**What are loops for?**\n\nLoops are programming constructs that solve the problem of repetition. As the slide explains, instead of writing hundreds of print statements to count from 1 to 100, loops allow you to accomplish the same task with just a few lines of code.\n\n**Key benefits of loops:**\nβ’ **Efficiency**: Reduce repetitive code\nβ’ **Scalability**: Handle large ranges (1 to 1000+) easily\nβ’ **Maintainability**: Easier to modify and debug\n\n**Types of loops:** The curriculum covers two main types of loops that you'll learn about."
|
| 315 |
-
else:
|
| 316 |
-
answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\nThis slide explains the concept clearly. The content shows how programming constructs help solve real problems efficiently."
|
| 317 |
-
|
| 318 |
-
except Exception as e:
|
| 319 |
-
print(f"Error generating focused answer: {e}")
|
| 320 |
-
# Generate a proper answer using the slide content
|
| 321 |
-
slide_info = f"π **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
|
| 322 |
-
|
| 323 |
-
if "loops" in query.lower():
|
| 324 |
-
answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\n**What are loops for?**\n\nLoops are programming constructs that solve the problem of repetition. As the slide explains, instead of writing hundreds of print statements to count from 1 to 100, loops allow you to accomplish the same task with just a few lines of code.\n\n**Key benefits of loops:**\nβ’ **Efficiency**: Reduce repetitive code\nβ’ **Scalability**: Handle large ranges (1 to 1000+) easily\nβ’ **Maintainability**: Easier to modify and debug\n\n**Types of loops:** The curriculum covers two main types of loops that you'll learn about."
|
| 325 |
-
else:
|
| 326 |
-
answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\nThis slide contains the relevant information about your question."
|
| 327 |
-
|
| 328 |
-
elif self.qa_chain:
|
| 329 |
-
# Fallback to general LLM if focused chain fails
|
| 330 |
try:
|
| 331 |
-
if
|
| 332 |
-
context = "
|
| 333 |
-
filled_context = f"Curriculum Context:\n{context}\n\nPlease answer based on this curriculum content."
|
| 334 |
else:
|
| 335 |
-
|
| 336 |
|
| 337 |
-
|
| 338 |
-
"question": query,
|
| 339 |
-
"
|
| 340 |
})
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
if "<|eot_id|>" in answer:
|
| 345 |
-
answer = answer.split("<|eot_id|>")[-1].strip()
|
| 346 |
-
if answer.startswith("Answer:"):
|
| 347 |
-
answer = answer[7:].strip()
|
| 348 |
-
if answer.startswith("Provide a clear, educational answer explaining the concept:"):
|
| 349 |
-
answer = answer[58:].strip()
|
| 350 |
-
|
| 351 |
-
# Check if the answer is too short
|
| 352 |
-
if len(answer.strip()) < 50:
|
| 353 |
-
if curriculum_relevance_score > 0:
|
| 354 |
-
slide_info = f"π **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
|
| 355 |
-
answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\nThis slide explains the concept clearly."
|
| 356 |
-
else:
|
| 357 |
-
answer = "I'm sorry, I couldn't generate a proper answer. Please try rephrasing your question."
|
| 358 |
-
|
| 359 |
-
# Add warning if not in curriculum
|
| 360 |
-
if curriculum_relevance_score == 0:
|
| 361 |
-
answer = "β οΈ **Note: This topic is not covered in the current curriculum.**\n\n" + answer
|
| 362 |
-
|
| 363 |
except Exception as e:
|
| 364 |
-
print(f"
|
| 365 |
-
|
| 366 |
-
slide_info = f"π **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
|
| 367 |
-
answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\nThis slide contains the relevant information about your question."
|
| 368 |
-
else:
|
| 369 |
-
answer = "I'm sorry, I couldn't generate an answer at the moment. Please try rephrasing your question."
|
| 370 |
else:
|
| 371 |
-
#
|
| 372 |
-
if
|
| 373 |
-
|
| 374 |
-
answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\n*Note: AI generation is not available, but here's the relevant curriculum content.*"
|
| 375 |
else:
|
| 376 |
-
answer = "
|
| 377 |
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
# Get the specific PDF and its pages
|
| 387 |
-
if filename in self.pdf_files:
|
| 388 |
-
pdf_path = self.pdf_files[filename]
|
| 389 |
-
doc = fitz.open(pdf_path)
|
| 390 |
-
total_pages = len(doc)
|
| 391 |
-
doc.close()
|
| 392 |
-
|
| 393 |
-
# Find the best content page by analyzing all results
|
| 394 |
-
target_page = page_number
|
| 395 |
-
best_content_score = 0
|
| 396 |
-
|
| 397 |
-
# Check all search results for the best content page
|
| 398 |
-
for result in results:
|
| 399 |
-
if result.metadata["filename"] == filename:
|
| 400 |
-
page_num = result.metadata["page_number"]
|
| 401 |
-
page_text = self.pdf_pages[filename].get(page_num, "")
|
| 402 |
-
text_length = len(page_text.strip())
|
| 403 |
-
|
| 404 |
-
# Score based on text length and relevance
|
| 405 |
-
content_score = text_length
|
| 406 |
-
if text_length > 100: # Prefer content pages over title slides
|
| 407 |
-
content_score += 500
|
| 408 |
-
|
| 409 |
-
if content_score > best_content_score:
|
| 410 |
-
best_content_score = content_score
|
| 411 |
-
target_page = page_num
|
| 412 |
-
|
| 413 |
-
# If we still have a title slide, look for better content in the same PDF
|
| 414 |
-
page_text = self.pdf_pages[filename].get(target_page, "")
|
| 415 |
-
if len(page_text.strip()) < 150: # Still a title slide
|
| 416 |
-
# Search for pages with the query terms
|
| 417 |
-
query_terms = query.lower().split()
|
| 418 |
-
best_match_score = 0
|
| 419 |
-
|
| 420 |
-
for page_num in range(1, total_pages + 1):
|
| 421 |
-
if page_num in self.pdf_pages[filename]:
|
| 422 |
-
text = self.pdf_pages[filename][page_num].lower()
|
| 423 |
-
text_length = len(text.strip())
|
| 424 |
-
|
| 425 |
-
# Count how many query terms appear in this page
|
| 426 |
-
match_score = sum(1 for term in query_terms if term in text)
|
| 427 |
-
|
| 428 |
-
# Prefer pages with both query terms and good content
|
| 429 |
-
if match_score > 0 and text_length > 200:
|
| 430 |
-
total_score = match_score * 1000 + text_length
|
| 431 |
-
if total_score > best_match_score:
|
| 432 |
-
best_match_score = total_score
|
| 433 |
-
target_page = page_num
|
| 434 |
-
|
| 435 |
-
# Get the target page and neighboring pages (2 before, 2 after)
|
| 436 |
-
start_page = max(1, target_page - 2)
|
| 437 |
-
end_page = min(total_pages, target_page + 2)
|
| 438 |
-
|
| 439 |
-
for page_num in range(start_page, end_page + 1):
|
| 440 |
-
img = self.get_pdf_page_image(pdf_path, page_num)
|
| 441 |
-
if img:
|
| 442 |
-
if page_num == target_page:
|
| 443 |
-
# Highlight the most relevant page
|
| 444 |
-
label = f"π {filename} - Page {page_num} (Most Relevant)"
|
| 445 |
-
else:
|
| 446 |
-
label = f"{filename} - Page {page_num}"
|
| 447 |
-
relevant_slides.append((img, label))
|
| 448 |
-
|
| 449 |
-
recommended_slide = relevant_slides[0][0] if relevant_slides else None
|
| 450 |
-
recommended_label = relevant_slides[0][1] if relevant_slides else None
|
| 451 |
-
else:
|
| 452 |
-
# Fallback if filename not found
|
| 453 |
-
recommended_slide = None
|
| 454 |
-
recommended_label = None
|
| 455 |
-
else:
|
| 456 |
-
# If no curriculum content, show a few slides from different PDFs
|
| 457 |
relevant_slides = []
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
|
| 468 |
-
|
| 469 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
|
| 471 |
-
def gradio_chat(query):
|
| 472 |
-
|
|
|
|
| 473 |
|
| 474 |
-
#
|
| 475 |
-
|
|
|
|
|
|
|
| 476 |
|
| 477 |
-
|
|
|
|
| 478 |
|
| 479 |
with gr.Blocks(title="Inclusive World Curriculum Assistant", theme=gr.themes.Soft()) as demo:
|
| 480 |
-
gr.Markdown("# π€ Inclusive World Curriculum Assistant\
|
| 481 |
|
| 482 |
with gr.Row():
|
| 483 |
-
#
|
| 484 |
with gr.Column(scale=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
gr.Markdown("### π¬ Chatbot")
|
| 486 |
-
gr.Markdown("**What questions do you have?**")
|
| 487 |
question = gr.Textbox(
|
| 488 |
label="Question Input",
|
| 489 |
-
placeholder="e.g., What are for loops? How do variables work?
|
| 490 |
lines=3
|
| 491 |
)
|
| 492 |
submit = gr.Button("π€ Ask AI", variant="primary", size="lg")
|
| 493 |
-
answer = gr.Markdown(label="
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
|
| 507 |
# Event handlers
|
| 508 |
-
submit.click(
|
| 509 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
|
| 511 |
if __name__ == "__main__":
|
| 512 |
demo.launch()
|
|
|
|
| 4 |
import fitz # PyMuPDF
|
| 5 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 6 |
from langchain_community.vectorstores import Chroma
|
|
|
|
| 7 |
from langchain_huggingface import HuggingFacePipeline
|
| 8 |
from langchain.prompts import PromptTemplate
|
| 9 |
from transformers import pipeline
|
| 10 |
import torch
|
|
|
|
| 11 |
from PIL import Image
|
| 12 |
import io
|
| 13 |
import re
|
| 14 |
+
import time
|
| 15 |
|
| 16 |
+
# --- Modular Curriculum Assistant for HuggingFace Spaces ---
|
| 17 |
|
| 18 |
+
class ModularCurriculumChatbot:
|
| 19 |
+
def __init__(self, slides_dir="Slides", mode="text_only"):
|
| 20 |
+
"""
|
| 21 |
+
Modes optimized for HuggingFace:
|
| 22 |
+
- "text_only": No PDF rendering (fastest for HuggingFace)
|
| 23 |
+
- "search_only": Only vector search, no LLM
|
| 24 |
+
- "llm_only": Only LLM processing, no search
|
| 25 |
+
- "no_cache": No response caching
|
| 26 |
+
- "fast_llm": Use smaller/faster model
|
| 27 |
+
- "full": Complete functionality (slowest)
|
| 28 |
+
"""
|
| 29 |
+
self.pdf_pages = {}
|
| 30 |
+
self.pdf_files = {}
|
| 31 |
self.chunks = []
|
| 32 |
self.chunk_metadata = []
|
| 33 |
self.vector_db = None
|
| 34 |
self.embeddings = None
|
| 35 |
self.llm = None
|
| 36 |
self.qa_chain = None
|
| 37 |
+
self.response_cache = {}
|
| 38 |
+
self.mode = mode
|
| 39 |
+
|
| 40 |
+
print(f"π Initializing in {mode} mode for HuggingFace...")
|
| 41 |
self._process_pdfs(slides_dir)
|
| 42 |
self._build_vector_db()
|
| 43 |
+
if "llm" in mode or mode == "full":
|
| 44 |
+
self._setup_llm()
|
| 45 |
+
print(f"β
{mode} mode ready!")
|
| 46 |
|
| 47 |
def _process_pdfs(self, slides_dir):
|
| 48 |
+
start_time = time.time()
|
| 49 |
slides_path = Path(slides_dir)
|
| 50 |
pdf_files = list(slides_path.glob("*.pdf"))
|
| 51 |
+
|
| 52 |
for pdf_file in pdf_files:
|
| 53 |
self.pdf_files[pdf_file.name] = str(pdf_file)
|
| 54 |
doc = fitz.open(str(pdf_file))
|
|
|
|
| 60 |
pages[page_num + 1] = text.strip()
|
| 61 |
self.pdf_pages[pdf_file.name] = pages
|
| 62 |
doc.close()
|
| 63 |
+
|
| 64 |
# Add each page as a chunk
|
| 65 |
for page_num, text in pages.items():
|
| 66 |
self.chunks.append(text)
|
|
|
|
| 68 |
"filename": pdf_file.name,
|
| 69 |
"page_number": page_num
|
| 70 |
})
|
| 71 |
+
|
| 72 |
+
print(f"π PDF processing: {time.time() - start_time:.2f}s")
|
| 73 |
|
| 74 |
def _build_vector_db(self):
|
| 75 |
+
start_time = time.time()
|
| 76 |
self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 77 |
self.vector_db = Chroma.from_texts(
|
| 78 |
texts=self.chunks,
|
|
|
|
| 80 |
metadatas=self.chunk_metadata,
|
| 81 |
persist_directory="./chroma_db"
|
| 82 |
)
|
| 83 |
+
print(f"π Vector DB build: {time.time() - start_time:.2f}s")
|
| 84 |
|
| 85 |
def _setup_llm(self):
|
| 86 |
+
start_time = time.time()
|
| 87 |
try:
|
| 88 |
+
# Use different models based on mode
|
| 89 |
+
if self.mode == "fast_llm":
|
| 90 |
+
model_name = "microsoft/DialoGPT-small" # Fastest for HuggingFace
|
| 91 |
+
max_tokens = 30
|
| 92 |
+
else:
|
| 93 |
+
model_name = "microsoft/DialoGPT-medium" # Original
|
| 94 |
+
max_tokens = 100
|
| 95 |
|
|
|
|
|
|
|
| 96 |
token = os.environ.get("IW_Token")
|
| 97 |
if not token:
|
| 98 |
+
print("β οΈ IW_Token not found - LLM disabled")
|
| 99 |
+
self.llm = None
|
| 100 |
+
return
|
| 101 |
|
| 102 |
pipe = pipeline(
|
| 103 |
"text-generation",
|
| 104 |
model=model_name,
|
| 105 |
+
max_new_tokens=max_tokens,
|
| 106 |
temperature=0.3,
|
| 107 |
do_sample=True,
|
| 108 |
top_p=0.9,
|
| 109 |
repetition_penalty=1.1,
|
| 110 |
+
device_map="auto" if torch.cuda.is_available() else "cpu",
|
| 111 |
+
token=token,
|
| 112 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 113 |
+
low_cpu_mem_usage=True
|
| 114 |
)
|
| 115 |
self.llm = HuggingFacePipeline(pipeline=pipe)
|
| 116 |
|
| 117 |
+
qa_template = """Q: {question}\nContext: {context}\nA:"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
self.qa_prompt = PromptTemplate(
|
| 119 |
+
input_variables=["question", "context"],
|
| 120 |
template=qa_template
|
| 121 |
)
|
| 122 |
self.qa_chain = self.qa_prompt | self.llm
|
| 123 |
|
| 124 |
+
print(f"π€ LLM setup ({model_name}): {time.time() - start_time:.2f}s")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
except Exception as e:
|
| 126 |
+
print(f"β LLM setup failed: {e}")
|
|
|
|
| 127 |
self.llm = None
|
|
|
|
|
|
|
| 128 |
|
| 129 |
def get_pdf_page_image(self, pdf_path, page_num):
|
| 130 |
+
"""PDF rendering - potentially slow operation"""
|
| 131 |
try:
|
| 132 |
doc = fitz.open(pdf_path)
|
| 133 |
if page_num <= len(doc):
|
|
|
|
| 145 |
except Exception as e:
|
| 146 |
print(f"Error rendering PDF page: {str(e)}")
|
| 147 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
def chat(self, query):
|
| 150 |
+
"""Main chat function with performance tracking"""
|
| 151 |
+
total_start = time.time()
|
|
|
|
| 152 |
|
| 153 |
+
# Check cache (unless no_cache mode)
|
| 154 |
+
if self.mode != "no_cache" and query in self.response_cache:
|
| 155 |
+
print("β
Using cached response")
|
| 156 |
+
return self.response_cache[query]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
+
# Step 1: Vector Search
|
| 159 |
+
search_start = time.time()
|
| 160 |
+
if self.mode == "llm_only":
|
| 161 |
+
results = []
|
| 162 |
+
answer = "LLM-only mode: No search performed"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
else:
|
| 164 |
+
results = self.vector_db.similarity_search(query, k=2)
|
| 165 |
+
search_time = time.time() - search_start
|
| 166 |
+
print(f"π Search time: {search_time:.2f}s")
|
|
|
|
| 167 |
|
| 168 |
+
# Step 2: LLM Processing
|
| 169 |
+
llm_start = time.time()
|
| 170 |
+
if self.mode == "search_only":
|
| 171 |
+
# Search-only mode
|
| 172 |
+
if results:
|
| 173 |
+
best_result = results[0]
|
| 174 |
+
answer = f"Search result: {best_result.page_content[:200]}..."
|
| 175 |
+
else:
|
| 176 |
+
answer = "No relevant content found"
|
| 177 |
+
elif self.llm and (self.mode in ["llm_only", "fast_llm", "full"]):
|
| 178 |
+
# LLM processing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
try:
|
| 180 |
+
if results:
|
| 181 |
+
context = f"Content: {results[0].page_content[:300]}"
|
|
|
|
| 182 |
else:
|
| 183 |
+
context = "No relevant content found"
|
| 184 |
|
| 185 |
+
response = self.qa_chain.invoke({
|
| 186 |
+
"question": query,
|
| 187 |
+
"context": context
|
| 188 |
})
|
| 189 |
+
answer = response.strip()
|
| 190 |
+
if answer.startswith("A:"):
|
| 191 |
+
answer = answer[2:].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
except Exception as e:
|
| 193 |
+
print(f"LLM error: {e}")
|
| 194 |
+
answer = "LLM processing failed"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
else:
|
| 196 |
+
# No LLM available
|
| 197 |
+
if results:
|
| 198 |
+
answer = f"Text result: {results[0].page_content[:200]}..."
|
|
|
|
| 199 |
else:
|
| 200 |
+
answer = "No relevant content found"
|
| 201 |
|
| 202 |
+
llm_time = time.time() - llm_start
|
| 203 |
+
print(f"π€ LLM time: {llm_time:.2f}s")
|
| 204 |
+
|
| 205 |
+
# Step 3: PDF Rendering (potentially slow)
|
| 206 |
+
render_start = time.time()
|
| 207 |
+
if self.mode == "text_only":
|
| 208 |
+
# Text-only mode - no image rendering (fastest for HuggingFace)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
relevant_slides = []
|
| 210 |
+
answer += "\n\n[Text-only mode: No images rendered for speed]"
|
| 211 |
+
else:
|
| 212 |
+
# Full image rendering
|
| 213 |
+
relevant_slides = self._get_slides_for_file(
|
| 214 |
+
results[0].metadata["filename"] if results else None,
|
| 215 |
+
results[0].metadata["page_number"] if results else 1
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
render_time = time.time() - render_start
|
| 219 |
+
print(f"πΌοΈ Render time: {render_time:.2f}s")
|
| 220 |
+
|
| 221 |
+
# Cache response (unless no_cache mode)
|
| 222 |
+
if self.mode != "no_cache":
|
| 223 |
+
self.response_cache[query] = (answer, relevant_slides)
|
| 224 |
+
if len(self.response_cache) > 30:
|
| 225 |
+
oldest_key = next(iter(self.response_cache))
|
| 226 |
+
del self.response_cache[oldest_key]
|
| 227 |
|
| 228 |
+
total_time = time.time() - total_start
|
| 229 |
+
print(f"β±οΈ Total time: {total_time:.2f}s")
|
| 230 |
+
print(f"π Breakdown - Search: {search_time:.2f}s, LLM: {llm_time:.2f}s, Render: {render_time:.2f}s")
|
| 231 |
+
|
| 232 |
+
return answer, relevant_slides
|
| 233 |
|
| 234 |
+
def _get_slides_for_file(self, filename, target_page):
|
| 235 |
+
"""Get slides for display"""
|
| 236 |
+
slides = []
|
| 237 |
+
|
| 238 |
+
if not filename or filename not in self.pdf_files:
|
| 239 |
+
return slides
|
| 240 |
+
|
| 241 |
+
pdf_path = self.pdf_files[filename]
|
| 242 |
+
doc = fitz.open(pdf_path)
|
| 243 |
+
total_pages = len(doc)
|
| 244 |
+
doc.close()
|
| 245 |
+
|
| 246 |
+
# Get target page and 1 page before/after
|
| 247 |
+
start_page = max(1, target_page - 1)
|
| 248 |
+
end_page = min(total_pages, target_page + 1)
|
| 249 |
+
|
| 250 |
+
for page_num in range(start_page, end_page + 1):
|
| 251 |
+
img = self.get_pdf_page_image(pdf_path, page_num)
|
| 252 |
+
if img:
|
| 253 |
+
if page_num == target_page:
|
| 254 |
+
label = f"π {filename} - Page {page_num} (Most Relevant)"
|
| 255 |
+
else:
|
| 256 |
+
label = f"{filename} - Page {page_num}"
|
| 257 |
+
slides.append((img, label))
|
| 258 |
+
|
| 259 |
+
return slides
|
| 260 |
+
|
| 261 |
+
# --- Gradio UI with Mode Selection for HuggingFace ---
|
| 262 |
+
|
| 263 |
+
# Initialize chatbot with text_only mode (fastest for HuggingFace)
|
| 264 |
+
chatbot = ModularCurriculumChatbot(mode="text_only")
|
| 265 |
|
| 266 |
+
def gradio_chat(query, mode):
|
| 267 |
+
"""Chat function with mode switching"""
|
| 268 |
+
global chatbot
|
| 269 |
|
| 270 |
+
# Reinitialize chatbot if mode changed
|
| 271 |
+
if chatbot.mode != mode:
|
| 272 |
+
print(f"π Switching to {mode} mode...")
|
| 273 |
+
chatbot = ModularCurriculumChatbot(mode=mode)
|
| 274 |
|
| 275 |
+
answer, relevant_slides = chatbot.chat(query)
|
| 276 |
+
return answer, relevant_slides
|
| 277 |
|
| 278 |
with gr.Blocks(title="Inclusive World Curriculum Assistant", theme=gr.themes.Soft()) as demo:
|
| 279 |
+
gr.Markdown("# π€ Inclusive World Curriculum Assistant\nPerformance-optimized for HuggingFace Spaces")
|
| 280 |
|
| 281 |
with gr.Row():
|
| 282 |
+
# Mode Selection
|
| 283 |
with gr.Column(scale=1):
|
| 284 |
+
gr.Markdown("### βοΈ Performance Mode")
|
| 285 |
+
mode_select = gr.Dropdown(
|
| 286 |
+
choices=[
|
| 287 |
+
"text_only", # Fastest for HuggingFace
|
| 288 |
+
"search_only",
|
| 289 |
+
"fast_llm",
|
| 290 |
+
"llm_only",
|
| 291 |
+
"no_cache",
|
| 292 |
+
"full"
|
| 293 |
+
],
|
| 294 |
+
value="text_only", # Default to fastest mode
|
| 295 |
+
label="Select Mode",
|
| 296 |
+
info="text_only is fastest for HuggingFace"
|
| 297 |
+
)
|
| 298 |
+
gr.Markdown("""
|
| 299 |
+
**Recommended for HuggingFace:**
|
| 300 |
+
- **text_only**: Fastest (no PDF rendering)
|
| 301 |
+
- **search_only**: Vector search only
|
| 302 |
+
- **fast_llm**: Small model + search
|
| 303 |
+
- **full**: Complete (slowest)
|
| 304 |
+
""")
|
| 305 |
+
|
| 306 |
+
# Chat Interface
|
| 307 |
+
with gr.Column(scale=2):
|
| 308 |
gr.Markdown("### π¬ Chatbot")
|
|
|
|
| 309 |
question = gr.Textbox(
|
| 310 |
label="Question Input",
|
| 311 |
+
placeholder="e.g., What are for loops? How do variables work?",
|
| 312 |
lines=3
|
| 313 |
)
|
| 314 |
submit = gr.Button("π€ Ask AI", variant="primary", size="lg")
|
| 315 |
+
answer = gr.Markdown(label="Response")
|
| 316 |
+
|
| 317 |
+
# Slides Display
|
| 318 |
+
with gr.Row():
|
| 319 |
+
gr.Markdown("### π Slides (if applicable)")
|
| 320 |
+
gallery = gr.Gallery(
|
| 321 |
+
label="Curriculum Slides",
|
| 322 |
+
columns=1,
|
| 323 |
+
rows=3,
|
| 324 |
+
height="400px",
|
| 325 |
+
object_fit="contain",
|
| 326 |
+
show_label=False
|
| 327 |
+
)
|
| 328 |
|
| 329 |
# Event handlers
|
| 330 |
+
submit.click(
|
| 331 |
+
fn=gradio_chat,
|
| 332 |
+
inputs=[question, mode_select],
|
| 333 |
+
outputs=[answer, gallery]
|
| 334 |
+
)
|
| 335 |
+
question.submit(
|
| 336 |
+
fn=gradio_chat,
|
| 337 |
+
inputs=[question, mode_select],
|
| 338 |
+
outputs=[answer, gallery]
|
| 339 |
+
)
|
| 340 |
|
| 341 |
if __name__ == "__main__":
|
| 342 |
demo.launch()
|