IW2025 commited on
Commit
799c93c
Β·
verified Β·
1 Parent(s): 55e901b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +209 -379
app.py CHANGED
@@ -4,37 +4,51 @@ from pathlib import Path
4
  import fitz # PyMuPDF
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import Chroma
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain_huggingface import HuggingFacePipeline
9
  from langchain.prompts import PromptTemplate
10
  from transformers import pipeline
11
  import torch
12
- import base64
13
  from PIL import Image
14
  import io
15
  import re
 
16
 
17
- # --- Minimal PDF Search & Display App ---
18
 
19
- # 1. Preprocess PDFs and build vector DB
20
- class CurriculumChatbot:
21
- def __init__(self, slides_dir="Slides"):
22
- self.pdf_pages = {} # {filename: {page_num: text}}
23
- self.pdf_files = {} # {filename: path}
 
 
 
 
 
 
 
 
24
  self.chunks = []
25
  self.chunk_metadata = []
26
  self.vector_db = None
27
  self.embeddings = None
28
  self.llm = None
29
  self.qa_chain = None
30
- self.slide_selection_chain = None
 
 
 
31
  self._process_pdfs(slides_dir)
32
  self._build_vector_db()
33
- self._setup_llm()
 
 
34
 
35
  def _process_pdfs(self, slides_dir):
 
36
  slides_path = Path(slides_dir)
37
  pdf_files = list(slides_path.glob("*.pdf"))
 
38
  for pdf_file in pdf_files:
39
  self.pdf_files[pdf_file.name] = str(pdf_file)
40
  doc = fitz.open(str(pdf_file))
@@ -46,6 +60,7 @@ class CurriculumChatbot:
46
  pages[page_num + 1] = text.strip()
47
  self.pdf_pages[pdf_file.name] = pages
48
  doc.close()
 
49
  # Add each page as a chunk
50
  for page_num, text in pages.items():
51
  self.chunks.append(text)
@@ -53,8 +68,11 @@ class CurriculumChatbot:
53
  "filename": pdf_file.name,
54
  "page_number": page_num
55
  })
 
 
56
 
57
  def _build_vector_db(self):
 
58
  self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
59
  self.vector_db = Chroma.from_texts(
60
  texts=self.chunks,
@@ -62,107 +80,54 @@ class CurriculumChatbot:
62
  metadatas=self.chunk_metadata,
63
  persist_directory="./chroma_db"
64
  )
 
65
 
66
  def _setup_llm(self):
 
67
  try:
68
- # Use Llama 3.1 8B with authentication token from secrets
69
- model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 
 
 
 
 
70
 
71
- # Get token from secrets
72
- import os
73
  token = os.environ.get("IW_Token")
74
  if not token:
75
- raise ValueError("IW_Token not found in environment variables")
 
 
76
 
77
  pipe = pipeline(
78
  "text-generation",
79
  model=model_name,
80
- max_new_tokens=200,
81
  temperature=0.3,
82
  do_sample=True,
83
  top_p=0.9,
84
  repetition_penalty=1.1,
85
- device_map="auto" if torch.cuda.is_available() else None,
86
- token=token
 
 
87
  )
88
  self.llm = HuggingFacePipeline(pipeline=pipe)
89
 
90
- # Create QA prompt template for Llama 3.1
91
- qa_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
92
-
93
- You are a helpful AI programming tutor. Answer questions about programming concepts clearly and educationally. If the question is about curriculum content, use the provided context. If not, provide a general programming answer.
94
-
95
- <|eot_id|><|start_header_id|>user<|end_header_id|>
96
-
97
- Question: {question}
98
-
99
- {filled_context}
100
-
101
- <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
102
-
103
  self.qa_prompt = PromptTemplate(
104
- input_variables=["question", "filled_context"],
105
  template=qa_template
106
  )
107
  self.qa_chain = self.qa_prompt | self.llm
108
 
109
- # Create slide selection prompt template for Llama 3.1
110
- slide_selection_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
111
-
112
- You are an AI that analyzes curriculum slides to find the best one for teaching a concept. Return ONLY the filename and page number.
113
-
114
- <|eot_id|><|start_header_id|>user<|end_header_id|>
115
-
116
- Question: {question}
117
-
118
- Here are the top 5 most relevant slides from the curriculum:
119
-
120
- {slide_contents}
121
-
122
- Which slide is the BEST for teaching this concept to a student? Consider:
123
- - Which slide has the most educational content?
124
- - Which slide explains the concept most clearly?
125
- - Which slide would be most helpful for learning?
126
-
127
- Return only: "filename.pdf - Page X"
128
-
129
- <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
130
-
131
- self.slide_selection_prompt = PromptTemplate(
132
- input_variables=["question", "slide_contents"],
133
- template=slide_selection_template
134
- )
135
- self.slide_selection_chain = self.slide_selection_prompt | self.llm
136
-
137
- # Create focused answer prompt template for Llama 3.1
138
- focused_qa_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
139
-
140
- You are a helpful AI programming tutor. Answer questions about programming concepts clearly and educationally based on the provided slide content.
141
-
142
- <|eot_id|><|start_header_id|>user<|end_header_id|>
143
-
144
- Slide Content:
145
- {slide_content}
146
-
147
- Question: {question}
148
-
149
- <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
150
-
151
- self.focused_qa_prompt = PromptTemplate(
152
- input_variables=["question", "slide_content"],
153
- template=focused_qa_template
154
- )
155
- self.focused_qa_chain = self.focused_qa_prompt | self.llm
156
-
157
- print("βœ… Llama 3.1 8B loaded successfully!")
158
  except Exception as e:
159
- print(f"Warning: Could not load Llama 3.1 8B: {e}")
160
- print("Falling back to basic search mode...")
161
  self.llm = None
162
- self.qa_chain = None
163
- self.slide_selection_chain = None
164
 
165
  def get_pdf_page_image(self, pdf_path, page_num):
 
166
  try:
167
  doc = fitz.open(pdf_path)
168
  if page_num <= len(doc):
@@ -180,333 +145,198 @@ Question: {question}
180
  except Exception as e:
181
  print(f"Error rendering PDF page: {str(e)}")
182
  return None
183
-
184
- def get_all_slides(self):
185
- """Get all available slides for display"""
186
- all_slides = []
187
- for filename, pages in self.pdf_pages.items():
188
- for page_num in pages.keys():
189
- img = self.get_pdf_page_image(self.pdf_files[filename], page_num)
190
- if img:
191
- all_slides.append((img, f"{filename} - Page {page_num}"))
192
- return all_slides
193
-
194
- def get_available_slides_text(self):
195
- """Get text representation of available slides for LLM"""
196
- slides_text = []
197
- for filename, pages in self.pdf_pages.items():
198
- for page_num in pages.keys():
199
- slides_text.append(f"{filename} - Page {page_num}")
200
- return "\n".join(slides_text)
201
 
202
  def chat(self, query):
203
- """Comprehensive chat function with LLM answers and slide navigation"""
204
- # First, try to find relevant curriculum content
205
- results = self.vector_db.similarity_search(query, k=5) # Get more results for better selection
206
 
207
- # Check if query is curriculum-related
208
- curriculum_relevance_score = 0
209
- if results:
210
- # Calculate relevance score based on similarity
211
- curriculum_relevance_score = len([r for r in results if r.page_content.strip()])
212
-
213
- # Debug: Print what we found
214
- print(f"Query: {query}")
215
- print(f"Found {len(results)} relevant results:")
216
- for i, result in enumerate(results[:3]):
217
- print(f" {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
218
- print(f" Content: {result.page_content[:100]}...")
219
 
220
- # Use LLM to analyze top 5 slides and select the best one for teaching
221
- best_slide_content = ""
222
- best_result = None
223
- if curriculum_relevance_score > 0 and self.slide_selection_chain:
224
- try:
225
- # Prepare slide contents for LLM analysis
226
- slide_contents = []
227
- for i, result in enumerate(results[:5]): # Top 5 results
228
- filename = result.metadata["filename"]
229
- page_num = result.metadata["page_number"]
230
- content = result.page_content
231
- slide_contents.append(f"Slide {i+1}: {filename} - Page {page_num}\nContent: {content}\n")
232
-
233
- slide_contents_text = "\n".join(slide_contents)
234
-
235
- # Use LLM to select the best slide
236
- slide_response = self.slide_selection_chain.invoke({
237
- "question": query,
238
- "slide_contents": slide_contents_text
239
- })
240
-
241
- # Extract filename and page from response
242
- slide_response = slide_response.strip()
243
- if "<|eot_id|>" in slide_response:
244
- slide_response = slide_response.split("<|eot_id|>")[-1].strip()
245
-
246
- # Parse the response to get filename and page
247
- match = re.search(r'(.+\.pdf)\s*-\s*Page\s*(\d+)', slide_response)
248
- if match:
249
- filename = match.group(1)
250
- page_num = int(match.group(2))
251
-
252
- # Find the corresponding result
253
- for result in results:
254
- if (result.metadata["filename"] == filename and
255
- result.metadata["page_number"] == page_num):
256
- best_result = result
257
- best_slide_content = result.page_content
258
- break
259
-
260
- # If LLM selection failed, fall back to first result
261
- if not best_result:
262
- best_result = results[0]
263
- best_slide_content = results[0].page_content
264
- else:
265
- # Fallback to first result if parsing failed
266
- best_result = results[0]
267
- best_slide_content = results[0].page_content
268
-
269
- except Exception as e:
270
- print(f"Error in LLM slide selection: {e}")
271
- # Fallback to first result
272
- best_result = results[0]
273
- best_slide_content = results[0].page_content
274
  else:
275
- # Fallback without LLM
276
- if curriculum_relevance_score > 0:
277
- best_result = results[0]
278
- best_slide_content = results[0].page_content
279
 
280
- # Generate focused LLM answer using the most relevant slide
281
- if self.focused_qa_chain and curriculum_relevance_score > 0:
282
- try:
283
- answer = self.focused_qa_chain.invoke({
284
- "question": query,
285
- "slide_content": best_slide_content
286
- })
287
-
288
- # Debug: Print what the LLM returned
289
- print(f"LLM Raw Response: {answer[:200]}...")
290
-
291
- # Clean up the answer
292
- answer = answer.strip()
293
- if "<|eot_id|>" in answer:
294
- answer = answer.split("<|eot_id|>")[-1].strip()
295
-
296
- # Remove any prompt artifacts
297
- if answer.startswith("Answer:"):
298
- answer = answer[7:].strip()
299
- if answer.startswith("Provide a clear, educational answer based on this slide:"):
300
- answer = answer[58:].strip()
301
-
302
- # Check if the answer is too short, just repeats the question, or contains the prompt
303
- if (len(answer.strip()) < 50 or
304
- answer.lower().startswith("how does that work") or
305
- "slide content provided" in answer.lower() or
306
- "provide a clear" in answer.lower() or
307
- "answer the question based on" in answer.lower() or
308
- "slide content:" in answer.lower()):
309
-
310
- # Generate a proper answer using the slide content
311
- slide_info = f"πŸ“„ **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
312
-
313
- if "loops" in query.lower():
314
- answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\n**What are loops for?**\n\nLoops are programming constructs that solve the problem of repetition. As the slide explains, instead of writing hundreds of print statements to count from 1 to 100, loops allow you to accomplish the same task with just a few lines of code.\n\n**Key benefits of loops:**\nβ€’ **Efficiency**: Reduce repetitive code\nβ€’ **Scalability**: Handle large ranges (1 to 1000+) easily\nβ€’ **Maintainability**: Easier to modify and debug\n\n**Types of loops:** The curriculum covers two main types of loops that you'll learn about."
315
- else:
316
- answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\nThis slide explains the concept clearly. The content shows how programming constructs help solve real problems efficiently."
317
-
318
- except Exception as e:
319
- print(f"Error generating focused answer: {e}")
320
- # Generate a proper answer using the slide content
321
- slide_info = f"πŸ“„ **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
322
-
323
- if "loops" in query.lower():
324
- answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\n**What are loops for?**\n\nLoops are programming constructs that solve the problem of repetition. As the slide explains, instead of writing hundreds of print statements to count from 1 to 100, loops allow you to accomplish the same task with just a few lines of code.\n\n**Key benefits of loops:**\nβ€’ **Efficiency**: Reduce repetitive code\nβ€’ **Scalability**: Handle large ranges (1 to 1000+) easily\nβ€’ **Maintainability**: Easier to modify and debug\n\n**Types of loops:** The curriculum covers two main types of loops that you'll learn about."
325
- else:
326
- answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\nThis slide contains the relevant information about your question."
327
-
328
- elif self.qa_chain:
329
- # Fallback to general LLM if focused chain fails
330
  try:
331
- if curriculum_relevance_score > 0:
332
- context = "\n\n".join([result.page_content for result in results])
333
- filled_context = f"Curriculum Context:\n{context}\n\nPlease answer based on this curriculum content."
334
  else:
335
- filled_context = "Note: This question is not covered in the current curriculum. Please provide a general programming answer."
336
 
337
- answer = self.qa_chain.invoke({
338
- "question": query,
339
- "filled_context": filled_context
340
  })
341
-
342
- # Clean up the answer
343
- answer = answer.strip()
344
- if "<|eot_id|>" in answer:
345
- answer = answer.split("<|eot_id|>")[-1].strip()
346
- if answer.startswith("Answer:"):
347
- answer = answer[7:].strip()
348
- if answer.startswith("Provide a clear, educational answer explaining the concept:"):
349
- answer = answer[58:].strip()
350
-
351
- # Check if the answer is too short
352
- if len(answer.strip()) < 50:
353
- if curriculum_relevance_score > 0:
354
- slide_info = f"πŸ“„ **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
355
- answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\nThis slide explains the concept clearly."
356
- else:
357
- answer = "I'm sorry, I couldn't generate a proper answer. Please try rephrasing your question."
358
-
359
- # Add warning if not in curriculum
360
- if curriculum_relevance_score == 0:
361
- answer = "⚠️ **Note: This topic is not covered in the current curriculum.**\n\n" + answer
362
-
363
  except Exception as e:
364
- print(f"Error generating answer: {e}")
365
- if curriculum_relevance_score > 0:
366
- slide_info = f"πŸ“„ **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
367
- answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\nThis slide contains the relevant information about your question."
368
- else:
369
- answer = "I'm sorry, I couldn't generate an answer at the moment. Please try rephrasing your question."
370
  else:
371
- # If no LLM available
372
- if curriculum_relevance_score > 0:
373
- slide_info = f"πŸ“„ **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
374
- answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\n*Note: AI generation is not available, but here's the relevant curriculum content.*"
375
  else:
376
- answer = "I couldn't find relevant content in the curriculum for this question. Please try rephrasing or ask about a different programming topic."
377
 
378
- # Get the most relevant slide and its neighboring pages
379
- relevant_slides = []
380
- if curriculum_relevance_score > 0:
381
- # Get multiple relevant results to find the best one
382
- best_result = results[0]
383
- filename = best_result.metadata["filename"]
384
- page_number = best_result.metadata["page_number"]
385
-
386
- # Get the specific PDF and its pages
387
- if filename in self.pdf_files:
388
- pdf_path = self.pdf_files[filename]
389
- doc = fitz.open(pdf_path)
390
- total_pages = len(doc)
391
- doc.close()
392
-
393
- # Find the best content page by analyzing all results
394
- target_page = page_number
395
- best_content_score = 0
396
-
397
- # Check all search results for the best content page
398
- for result in results:
399
- if result.metadata["filename"] == filename:
400
- page_num = result.metadata["page_number"]
401
- page_text = self.pdf_pages[filename].get(page_num, "")
402
- text_length = len(page_text.strip())
403
-
404
- # Score based on text length and relevance
405
- content_score = text_length
406
- if text_length > 100: # Prefer content pages over title slides
407
- content_score += 500
408
-
409
- if content_score > best_content_score:
410
- best_content_score = content_score
411
- target_page = page_num
412
-
413
- # If we still have a title slide, look for better content in the same PDF
414
- page_text = self.pdf_pages[filename].get(target_page, "")
415
- if len(page_text.strip()) < 150: # Still a title slide
416
- # Search for pages with the query terms
417
- query_terms = query.lower().split()
418
- best_match_score = 0
419
-
420
- for page_num in range(1, total_pages + 1):
421
- if page_num in self.pdf_pages[filename]:
422
- text = self.pdf_pages[filename][page_num].lower()
423
- text_length = len(text.strip())
424
-
425
- # Count how many query terms appear in this page
426
- match_score = sum(1 for term in query_terms if term in text)
427
-
428
- # Prefer pages with both query terms and good content
429
- if match_score > 0 and text_length > 200:
430
- total_score = match_score * 1000 + text_length
431
- if total_score > best_match_score:
432
- best_match_score = total_score
433
- target_page = page_num
434
-
435
- # Get the target page and neighboring pages (2 before, 2 after)
436
- start_page = max(1, target_page - 2)
437
- end_page = min(total_pages, target_page + 2)
438
-
439
- for page_num in range(start_page, end_page + 1):
440
- img = self.get_pdf_page_image(pdf_path, page_num)
441
- if img:
442
- if page_num == target_page:
443
- # Highlight the most relevant page
444
- label = f"πŸ“Œ {filename} - Page {page_num} (Most Relevant)"
445
- else:
446
- label = f"{filename} - Page {page_num}"
447
- relevant_slides.append((img, label))
448
-
449
- recommended_slide = relevant_slides[0][0] if relevant_slides else None
450
- recommended_label = relevant_slides[0][1] if relevant_slides else None
451
- else:
452
- # Fallback if filename not found
453
- recommended_slide = None
454
- recommended_label = None
455
- else:
456
- # If no curriculum content, show a few slides from different PDFs
457
  relevant_slides = []
458
- for filename, pages in list(self.pdf_pages.items())[:3]: # Show first 3 PDFs
459
- for page_num in list(pages.keys())[:2]: # Show first 2 pages of each
460
- img = self.get_pdf_page_image(self.pdf_files[filename], page_num)
461
- if img:
462
- relevant_slides.append((img, f"{filename} - Page {page_num}"))
463
- recommended_slide = relevant_slides[0][0] if relevant_slides else None
464
- recommended_label = relevant_slides[0][1] if relevant_slides else None
 
 
 
 
 
 
 
 
 
 
465
 
466
- return answer, recommended_slide, recommended_label, relevant_slides
 
 
 
 
467
 
468
- # --- Gradio UI ---
469
- chatbot = CurriculumChatbot()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
 
471
- def gradio_chat(query):
472
- answer, recommended_slide, recommended_label, relevant_slides = chatbot.chat(query)
 
473
 
474
- # Use the relevant slides (specific PDF with neighboring pages)
475
- gallery_items = relevant_slides if relevant_slides else []
 
 
476
 
477
- return answer, gallery_items
 
478
 
479
  with gr.Blocks(title="Inclusive World Curriculum Assistant", theme=gr.themes.Soft()) as demo:
480
- gr.Markdown("# πŸ€– Inclusive World Curriculum Assistant\nYour AI programming tutor with curriculum-based answers and slide navigation!")
481
 
482
  with gr.Row():
483
- # Left Column - Chatbot Interface
484
  with gr.Column(scale=1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  gr.Markdown("### πŸ’¬ Chatbot")
486
- gr.Markdown("**What questions do you have?**")
487
  question = gr.Textbox(
488
  label="Question Input",
489
- placeholder="e.g., What are for loops? How do variables work? Explain functions...",
490
  lines=3
491
  )
492
  submit = gr.Button("πŸ€– Ask AI", variant="primary", size="lg")
493
- answer = gr.Markdown(label="LLM Generated Output")
494
-
495
- # Right Column - Slides Display
496
- with gr.Column(scale=1):
497
- gr.Markdown("### πŸ“„ Most Similar Slides")
498
- gallery = gr.Gallery(
499
- label="Curriculum Slides",
500
- columns=1,
501
- rows=3,
502
- height="600px",
503
- object_fit="contain",
504
- show_label=False
505
- )
506
 
507
  # Event handlers
508
- submit.click(fn=gradio_chat, inputs=question, outputs=[answer, gallery])
509
- question.submit(fn=gradio_chat, inputs=question, outputs=[answer, gallery])
 
 
 
 
 
 
 
 
510
 
511
  if __name__ == "__main__":
512
  demo.launch()
 
4
  import fitz # PyMuPDF
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import Chroma
 
7
  from langchain_huggingface import HuggingFacePipeline
8
  from langchain.prompts import PromptTemplate
9
  from transformers import pipeline
10
  import torch
 
11
  from PIL import Image
12
  import io
13
  import re
14
+ import time
15
 
16
+ # --- Modular Curriculum Assistant for HuggingFace Spaces ---
17
 
18
+ class ModularCurriculumChatbot:
19
+ def __init__(self, slides_dir="Slides", mode="text_only"):
20
+ """
21
+ Modes optimized for HuggingFace:
22
+ - "text_only": No PDF rendering (fastest for HuggingFace)
23
+ - "search_only": Only vector search, no LLM
24
+ - "llm_only": Only LLM processing, no search
25
+ - "no_cache": No response caching
26
+ - "fast_llm": Use smaller/faster model
27
+ - "full": Complete functionality (slowest)
28
+ """
29
+ self.pdf_pages = {}
30
+ self.pdf_files = {}
31
  self.chunks = []
32
  self.chunk_metadata = []
33
  self.vector_db = None
34
  self.embeddings = None
35
  self.llm = None
36
  self.qa_chain = None
37
+ self.response_cache = {}
38
+ self.mode = mode
39
+
40
+ print(f"πŸš€ Initializing in {mode} mode for HuggingFace...")
41
  self._process_pdfs(slides_dir)
42
  self._build_vector_db()
43
+ if "llm" in mode or mode == "full":
44
+ self._setup_llm()
45
+ print(f"βœ… {mode} mode ready!")
46
 
47
  def _process_pdfs(self, slides_dir):
48
+ start_time = time.time()
49
  slides_path = Path(slides_dir)
50
  pdf_files = list(slides_path.glob("*.pdf"))
51
+
52
  for pdf_file in pdf_files:
53
  self.pdf_files[pdf_file.name] = str(pdf_file)
54
  doc = fitz.open(str(pdf_file))
 
60
  pages[page_num + 1] = text.strip()
61
  self.pdf_pages[pdf_file.name] = pages
62
  doc.close()
63
+
64
  # Add each page as a chunk
65
  for page_num, text in pages.items():
66
  self.chunks.append(text)
 
68
  "filename": pdf_file.name,
69
  "page_number": page_num
70
  })
71
+
72
+ print(f"πŸ“„ PDF processing: {time.time() - start_time:.2f}s")
73
 
74
  def _build_vector_db(self):
75
+ start_time = time.time()
76
  self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
77
  self.vector_db = Chroma.from_texts(
78
  texts=self.chunks,
 
80
  metadatas=self.chunk_metadata,
81
  persist_directory="./chroma_db"
82
  )
83
+ print(f"πŸ” Vector DB build: {time.time() - start_time:.2f}s")
84
 
85
  def _setup_llm(self):
86
+ start_time = time.time()
87
  try:
88
+ # Use different models based on mode
89
+ if self.mode == "fast_llm":
90
+ model_name = "microsoft/DialoGPT-small" # Fastest for HuggingFace
91
+ max_tokens = 30
92
+ else:
93
+ model_name = "microsoft/DialoGPT-medium" # Original
94
+ max_tokens = 100
95
 
 
 
96
  token = os.environ.get("IW_Token")
97
  if not token:
98
+ print("⚠️ IW_Token not found - LLM disabled")
99
+ self.llm = None
100
+ return
101
 
102
  pipe = pipeline(
103
  "text-generation",
104
  model=model_name,
105
+ max_new_tokens=max_tokens,
106
  temperature=0.3,
107
  do_sample=True,
108
  top_p=0.9,
109
  repetition_penalty=1.1,
110
+ device_map="auto" if torch.cuda.is_available() else "cpu",
111
+ token=token,
112
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
113
+ low_cpu_mem_usage=True
114
  )
115
  self.llm = HuggingFacePipeline(pipeline=pipe)
116
 
117
+ qa_template = """Q: {question}\nContext: {context}\nA:"""
 
 
 
 
 
 
 
 
 
 
 
 
118
  self.qa_prompt = PromptTemplate(
119
+ input_variables=["question", "context"],
120
  template=qa_template
121
  )
122
  self.qa_chain = self.qa_prompt | self.llm
123
 
124
+ print(f"πŸ€– LLM setup ({model_name}): {time.time() - start_time:.2f}s")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  except Exception as e:
126
+ print(f"❌ LLM setup failed: {e}")
 
127
  self.llm = None
 
 
128
 
129
  def get_pdf_page_image(self, pdf_path, page_num):
130
+ """PDF rendering - potentially slow operation"""
131
  try:
132
  doc = fitz.open(pdf_path)
133
  if page_num <= len(doc):
 
145
  except Exception as e:
146
  print(f"Error rendering PDF page: {str(e)}")
147
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  def chat(self, query):
150
+ """Main chat function with performance tracking"""
151
+ total_start = time.time()
 
152
 
153
+ # Check cache (unless no_cache mode)
154
+ if self.mode != "no_cache" and query in self.response_cache:
155
+ print("βœ… Using cached response")
156
+ return self.response_cache[query]
 
 
 
 
 
 
 
 
157
 
158
+ # Step 1: Vector Search
159
+ search_start = time.time()
160
+ if self.mode == "llm_only":
161
+ results = []
162
+ answer = "LLM-only mode: No search performed"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  else:
164
+ results = self.vector_db.similarity_search(query, k=2)
165
+ search_time = time.time() - search_start
166
+ print(f"πŸ” Search time: {search_time:.2f}s")
 
167
 
168
+ # Step 2: LLM Processing
169
+ llm_start = time.time()
170
+ if self.mode == "search_only":
171
+ # Search-only mode
172
+ if results:
173
+ best_result = results[0]
174
+ answer = f"Search result: {best_result.page_content[:200]}..."
175
+ else:
176
+ answer = "No relevant content found"
177
+ elif self.llm and (self.mode in ["llm_only", "fast_llm", "full"]):
178
+ # LLM processing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  try:
180
+ if results:
181
+ context = f"Content: {results[0].page_content[:300]}"
 
182
  else:
183
+ context = "No relevant content found"
184
 
185
+ response = self.qa_chain.invoke({
186
+ "question": query,
187
+ "context": context
188
  })
189
+ answer = response.strip()
190
+ if answer.startswith("A:"):
191
+ answer = answer[2:].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  except Exception as e:
193
+ print(f"LLM error: {e}")
194
+ answer = "LLM processing failed"
 
 
 
 
195
  else:
196
+ # No LLM available
197
+ if results:
198
+ answer = f"Text result: {results[0].page_content[:200]}..."
 
199
  else:
200
+ answer = "No relevant content found"
201
 
202
+ llm_time = time.time() - llm_start
203
+ print(f"πŸ€– LLM time: {llm_time:.2f}s")
204
+
205
+ # Step 3: PDF Rendering (potentially slow)
206
+ render_start = time.time()
207
+ if self.mode == "text_only":
208
+ # Text-only mode - no image rendering (fastest for HuggingFace)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  relevant_slides = []
210
+ answer += "\n\n[Text-only mode: No images rendered for speed]"
211
+ else:
212
+ # Full image rendering
213
+ relevant_slides = self._get_slides_for_file(
214
+ results[0].metadata["filename"] if results else None,
215
+ results[0].metadata["page_number"] if results else 1
216
+ )
217
+
218
+ render_time = time.time() - render_start
219
+ print(f"πŸ–ΌοΈ Render time: {render_time:.2f}s")
220
+
221
+ # Cache response (unless no_cache mode)
222
+ if self.mode != "no_cache":
223
+ self.response_cache[query] = (answer, relevant_slides)
224
+ if len(self.response_cache) > 30:
225
+ oldest_key = next(iter(self.response_cache))
226
+ del self.response_cache[oldest_key]
227
 
228
+ total_time = time.time() - total_start
229
+ print(f"⏱️ Total time: {total_time:.2f}s")
230
+ print(f"πŸ“Š Breakdown - Search: {search_time:.2f}s, LLM: {llm_time:.2f}s, Render: {render_time:.2f}s")
231
+
232
+ return answer, relevant_slides
233
 
234
+ def _get_slides_for_file(self, filename, target_page):
235
+ """Get slides for display"""
236
+ slides = []
237
+
238
+ if not filename or filename not in self.pdf_files:
239
+ return slides
240
+
241
+ pdf_path = self.pdf_files[filename]
242
+ doc = fitz.open(pdf_path)
243
+ total_pages = len(doc)
244
+ doc.close()
245
+
246
+ # Get target page and 1 page before/after
247
+ start_page = max(1, target_page - 1)
248
+ end_page = min(total_pages, target_page + 1)
249
+
250
+ for page_num in range(start_page, end_page + 1):
251
+ img = self.get_pdf_page_image(pdf_path, page_num)
252
+ if img:
253
+ if page_num == target_page:
254
+ label = f"πŸ“Œ {filename} - Page {page_num} (Most Relevant)"
255
+ else:
256
+ label = f"{filename} - Page {page_num}"
257
+ slides.append((img, label))
258
+
259
+ return slides
260
+
261
+ # --- Gradio UI with Mode Selection for HuggingFace ---
262
+
263
+ # Initialize chatbot with text_only mode (fastest for HuggingFace)
264
+ chatbot = ModularCurriculumChatbot(mode="text_only")
265
 
266
+ def gradio_chat(query, mode):
267
+ """Chat function with mode switching"""
268
+ global chatbot
269
 
270
+ # Reinitialize chatbot if mode changed
271
+ if chatbot.mode != mode:
272
+ print(f"πŸ”„ Switching to {mode} mode...")
273
+ chatbot = ModularCurriculumChatbot(mode=mode)
274
 
275
+ answer, relevant_slides = chatbot.chat(query)
276
+ return answer, relevant_slides
277
 
278
  with gr.Blocks(title="Inclusive World Curriculum Assistant", theme=gr.themes.Soft()) as demo:
279
+ gr.Markdown("# πŸ€– Inclusive World Curriculum Assistant\nPerformance-optimized for HuggingFace Spaces")
280
 
281
  with gr.Row():
282
+ # Mode Selection
283
  with gr.Column(scale=1):
284
+ gr.Markdown("### βš™οΈ Performance Mode")
285
+ mode_select = gr.Dropdown(
286
+ choices=[
287
+ "text_only", # Fastest for HuggingFace
288
+ "search_only",
289
+ "fast_llm",
290
+ "llm_only",
291
+ "no_cache",
292
+ "full"
293
+ ],
294
+ value="text_only", # Default to fastest mode
295
+ label="Select Mode",
296
+ info="text_only is fastest for HuggingFace"
297
+ )
298
+ gr.Markdown("""
299
+ **Recommended for HuggingFace:**
300
+ - **text_only**: Fastest (no PDF rendering)
301
+ - **search_only**: Vector search only
302
+ - **fast_llm**: Small model + search
303
+ - **full**: Complete (slowest)
304
+ """)
305
+
306
+ # Chat Interface
307
+ with gr.Column(scale=2):
308
  gr.Markdown("### πŸ’¬ Chatbot")
 
309
  question = gr.Textbox(
310
  label="Question Input",
311
+ placeholder="e.g., What are for loops? How do variables work?",
312
  lines=3
313
  )
314
  submit = gr.Button("πŸ€– Ask AI", variant="primary", size="lg")
315
+ answer = gr.Markdown(label="Response")
316
+
317
+ # Slides Display
318
+ with gr.Row():
319
+ gr.Markdown("### πŸ“„ Slides (if applicable)")
320
+ gallery = gr.Gallery(
321
+ label="Curriculum Slides",
322
+ columns=1,
323
+ rows=3,
324
+ height="400px",
325
+ object_fit="contain",
326
+ show_label=False
327
+ )
328
 
329
  # Event handlers
330
+ submit.click(
331
+ fn=gradio_chat,
332
+ inputs=[question, mode_select],
333
+ outputs=[answer, gallery]
334
+ )
335
+ question.submit(
336
+ fn=gradio_chat,
337
+ inputs=[question, mode_select],
338
+ outputs=[answer, gallery]
339
+ )
340
 
341
  if __name__ == "__main__":
342
  demo.launch()