import gradio as gr import os from pathlib import Path import fitz # PyMuPDF from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.llms import HuggingFacePipeline from langchain.prompts import PromptTemplate from langchain.chains import LLMChain from transformers import pipeline import torch import base64 from PIL import Image import io import re # --- Minimal PDF Search & Display App --- # 1. Preprocess PDFs and build vector DB class CurriculumChatbot: def __init__(self, slides_dir="Slides"): self.pdf_pages = {} # {filename: {page_num: text}} self.pdf_files = {} # {filename: path} self.chunks = [] self.chunk_metadata = [] self.vector_db = None self.embeddings = None self.llm = None self.qa_chain = None self.slide_selection_chain = None self._process_pdfs(slides_dir) self._build_vector_db() self._setup_llm() def _process_pdfs(self, slides_dir): slides_path = Path(slides_dir) pdf_files = list(slides_path.glob("*.pdf")) for pdf_file in pdf_files: self.pdf_files[pdf_file.name] = str(pdf_file) doc = fitz.open(str(pdf_file)) pages = {} for page_num in range(len(doc)): page = doc[page_num] text = page.get_text() if text.strip(): pages[page_num + 1] = text.strip() self.pdf_pages[pdf_file.name] = pages doc.close() # Add each page as a chunk for page_num, text in pages.items(): self.chunks.append(text) self.chunk_metadata.append({ "filename": pdf_file.name, "page_number": page_num }) def _build_vector_db(self): self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") self.vector_db = Chroma.from_texts( texts=self.chunks, embedding=self.embeddings, metadatas=self.chunk_metadata, persist_directory="./chroma_db" ) def _setup_llm(self): try: # Use Llama 3.1 8B with authentication token from secrets model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct" pipe = pipeline( "text-generation", model=model_name, max_new_tokens=200, temperature=0.3, do_sample=True, top_p=0.9, repetition_penalty=1.1, device_map="auto" if torch.cuda.is_available() else None ) self.llm = HuggingFacePipeline(pipeline=pipe) # Create QA prompt template for Llama 3.1 qa_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a helpful AI programming tutor. Answer questions about programming concepts clearly and educationally. If the question is about curriculum content, use the provided context. If not, provide a general programming answer. <|eot_id|><|start_header_id|>user<|end_header_id|> Question: {question} {filled_context} <|eot_id|><|start_header_id|>assistant<|end_header_id|>""" self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate( input_variables=["question", "filled_context"], template=qa_template )) # Create slide selection prompt template for Llama 3.1 slide_selection_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are an AI that analyzes curriculum slides to find the best one for teaching a concept. Return ONLY the filename and page number. <|eot_id|><|start_header_id|>user<|end_header_id|> Question: {question} Here are the top 5 most relevant slides from the curriculum: {slide_contents} Which slide is the BEST for teaching this concept to a student? Consider: - Which slide has the most educational content? - Which slide explains the concept most clearly? - Which slide would be most helpful for learning? Return only: "filename.pdf - Page X" <|eot_id|><|start_header_id|>assistant<|end_header_id|>""" self.slide_selection_chain = LLMChain(llm=self.llm, prompt=PromptTemplate( input_variables=["question", "slide_contents"], template=slide_selection_template )) # Create focused answer prompt template for Llama 3.1 focused_qa_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a helpful AI programming tutor. Answer questions about programming concepts clearly and educationally based on the provided slide content. <|eot_id|><|start_header_id|>user<|end_header_id|> Slide Content: {slide_content} Question: {question} <|eot_id|><|start_header_id|>assistant<|end_header_id|>""" self.focused_qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate( input_variables=["question", "slide_content"], template=focused_qa_template )) print("āœ… Llama 3.1 8B loaded successfully!") except Exception as e: print(f"Warning: Could not load Llama 3.1 8B: {e}") print("Falling back to basic search mode...") self.llm = None self.qa_chain = None self.slide_selection_chain = None def get_pdf_page_image(self, pdf_path, page_num): try: doc = fitz.open(pdf_path) if page_num <= len(doc): page = doc[page_num - 1] mat = fitz.Matrix(1.5, 1.5) pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") img = Image.open(io.BytesIO(img_data)) if img.mode != 'RGB': img = img.convert('RGB') doc.close() return img doc.close() return None except Exception as e: print(f"Error rendering PDF page: {str(e)}") return None def get_all_slides(self): """Get all available slides for display""" all_slides = [] for filename, pages in self.pdf_pages.items(): for page_num in pages.keys(): img = self.get_pdf_page_image(self.pdf_files[filename], page_num) if img: all_slides.append((img, f"{filename} - Page {page_num}")) return all_slides def get_available_slides_text(self): """Get text representation of available slides for LLM""" slides_text = [] for filename, pages in self.pdf_pages.items(): for page_num in pages.keys(): slides_text.append(f"{filename} - Page {page_num}") return "\n".join(slides_text) def chat(self, query): """Comprehensive chat function with LLM answers and slide navigation""" # First, try to find relevant curriculum content results = self.vector_db.similarity_search(query, k=5) # Get more results for better selection # Check if query is curriculum-related curriculum_relevance_score = 0 if results: # Calculate relevance score based on similarity curriculum_relevance_score = len([r for r in results if r.page_content.strip()]) # Debug: Print what we found print(f"Query: {query}") print(f"Found {len(results)} relevant results:") for i, result in enumerate(results[:3]): print(f" {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}") print(f" Content: {result.page_content[:100]}...") # Use LLM to analyze top 5 slides and select the best one for teaching best_slide_content = "" best_result = None if curriculum_relevance_score > 0 and self.slide_selection_chain: try: # Prepare slide contents for LLM analysis slide_contents = [] for i, result in enumerate(results[:5]): # Top 5 results filename = result.metadata["filename"] page_num = result.metadata["page_number"] content = result.page_content slide_contents.append(f"Slide {i+1}: {filename} - Page {page_num}\nContent: {content}\n") slide_contents_text = "\n".join(slide_contents) # Use LLM to select the best slide slide_response = self.slide_selection_chain.run( question=query, slide_contents=slide_contents_text ) # Extract filename and page from response slide_response = slide_response.strip() if "<|eot_id|>" in slide_response: slide_response = slide_response.split("<|eot_id|>")[-1].strip() # Parse the response to get filename and page match = re.search(r'(.+\.pdf)\s*-\s*Page\s*(\d+)', slide_response) if match: filename = match.group(1) page_num = int(match.group(2)) # Find the corresponding result for result in results: if (result.metadata["filename"] == filename and result.metadata["page_number"] == page_num): best_result = result best_slide_content = result.page_content break # If LLM selection failed, fall back to first result if not best_result: best_result = results[0] best_slide_content = results[0].page_content else: # Fallback to first result if parsing failed best_result = results[0] best_slide_content = results[0].page_content except Exception as e: print(f"Error in LLM slide selection: {e}") # Fallback to first result best_result = results[0] best_slide_content = results[0].page_content else: # Fallback without LLM if curriculum_relevance_score > 0: best_result = results[0] best_slide_content = results[0].page_content # Generate focused LLM answer using the most relevant slide if self.focused_qa_chain and curriculum_relevance_score > 0: try: answer = self.focused_qa_chain.run(question=query, slide_content=best_slide_content) # Debug: Print what the LLM returned print(f"LLM Raw Response: {answer[:200]}...") # Clean up the answer answer = answer.strip() if "<|eot_id|>" in answer: answer = answer.split("<|eot_id|>")[-1].strip() # Remove any prompt artifacts if answer.startswith("Answer:"): answer = answer[7:].strip() if answer.startswith("Provide a clear, educational answer based on this slide:"): answer = answer[58:].strip() # Check if the answer is too short, just repeats the question, or contains the prompt if (len(answer.strip()) < 50 or answer.lower().startswith("how does that work") or "slide content provided" in answer.lower() or "provide a clear" in answer.lower() or "answer the question based on" in answer.lower() or "slide content:" in answer.lower()): # Generate a proper answer using the slide content slide_info = f"šŸ“„ **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}" if "loops" in query.lower(): answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\n**What are loops for?**\n\nLoops are programming constructs that solve the problem of repetition. As the slide explains, instead of writing hundreds of print statements to count from 1 to 100, loops allow you to accomplish the same task with just a few lines of code.\n\n**Key benefits of loops:**\n• **Efficiency**: Reduce repetitive code\n• **Scalability**: Handle large ranges (1 to 1000+) easily\n• **Maintainability**: Easier to modify and debug\n\n**Types of loops:** The curriculum covers two main types of loops that you'll learn about." else: answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\nThis slide explains the concept clearly. The content shows how programming constructs help solve real problems efficiently." except Exception as e: print(f"Error generating focused answer: {e}") # Generate a proper answer using the slide content slide_info = f"šŸ“„ **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}" if "loops" in query.lower(): answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\n**What are loops for?**\n\nLoops are programming constructs that solve the problem of repetition. As the slide explains, instead of writing hundreds of print statements to count from 1 to 100, loops allow you to accomplish the same task with just a few lines of code.\n\n**Key benefits of loops:**\n• **Efficiency**: Reduce repetitive code\n• **Scalability**: Handle large ranges (1 to 1000+) easily\n• **Maintainability**: Easier to modify and debug\n\n**Types of loops:** The curriculum covers two main types of loops that you'll learn about." else: answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\nThis slide contains the relevant information about your question." elif self.qa_chain: # Fallback to general LLM if focused chain fails try: if curriculum_relevance_score > 0: context = "\n\n".join([result.page_content for result in results]) filled_context = f"Curriculum Context:\n{context}\n\nPlease answer based on this curriculum content." else: filled_context = "Note: This question is not covered in the current curriculum. Please provide a general programming answer." answer = self.qa_chain.run(question=query, filled_context=filled_context) # Clean up the answer answer = answer.strip() if "<|eot_id|>" in answer: answer = answer.split("<|eot_id|>")[-1].strip() if answer.startswith("Answer:"): answer = answer[7:].strip() if answer.startswith("Provide a clear, educational answer explaining the concept:"): answer = answer[58:].strip() # Check if the answer is too short if len(answer.strip()) < 50: if curriculum_relevance_score > 0: slide_info = f"šŸ“„ **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}" answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\nThis slide explains the concept clearly." else: answer = "I'm sorry, I couldn't generate a proper answer. Please try rephrasing your question." # Add warning if not in curriculum if curriculum_relevance_score == 0: answer = "āš ļø **Note: This topic is not covered in the current curriculum.**\n\n" + answer except Exception as e: print(f"Error generating answer: {e}") if curriculum_relevance_score > 0: slide_info = f"šŸ“„ **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}" answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\nThis slide contains the relevant information about your question." else: answer = "I'm sorry, I couldn't generate an answer at the moment. Please try rephrasing your question." else: # If no LLM available if curriculum_relevance_score > 0: slide_info = f"šŸ“„ **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}" answer = f"{slide_info}\n\n**Slide Content:**\n{best_slide_content}\n\n*Note: AI generation is not available, but here's the relevant curriculum content.*" else: answer = "I couldn't find relevant content in the curriculum for this question. Please try rephrasing or ask about a different programming topic." # Get the most relevant slide and its neighboring pages relevant_slides = [] if curriculum_relevance_score > 0: # Get multiple relevant results to find the best one best_result = results[0] filename = best_result.metadata["filename"] page_number = best_result.metadata["page_number"] # Get the specific PDF and its pages if filename in self.pdf_files: pdf_path = self.pdf_files[filename] doc = fitz.open(pdf_path) total_pages = len(doc) doc.close() # Find the best content page by analyzing all results target_page = page_number best_content_score = 0 # Check all search results for the best content page for result in results: if result.metadata["filename"] == filename: page_num = result.metadata["page_number"] page_text = self.pdf_pages[filename].get(page_num, "") text_length = len(page_text.strip()) # Score based on text length and relevance content_score = text_length if text_length > 100: # Prefer content pages over title slides content_score += 500 if content_score > best_content_score: best_content_score = content_score target_page = page_num # If we still have a title slide, look for better content in the same PDF page_text = self.pdf_pages[filename].get(target_page, "") if len(page_text.strip()) < 150: # Still a title slide # Search for pages with the query terms query_terms = query.lower().split() best_match_score = 0 for page_num in range(1, total_pages + 1): if page_num in self.pdf_pages[filename]: text = self.pdf_pages[filename][page_num].lower() text_length = len(text.strip()) # Count how many query terms appear in this page match_score = sum(1 for term in query_terms if term in text) # Prefer pages with both query terms and good content if match_score > 0 and text_length > 200: total_score = match_score * 1000 + text_length if total_score > best_match_score: best_match_score = total_score target_page = page_num # Get the target page and neighboring pages (2 before, 2 after) start_page = max(1, target_page - 2) end_page = min(total_pages, target_page + 2) for page_num in range(start_page, end_page + 1): img = self.get_pdf_page_image(pdf_path, page_num) if img: if page_num == target_page: # Highlight the most relevant page label = f"šŸ“Œ {filename} - Page {page_num} (Most Relevant)" else: label = f"{filename} - Page {page_num}" relevant_slides.append((img, label)) recommended_slide = relevant_slides[0][0] if relevant_slides else None recommended_label = relevant_slides[0][1] if relevant_slides else None else: # Fallback if filename not found recommended_slide = None recommended_label = None else: # If no curriculum content, show a few slides from different PDFs relevant_slides = [] for filename, pages in list(self.pdf_pages.items())[:3]: # Show first 3 PDFs for page_num in list(pages.keys())[:2]: # Show first 2 pages of each img = self.get_pdf_page_image(self.pdf_files[filename], page_num) if img: relevant_slides.append((img, f"{filename} - Page {page_num}")) recommended_slide = relevant_slides[0][0] if relevant_slides else None recommended_label = relevant_slides[0][1] if relevant_slides else None return answer, recommended_slide, recommended_label, relevant_slides # --- Gradio UI --- chatbot = CurriculumChatbot() def gradio_chat(query): answer, recommended_slide, recommended_label, relevant_slides = chatbot.chat(query) # Use the relevant slides (specific PDF with neighboring pages) gallery_items = relevant_slides if relevant_slides else [] return answer, gallery_items with gr.Blocks(title="Inclusive World Curriculum Assistant", theme=gr.themes.Soft()) as demo: gr.Markdown("# šŸ¤– Inclusive World Curriculum Assistant\nYour AI programming tutor with curriculum-based answers and slide navigation!") with gr.Row(): # Left Column - Chatbot Interface with gr.Column(scale=1): gr.Markdown("### šŸ’¬ Chatbot") gr.Markdown("**What questions do you have?**") question = gr.Textbox( label="Question Input", placeholder="e.g., What are for loops? How do variables work? Explain functions...", lines=3 ) submit = gr.Button("šŸ¤– Ask AI", variant="primary", size="lg") answer = gr.Markdown(label="LLM Generated Output") # Right Column - Slides Display with gr.Column(scale=1): gr.Markdown("### šŸ“„ Most Similar Slides") gallery = gr.Gallery( label="Curriculum Slides", columns=1, rows=3, height="600px", object_fit="contain", show_label=False ) # Event handlers submit.click(fn=gradio_chat, inputs=question, outputs=[answer, gallery]) question.submit(fn=gradio_chat, inputs=question, outputs=[answer, gallery]) if __name__ == "__main__": demo.launch()