import gradio as gr import os from pathlib import Path import fitz # PyMuPDF from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma import base64 from PIL import Image import io import re # --- Improved Vector Search Curriculum Assistant --- class ImprovedCurriculumAssistant: def __init__(self, slides_dir="Slides"): self.pdf_pages = {} # {filename: {page_num: text}} self.pdf_files = {} # {filename: path} self.chunks = [] self.chunk_metadata = [] self.vector_db = None self.embeddings = None # Setup self._process_pdfs(slides_dir) self._build_vector_db() def _process_pdfs(self, slides_dir): """Process PDFs and extract text""" slides_path = Path(slides_dir) pdf_files = list(slides_path.glob("*.pdf")) for pdf_file in pdf_files: self.pdf_files[pdf_file.name] = str(pdf_file) doc = fitz.open(str(pdf_file)) pages = {} for page_num in range(len(doc)): page = doc[page_num] text = page.get_text() if text.strip(): pages[page_num + 1] = text.strip() self.pdf_pages[pdf_file.name] = pages doc.close() # Add each page as a chunk for page_num, text in pages.items(): self.chunks.append(text) self.chunk_metadata.append({ "filename": pdf_file.name, "page_number": page_num }) print(f"āœ… Processed {len(pdf_files)} PDF files with {len(self.chunks)} total pages") def _build_vector_db(self): """Build vector database for semantic search""" self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") self.vector_db = Chroma.from_texts( texts=self.chunks, embedding=self.embeddings, metadatas=self.chunk_metadata, persist_directory="./chroma_db" ) print("āœ… Vector database built successfully") def get_pdf_page_image(self, pdf_path, page_num): """Get PDF page as image""" try: doc = fitz.open(pdf_path) if page_num <= len(doc): page = doc[page_num - 1] mat = fitz.Matrix(1.5, 1.5) pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") img = Image.open(io.BytesIO(img_data)) if img.mode != 'RGB': img = img.convert('RGB') doc.close() return img doc.close() return None except Exception as e: print(f"Error rendering PDF page: {str(e)}") return None def _select_best_content(self, results, query): """Intelligent content selection without LLM""" if not results: return None, None query_lower = query.lower() query_terms = query_lower.split() # Score each result based on content quality and relevance scored_results = [] for result in results: content = result.page_content content_lower = content.lower() # Calculate relevance score score = 0 # Check for exact phrase matches for i in range(len(query_terms)): for j in range(i + 1, len(query_terms) + 1): phrase = " ".join(query_terms[i:j]) if len(phrase) > 2 and phrase in content_lower: score += len(phrase.split()) * 10 # Check for individual term matches for term in query_terms: if len(term) > 2 and term in content_lower: score += 1 # Bonus for content length (prefer detailed explanations) content_length = len(content.strip()) score += content_length * 0.01 # Penalty for very short content (likely title slides) if content_length < 100: score -= 50 # Bonus for content that contains programming keywords programming_keywords = ['function', 'variable', 'loop', 'condition', 'class', 'method', 'array', 'string', 'number'] for keyword in programming_keywords: if keyword in content_lower: score += 5 scored_results.append((result, score)) # Sort by score and return the best scored_results.sort(key=lambda x: x[1], reverse=True) best_result = scored_results[0][0] print(f"āœ… Selected content with score: {scored_results[0][1]}") return best_result, best_result.page_content def _generate_educational_answer(self, query, selected_content): """Generate educational answer based on content""" query_lower = query.lower() # Create educational answer based on content and query if "loop" in query_lower: if "for loop" in query_lower: return f"""**For Loops** are a fundamental programming construct that allows you to repeat code a specific number of times. Based on the curriculum content: {selected_content} **Key characteristics of for loops:** - They use a counter variable to track iterations - They have a defined start, end, and increment - They are perfect for iterating through sequences like lists, ranges, or arrays - They are more structured than while loops **Example:** ```python for i in range(5): print(i) # Prints 0, 1, 2, 3, 4 ``` For loops are essential when you know exactly how many times you want to repeat an action.""" else: return f"""**Loops** are fundamental programming constructs that allow you to repeat code multiple times without having to write the same code repeatedly. Based on the curriculum content: {selected_content} **Why loops are important:** - Process large amounts of data efficiently - Repeat actions a specific number of times - Iterate through collections like lists and arrays - Automate repetitive tasks **Types of loops:** - **For loops**: When you know the number of iterations - **While loops**: When you don't know the number of iterations - **Do-while loops**: Execute at least once, then check condition Loops are essential for making programs efficient and handling repetitive tasks.""" elif "variable" in query_lower: return f"""**Variables** are fundamental programming concepts that allow you to store and manipulate data. Based on the curriculum content: {selected_content} **What are variables:** - Containers that store data values - Have names that you choose - Can hold different types of data (numbers, text, etc.) - Can be changed throughout your program **Key concepts:** - **Declaration**: Creating a variable with a name - **Assignment**: Giving a variable a value - **Data types**: Different kinds of data (integers, strings, etc.) - **Scope**: Where a variable can be used **Example:** ```python name = "Alice" # String variable age = 25 # Integer variable is_student = True # Boolean variable ``` Variables are the building blocks of programming - they let you work with data in your programs.""" else: return f"""Based on the curriculum content: {selected_content} This slide explains the concept you asked about. The curriculum provides a solid foundation for understanding this programming topic. **Key points:** - This is fundamental programming knowledge - Understanding this concept will help with more advanced topics - Practice with examples to reinforce your learning - Ask questions if you need clarification on any part The curriculum is designed to build your programming skills step by step.""" def chat(self, query): """Main chat function with improved content selection""" print(f"\nšŸ” Processing query: {query}") # Step 1: Vector search to find relevant content results = self.vector_db.similarity_search(query, k=5) if not results: return "I couldn't find any relevant content in the curriculum for your question.", [], None, None print(f"šŸ“š Found {len(results)} relevant slides from vector search") # Step 2: Intelligent content selection selected_result, selected_content = self._select_best_content(results, query) if not selected_result: selected_result = results[0] selected_content = selected_result.page_content # Step 3: Generate educational answer answer = self._generate_educational_answer(query, selected_content) print(f"āœ… Generated educational answer: {answer[:100]}...") # Step 4: Get relevant slides for display relevant_slides = [] if selected_result: filename = selected_result.metadata["filename"] page_number = selected_result.metadata["page_number"] if filename in self.pdf_files: pdf_path = self.pdf_files[filename] doc = fitz.open(pdf_path) total_pages = len(doc) doc.close() # Get the selected page and neighboring pages start_page = max(1, page_number - 2) end_page = min(total_pages, page_number + 2) for page_num in range(start_page, end_page + 1): img = self.get_pdf_page_image(pdf_path, page_num) if img: if page_num == page_number: label = f"šŸ“Œ {filename} - Page {page_num} (Most Relevant)" else: label = f"{filename} - Page {page_num}" relevant_slides.append((img, label)) recommended_slide = relevant_slides[0][0] if relevant_slides else None recommended_label = relevant_slides[0][1] if relevant_slides else None else: recommended_slide = None recommended_label = None else: recommended_slide = None recommended_label = None return answer, relevant_slides, recommended_slide, recommended_label # --- Gradio UI --- assistant = ImprovedCurriculumAssistant() def gradio_chat(query): """Gradio chat interface""" answer, relevant_slides, recommended_slide, recommended_label = assistant.chat(query) return answer, relevant_slides with gr.Blocks(title="Improved Curriculum Assistant", theme=gr.themes.Soft()) as demo: gr.Markdown("# šŸ¤– Improved Curriculum Assistant\nYour AI programming tutor with intelligent content selection!") with gr.Row(): # Left Column - Chatbot Interface with gr.Column(scale=1): gr.Markdown("### šŸ’¬ Chatbot") gr.Markdown("**Ask questions about programming concepts:**") question = gr.Textbox( label="Question Input", placeholder="e.g., What are for loops? How do variables work? Explain functions...", lines=3 ) submit = gr.Button("šŸ¤– Ask AI", variant="primary", size="lg") answer = gr.Markdown(label="Generated Answer") # Right Column - Slides Display with gr.Column(scale=1): gr.Markdown("### šŸ“„ Most Relevant Slides") gallery = gr.Gallery( label="Curriculum Slides", columns=1, rows=3, height="600px", object_fit="contain", show_label=False ) # Event handlers submit.click(fn=gradio_chat, inputs=[question], outputs=[answer, gallery]) question.submit(fn=gradio_chat, inputs=[question], outputs=[answer, gallery]) if __name__ == "__main__": demo.launch()