Spaces:

IW2025
/

InclusiveWorldChatbotSpace

Sleeping

File size: 12,402 Bytes

93fe96e

import gradio as gr
import os
from pathlib import Path
import fitz  # PyMuPDF
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import base64
from PIL import Image
import io
import re

# --- Improved Vector Search Curriculum Assistant ---

class ImprovedCurriculumAssistant:
    def __init__(self, slides_dir="Slides"):
        self.pdf_pages = {}  # {filename: {page_num: text}}
        self.pdf_files = {}  # {filename: path}
        self.chunks = []
        self.chunk_metadata = []
        self.vector_db = None
        self.embeddings = None
        
        # Setup
        self._process_pdfs(slides_dir)
        self._build_vector_db()
        
    def _process_pdfs(self, slides_dir):
        """Process PDFs and extract text"""
        slides_path = Path(slides_dir)
        pdf_files = list(slides_path.glob("*.pdf"))
        
        for pdf_file in pdf_files:
            self.pdf_files[pdf_file.name] = str(pdf_file)
            doc = fitz.open(str(pdf_file))
            pages = {}
            
            for page_num in range(len(doc)):
                page = doc[page_num]
                text = page.get_text()
                if text.strip():
                    pages[page_num + 1] = text.strip()
            
            self.pdf_pages[pdf_file.name] = pages
            doc.close()
            
            # Add each page as a chunk
            for page_num, text in pages.items():
                self.chunks.append(text)
                self.chunk_metadata.append({
                    "filename": pdf_file.name,
                    "page_number": page_num
                })
        
        print(f"✅ Processed {len(pdf_files)} PDF files with {len(self.chunks)} total pages")

    def _build_vector_db(self):
        """Build vector database for semantic search"""
        self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        self.vector_db = Chroma.from_texts(
            texts=self.chunks,
            embedding=self.embeddings,
            metadatas=self.chunk_metadata,
            persist_directory="./chroma_db"
        )
        print("✅ Vector database built successfully")

    def get_pdf_page_image(self, pdf_path, page_num):
        """Get PDF page as image"""
        try:
            doc = fitz.open(pdf_path)
            if page_num <= len(doc):
                page = doc[page_num - 1]
                mat = fitz.Matrix(1.5, 1.5)
                pix = page.get_pixmap(matrix=mat)
                img_data = pix.tobytes("png")
                img = Image.open(io.BytesIO(img_data))
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                doc.close()
                return img
            doc.close()
            return None
        except Exception as e:
            print(f"Error rendering PDF page: {str(e)}")
            return None

    def _select_best_content(self, results, query):
        """Intelligent content selection without LLM"""
        if not results:
            return None, None
        
        query_lower = query.lower()
        query_terms = query_lower.split()
        
        # Score each result based on content quality and relevance
        scored_results = []
        
        for result in results:
            content = result.page_content
            content_lower = content.lower()
            
            # Calculate relevance score
            score = 0
            
            # Check for exact phrase matches
            for i in range(len(query_terms)):
                for j in range(i + 1, len(query_terms) + 1):
                    phrase = " ".join(query_terms[i:j])
                    if len(phrase) > 2 and phrase in content_lower:
                        score += len(phrase.split()) * 10
            
            # Check for individual term matches
            for term in query_terms:
                if len(term) > 2 and term in content_lower:
                    score += 1
            
            # Bonus for content length (prefer detailed explanations)
            content_length = len(content.strip())
            score += content_length * 0.01
            
            # Penalty for very short content (likely title slides)
            if content_length < 100:
                score -= 50
            
            # Bonus for content that contains programming keywords
            programming_keywords = ['function', 'variable', 'loop', 'condition', 'class', 'method', 'array', 'string', 'number']
            for keyword in programming_keywords:
                if keyword in content_lower:
                    score += 5
            
            scored_results.append((result, score))
        
        # Sort by score and return the best
        scored_results.sort(key=lambda x: x[1], reverse=True)
        best_result = scored_results[0][0]
        
        print(f"✅ Selected content with score: {scored_results[0][1]}")
        return best_result, best_result.page_content

    def _generate_educational_answer(self, query, selected_content):
        """Generate educational answer based on content"""
        query_lower = query.lower()
        
        # Create educational answer based on content and query
        if "loop" in query_lower:
            if "for loop" in query_lower:
                return f"""**For Loops** are a fundamental programming construct that allows you to repeat code a specific number of times.

Based on the curriculum content:
{selected_content}

**Key characteristics of for loops:**
- They use a counter variable to track iterations
- They have a defined start, end, and increment
- They are perfect for iterating through sequences like lists, ranges, or arrays
- They are more structured than while loops

**Example:**
```python
for i in range(5):
    print(i)  # Prints 0, 1, 2, 3, 4
```

For loops are essential when you know exactly how many times you want to repeat an action."""
            else:
                return f"""**Loops** are fundamental programming constructs that allow you to repeat code multiple times without having to write the same code repeatedly.

Based on the curriculum content:
{selected_content}

**Why loops are important:**
- Process large amounts of data efficiently
- Repeat actions a specific number of times
- Iterate through collections like lists and arrays
- Automate repetitive tasks

**Types of loops:**
- **For loops**: When you know the number of iterations
- **While loops**: When you don't know the number of iterations
- **Do-while loops**: Execute at least once, then check condition

Loops are essential for making programs efficient and handling repetitive tasks."""
        
        elif "variable" in query_lower:
            return f"""**Variables** are fundamental programming concepts that allow you to store and manipulate data.

Based on the curriculum content:
{selected_content}

**What are variables:**
- Containers that store data values
- Have names that you choose
- Can hold different types of data (numbers, text, etc.)
- Can be changed throughout your program

**Key concepts:**
- **Declaration**: Creating a variable with a name
- **Assignment**: Giving a variable a value
- **Data types**: Different kinds of data (integers, strings, etc.)
- **Scope**: Where a variable can be used

**Example:**
```python
name = "Alice"        # String variable
age = 25             # Integer variable
is_student = True     # Boolean variable
```

Variables are the building blocks of programming - they let you work with data in your programs."""
        
        else:
            return f"""Based on the curriculum content:

{selected_content}

This slide explains the concept you asked about. The curriculum provides a solid foundation for understanding this programming topic.

**Key points:**
- This is fundamental programming knowledge
- Understanding this concept will help with more advanced topics
- Practice with examples to reinforce your learning
- Ask questions if you need clarification on any part

The curriculum is designed to build your programming skills step by step."""
        
    def chat(self, query):
        """Main chat function with improved content selection"""
        print(f"\n🔍 Processing query: {query}")
        
        # Step 1: Vector search to find relevant content
        results = self.vector_db.similarity_search(query, k=5)
        
        if not results:
            return "I couldn't find any relevant content in the curriculum for your question.", [], None, None
        
        print(f"📚 Found {len(results)} relevant slides from vector search")
        
        # Step 2: Intelligent content selection
        selected_result, selected_content = self._select_best_content(results, query)
        
        if not selected_result:
            selected_result = results[0]
            selected_content = selected_result.page_content
        
        # Step 3: Generate educational answer
        answer = self._generate_educational_answer(query, selected_content)
        print(f"✅ Generated educational answer: {answer[:100]}...")
        
        # Step 4: Get relevant slides for display
        relevant_slides = []
        if selected_result:
            filename = selected_result.metadata["filename"]
            page_number = selected_result.metadata["page_number"]
            
            if filename in self.pdf_files:
                pdf_path = self.pdf_files[filename]
                doc = fitz.open(pdf_path)
                total_pages = len(doc)
                doc.close()
                
                # Get the selected page and neighboring pages
                start_page = max(1, page_number - 2)
                end_page = min(total_pages, page_number + 2)
                
                for page_num in range(start_page, end_page + 1):
                    img = self.get_pdf_page_image(pdf_path, page_num)
                    if img:
                        if page_num == page_number:
                            label = f"📌 {filename} - Page {page_num} (Most Relevant)"
                        else:
                            label = f"{filename} - Page {page_num}"
                        relevant_slides.append((img, label))
                
                recommended_slide = relevant_slides[0][0] if relevant_slides else None
                recommended_label = relevant_slides[0][1] if relevant_slides else None
            else:
                recommended_slide = None
                recommended_label = None
        else:
            recommended_slide = None
            recommended_label = None
        
        return answer, relevant_slides, recommended_slide, recommended_label

# --- Gradio UI ---
assistant = ImprovedCurriculumAssistant()

def gradio_chat(query):
    """Gradio chat interface"""
    answer, relevant_slides, recommended_slide, recommended_label = assistant.chat(query)
    return answer, relevant_slides

with gr.Blocks(title="Improved Curriculum Assistant", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🤖 Improved Curriculum Assistant\nYour AI programming tutor with intelligent content selection!")
    
    with gr.Row():
        # Left Column - Chatbot Interface
        with gr.Column(scale=1):
            gr.Markdown("### 💬 Chatbot")
            gr.Markdown("**Ask questions about programming concepts:**")
            
            question = gr.Textbox(
                label="Question Input", 
                placeholder="e.g., What are for loops? How do variables work? Explain functions...", 
                lines=3
            )
            submit = gr.Button("🤖 Ask AI", variant="primary", size="lg")
            answer = gr.Markdown(label="Generated Answer")
        
        # Right Column - Slides Display
        with gr.Column(scale=1):
            gr.Markdown("### 📄 Most Relevant Slides")
            gallery = gr.Gallery(
                label="Curriculum Slides", 
                columns=1, 
                rows=3, 
                height="600px", 
                object_fit="contain",
                show_label=False
            )
    
    # Event handlers
    submit.click(fn=gradio_chat, inputs=[question], outputs=[answer, gallery])
    question.submit(fn=gradio_chat, inputs=[question], outputs=[answer, gallery])

if __name__ == "__main__":
    demo.launch()