File size: 12,402 Bytes
93fe96e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
import gradio as gr
import os
from pathlib import Path
import fitz  # PyMuPDF
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import base64
from PIL import Image
import io
import re

# --- Improved Vector Search Curriculum Assistant ---

class ImprovedCurriculumAssistant:
    def __init__(self, slides_dir="Slides"):
        self.pdf_pages = {}  # {filename: {page_num: text}}
        self.pdf_files = {}  # {filename: path}
        self.chunks = []
        self.chunk_metadata = []
        self.vector_db = None
        self.embeddings = None
        
        # Setup
        self._process_pdfs(slides_dir)
        self._build_vector_db()
        
    def _process_pdfs(self, slides_dir):
        """Process PDFs and extract text"""
        slides_path = Path(slides_dir)
        pdf_files = list(slides_path.glob("*.pdf"))
        
        for pdf_file in pdf_files:
            self.pdf_files[pdf_file.name] = str(pdf_file)
            doc = fitz.open(str(pdf_file))
            pages = {}
            
            for page_num in range(len(doc)):
                page = doc[page_num]
                text = page.get_text()
                if text.strip():
                    pages[page_num + 1] = text.strip()
            
            self.pdf_pages[pdf_file.name] = pages
            doc.close()
            
            # Add each page as a chunk
            for page_num, text in pages.items():
                self.chunks.append(text)
                self.chunk_metadata.append({
                    "filename": pdf_file.name,
                    "page_number": page_num
                })
        
        print(f"βœ… Processed {len(pdf_files)} PDF files with {len(self.chunks)} total pages")

    def _build_vector_db(self):
        """Build vector database for semantic search"""
        self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        self.vector_db = Chroma.from_texts(
            texts=self.chunks,
            embedding=self.embeddings,
            metadatas=self.chunk_metadata,
            persist_directory="./chroma_db"
        )
        print("βœ… Vector database built successfully")

    def get_pdf_page_image(self, pdf_path, page_num):
        """Get PDF page as image"""
        try:
            doc = fitz.open(pdf_path)
            if page_num <= len(doc):
                page = doc[page_num - 1]
                mat = fitz.Matrix(1.5, 1.5)
                pix = page.get_pixmap(matrix=mat)
                img_data = pix.tobytes("png")
                img = Image.open(io.BytesIO(img_data))
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                doc.close()
                return img
            doc.close()
            return None
        except Exception as e:
            print(f"Error rendering PDF page: {str(e)}")
            return None

    def _select_best_content(self, results, query):
        """Intelligent content selection without LLM"""
        if not results:
            return None, None
        
        query_lower = query.lower()
        query_terms = query_lower.split()
        
        # Score each result based on content quality and relevance
        scored_results = []
        
        for result in results:
            content = result.page_content
            content_lower = content.lower()
            
            # Calculate relevance score
            score = 0
            
            # Check for exact phrase matches
            for i in range(len(query_terms)):
                for j in range(i + 1, len(query_terms) + 1):
                    phrase = " ".join(query_terms[i:j])
                    if len(phrase) > 2 and phrase in content_lower:
                        score += len(phrase.split()) * 10
            
            # Check for individual term matches
            for term in query_terms:
                if len(term) > 2 and term in content_lower:
                    score += 1
            
            # Bonus for content length (prefer detailed explanations)
            content_length = len(content.strip())
            score += content_length * 0.01
            
            # Penalty for very short content (likely title slides)
            if content_length < 100:
                score -= 50
            
            # Bonus for content that contains programming keywords
            programming_keywords = ['function', 'variable', 'loop', 'condition', 'class', 'method', 'array', 'string', 'number']
            for keyword in programming_keywords:
                if keyword in content_lower:
                    score += 5
            
            scored_results.append((result, score))
        
        # Sort by score and return the best
        scored_results.sort(key=lambda x: x[1], reverse=True)
        best_result = scored_results[0][0]
        
        print(f"βœ… Selected content with score: {scored_results[0][1]}")
        return best_result, best_result.page_content

    def _generate_educational_answer(self, query, selected_content):
        """Generate educational answer based on content"""
        query_lower = query.lower()
        
        # Create educational answer based on content and query
        if "loop" in query_lower:
            if "for loop" in query_lower:
                return f"""**For Loops** are a fundamental programming construct that allows you to repeat code a specific number of times.

Based on the curriculum content:
{selected_content}

**Key characteristics of for loops:**
- They use a counter variable to track iterations
- They have a defined start, end, and increment
- They are perfect for iterating through sequences like lists, ranges, or arrays
- They are more structured than while loops

**Example:**
```python
for i in range(5):
    print(i)  # Prints 0, 1, 2, 3, 4
```

For loops are essential when you know exactly how many times you want to repeat an action."""
            else:
                return f"""**Loops** are fundamental programming constructs that allow you to repeat code multiple times without having to write the same code repeatedly.

Based on the curriculum content:
{selected_content}

**Why loops are important:**
- Process large amounts of data efficiently
- Repeat actions a specific number of times
- Iterate through collections like lists and arrays
- Automate repetitive tasks

**Types of loops:**
- **For loops**: When you know the number of iterations
- **While loops**: When you don't know the number of iterations
- **Do-while loops**: Execute at least once, then check condition

Loops are essential for making programs efficient and handling repetitive tasks."""
        
        elif "variable" in query_lower:
            return f"""**Variables** are fundamental programming concepts that allow you to store and manipulate data.

Based on the curriculum content:
{selected_content}

**What are variables:**
- Containers that store data values
- Have names that you choose
- Can hold different types of data (numbers, text, etc.)
- Can be changed throughout your program

**Key concepts:**
- **Declaration**: Creating a variable with a name
- **Assignment**: Giving a variable a value
- **Data types**: Different kinds of data (integers, strings, etc.)
- **Scope**: Where a variable can be used

**Example:**
```python
name = "Alice"        # String variable
age = 25             # Integer variable
is_student = True     # Boolean variable
```

Variables are the building blocks of programming - they let you work with data in your programs."""
        
        else:
            return f"""Based on the curriculum content:

{selected_content}

This slide explains the concept you asked about. The curriculum provides a solid foundation for understanding this programming topic.

**Key points:**
- This is fundamental programming knowledge
- Understanding this concept will help with more advanced topics
- Practice with examples to reinforce your learning
- Ask questions if you need clarification on any part

The curriculum is designed to build your programming skills step by step."""
        
    def chat(self, query):
        """Main chat function with improved content selection"""
        print(f"\nπŸ” Processing query: {query}")
        
        # Step 1: Vector search to find relevant content
        results = self.vector_db.similarity_search(query, k=5)
        
        if not results:
            return "I couldn't find any relevant content in the curriculum for your question.", [], None, None
        
        print(f"πŸ“š Found {len(results)} relevant slides from vector search")
        
        # Step 2: Intelligent content selection
        selected_result, selected_content = self._select_best_content(results, query)
        
        if not selected_result:
            selected_result = results[0]
            selected_content = selected_result.page_content
        
        # Step 3: Generate educational answer
        answer = self._generate_educational_answer(query, selected_content)
        print(f"βœ… Generated educational answer: {answer[:100]}...")
        
        # Step 4: Get relevant slides for display
        relevant_slides = []
        if selected_result:
            filename = selected_result.metadata["filename"]
            page_number = selected_result.metadata["page_number"]
            
            if filename in self.pdf_files:
                pdf_path = self.pdf_files[filename]
                doc = fitz.open(pdf_path)
                total_pages = len(doc)
                doc.close()
                
                # Get the selected page and neighboring pages
                start_page = max(1, page_number - 2)
                end_page = min(total_pages, page_number + 2)
                
                for page_num in range(start_page, end_page + 1):
                    img = self.get_pdf_page_image(pdf_path, page_num)
                    if img:
                        if page_num == page_number:
                            label = f"πŸ“Œ {filename} - Page {page_num} (Most Relevant)"
                        else:
                            label = f"{filename} - Page {page_num}"
                        relevant_slides.append((img, label))
                
                recommended_slide = relevant_slides[0][0] if relevant_slides else None
                recommended_label = relevant_slides[0][1] if relevant_slides else None
            else:
                recommended_slide = None
                recommended_label = None
        else:
            recommended_slide = None
            recommended_label = None
        
        return answer, relevant_slides, recommended_slide, recommended_label

# --- Gradio UI ---
assistant = ImprovedCurriculumAssistant()

def gradio_chat(query):
    """Gradio chat interface"""
    answer, relevant_slides, recommended_slide, recommended_label = assistant.chat(query)
    return answer, relevant_slides

with gr.Blocks(title="Improved Curriculum Assistant", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# πŸ€– Improved Curriculum Assistant\nYour AI programming tutor with intelligent content selection!")
    
    with gr.Row():
        # Left Column - Chatbot Interface
        with gr.Column(scale=1):
            gr.Markdown("### πŸ’¬ Chatbot")
            gr.Markdown("**Ask questions about programming concepts:**")
            
            question = gr.Textbox(
                label="Question Input", 
                placeholder="e.g., What are for loops? How do variables work? Explain functions...", 
                lines=3
            )
            submit = gr.Button("πŸ€– Ask AI", variant="primary", size="lg")
            answer = gr.Markdown(label="Generated Answer")
        
        # Right Column - Slides Display
        with gr.Column(scale=1):
            gr.Markdown("### πŸ“„ Most Relevant Slides")
            gallery = gr.Gallery(
                label="Curriculum Slides", 
                columns=1, 
                rows=3, 
                height="600px", 
                object_fit="contain",
                show_label=False
            )
    
    # Event handlers
    submit.click(fn=gradio_chat, inputs=[question], outputs=[answer, gallery])
    question.submit(fn=gradio_chat, inputs=[question], outputs=[answer, gallery])

if __name__ == "__main__":
    demo.launch()