InclusiveWorldChatbotSpace / llm_app_fallback.py
IW2025's picture
Upload 30 files
93fe96e verified
import gradio as gr
import os
from pathlib import Path
import fitz # PyMuPDF
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import base64
from PIL import Image
import io
import re
# --- Improved Vector Search Curriculum Assistant ---
class ImprovedCurriculumAssistant:
def __init__(self, slides_dir="Slides"):
self.pdf_pages = {} # {filename: {page_num: text}}
self.pdf_files = {} # {filename: path}
self.chunks = []
self.chunk_metadata = []
self.vector_db = None
self.embeddings = None
# Setup
self._process_pdfs(slides_dir)
self._build_vector_db()
def _process_pdfs(self, slides_dir):
"""Process PDFs and extract text"""
slides_path = Path(slides_dir)
pdf_files = list(slides_path.glob("*.pdf"))
for pdf_file in pdf_files:
self.pdf_files[pdf_file.name] = str(pdf_file)
doc = fitz.open(str(pdf_file))
pages = {}
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text()
if text.strip():
pages[page_num + 1] = text.strip()
self.pdf_pages[pdf_file.name] = pages
doc.close()
# Add each page as a chunk
for page_num, text in pages.items():
self.chunks.append(text)
self.chunk_metadata.append({
"filename": pdf_file.name,
"page_number": page_num
})
print(f"βœ… Processed {len(pdf_files)} PDF files with {len(self.chunks)} total pages")
def _build_vector_db(self):
"""Build vector database for semantic search"""
self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
self.vector_db = Chroma.from_texts(
texts=self.chunks,
embedding=self.embeddings,
metadatas=self.chunk_metadata,
persist_directory="./chroma_db"
)
print("βœ… Vector database built successfully")
def get_pdf_page_image(self, pdf_path, page_num):
"""Get PDF page as image"""
try:
doc = fitz.open(pdf_path)
if page_num <= len(doc):
page = doc[page_num - 1]
mat = fitz.Matrix(1.5, 1.5)
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
img = Image.open(io.BytesIO(img_data))
if img.mode != 'RGB':
img = img.convert('RGB')
doc.close()
return img
doc.close()
return None
except Exception as e:
print(f"Error rendering PDF page: {str(e)}")
return None
def _select_best_content(self, results, query):
"""Intelligent content selection without LLM"""
if not results:
return None, None
query_lower = query.lower()
query_terms = query_lower.split()
# Score each result based on content quality and relevance
scored_results = []
for result in results:
content = result.page_content
content_lower = content.lower()
# Calculate relevance score
score = 0
# Check for exact phrase matches
for i in range(len(query_terms)):
for j in range(i + 1, len(query_terms) + 1):
phrase = " ".join(query_terms[i:j])
if len(phrase) > 2 and phrase in content_lower:
score += len(phrase.split()) * 10
# Check for individual term matches
for term in query_terms:
if len(term) > 2 and term in content_lower:
score += 1
# Bonus for content length (prefer detailed explanations)
content_length = len(content.strip())
score += content_length * 0.01
# Penalty for very short content (likely title slides)
if content_length < 100:
score -= 50
# Bonus for content that contains programming keywords
programming_keywords = ['function', 'variable', 'loop', 'condition', 'class', 'method', 'array', 'string', 'number']
for keyword in programming_keywords:
if keyword in content_lower:
score += 5
scored_results.append((result, score))
# Sort by score and return the best
scored_results.sort(key=lambda x: x[1], reverse=True)
best_result = scored_results[0][0]
print(f"βœ… Selected content with score: {scored_results[0][1]}")
return best_result, best_result.page_content
def _generate_educational_answer(self, query, selected_content):
"""Generate educational answer based on content"""
query_lower = query.lower()
# Create educational answer based on content and query
if "loop" in query_lower:
if "for loop" in query_lower:
return f"""**For Loops** are a fundamental programming construct that allows you to repeat code a specific number of times.
Based on the curriculum content:
{selected_content}
**Key characteristics of for loops:**
- They use a counter variable to track iterations
- They have a defined start, end, and increment
- They are perfect for iterating through sequences like lists, ranges, or arrays
- They are more structured than while loops
**Example:**
```python
for i in range(5):
print(i) # Prints 0, 1, 2, 3, 4
```
For loops are essential when you know exactly how many times you want to repeat an action."""
else:
return f"""**Loops** are fundamental programming constructs that allow you to repeat code multiple times without having to write the same code repeatedly.
Based on the curriculum content:
{selected_content}
**Why loops are important:**
- Process large amounts of data efficiently
- Repeat actions a specific number of times
- Iterate through collections like lists and arrays
- Automate repetitive tasks
**Types of loops:**
- **For loops**: When you know the number of iterations
- **While loops**: When you don't know the number of iterations
- **Do-while loops**: Execute at least once, then check condition
Loops are essential for making programs efficient and handling repetitive tasks."""
elif "variable" in query_lower:
return f"""**Variables** are fundamental programming concepts that allow you to store and manipulate data.
Based on the curriculum content:
{selected_content}
**What are variables:**
- Containers that store data values
- Have names that you choose
- Can hold different types of data (numbers, text, etc.)
- Can be changed throughout your program
**Key concepts:**
- **Declaration**: Creating a variable with a name
- **Assignment**: Giving a variable a value
- **Data types**: Different kinds of data (integers, strings, etc.)
- **Scope**: Where a variable can be used
**Example:**
```python
name = "Alice" # String variable
age = 25 # Integer variable
is_student = True # Boolean variable
```
Variables are the building blocks of programming - they let you work with data in your programs."""
else:
return f"""Based on the curriculum content:
{selected_content}
This slide explains the concept you asked about. The curriculum provides a solid foundation for understanding this programming topic.
**Key points:**
- This is fundamental programming knowledge
- Understanding this concept will help with more advanced topics
- Practice with examples to reinforce your learning
- Ask questions if you need clarification on any part
The curriculum is designed to build your programming skills step by step."""
def chat(self, query):
"""Main chat function with improved content selection"""
print(f"\nπŸ” Processing query: {query}")
# Step 1: Vector search to find relevant content
results = self.vector_db.similarity_search(query, k=5)
if not results:
return "I couldn't find any relevant content in the curriculum for your question.", [], None, None
print(f"πŸ“š Found {len(results)} relevant slides from vector search")
# Step 2: Intelligent content selection
selected_result, selected_content = self._select_best_content(results, query)
if not selected_result:
selected_result = results[0]
selected_content = selected_result.page_content
# Step 3: Generate educational answer
answer = self._generate_educational_answer(query, selected_content)
print(f"βœ… Generated educational answer: {answer[:100]}...")
# Step 4: Get relevant slides for display
relevant_slides = []
if selected_result:
filename = selected_result.metadata["filename"]
page_number = selected_result.metadata["page_number"]
if filename in self.pdf_files:
pdf_path = self.pdf_files[filename]
doc = fitz.open(pdf_path)
total_pages = len(doc)
doc.close()
# Get the selected page and neighboring pages
start_page = max(1, page_number - 2)
end_page = min(total_pages, page_number + 2)
for page_num in range(start_page, end_page + 1):
img = self.get_pdf_page_image(pdf_path, page_num)
if img:
if page_num == page_number:
label = f"πŸ“Œ {filename} - Page {page_num} (Most Relevant)"
else:
label = f"{filename} - Page {page_num}"
relevant_slides.append((img, label))
recommended_slide = relevant_slides[0][0] if relevant_slides else None
recommended_label = relevant_slides[0][1] if relevant_slides else None
else:
recommended_slide = None
recommended_label = None
else:
recommended_slide = None
recommended_label = None
return answer, relevant_slides, recommended_slide, recommended_label
# --- Gradio UI ---
assistant = ImprovedCurriculumAssistant()
def gradio_chat(query):
"""Gradio chat interface"""
answer, relevant_slides, recommended_slide, recommended_label = assistant.chat(query)
return answer, relevant_slides
with gr.Blocks(title="Improved Curriculum Assistant", theme=gr.themes.Soft()) as demo:
gr.Markdown("# πŸ€– Improved Curriculum Assistant\nYour AI programming tutor with intelligent content selection!")
with gr.Row():
# Left Column - Chatbot Interface
with gr.Column(scale=1):
gr.Markdown("### πŸ’¬ Chatbot")
gr.Markdown("**Ask questions about programming concepts:**")
question = gr.Textbox(
label="Question Input",
placeholder="e.g., What are for loops? How do variables work? Explain functions...",
lines=3
)
submit = gr.Button("πŸ€– Ask AI", variant="primary", size="lg")
answer = gr.Markdown(label="Generated Answer")
# Right Column - Slides Display
with gr.Column(scale=1):
gr.Markdown("### πŸ“„ Most Relevant Slides")
gallery = gr.Gallery(
label="Curriculum Slides",
columns=1,
rows=3,
height="600px",
object_fit="contain",
show_label=False
)
# Event handlers
submit.click(fn=gradio_chat, inputs=[question], outputs=[answer, gallery])
question.submit(fn=gradio_chat, inputs=[question], outputs=[answer, gallery])
if __name__ == "__main__":
demo.launch()