Spaces:

IW2025
/

InclusiveWorldChatbotSpace

Sleeping

App Files Files Community

InclusiveWorldChatbotSpace / llm_app_fallback.py

IW2025

Upload 30 files

93fe96e verified 5 months ago

raw

history blame contribute delete

12.4 kB

	import gradio as gr
	import os
	from pathlib import Path
	import fitz # PyMuPDF
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import Chroma
	import base64
	from PIL import Image
	import io
	import re

	# --- Improved Vector Search Curriculum Assistant ---

	class ImprovedCurriculumAssistant:
	def __init__(self, slides_dir="Slides"):
	self.pdf_pages = {} # {filename: {page_num: text}}
	self.pdf_files = {} # {filename: path}
	self.chunks = []
	self.chunk_metadata = []
	self.vector_db = None
	self.embeddings = None

	# Setup
	self._process_pdfs(slides_dir)
	self._build_vector_db()

	def _process_pdfs(self, slides_dir):
	"""Process PDFs and extract text"""
	slides_path = Path(slides_dir)
	pdf_files = list(slides_path.glob("*.pdf"))

	for pdf_file in pdf_files:
	self.pdf_files[pdf_file.name] = str(pdf_file)
	doc = fitz.open(str(pdf_file))
	pages = {}

	for page_num in range(len(doc)):
	page = doc[page_num]
	text = page.get_text()
	if text.strip():
	pages[page_num + 1] = text.strip()

	self.pdf_pages[pdf_file.name] = pages
	doc.close()

	# Add each page as a chunk
	for page_num, text in pages.items():
	self.chunks.append(text)
	self.chunk_metadata.append({
	"filename": pdf_file.name,
	"page_number": page_num
	})

	print(f"✅ Processed {len(pdf_files)} PDF files with {len(self.chunks)} total pages")

	def _build_vector_db(self):
	"""Build vector database for semantic search"""
	self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	self.vector_db = Chroma.from_texts(
	texts=self.chunks,
	embedding=self.embeddings,
	metadatas=self.chunk_metadata,
	persist_directory="./chroma_db"
	)
	print("✅ Vector database built successfully")

	def get_pdf_page_image(self, pdf_path, page_num):
	"""Get PDF page as image"""
	try:
	doc = fitz.open(pdf_path)
	if page_num <= len(doc):
	page = doc[page_num - 1]
	mat = fitz.Matrix(1.5, 1.5)
	pix = page.get_pixmap(matrix=mat)
	img_data = pix.tobytes("png")
	img = Image.open(io.BytesIO(img_data))
	if img.mode != 'RGB':
	img = img.convert('RGB')
	doc.close()
	return img
	doc.close()
	return None
	except Exception as e:
	print(f"Error rendering PDF page: {str(e)}")
	return None

	def _select_best_content(self, results, query):
	"""Intelligent content selection without LLM"""
	if not results:
	return None, None

	query_lower = query.lower()
	query_terms = query_lower.split()

	# Score each result based on content quality and relevance
	scored_results = []

	for result in results:
	content = result.page_content
	content_lower = content.lower()

	# Calculate relevance score
	score = 0

	# Check for exact phrase matches
	for i in range(len(query_terms)):
	for j in range(i + 1, len(query_terms) + 1):
	phrase = " ".join(query_terms[i:j])
	if len(phrase) > 2 and phrase in content_lower:
	score += len(phrase.split()) * 10

	# Check for individual term matches
	for term in query_terms:
	if len(term) > 2 and term in content_lower:
	score += 1

	# Bonus for content length (prefer detailed explanations)
	content_length = len(content.strip())
	score += content_length * 0.01

	# Penalty for very short content (likely title slides)
	if content_length < 100:
	score -= 50

	# Bonus for content that contains programming keywords
	programming_keywords = ['function', 'variable', 'loop', 'condition', 'class', 'method', 'array', 'string', 'number']
	for keyword in programming_keywords:
	if keyword in content_lower:
	score += 5

	scored_results.append((result, score))

	# Sort by score and return the best
	scored_results.sort(key=lambda x: x[1], reverse=True)
	best_result = scored_results[0][0]

	print(f"✅ Selected content with score: {scored_results[0][1]}")
	return best_result, best_result.page_content

	def _generate_educational_answer(self, query, selected_content):
	"""Generate educational answer based on content"""
	query_lower = query.lower()

	# Create educational answer based on content and query
	if "loop" in query_lower:
	if "for loop" in query_lower:
	return f"""For Loops are a fundamental programming construct that allows you to repeat code a specific number of times.

	Based on the curriculum content:
	{selected_content}

	Key characteristics of for loops:
	- They use a counter variable to track iterations
	- They have a defined start, end, and increment
	- They are perfect for iterating through sequences like lists, ranges, or arrays
	- They are more structured than while loops

	Example:
	```python
	for i in range(5):
	print(i) # Prints 0, 1, 2, 3, 4
	```

	For loops are essential when you know exactly how many times you want to repeat an action."""
	else:
	return f"""Loops are fundamental programming constructs that allow you to repeat code multiple times without having to write the same code repeatedly.

	Based on the curriculum content:
	{selected_content}

	Why loops are important:
	- Process large amounts of data efficiently
	- Repeat actions a specific number of times
	- Iterate through collections like lists and arrays
	- Automate repetitive tasks

	Types of loops:
	- For loops: When you know the number of iterations
	- While loops: When you don't know the number of iterations
	- Do-while loops: Execute at least once, then check condition

	Loops are essential for making programs efficient and handling repetitive tasks."""

	elif "variable" in query_lower:
	return f"""Variables are fundamental programming concepts that allow you to store and manipulate data.

	Based on the curriculum content:
	{selected_content}

	What are variables:
	- Containers that store data values
	- Have names that you choose
	- Can hold different types of data (numbers, text, etc.)
	- Can be changed throughout your program

	Key concepts:
	- Declaration: Creating a variable with a name
	- Assignment: Giving a variable a value
	- Data types: Different kinds of data (integers, strings, etc.)
	- Scope: Where a variable can be used

	Example:
	```python
	name = "Alice" # String variable
	age = 25 # Integer variable
	is_student = True # Boolean variable
	```

	Variables are the building blocks of programming - they let you work with data in your programs."""

	else:
	return f"""Based on the curriculum content:

	{selected_content}

	This slide explains the concept you asked about. The curriculum provides a solid foundation for understanding this programming topic.

	Key points:
	- This is fundamental programming knowledge
	- Understanding this concept will help with more advanced topics
	- Practice with examples to reinforce your learning
	- Ask questions if you need clarification on any part

	The curriculum is designed to build your programming skills step by step."""

	def chat(self, query):
	"""Main chat function with improved content selection"""
	print(f"\n🔍 Processing query: {query}")

	# Step 1: Vector search to find relevant content
	results = self.vector_db.similarity_search(query, k=5)

	if not results:
	return "I couldn't find any relevant content in the curriculum for your question.", [], None, None

	print(f"📚 Found {len(results)} relevant slides from vector search")

	# Step 2: Intelligent content selection
	selected_result, selected_content = self._select_best_content(results, query)

	if not selected_result:
	selected_result = results[0]
	selected_content = selected_result.page_content

	# Step 3: Generate educational answer
	answer = self._generate_educational_answer(query, selected_content)
	print(f"✅ Generated educational answer: {answer[:100]}...")

	# Step 4: Get relevant slides for display
	relevant_slides = []
	if selected_result:
	filename = selected_result.metadata["filename"]
	page_number = selected_result.metadata["page_number"]

	if filename in self.pdf_files:
	pdf_path = self.pdf_files[filename]
	doc = fitz.open(pdf_path)
	total_pages = len(doc)
	doc.close()

	# Get the selected page and neighboring pages
	start_page = max(1, page_number - 2)
	end_page = min(total_pages, page_number + 2)

	for page_num in range(start_page, end_page + 1):
	img = self.get_pdf_page_image(pdf_path, page_num)
	if img:
	if page_num == page_number:
	label = f"📌 {filename} - Page {page_num} (Most Relevant)"
	else:
	label = f"{filename} - Page {page_num}"
	relevant_slides.append((img, label))

	recommended_slide = relevant_slides[0][0] if relevant_slides else None
	recommended_label = relevant_slides[0][1] if relevant_slides else None
	else:
	recommended_slide = None
	recommended_label = None
	else:
	recommended_slide = None
	recommended_label = None

	return answer, relevant_slides, recommended_slide, recommended_label

	# --- Gradio UI ---
	assistant = ImprovedCurriculumAssistant()

	def gradio_chat(query):
	"""Gradio chat interface"""
	answer, relevant_slides, recommended_slide, recommended_label = assistant.chat(query)
	return answer, relevant_slides

	with gr.Blocks(title="Improved Curriculum Assistant", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🤖 Improved Curriculum Assistant\nYour AI programming tutor with intelligent content selection!")

	with gr.Row():
	# Left Column - Chatbot Interface
	with gr.Column(scale=1):
	gr.Markdown("### 💬 Chatbot")
	gr.Markdown("Ask questions about programming concepts:")

	question = gr.Textbox(
	label="Question Input",
	placeholder="e.g., What are for loops? How do variables work? Explain functions...",
	lines=3
	)
	submit = gr.Button("🤖 Ask AI", variant="primary", size="lg")
	answer = gr.Markdown(label="Generated Answer")

	# Right Column - Slides Display
	with gr.Column(scale=1):
	gr.Markdown("### 📄 Most Relevant Slides")
	gallery = gr.Gallery(
	label="Curriculum Slides",
	columns=1,
	rows=3,
	height="600px",
	object_fit="contain",
	show_label=False
	)

	# Event handlers
	submit.click(fn=gradio_chat, inputs=[question], outputs=[answer, gallery])
	question.submit(fn=gradio_chat, inputs=[question], outputs=[answer, gallery])

	if __name__ == "__main__":
	demo.launch()