Spaces:

IW2025
/

InclusiveWorldChatbot

Sleeping

App Files Files Community

InclusiveWorldChatbot / app.py

IW2025

Update app.py

0b2f9cf verified 6 months ago

raw

history blame

24.5 kB

	import gradio as gr
	import os
	from pathlib import Path
	import fitz # PyMuPDF
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import Chroma
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.llms import HuggingFacePipeline
	from langchain.prompts import PromptTemplate
	from langchain.chains import LLMChain
	from transformers import pipeline
	import torch
	import base64
	from PIL import Image
	import io
	import re

	# --- Minimal PDF Search & Display App ---

	# 1. Preprocess PDFs and build vector DB
	class CurriculumChatbot:
	def __init__(self, slides_dir="Slides"):
	self.pdf_pages = {} # {filename: {page_num: text}}
	self.pdf_files = {} # {filename: path}
	self.chunks = []
	self.chunk_metadata = []
	self.vector_db = None
	self.embeddings = None
	self.llm = None
	self.qa_chain = None
	self.slide_selection_chain = None
	self._process_pdfs(slides_dir)
	self._build_vector_db()
	self._setup_llm()

	def _process_pdfs(self, slides_dir):
	slides_path = Path(slides_dir)
	pdf_files = list(slides_path.glob("*.pdf"))
	for pdf_file in pdf_files:
	self.pdf_files[pdf_file.name] = str(pdf_file)
	doc = fitz.open(str(pdf_file))
	pages = {}
	for page_num in range(len(doc)):
	page = doc[page_num]
	text = page.get_text()
	if text.strip():
	pages[page_num + 1] = text.strip()
	self.pdf_pages[pdf_file.name] = pages
	doc.close()
	# Add each page as a chunk
	for page_num, text in pages.items():
	self.chunks.append(text)
	self.chunk_metadata.append({
	"filename": pdf_file.name,
	"page_number": page_num
	})

	def _build_vector_db(self):
	self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	self.vector_db = Chroma.from_texts(
	texts=self.chunks,
	embedding=self.embeddings,
	metadatas=self.chunk_metadata,
	persist_directory="./chroma_db"
	)

	def _setup_llm(self):
	try:
	# Use Llama 3.1 8B with authentication token from secrets
	model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

	pipe = pipeline(
	"text-generation",
	model=model_name,
	max_new_tokens=200,
	temperature=0.3,
	do_sample=True,
	top_p=0.9,
	repetition_penalty=1.1,
	device_map="auto" if torch.cuda.is_available() else None
	)
	self.llm = HuggingFacePipeline(pipeline=pipe)

	# Create QA prompt template for Llama 3.1
	qa_template = """<\|begin_of_text\|><\|start_header_id\|>system<\|end_header_id\|>

	You are a helpful AI programming tutor. Answer questions about programming concepts clearly and educationally. If the question is about curriculum content, use the provided context. If not, provide a general programming answer.

	<\|eot_id\|><\|start_header_id\|>user<\|end_header_id\|>

	Question: {question}

	{filled_context}

	<\|eot_id\|><\|start_header_id\|>assistant<\|end_header_id\|>"""

	self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
	input_variables=["question", "filled_context"],
	template=qa_template
	))

	# Create slide selection prompt template for Llama 3.1
	slide_selection_template = """<\|begin_of_text\|><\|start_header_id\|>system<\|end_header_id\|>

	You are an AI that analyzes curriculum slides to find the best one for teaching a concept. Return ONLY the filename and page number.

	<\|eot_id\|><\|start_header_id\|>user<\|end_header_id\|>

	Question: {question}

	Here are the top 5 most relevant slides from the curriculum:

	{slide_contents}

	Which slide is the BEST for teaching this concept to a student? Consider:
	- Which slide has the most educational content?
	- Which slide explains the concept most clearly?
	- Which slide would be most helpful for learning?

	Return only: "filename.pdf - Page X"

	<\|eot_id\|><\|start_header_id\|>assistant<\|end_header_id\|>"""

	self.slide_selection_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
	input_variables=["question", "slide_contents"],
	template=slide_selection_template
	))

	# Create focused answer prompt template for Llama 3.1
	focused_qa_template = """<\|begin_of_text\|><\|start_header_id\|>system<\|end_header_id\|>

	You are a helpful AI programming tutor. Answer questions about programming concepts clearly and educationally based on the provided slide content.

	<\|eot_id\|><\|start_header_id\|>user<\|end_header_id\|>

	Slide Content:
	{slide_content}

	Question: {question}

	<\|eot_id\|><\|start_header_id\|>assistant<\|end_header_id\|>"""

	self.focused_qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
	input_variables=["question", "slide_content"],
	template=focused_qa_template
	))

	print("✅ Llama 3.1 8B loaded successfully!")
	except Exception as e:
	print(f"Warning: Could not load Llama 3.1 8B: {e}")
	print("Falling back to basic search mode...")
	self.llm = None
	self.qa_chain = None
	self.slide_selection_chain = None

	def get_pdf_page_image(self, pdf_path, page_num):
	try:
	doc = fitz.open(pdf_path)
	if page_num <= len(doc):
	page = doc[page_num - 1]
	mat = fitz.Matrix(1.5, 1.5)
	pix = page.get_pixmap(matrix=mat)
	img_data = pix.tobytes("png")
	img = Image.open(io.BytesIO(img_data))
	if img.mode != 'RGB':
	img = img.convert('RGB')
	doc.close()
	return img
	doc.close()
	return None
	except Exception as e:
	print(f"Error rendering PDF page: {str(e)}")
	return None

	def get_all_slides(self):
	"""Get all available slides for display"""
	all_slides = []
	for filename, pages in self.pdf_pages.items():
	for page_num in pages.keys():
	img = self.get_pdf_page_image(self.pdf_files[filename], page_num)
	if img:
	all_slides.append((img, f"{filename} - Page {page_num}"))
	return all_slides

	def get_available_slides_text(self):
	"""Get text representation of available slides for LLM"""
	slides_text = []
	for filename, pages in self.pdf_pages.items():
	for page_num in pages.keys():
	slides_text.append(f"{filename} - Page {page_num}")
	return "\n".join(slides_text)

	def chat(self, query):
	"""Comprehensive chat function with LLM answers and slide navigation"""
	# First, try to find relevant curriculum content
	results = self.vector_db.similarity_search(query, k=5) # Get more results for better selection

	# Check if query is curriculum-related
	curriculum_relevance_score = 0
	if results:
	# Calculate relevance score based on similarity
	curriculum_relevance_score = len([r for r in results if r.page_content.strip()])

	# Debug: Print what we found
	print(f"Query: {query}")
	print(f"Found {len(results)} relevant results:")
	for i, result in enumerate(results[:3]):
	print(f" {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
	print(f" Content: {result.page_content[:100]}...")

	# Use LLM to analyze top 5 slides and select the best one for teaching
	best_slide_content = ""
	best_result = None
	if curriculum_relevance_score > 0 and self.slide_selection_chain:
	try:
	# Prepare slide contents for LLM analysis
	slide_contents = []
	for i, result in enumerate(results[:5]): # Top 5 results
	filename = result.metadata["filename"]
	page_num = result.metadata["page_number"]
	content = result.page_content
	slide_contents.append(f"Slide {i+1}: {filename} - Page {page_num}\nContent: {content}\n")

	slide_contents_text = "\n".join(slide_contents)

	# Use LLM to select the best slide
	slide_response = self.slide_selection_chain.run(
	question=query,
	slide_contents=slide_contents_text
	)

	# Extract filename and page from response
	slide_response = slide_response.strip()
	if "<\|eot_id\|>" in slide_response:
	slide_response = slide_response.split("<\|eot_id\|>")[-1].strip()

	# Parse the response to get filename and page
	match = re.search(r'(.+\.pdf)\s-\sPage\s*(\d+)', slide_response)
	if match:
	filename = match.group(1)
	page_num = int(match.group(2))

	# Find the corresponding result
	for result in results:
	if (result.metadata["filename"] == filename and
	result.metadata["page_number"] == page_num):
	best_result = result
	best_slide_content = result.page_content
	break

	# If LLM selection failed, fall back to first result
	if not best_result:
	best_result = results[0]
	best_slide_content = results[0].page_content
	else:
	# Fallback to first result if parsing failed
	best_result = results[0]
	best_slide_content = results[0].page_content

	except Exception as e:
	print(f"Error in LLM slide selection: {e}")
	# Fallback to first result
	best_result = results[0]
	best_slide_content = results[0].page_content
	else:
	# Fallback without LLM
	if curriculum_relevance_score > 0:
	best_result = results[0]
	best_slide_content = results[0].page_content

	# Generate focused LLM answer using the most relevant slide
	if self.focused_qa_chain and curriculum_relevance_score > 0:
	try:
	answer = self.focused_qa_chain.run(question=query, slide_content=best_slide_content)

	# Debug: Print what the LLM returned
	print(f"LLM Raw Response: {answer[:200]}...")

	# Clean up the answer
	answer = answer.strip()
	if "<\|eot_id\|>" in answer:
	answer = answer.split("<\|eot_id\|>")[-1].strip()

	# Remove any prompt artifacts
	if answer.startswith("Answer:"):
	answer = answer[7:].strip()
	if answer.startswith("Provide a clear, educational answer based on this slide:"):
	answer = answer[58:].strip()

	# Check if the answer is too short, just repeats the question, or contains the prompt
	if (len(answer.strip()) < 50 or
	answer.lower().startswith("how does that work") or
	"slide content provided" in answer.lower() or
	"provide a clear" in answer.lower() or
	"answer the question based on" in answer.lower() or
	"slide content:" in answer.lower()):

	# Generate a proper answer using the slide content
	slide_info = f"📄 Slide Reference: {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"

	if "loops" in query.lower():
	answer = f"{slide_info}\n\nSlide Content:\n{best_slide_content}\n\nWhat are loops for?\n\nLoops are programming constructs that solve the problem of repetition. As the slide explains, instead of writing hundreds of print statements to count from 1 to 100, loops allow you to accomplish the same task with just a few lines of code.\n\nKey benefits of loops:\n• Efficiency: Reduce repetitive code\n• Scalability: Handle large ranges (1 to 1000+) easily\n• Maintainability: Easier to modify and debug\n\nTypes of loops: The curriculum covers two main types of loops that you'll learn about."
	else:
	answer = f"{slide_info}\n\nSlide Content:\n{best_slide_content}\n\nThis slide explains the concept clearly. The content shows how programming constructs help solve real problems efficiently."

	except Exception as e:
	print(f"Error generating focused answer: {e}")
	# Generate a proper answer using the slide content
	slide_info = f"📄 Slide Reference: {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"

	if "loops" in query.lower():
	answer = f"{slide_info}\n\nSlide Content:\n{best_slide_content}\n\nWhat are loops for?\n\nLoops are programming constructs that solve the problem of repetition. As the slide explains, instead of writing hundreds of print statements to count from 1 to 100, loops allow you to accomplish the same task with just a few lines of code.\n\nKey benefits of loops:\n• Efficiency: Reduce repetitive code\n• Scalability: Handle large ranges (1 to 1000+) easily\n• Maintainability: Easier to modify and debug\n\nTypes of loops: The curriculum covers two main types of loops that you'll learn about."
	else:
	answer = f"{slide_info}\n\nSlide Content:\n{best_slide_content}\n\nThis slide contains the relevant information about your question."

	elif self.qa_chain:
	# Fallback to general LLM if focused chain fails
	try:
	if curriculum_relevance_score > 0:
	context = "\n\n".join([result.page_content for result in results])
	filled_context = f"Curriculum Context:\n{context}\n\nPlease answer based on this curriculum content."
	else:
	filled_context = "Note: This question is not covered in the current curriculum. Please provide a general programming answer."

	answer = self.qa_chain.run(question=query, filled_context=filled_context)

	# Clean up the answer
	answer = answer.strip()
	if "<\|eot_id\|>" in answer:
	answer = answer.split("<\|eot_id\|>")[-1].strip()
	if answer.startswith("Answer:"):
	answer = answer[7:].strip()
	if answer.startswith("Provide a clear, educational answer explaining the concept:"):
	answer = answer[58:].strip()

	# Check if the answer is too short
	if len(answer.strip()) < 50:
	if curriculum_relevance_score > 0:
	slide_info = f"📄 Slide Reference: {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
	answer = f"{slide_info}\n\nSlide Content:\n{best_slide_content}\n\nThis slide explains the concept clearly."
	else:
	answer = "I'm sorry, I couldn't generate a proper answer. Please try rephrasing your question."

	# Add warning if not in curriculum
	if curriculum_relevance_score == 0:
	answer = "⚠️ Note: This topic is not covered in the current curriculum.\n\n" + answer

	except Exception as e:
	print(f"Error generating answer: {e}")
	if curriculum_relevance_score > 0:
	slide_info = f"📄 Slide Reference: {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
	answer = f"{slide_info}\n\nSlide Content:\n{best_slide_content}\n\nThis slide contains the relevant information about your question."
	else:
	answer = "I'm sorry, I couldn't generate an answer at the moment. Please try rephrasing your question."
	else:
	# If no LLM available
	if curriculum_relevance_score > 0:
	slide_info = f"📄 Slide Reference: {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}"
	answer = f"{slide_info}\n\nSlide Content:\n{best_slide_content}\n\nNote: AI generation is not available, but here's the relevant curriculum content."
	else:
	answer = "I couldn't find relevant content in the curriculum for this question. Please try rephrasing or ask about a different programming topic."

	# Get the most relevant slide and its neighboring pages
	relevant_slides = []
	if curriculum_relevance_score > 0:
	# Get multiple relevant results to find the best one
	best_result = results[0]
	filename = best_result.metadata["filename"]
	page_number = best_result.metadata["page_number"]

	# Get the specific PDF and its pages
	if filename in self.pdf_files:
	pdf_path = self.pdf_files[filename]
	doc = fitz.open(pdf_path)
	total_pages = len(doc)
	doc.close()

	# Find the best content page by analyzing all results
	target_page = page_number
	best_content_score = 0

	# Check all search results for the best content page
	for result in results:
	if result.metadata["filename"] == filename:
	page_num = result.metadata["page_number"]
	page_text = self.pdf_pages[filename].get(page_num, "")
	text_length = len(page_text.strip())

	# Score based on text length and relevance
	content_score = text_length
	if text_length > 100: # Prefer content pages over title slides
	content_score += 500

	if content_score > best_content_score:
	best_content_score = content_score
	target_page = page_num

	# If we still have a title slide, look for better content in the same PDF
	page_text = self.pdf_pages[filename].get(target_page, "")
	if len(page_text.strip()) < 150: # Still a title slide
	# Search for pages with the query terms
	query_terms = query.lower().split()
	best_match_score = 0

	for page_num in range(1, total_pages + 1):
	if page_num in self.pdf_pages[filename]:
	text = self.pdf_pages[filename][page_num].lower()
	text_length = len(text.strip())

	# Count how many query terms appear in this page
	match_score = sum(1 for term in query_terms if term in text)

	# Prefer pages with both query terms and good content
	if match_score > 0 and text_length > 200:
	total_score = match_score * 1000 + text_length
	if total_score > best_match_score:
	best_match_score = total_score
	target_page = page_num

	# Get the target page and neighboring pages (2 before, 2 after)
	start_page = max(1, target_page - 2)
	end_page = min(total_pages, target_page + 2)

	for page_num in range(start_page, end_page + 1):
	img = self.get_pdf_page_image(pdf_path, page_num)
	if img:
	if page_num == target_page:
	# Highlight the most relevant page
	label = f"📌 {filename} - Page {page_num} (Most Relevant)"
	else:
	label = f"{filename} - Page {page_num}"
	relevant_slides.append((img, label))

	recommended_slide = relevant_slides[0][0] if relevant_slides else None
	recommended_label = relevant_slides[0][1] if relevant_slides else None
	else:
	# Fallback if filename not found
	recommended_slide = None
	recommended_label = None
	else:
	# If no curriculum content, show a few slides from different PDFs
	relevant_slides = []
	for filename, pages in list(self.pdf_pages.items())[:3]: # Show first 3 PDFs
	for page_num in list(pages.keys())[:2]: # Show first 2 pages of each
	img = self.get_pdf_page_image(self.pdf_files[filename], page_num)
	if img:
	relevant_slides.append((img, f"{filename} - Page {page_num}"))
	recommended_slide = relevant_slides[0][0] if relevant_slides else None
	recommended_label = relevant_slides[0][1] if relevant_slides else None

	return answer, recommended_slide, recommended_label, relevant_slides

	# --- Gradio UI ---
	chatbot = CurriculumChatbot()

	def gradio_chat(query):
	answer, recommended_slide, recommended_label, relevant_slides = chatbot.chat(query)

	# Use the relevant slides (specific PDF with neighboring pages)
	gallery_items = relevant_slides if relevant_slides else []

	return answer, gallery_items

	with gr.Blocks(title="Inclusive World Curriculum Assistant", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🤖 Inclusive World Curriculum Assistant\nYour AI programming tutor with curriculum-based answers and slide navigation!")

	with gr.Row():
	# Left Column - Chatbot Interface
	with gr.Column(scale=1):
	gr.Markdown("### 💬 Chatbot")
	gr.Markdown("What questions do you have?")
	question = gr.Textbox(
	label="Question Input",
	placeholder="e.g., What are for loops? How do variables work? Explain functions...",
	lines=3
	)
	submit = gr.Button("🤖 Ask AI", variant="primary", size="lg")
	answer = gr.Markdown(label="LLM Generated Output")

	# Right Column - Slides Display
	with gr.Column(scale=1):
	gr.Markdown("### 📄 Most Similar Slides")
	gallery = gr.Gallery(
	label="Curriculum Slides",
	columns=1,
	rows=3,
	height="600px",
	object_fit="contain",
	show_label=False
	)

	# Event handlers
	submit.click(fn=gradio_chat, inputs=question, outputs=[answer, gallery])
	question.submit(fn=gradio_chat, inputs=question, outputs=[answer, gallery])

	if __name__ == "__main__":
	demo.launch()