Spaces:

IW2025
/

InclusiveWorldChatbot

Sleeping

App Files Files Community

IW2025 commited on Aug 1, 2025

Commit

764f397

verified ·

1 Parent(s): 32ddee3

Upload llm_app.py

Browse files

Files changed (1) hide show

llm_app.py +361 -0

llm_app.py ADDED Viewed

	@@ -0,0 +1,361 @@

+import gradio as gr
+import os
+from pathlib import Path
+import fitz  # PyMuPDF
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain.prompts import PromptTemplate
+from langchain.chains import LLMChain
+import requests
+import json
+import base64
+from PIL import Image
+import io
+import re
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
+# --- LLM-Powered Curriculum Assistant ---
+class LLMCurriculumAssistant:
+    def __init__(self, slides_dir="Slides"):
+        self.pdf_pages = {}  # {filename: {page_num: text}}
+        self.pdf_files = {}  # {filename: path}
+        self.chunks = []
+        self.chunk_metadata = []
+        self.vector_db = None
+        self.embeddings = None
+        self.llm = None
+        self.content_selection_chain = None
+        self.answer_chain = None
+        # Setup
+        self._process_pdfs(slides_dir)
+        self._build_vector_db()
+        self._setup_llm()
+    def _process_pdfs(self, slides_dir):
+        """Process PDFs and extract text"""
+        slides_path = Path(slides_dir)
+        pdf_files = list(slides_path.glob("*.pdf"))
+        for pdf_file in pdf_files:
+            self.pdf_files[pdf_file.name] = str(pdf_file)
+            doc = fitz.open(str(pdf_file))
+            pages = {}
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                text = page.get_text()
+                if text.strip():
+                    pages[page_num + 1] = text.strip()
+            self.pdf_pages[pdf_file.name] = pages
+            doc.close()
+            # Add each page as a chunk
+            for page_num, text in pages.items():
+                self.chunks.append(text)
+                self.chunk_metadata.append({
+                    "filename": pdf_file.name,
+                    "page_number": page_num
+                })
+        print(f"✅ Processed {len(pdf_files)} PDF files with {len(self.chunks)} total pages")
+    def _build_vector_db(self):
+        """Build vector database for semantic search"""
+        self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+        self.vector_db = Chroma.from_texts(
+            texts=self.chunks,
+            embedding=self.embeddings,
+            metadatas=self.chunk_metadata,
+            persist_directory="./chroma_db"
+        )
+        print("✅ Vector database built successfully")
+    def _setup_llm(self):
+        """Setup DeepSeek LLM"""
+        try:
+            # Initialize DeepSeek client
+            self.deepseek_api_key = os.environ.get("DEEPSEEK_API_KEY")
+            self.deepseek_base_url = "https://api.deepseek.com/v1/chat/completions"
+            # Create content selection prompt
+            content_selection_template = """Hi! I'm helping a student find the best curriculum slide for their question.
+The student asked: "{question}"
+Here are some slides that might be relevant:
+{slide_contents}
+Could you help me pick the slide that best answers their specific question? Look for:
+- Slides that specifically mention what they're asking about
+- Slides with clear explanations and examples
+- Slides that match the exact terms they used (like "for loops" vs just "loops")
+Just respond with the slide number (1, 2, 3, etc.) that you think is most helpful. If none really fit, say "0".
+Thanks! Slide number:"""
+            self.content_selection_prompt = PromptTemplate(
+                input_variables=["question", "slide_contents"],
+                template=content_selection_template
+            )
+            # Create answer generation prompt
+            answer_template = """Hey there! I'm helping a student understand a programming concept. They asked:
+"{question}"
+Here's what the curriculum slide says about it:
+{slide_content}
+Could you help me explain this to them in a friendly, educational way? I'd like you to:
+- Break it down in simple terms
+- Use examples if the slide has them
+- Make it step-by-step and easy to follow
+- Add some helpful context if the slide is brief
+- Use bullet points or lists to make it clear
+- Make sure your answer directly addresses what they asked
+Thanks for your help! Here's what I'd tell the student:"""
+            self.answer_prompt = PromptTemplate(
+                input_variables=["question", "slide_content"],
+                template=answer_template
+            )
+            print("✅ LLM setup successful!")
+        except Exception as e:
+            print(f"❌ Error setting up LLM: {e}")
+            self.deepseek_api_key = None
+            self.content_selection_prompt = None
+            self.answer_prompt = None
+    def get_pdf_page_image(self, pdf_path, page_num):
+        """Get PDF page as image"""
+        try:
+            doc = fitz.open(pdf_path)
+            if page_num <= len(doc):
+                page = doc[page_num - 1]
+                mat = fitz.Matrix(1.5, 1.5)
+                pix = page.get_pixmap(matrix=mat)
+                img_data = pix.tobytes("png")
+                img = Image.open(io.BytesIO(img_data))
+                if img.mode != 'RGB':
+                    img = img.convert('RGB')
+                doc.close()
+                return img
+            doc.close()
+            return None
+        except Exception as e:
+            print(f"Error rendering PDF page: {str(e)}")
+            return None
+    def chat(self, query):
+        """Main chat function with LLM-powered content selection and answer generation"""
+        print(f"\n🔍 Processing query: {query}")
+        # Step 1: Vector search to find relevant content
+        results = self.vector_db.similarity_search(query, k=5)
+        if not results:
+            return "I couldn't find any relevant content in the curriculum for your question.", [], None, None
+        print(f"📚 Found {len(results)} relevant slides from vector search")
+        # Step 2: LLM content selection
+        selected_content = None
+        selected_result = None
+        if self.deepseek_api_key and self.content_selection_prompt:
+            try:
+                # Prepare slide contents for LLM analysis
+                slide_contents = []
+                for i, result in enumerate(results):
+                    filename = result.metadata['filename']
+                    page_num = result.metadata['page_number']
+                    content = result.page_content[:800]
+                    slide_contents.append(f"Slide {i+1} ({filename} - Page {page_num}):\n{content}")
+                slide_contents_text = "\n\n".join(slide_contents)
+                print("🤖 Using DeepSeek to select most relevant content...")
+                # Format the prompt
+                prompt = self.content_selection_prompt.format(
+                    question=query,
+                    slide_contents=slide_contents_text
+                )
+                # Get DeepSeek's selection
+                headers = {
+                    "Authorization": f"Bearer {self.deepseek_api_key}",
+                    "Content-Type": "application/json"
+                }
+                data = {
+                    "model": "deepseek-chat",
+                    "messages": [{"role": "user", "content": prompt}],
+                    "max_tokens": 1500,
+                    "temperature": 0.7
+                }
+                response = requests.post(self.deepseek_base_url, headers=headers, json=data)
+                response.raise_for_status()
+                selection_response = response.json()["choices"][0]["message"]["content"]
+                print(f"DeepSeek Selection Response: {selection_response}")
+                # Parse the selection
+                try:
+                    numbers = re.findall(r'\d+', selection_response)
+                    if numbers:
+                        selected_index = int(numbers[0]) - 1
+                        if 0 <= selected_index < len(results):
+                            selected_result = results[selected_index]
+                            selected_content = selected_result.page_content
+                            print(f"✅ LLM selected slide {selected_index + 1}")
+                        else:
+                            print(f"⚠️ LLM selection out of range: {selected_index + 1}")
+                            selected_result = results[0]
+                            selected_content = selected_result.page_content
+                    else:
+                        print("⚠️ No number found in LLM response, using first result")
+                        selected_result = results[0]
+                        selected_content = selected_result.page_content
+                except Exception as e:
+                    print(f"Error parsing LLM selection: {e}")
+                    selected_result = results[0]
+                    selected_content = selected_result.page_content
+            except Exception as e:
+                print(f"Error in LLM content selection: {e}")
+                selected_result = results[0]
+                selected_content = selected_result.page_content
+        else:
+            # Fallback to first result
+            selected_result = results[0]
+            selected_content = selected_result.page_content
+        # Step 3: LLM answer generation
+        answer = ""
+        if self.deepseek_api_key and self.answer_prompt and selected_content:
+            try:
+                print("🤖 Generating DeepSeek answer...")
+                # Format the prompt
+                prompt = self.answer_prompt.format(
+                    question=query,
+                    slide_content=selected_content
+                )
+                # Get DeepSeek's answer
+                headers = {
+                    "Authorization": f"Bearer {self.deepseek_api_key}",
+                    "Content-Type": "application/json"
+                }
+                data = {
+                    "model": "deepseek-chat",
+                    "messages": [{"role": "user", "content": prompt}],
+                    "max_tokens": 1500,
+                    "temperature": 0.7
+                }
+                response = requests.post(self.deepseek_base_url, headers=headers, json=data)
+                response.raise_for_status()
+                answer = response.json()["choices"][0]["message"]["content"].strip()
+                print(f"✅ DeepSeek answer generated: {answer[:100]}...")
+            except Exception as e:
+                print(f"Error generating DeepSeek answer: {e}")
+                answer = f"Based on the curriculum slide:\n\n{selected_content}\n\nThis slide contains relevant information about your question."
+        else:
+            answer = f"Based on the curriculum slide:\n\n{selected_content}\n\nThis slide contains relevant information about your question."
+        # Step 4: Get relevant slides for display
+        relevant_slides = []
+        if selected_result:
+            filename = selected_result.metadata["filename"]
+            page_number = selected_result.metadata["page_number"]
+            if filename in self.pdf_files:
+                pdf_path = self.pdf_files[filename]
+                doc = fitz.open(pdf_path)
+                total_pages = len(doc)
+                doc.close()
+                # Get the selected page and neighboring pages
+                start_page = max(1, page_number - 2)
+                end_page = min(total_pages, page_number + 2)
+                for page_num in range(start_page, end_page + 1):
+                    img = self.get_pdf_page_image(pdf_path, page_num)
+                    if img:
+                        if page_num == page_number:
+                            label = f"📌 {filename} - Page {page_num} (Most Relevant)"
+                        else:
+                            label = f"{filename} - Page {page_num}"
+                        relevant_slides.append((img, label))
+                recommended_slide = relevant_slides[0][0] if relevant_slides else None
+                recommended_label = relevant_slides[0][1] if relevant_slides else None
+            else:
+                recommended_slide = None
+                recommended_label = None
+        else:
+            recommended_slide = None
+            recommended_label = None
+        return answer, relevant_slides, recommended_slide, recommended_label
+# --- Gradio UI ---
+assistant = LLMCurriculumAssistant()
+def gradio_chat(query):
+    """Gradio chat interface"""
+    answer, relevant_slides, recommended_slide, recommended_label = assistant.chat(query)
+    return answer, relevant_slides
+with gr.Blocks(title="LLM Curriculum Assistant", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🤖 LLM Curriculum Assistant\nYour AI programming tutor with LLM-powered content selection and answers!")
+    with gr.Row():
+        # Left Column - Chatbot Interface
+        with gr.Column(scale=1):
+            gr.Markdown("### 💬 Chatbot")
+            gr.Markdown("**Ask questions about programming concepts:**")
+            question = gr.Textbox(
+                label="Question Input",
+                placeholder="e.g., What are for loops? How do variables work? Explain functions...",
+                lines=3
+            )
+            submit = gr.Button("🤖 Ask AI", variant="primary", size="lg")
+            answer = gr.Markdown(label="LLM Generated Answer")
+        # Right Column - Slides Display
+        with gr.Column(scale=1):
+            gr.Markdown("### 📄 Most Relevant Slides")
+            gallery = gr.Gallery(
+                label="Curriculum Slides",
+                columns=1,
+                rows=3,
+                height="600px",
+                object_fit="contain",
+                show_label=False
+            )
+    # Event handlers
+    submit.click(fn=gradio_chat, inputs=[question], outputs=[answer, gallery])
+    question.submit(fn=gradio_chat, inputs=[question], outputs=[answer, gallery])
+if __name__ == "__main__":
+    demo.launch()