Spaces:

IW2025
/

InclusiveWorldChatbot

Sleeping

App Files Files Community

IW2025 commited on Jul 29, 2025

Commit

ce130ce

verified ·

1 Parent(s): fc75dd1

Upload app.py

Browse files

Files changed (1) hide show

app.py +198 -108

app.py CHANGED Viewed

@@ -70,88 +70,56 @@ class CurriculumChatbot:
         )
     def _setup_llm(self):
-        # Initialize all LLM-related attributes to None first
-        self.llm = None
-        self.qa_chain = None
-        self.slide_selection_chain = None
-        self.focused_qa_chain = None
         try:
-            # Use a smaller, faster model for better performance
-            # Llama 3.1 8B is quite large and slow - let's use a smaller model
-            model_name = "microsoft/DialoGPT-medium"  # Much faster, smaller model
-            # Get token from secrets
-            import os
-            token = os.environ.get("IW_Token")
-            if not token:
-                raise ValueError("IW_Token not found in environment variables")
             pipe = pipeline(
                 "text-generation",
-                model=model_name,
-                max_new_tokens=100,  # Reduced for faster responses
-                temperature=0.3,
                 do_sample=True,
                 top_p=0.9,
-                repetition_penalty=1.1,
-                device_map="auto" if torch.cuda.is_available() else None,
-                token=token,
-                # Performance optimizations
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
             )
             self.llm = HuggingFacePipeline(pipeline=pipe)
-            # Warm and engaging prompt templates
-            qa_template = """Answer this question: {question}
-Using this information: {filled_context}
-Provide a helpful, friendly answer."""
-            self.qa_prompt = PromptTemplate(
                 input_variables=["question", "filled_context"],
                 template=qa_template
-            )
-            self.qa_chain = self.qa_prompt | self.llm
-            # Enhanced slide selection template
-            slide_selection_template = """As a helpful programming tutor, a student has asked: {question}
-Here are the available curriculum slides that might help answer their question:
-{slide_contents}
-Please select the most relevant slide (filename.pdf - Page X) that would best help explain this concept to the student. Choose the slide that has the most detailed and relevant content for their question."""
-            self.slide_selection_prompt = PromptTemplate(
-                input_variables=["question", "slide_contents"],
-                template=slide_selection_template
-            )
-            self.slide_selection_chain = self.slide_selection_prompt | self.llm
-            # Warm and detailed focused QA template
-            focused_qa_template = """Answer this question: {question}
-Using this information: {slide_content}
-Provide a helpful, friendly answer."""
-            self.focused_qa_prompt = PromptTemplate(
                 input_variables=["question", "slide_content"],
                 template=focused_qa_template
-            )
-            self.focused_qa_chain = self.focused_qa_prompt | self.llm
-            print("✅ Optimized model loaded successfully!")
-            print(f"🔍 LLM object: {self.llm}")
-            print(f"🔍 Focused QA chain: {self.focused_qa_chain}")
         except Exception as e:
-            print(f"Warning: Could not load optimized model: {e}")
-            print("Falling back to basic search mode...")
-            self.llm = None
-            self.qa_chain = None
-            self.slide_selection_chain = None
-            self.focused_qa_chain = None
     def get_pdf_page_image(self, pdf_path, page_num):
         try:
@@ -191,72 +159,194 @@ Provide a helpful, friendly answer."""
         return "\n".join(slides_text)
     def chat(self, query):
-        """Simplified chat function with vector search, LLM analysis, and slide display"""
-        # 1. Vector Search - Find relevant slides
-        results = self.vector_db.similarity_search(query, k=3)
-        if not results:
-            return "I couldn't find relevant content in the curriculum for this question.", None, None, []
-        # Debug: Show what we found
-        print(f"Query: {query}")
-        print(f"Found {len(results)} relevant slides:")
-        for i, result in enumerate(results):
-            print(f"  {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
-        # 2. LLM Check - Analyze slides and generate answer
-        best_result = results[0]
-        best_slide_content = best_result.page_content
-        if self.focused_qa_chain and not self.fast_mode:
             try:
-                print(f"🔍 Calling LLM with question: {query}")
-                answer = self.focused_qa_chain.invoke({
-                    "question": query,
-                    "slide_content": best_slide_content
-                })
-                print(f"LLM Response: {answer[:100]}...")
-                # Clean up the answer
                 answer = answer.strip()
                 if "<|eot_id|>" in answer:
                     answer = answer.split("<|eot_id|>")[-1].strip()
             except Exception as e:
                 print(f"Error generating answer: {e}")
-                answer = f"📄 **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}\n\n**Slide Content:**\n{best_slide_content}"
         else:
-            # Fallback to slide content
-            answer = f"📄 **Slide Reference:** {best_result.metadata['filename']} - Page {best_result.metadata['page_number']}\n\n**Slide Content:**\n{best_slide_content}"
-        # 3. Slide Output - Get relevant slides
         relevant_slides = []
-        filename = best_result.metadata["filename"]
-        page_number = best_result.metadata["page_number"]
-        if filename in self.pdf_files:
-            pdf_path = self.pdf_files[filename]
-            doc = fitz.open(pdf_path)
-            total_pages = len(doc)
-            doc.close()
-            # Get the target page and neighboring pages (2 before, 2 after)
-            start_page = max(1, page_number - 2)
-            end_page = min(total_pages, page_number + 2)
-            for page_num in range(start_page, end_page + 1):
-                img = self.get_pdf_page_image(pdf_path, page_num)
-                if img:
-                    if page_num == page_number:
-                        label = f"📌 {filename} - Page {page_num} (Most Relevant)"
-                    else:
-                        label = f"{filename} - Page {page_num}"
-                    relevant_slides.append((img, label))
-        return answer, relevant_slides[0][0] if relevant_slides else None, relevant_slides[0][1] if relevant_slides else None, relevant_slides
 # --- Gradio UI ---
 chatbot = CurriculumChatbot(fast_mode=False)  # Enable AI mode by default

         )
     def _setup_llm(self):
+        """Setup LLM with HuggingFace pipeline"""
         try:
+            # Load the model
             pipe = pipeline(
                 "text-generation",
+                model="microsoft/DialoGPT-medium",
+                torch_dtype=torch.float16,
+                device_map="auto",
+                max_length=512,
                 do_sample=True,
+                temperature=0.7,
                 top_p=0.9,
+                repetition_penalty=1.1
             )
             self.llm = HuggingFacePipeline(pipeline=pipe)
+            # Create QA prompt template for DialoGPT
+            qa_template = """You are a helpful programming tutor. Answer the following question based on the curriculum content provided.
+Curriculum Content:
+{filled_context}
+Question: {question}
+Provide a clear, educational answer explaining the concept:"""
+            self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
                 input_variables=["question", "filled_context"],
                 template=qa_template
+            ))
+            # Create focused answer prompt template
+            focused_qa_template = """You are a helpful programming tutor. Answer the question based on the specific slide content provided.
+Slide Content:
+{slide_content}
+Question: {question}
+Provide a clear, educational answer based on this slide:"""
+            self.focused_qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
                 input_variables=["question", "slide_content"],
                 template=focused_qa_template
+            ))
+            print("✅ Llama 3.1-8B loaded successfully!")
         except Exception as e:
+            print(f"Warning: Could not load Llama 3.1-8B: {e}")
     def get_pdf_page_image(self, pdf_path, page_num):
         try:
         return "\n".join(slides_text)
     def chat(self, query):
+        """Comprehensive chat function with LLM answers and slide navigation"""
+        # First, try to find relevant curriculum content
+        results = self.vector_db.similarity_search(query, k=5)  # Get more results for better selection
+        # Check if query is curriculum-related
+        curriculum_relevance_score = 0
+        if results:
+            # Calculate relevance score based on similarity
+            curriculum_relevance_score = len([r for r in results if r.page_content.strip()])
+            # Debug: Print what we found
+            print(f"Query: {query}")
+            print(f"Found {len(results)} relevant results:")
+            for i, result in enumerate(results[:3]):
+                print(f"  {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
+                print(f"     Content: {result.page_content[:100]}...")
+        # Find the most relevant slide content first
+        best_slide_content = ""
+        if curriculum_relevance_score > 0:
+            # Get the most relevant result
+            best_result = results[0]
+            best_slide_content = best_result.page_content
+            # If the best slide has little content, try to find a better one
+            if len(best_slide_content.strip()) < 100:
+                for result in results[1:]:
+                    if len(result.page_content.strip()) > len(best_slide_content.strip()):
+                        best_slide_content = result.page_content
+                        best_result = result
+        # Generate focused LLM answer using the most relevant slide
+        if self.focused_qa_chain and curriculum_relevance_score > 0:
             try:
+                answer = self.focused_qa_chain.run(question=query, slide_content=best_slide_content)
+                # Clean up the answer
+                answer = answer.strip()
+                if "<|eot_id|>" in answer:
+                    answer = answer.split("<|eot_id|>")[-1].strip()
+                # Remove any prompt artifacts
+                if answer.startswith("Answer:"):
+                    answer = answer[7:].strip()
+                if answer.startswith("Provide a clear, educational answer based on this slide:"):
+                    answer = answer[58:].strip()
+                # Check if the answer is too short or just repeats the question
+                if len(answer.strip()) < 50 or answer.lower().startswith("how does that work"):
+                    # Generate a better answer using the slide content
+                    answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide explains the concept clearly. Let me provide additional context: Loops are programming constructs that allow you to repeat code multiple times efficiently."
+            except Exception as e:
+                print(f"Error generating focused answer: {e}")
+                # Fallback to slide content with explanation
+                answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide contains the relevant information about your question."
+        elif self.qa_chain:
+            # Fallback to general LLM if focused chain fails
+            try:
+                if curriculum_relevance_score > 0:
+                    context = "\n\n".join([result.page_content for result in results])
+                    filled_context = f"Curriculum Context:\n{context}\n\nPlease answer based on this curriculum content."
+                else:
+                    filled_context = "Note: This question is not covered in the current curriculum. Please provide a general programming answer."
+                answer = self.qa_chain.run(question=query, filled_context=filled_context)
                 answer = answer.strip()
                 if "<|eot_id|>" in answer:
                     answer = answer.split("<|eot_id|>")[-1].strip()
+                if answer.startswith("Answer:"):
+                    answer = answer[7:].strip()
+                if answer.startswith("Provide a clear, educational answer explaining the concept:"):
+                    answer = answer[58:].strip()
+                # Check if the answer is too short
+                if len(answer.strip()) < 50:
+                    if curriculum_relevance_score > 0:
+                        answer = f"Based on the curriculum content:\n\n{best_slide_content}\n\nThis slide explains the concept clearly."
+                    else:
+                        answer = "I'm sorry, I couldn't generate a proper answer. Please try rephrasing your question."
+                # Add warning if not in curriculum
+                if curriculum_relevance_score == 0:
+                    answer = "💡 **Note: This topic isn't covered in your current curriculum, but here's a helpful answer:**\n\n" + answer
             except Exception as e:
                 print(f"Error generating answer: {e}")
+                if curriculum_relevance_score > 0:
+                    answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide contains the relevant information about your question."
+                else:
+                    answer = "I'm sorry, I couldn't generate an answer at the moment. Please try rephrasing your question."
         else:
+            # If no LLM available
+            if curriculum_relevance_score > 0:
+                answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\n*Note: AI generation is not available, but here's the relevant curriculum content.*"
+            else:
+                answer = "I couldn't find relevant content in the curriculum for this question. Please try rephrasing or ask about a different programming topic."
+        # Get the most relevant slide and its neighboring pages
         relevant_slides = []
+        if curriculum_relevance_score > 0:
+            # Get multiple relevant results to find the best one
+            best_result = results[0]
+            filename = best_result.metadata["filename"]
+            page_number = best_result.metadata["page_number"]
+            # Get the specific PDF and its pages
+            if filename in self.pdf_files:
+                pdf_path = self.pdf_files[filename]
+                doc = fitz.open(pdf_path)
+                total_pages = len(doc)
+                doc.close()
+                # Find the best content page by analyzing all results
+                target_page = page_number
+                best_content_score = 0
+                # Check all search results for the best content page
+                for result in results:
+                    if result.metadata["filename"] == filename:
+                        page_num = result.metadata["page_number"]
+                        page_text = self.pdf_pages[filename].get(page_num, "")
+                        text_length = len(page_text.strip())
+                        # Score based on text length and relevance
+                        content_score = text_length
+                        if text_length > 100:  # Prefer content pages over title slides
+                            content_score += 500
+                        if content_score > best_content_score:
+                            best_content_score = content_score
+                            target_page = page_num
+                # If we still have a title slide, look for better content in the same PDF
+                page_text = self.pdf_pages[filename].get(target_page, "")
+                if len(page_text.strip()) < 150:  # Still a title slide
+                    # Search for pages with the query terms
+                    query_terms = query.lower().split()
+                    best_match_score = 0
+                    for page_num in range(1, total_pages + 1):
+                        if page_num in self.pdf_pages[filename]:
+                            text = self.pdf_pages[filename][page_num].lower()
+                            text_length = len(text.strip())
+                            # Count how many query terms appear in this page
+                            match_score = sum(1 for term in query_terms if term in text)
+                            # Prefer pages with both query terms and good content
+                            if match_score > 0 and text_length > 200:
+                                total_score = match_score * 1000 + text_length
+                                if total_score > best_match_score:
+                                    best_match_score = total_score
+                                    target_page = page_num
+                # Get the target page and neighboring pages (2 before, 2 after)
+                start_page = max(1, target_page - 2)
+                end_page = min(total_pages, target_page + 2)
+                for page_num in range(start_page, end_page + 1):
+                    img = self.get_pdf_page_image(pdf_path, page_num)
+                    if img:
+                        if page_num == target_page:
+                            # Highlight the most relevant page
+                            label = f"📌 {filename} - Page {page_num} (Most Relevant)"
+                        else:
+                            label = f"{filename} - Page {page_num}"
+                        relevant_slides.append((img, label))
+                recommended_slide = relevant_slides[0][0] if relevant_slides else None
+                recommended_label = relevant_slides[0][1] if relevant_slides else None
+            else:
+                # Fallback if filename not found
+                recommended_slide = None
+                recommended_label = None
+        else:
+            # If no curriculum content, provide a helpful response
+            relevant_slides = []
+            recommended_slide = None
+            recommended_label = None
+        return answer, recommended_slide, recommended_label, relevant_slides
 # --- Gradio UI ---
 chatbot = CurriculumChatbot(fast_mode=False)  # Enable AI mode by default