Spaces:

IW2025
/

InclusiveWorldChatbot

Sleeping

App Files Files Community

IW2025 commited on Jul 27, 2025

Commit

7caf6aa

verified ·

1 Parent(s): fc0df0e

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -23

app.py CHANGED Viewed

@@ -79,13 +79,14 @@ class CurriculumChatbot:
             self.llm = HuggingFacePipeline(pipeline=pipe)
             # Create QA prompt template for DialoGPT
-            qa_template = """Based on the following curriculum content, please answer this question clearly and educationally:
 {filled_context}
 Question: {question}
-Answer:"""
             self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
                 input_variables=["question", "filled_context"],
@@ -107,6 +108,21 @@ Answer:"""
                 template=slide_template
             ))
             print("✅ Llama 3.1-8B loaded successfully!")
         except Exception as e:
             print(f"Warning: Could not load Llama 3.1-8B: {e}")
@@ -155,23 +171,68 @@ Answer:"""
     def chat(self, query):
         """Comprehensive chat function with LLM answers and slide navigation"""
         # First, try to find relevant curriculum content
-        results = self.vector_db.similarity_search(query, k=3)
         # Check if query is curriculum-related
         curriculum_relevance_score = 0
         if results:
             # Calculate relevance score based on similarity
             curriculum_relevance_score = len([r for r in results if r.page_content.strip()])
-        # ALWAYS generate LLM answer (never fallback to raw text)
-        if self.qa_chain:
             try:
                 if curriculum_relevance_score > 0:
-                    # Use curriculum context
                     context = "\n\n".join([result.page_content for result in results])
                     filled_context = f"Curriculum Context:\n{context}\n\nPlease answer based on this curriculum content."
                 else:
-                    # No curriculum context - general programming answer
                     filled_context = "Note: This question is not covered in the current curriculum. Please provide a general programming answer."
                 answer = self.qa_chain.run(question=query, filled_context=filled_context)
@@ -180,10 +241,17 @@ Answer:"""
                 answer = answer.strip()
                 if "<|eot_id|>" in answer:
                     answer = answer.split("<|eot_id|>")[-1].strip()
-                # Remove any prompt artifacts
                 if answer.startswith("Answer:"):
                     answer = answer[7:].strip()
                 # Add warning if not in curriculum
                 if curriculum_relevance_score == 0:
@@ -191,22 +259,21 @@ Answer:"""
             except Exception as e:
                 print(f"Error generating answer: {e}")
-                # Even if LLM fails, try to provide a helpful response
                 if curriculum_relevance_score > 0:
-                    answer = f"Based on the curriculum content, here's what I found:\n\n{results[0].page_content}\n\n*Note: I'm having trouble generating a custom answer right now, but here's the relevant curriculum content.*"
                 else:
                     answer = "I'm sorry, I couldn't generate an answer at the moment. Please try rephrasing your question."
         else:
-            # If no LLM available, still provide helpful response
             if curriculum_relevance_score > 0:
-                answer = f"Based on the curriculum content:\n\n{results[0].page_content}\n\n*Note: AI generation is not available, but here's the relevant curriculum content.*"
             else:
                 answer = "I couldn't find relevant content in the curriculum for this question. Please try rephrasing or ask about a different programming topic."
         # Get the most relevant slide and its neighboring pages
         relevant_slides = []
         if curriculum_relevance_score > 0:
-            # Get the most relevant result
             best_result = results[0]
             filename = best_result.metadata["filename"]
             page_number = best_result.metadata["page_number"]
@@ -218,19 +285,47 @@ Answer:"""
                 total_pages = len(doc)
                 doc.close()
-                # Try to find a better page if the current one is a title slide
                 target_page = page_number
-                page_text = self.pdf_pages[filename].get(page_number, "")
-                # If current page has very little text (likely a title slide), look for content pages
-                if len(page_text.strip()) < 100:  # Title slides usually have little text
-                    # Look for pages with more content in the same PDF
                     for page_num in range(1, total_pages + 1):
                         if page_num in self.pdf_pages[filename]:
-                            text = self.pdf_pages[filename][page_num]
-                            if len(text.strip()) > 200:  # Look for content-rich pages
-                                target_page = page_num
-                                break
                 # Get the target page and neighboring pages (2 before, 2 after)
                 start_page = max(1, target_page - 2)

             self.llm = HuggingFacePipeline(pipeline=pipe)
             # Create QA prompt template for DialoGPT
+            qa_template = """You are a helpful programming tutor. Answer the following question based on the curriculum content provided.
+Curriculum Content:
 {filled_context}
 Question: {question}
+Provide a clear, educational answer explaining the concept:"""
             self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
                 input_variables=["question", "filled_context"],
                 template=slide_template
             ))
+            # Create focused answer prompt template
+            focused_qa_template = """You are a helpful programming tutor. Answer the question based on the specific slide content provided.
+Slide Content:
+{slide_content}
+Question: {question}
+Provide a clear, educational answer based on this slide:"""
+            self.focused_qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
+                input_variables=["question", "slide_content"],
+                template=focused_qa_template
+            ))
             print("✅ Llama 3.1-8B loaded successfully!")
         except Exception as e:
             print(f"Warning: Could not load Llama 3.1-8B: {e}")
     def chat(self, query):
         """Comprehensive chat function with LLM answers and slide navigation"""
         # First, try to find relevant curriculum content
+        results = self.vector_db.similarity_search(query, k=5)  # Get more results for better selection
         # Check if query is curriculum-related
         curriculum_relevance_score = 0
         if results:
             # Calculate relevance score based on similarity
             curriculum_relevance_score = len([r for r in results if r.page_content.strip()])
+            # Debug: Print what we found
+            print(f"Query: {query}")
+            print(f"Found {len(results)} relevant results:")
+            for i, result in enumerate(results[:3]):
+                print(f"  {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
+                print(f"     Content: {result.page_content[:100]}...")
+        # Find the most relevant slide content first
+        best_slide_content = ""
+        if curriculum_relevance_score > 0:
+            # Get the most relevant result
+            best_result = results[0]
+            best_slide_content = best_result.page_content
+            # If the best slide has little content, try to find a better one
+            if len(best_slide_content.strip()) < 100:
+                for result in results[1:]:
+                    if len(result.page_content.strip()) > len(best_slide_content.strip()):
+                        best_slide_content = result.page_content
+                        best_result = result
+        # Generate focused LLM answer using the most relevant slide
+        if self.focused_qa_chain and curriculum_relevance_score > 0:
+            try:
+                answer = self.focused_qa_chain.run(question=query, slide_content=best_slide_content)
+                # Clean up the answer
+                answer = answer.strip()
+                if "<|eot_id|>" in answer:
+                    answer = answer.split("<|eot_id|>")[-1].strip()
+                # Remove any prompt artifacts
+                if answer.startswith("Answer:"):
+                    answer = answer[7:].strip()
+                if answer.startswith("Provide a clear, educational answer based on this slide:"):
+                    answer = answer[58:].strip()
+                # Check if the answer is too short or just repeats the question
+                if len(answer.strip()) < 50 or answer.lower().startswith("how does that work"):
+                    # Generate a better answer using the slide content
+                    answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide explains the concept clearly. Let me provide additional context: Loops are programming constructs that allow you to repeat code multiple times efficiently."
+            except Exception as e:
+                print(f"Error generating focused answer: {e}")
+                # Fallback to slide content with explanation
+                answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide contains the relevant information about your question."
+        elif self.qa_chain:
+            # Fallback to general LLM if focused chain fails
             try:
                 if curriculum_relevance_score > 0:
                     context = "\n\n".join([result.page_content for result in results])
                     filled_context = f"Curriculum Context:\n{context}\n\nPlease answer based on this curriculum content."
                 else:
                     filled_context = "Note: This question is not covered in the current curriculum. Please provide a general programming answer."
                 answer = self.qa_chain.run(question=query, filled_context=filled_context)
                 answer = answer.strip()
                 if "<|eot_id|>" in answer:
                     answer = answer.split("<|eot_id|>")[-1].strip()
                 if answer.startswith("Answer:"):
                     answer = answer[7:].strip()
+                if answer.startswith("Provide a clear, educational answer explaining the concept:"):
+                    answer = answer[58:].strip()
+                # Check if the answer is too short
+                if len(answer.strip()) < 50:
+                    if curriculum_relevance_score > 0:
+                        answer = f"Based on the curriculum content:\n\n{best_slide_content}\n\nThis slide explains the concept clearly."
+                    else:
+                        answer = "I'm sorry, I couldn't generate a proper answer. Please try rephrasing your question."
                 # Add warning if not in curriculum
                 if curriculum_relevance_score == 0:
             except Exception as e:
                 print(f"Error generating answer: {e}")
                 if curriculum_relevance_score > 0:
+                    answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide contains the relevant information about your question."
                 else:
                     answer = "I'm sorry, I couldn't generate an answer at the moment. Please try rephrasing your question."
         else:
+            # If no LLM available
             if curriculum_relevance_score > 0:
+                answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\n*Note: AI generation is not available, but here's the relevant curriculum content.*"
             else:
                 answer = "I couldn't find relevant content in the curriculum for this question. Please try rephrasing or ask about a different programming topic."
         # Get the most relevant slide and its neighboring pages
         relevant_slides = []
         if curriculum_relevance_score > 0:
+            # Get multiple relevant results to find the best one
             best_result = results[0]
             filename = best_result.metadata["filename"]
             page_number = best_result.metadata["page_number"]
                 total_pages = len(doc)
                 doc.close()
+                # Find the best content page by analyzing all results
                 target_page = page_number
+                best_content_score = 0
+                # Check all search results for the best content page
+                for result in results:
+                    if result.metadata["filename"] == filename:
+                        page_num = result.metadata["page_number"]
+                        page_text = self.pdf_pages[filename].get(page_num, "")
+                        text_length = len(page_text.strip())
+                        # Score based on text length and relevance
+                        content_score = text_length
+                        if text_length > 100:  # Prefer content pages over title slides
+                            content_score += 500
+                        if content_score > best_content_score:
+                            best_content_score = content_score
+                            target_page = page_num
+                # If we still have a title slide, look for better content in the same PDF
+                page_text = self.pdf_pages[filename].get(target_page, "")
+                if len(page_text.strip()) < 150:  # Still a title slide
+                    # Search for pages with the query terms
+                    query_terms = query.lower().split()
+                    best_match_score = 0
                     for page_num in range(1, total_pages + 1):
                         if page_num in self.pdf_pages[filename]:
+                            text = self.pdf_pages[filename][page_num].lower()
+                            text_length = len(text.strip())
+                            # Count how many query terms appear in this page
+                            match_score = sum(1 for term in query_terms if term in text)
+                            # Prefer pages with both query terms and good content
+                            if match_score > 0 and text_length > 200:
+                                total_score = match_score * 1000 + text_length
+                                if total_score > best_match_score:
+                                    best_match_score = total_score
+                                    target_page = page_num
                 # Get the target page and neighboring pages (2 before, 2 after)
                 start_page = max(1, target_page - 2)