Spaces:

IW2025
/

InclusiveWorldChatbot

Sleeping

App Files Files Community

IW2025 commited on Jul 29, 2025

Commit

eea9911

verified ·

1 Parent(s): ce130ce

Upload app.py

Browse files

Files changed (1) hide show

app.py +109 -69

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from langchain_community.vectorstores import Chroma
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_huggingface import HuggingFacePipeline
 from langchain.prompts import PromptTemplate
 from transformers import pipeline
 import torch
 import base64
@@ -72,6 +73,12 @@ class CurriculumChatbot:
     def _setup_llm(self):
         """Setup LLM with HuggingFace pipeline"""
         try:
             # Load the model
             pipe = pipeline(
                 "text-generation",
@@ -87,6 +94,23 @@ class CurriculumChatbot:
             self.llm = HuggingFacePipeline(pipeline=pipe)
             # Create QA prompt template for DialoGPT
             qa_template = """You are a helpful programming tutor. Answer the following question based on the curriculum content provided.
@@ -95,7 +119,7 @@ Curriculum Content:
 Question: {question}
-Provide a clear, educational answer explaining the concept:"""
             self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
                 input_variables=["question", "filled_context"],
@@ -110,16 +134,24 @@ Slide Content:
 Question: {question}
-Provide a clear, educational answer based on this slide:"""
             self.focused_qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
                 input_variables=["question", "slide_content"],
                 template=focused_qa_template
             ))
-            print("✅ Llama 3.1-8B loaded successfully!")
         except Exception as e:
-            print(f"Warning: Could not load Llama 3.1-8B: {e}")
     def get_pdf_page_image(self, pdf_path, page_num):
         try:
@@ -159,42 +191,90 @@ Provide a clear, educational answer based on this slide:"""
         return "\n".join(slides_text)
     def chat(self, query):
-        """Comprehensive chat function with LLM answers and slide navigation"""
-        # First, try to find relevant curriculum content
-        results = self.vector_db.similarity_search(query, k=5)  # Get more results for better selection
-        # Check if query is curriculum-related
         curriculum_relevance_score = 0
         if results:
-            # Calculate relevance score based on similarity
-            curriculum_relevance_score = len([r for r in results if r.page_content.strip()])
             # Debug: Print what we found
             print(f"Query: {query}")
-            print(f"Found {len(results)} relevant results:")
-            for i, result in enumerate(results[:3]):
                 print(f"  {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
                 print(f"     Content: {result.page_content[:100]}...")
-        # Find the most relevant slide content first
-        best_slide_content = ""
-        if curriculum_relevance_score > 0:
-            # Get the most relevant result
-            best_result = results[0]
-            best_slide_content = best_result.page_content
-            # If the best slide has little content, try to find a better one
-            if len(best_slide_content.strip()) < 100:
-                for result in results[1:]:
-                    if len(result.page_content.strip()) > len(best_slide_content.strip()):
-                        best_slide_content = result.page_content
-                        best_result = result
         # Generate focused LLM answer using the most relevant slide
         if self.focused_qa_chain and curriculum_relevance_score > 0:
             try:
                 answer = self.focused_qa_chain.run(question=query, slide_content=best_slide_content)
                 # Clean up the answer
                 answer = answer.strip()
                 if "<|eot_id|>" in answer:
@@ -209,7 +289,7 @@ Provide a clear, educational answer based on this slide:"""
                 # Check if the answer is too short or just repeats the question
                 if len(answer.strip()) < 50 or answer.lower().startswith("how does that work"):
                     # Generate a better answer using the slide content
-                    answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide explains the concept clearly. Let me provide additional context: Loops are programming constructs that allow you to repeat code multiple times efficiently."
             except Exception as e:
                 print(f"Error generating focused answer: {e}")
@@ -265,9 +345,8 @@ Provide a clear, educational answer based on this slide:"""
         # Get the most relevant slide and its neighboring pages
         relevant_slides = []
-        if curriculum_relevance_score > 0:
-            # Get multiple relevant results to find the best one
-            best_result = results[0]
             filename = best_result.metadata["filename"]
             page_number = best_result.metadata["page_number"]
@@ -278,47 +357,8 @@ Provide a clear, educational answer based on this slide:"""
                 total_pages = len(doc)
                 doc.close()
-                # Find the best content page by analyzing all results
                 target_page = page_number
-                best_content_score = 0
-                # Check all search results for the best content page
-                for result in results:
-                    if result.metadata["filename"] == filename:
-                        page_num = result.metadata["page_number"]
-                        page_text = self.pdf_pages[filename].get(page_num, "")
-                        text_length = len(page_text.strip())
-                        # Score based on text length and relevance
-                        content_score = text_length
-                        if text_length > 100:  # Prefer content pages over title slides
-                            content_score += 500
-                        if content_score > best_content_score:
-                            best_content_score = content_score
-                            target_page = page_num
-                # If we still have a title slide, look for better content in the same PDF
-                page_text = self.pdf_pages[filename].get(target_page, "")
-                if len(page_text.strip()) < 150:  # Still a title slide
-                    # Search for pages with the query terms
-                    query_terms = query.lower().split()
-                    best_match_score = 0
-                    for page_num in range(1, total_pages + 1):
-                        if page_num in self.pdf_pages[filename]:
-                            text = self.pdf_pages[filename][page_num].lower()
-                            text_length = len(text.strip())
-                            # Count how many query terms appear in this page
-                            match_score = sum(1 for term in query_terms if term in text)
-                            # Prefer pages with both query terms and good content
-                            if match_score > 0 and text_length > 200:
-                                total_score = match_score * 1000 + text_length
-                                if total_score > best_match_score:
-                                    best_match_score = total_score
-                                    target_page = page_num
                 # Get the target page and neighboring pages (2 before, 2 after)
                 start_page = max(1, target_page - 2)

 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_huggingface import HuggingFacePipeline
 from langchain.prompts import PromptTemplate
+from langchain.chains import LLMChain
 from transformers import pipeline
 import torch
 import base64
     def _setup_llm(self):
         """Setup LLM with HuggingFace pipeline"""
         try:
+            # Initialize LLM attributes
+            self.llm = None
+            self.qa_chain = None
+            self.focused_qa_chain = None
+            self.content_selection_chain = None
             # Load the model
             pipe = pipeline(
                 "text-generation",
             self.llm = HuggingFacePipeline(pipeline=pipe)
+            # Create content selection prompt template
+            content_selection_template = """You are an expert at analyzing curriculum content. Given a user's question and multiple slide contents, determine which slide is most relevant.
+User Question: {question}
+Available Slide Contents:
+{slide_contents}
+Analyze each slide and respond with ONLY the number (1, 2, 3, etc.) of the most relevant slide for the user's question. If no slide is relevant, respond with "0".
+Most relevant slide number:"""
+            self.content_selection_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
+                input_variables=["question", "slide_contents"],
+                template=content_selection_template
+            ))
             # Create QA prompt template for DialoGPT
             qa_template = """You are a helpful programming tutor. Answer the following question based on the curriculum content provided.
 Question: {question}
+Provide a clear, educational answer explaining the concept. Be specific and detailed in your explanation:"""
             self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
                 input_variables=["question", "filled_context"],
 Question: {question}
+Provide a clear, educational answer based on this slide. Be specific and detailed, focusing on the exact concept or topic the user is asking about:"""
             self.focused_qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
                 input_variables=["question", "slide_content"],
                 template=focused_qa_template
             ))
+            print("✅ LLM loaded successfully!")
+            print(f"🔍 LLM object: {self.llm}")
+            print(f"🔍 Content selection chain: {self.content_selection_chain}")
+            print(f"🔍 Focused QA chain: {self.focused_qa_chain}")
         except Exception as e:
+            print(f"Warning: Could not load LLM: {e}")
+            print("Falling back to basic search mode...")
+            self.llm = None
+            self.qa_chain = None
+            self.focused_qa_chain = None
+            self.content_selection_chain = None
     def get_pdf_page_image(self, pdf_path, page_num):
         try:
         return "\n".join(slides_text)
     def chat(self, query):
+        """Comprehensive chat function with LLM-powered content selection and answers"""
+        # First, try to find relevant curriculum content using vector search
+        results = self.vector_db.similarity_search(query, k=5)  # Get top 5 results for LLM analysis
         curriculum_relevance_score = 0
+        best_slide_content = ""
+        best_result = None
         if results:
+            curriculum_relevance_score = len(results)
             # Debug: Print what we found
             print(f"Query: {query}")
+            print(f"Found {len(results)} relevant results for LLM analysis:")
+            for i, result in enumerate(results):
                 print(f"  {i+1}. {result.metadata['filename']} - Page {result.metadata['page_number']}")
                 print(f"     Content: {result.page_content[:100]}...")
+            # Use LLM to select the most relevant content
+            if self.content_selection_chain and curriculum_relevance_score > 0:
+                try:
+                    # Prepare slide contents for LLM analysis
+                    slide_contents = []
+                    for i, result in enumerate(results):
+                        slide_contents.append(f"Slide {i+1}: {result.page_content[:500]}...")
+                    slide_contents_text = "\n\n".join(slide_contents)
+                    print(f"🔍 Using LLM to select most relevant content...")
+                    # Get LLM's selection
+                    selection_response = self.content_selection_chain.run(
+                        question=query,
+                        slide_contents=slide_contents_text
+                    )
+                    print(f"LLM Selection Response: {selection_response}")
+                    # Parse the selection (expecting a number)
+                    try:
+                        # Extract number from response
+                        import re
+                        numbers = re.findall(r'\d+', selection_response)
+                        if numbers:
+                            selected_index = int(numbers[0]) - 1  # Convert to 0-based index
+                            if 0 <= selected_index < len(results):
+                                best_result = results[selected_index]
+                                best_slide_content = best_result.page_content
+                                print(f"✅ LLM selected slide {selected_index + 1}")
+                            else:
+                                print(f"⚠️ LLM selection out of range: {selected_index + 1}")
+                                # Fallback to first result
+                                best_result = results[0]
+                                best_slide_content = best_result.page_content
+                        else:
+                            print("⚠️ No number found in LLM response, using first result")
+                            best_result = results[0]
+                            best_slide_content = best_result.page_content
+                    except Exception as e:
+                        print(f"Error parsing LLM selection: {e}")
+                        # Fallback to first result
+                        best_result = results[0]
+                        best_slide_content = best_result.page_content
+                except Exception as e:
+                    print(f"Error in LLM content selection: {e}")
+                    # Fallback to simple selection
+                    best_result = results[0]
+                    best_slide_content = best_result.page_content
+            else:
+                # Fallback to simple selection if no LLM
+                best_result = results[0]
+                best_slide_content = best_result.page_content
         # Generate focused LLM answer using the most relevant slide
         if self.focused_qa_chain and curriculum_relevance_score > 0:
             try:
+                print(f"🔍 Calling LLM with question: {query}")
+                print(f"🔍 LLM available: {self.focused_qa_chain is not None}")
                 answer = self.focused_qa_chain.run(question=query, slide_content=best_slide_content)
+                print(f"LLM Response: {answer[:200]}...")
                 # Clean up the answer
                 answer = answer.strip()
                 if "<|eot_id|>" in answer:
                 # Check if the answer is too short or just repeats the question
                 if len(answer.strip()) < 50 or answer.lower().startswith("how does that work"):
                     # Generate a better answer using the slide content
+                    answer = f"Based on the curriculum slide:\n\n{best_slide_content}\n\nThis slide explains the concept clearly. The curriculum content provides the foundation for understanding this programming concept."
             except Exception as e:
                 print(f"Error generating focused answer: {e}")
         # Get the most relevant slide and its neighboring pages
         relevant_slides = []
+        if curriculum_relevance_score > 0 and best_result:
+            # Use the LLM-selected result
             filename = best_result.metadata["filename"]
             page_number = best_result.metadata["page_number"]
                 total_pages = len(doc)
                 doc.close()
+                # Use the LLM-selected page as the target
                 target_page = page_number
                 # Get the target page and neighboring pages (2 before, 2 after)
                 start_page = max(1, target_page - 2)