Spaces:

IW2025
/

InclusiveWorldChatbot

Sleeping

App Files Files Community

IW2025 commited on Jul 27, 2025

Commit

fc0df0e

verified ·

1 Parent(s): 5eade97

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -34

app.py CHANGED Viewed

@@ -66,59 +66,41 @@ class CurriculumChatbot:
     def _setup_llm(self):
         try:
-            # Use Llama 3.1-8B for better question answering
-            model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
             pipe = pipeline(
                 "text-generation",
                 model=model_name,
-                max_new_tokens=300,
-                temperature=0.3,
                 do_sample=True,
-                top_p=0.9,
-                repetition_penalty=1.1,
-                device_map="auto" if torch.cuda.is_available() else None
             )
             self.llm = HuggingFacePipeline(pipeline=pipe)
-            # Create QA prompt template
-            qa_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
-You are a helpful AI programming tutor. You MUST ALWAYS provide a clear, educational answer to every question. Never say you cannot answer or that you don't know.
-If the question is about curriculum content, use the provided context to give a detailed, educational explanation. If the curriculum content doesn't perfectly match the question, adapt your answer to be relevant while using the curriculum information.
-If the question is not covered in the curriculum, provide a comprehensive general programming answer based on your knowledge.
-Always be educational, clear, and helpful.
-<|eot_id|><|start_header_id|>user<|end_header_id|>
 Question: {question}
-{filled_context}
-<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
             self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
                 input_variables=["question", "filled_context"],
                 template=qa_template
             ))
-            # Create slide selection prompt template
-            slide_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
-You are an AI that identifies the most relevant slide page for a given question. Return ONLY the filename and page number in this exact format: "filename.pdf - Page X"
-<|eot_id|><|start_header_id|>user<|end_header_id|>
-Question: {question}
 Available slides:
 {available_slides}
-Which slide is most relevant? Return only: "filename.pdf - Page X"
-<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
             self.slide_selection_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
                 input_variables=["question", "available_slides"],
@@ -199,6 +181,10 @@ Which slide is most relevant? Return only: "filename.pdf - Page X"
                 if "<|eot_id|>" in answer:
                     answer = answer.split("<|eot_id|>")[-1].strip()
                 # Add warning if not in curriculum
                 if curriculum_relevance_score == 0:
                     answer = "⚠️ **Note: This topic is not covered in the current curriculum.**\n\n" + answer
@@ -232,14 +218,28 @@ Which slide is most relevant? Return only: "filename.pdf - Page X"
                 total_pages = len(doc)
                 doc.close()
                 # Get the target page and neighboring pages (2 before, 2 after)
-                start_page = max(1, page_number - 2)
-                end_page = min(total_pages, page_number + 2)
                 for page_num in range(start_page, end_page + 1):
                     img = self.get_pdf_page_image(pdf_path, page_num)
                     if img:
-                        if page_num == page_number:
                             # Highlight the most relevant page
                             label = f"📌 {filename} - Page {page_num} (Most Relevant)"
                         else:

     def _setup_llm(self):
         try:
+            # Use a smaller, faster model for Hugging Face Spaces
+            model_name = "microsoft/DialoGPT-medium"  # Smaller model for faster inference
             pipe = pipeline(
                 "text-generation",
                 model=model_name,
+                max_new_tokens=200,
+                temperature=0.7,
                 do_sample=True,
+                pad_token_id=50256
             )
             self.llm = HuggingFacePipeline(pipeline=pipe)
+            # Create QA prompt template for DialoGPT
+            qa_template = """Based on the following curriculum content, please answer this question clearly and educationally:
+{filled_context}
 Question: {question}
+Answer:"""
             self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
                 input_variables=["question", "filled_context"],
                 template=qa_template
             ))
+            # Create slide selection prompt template for DialoGPT
+            slide_template = """Given this question: {question}
 Available slides:
 {available_slides}
+Which slide is most relevant? Return only the filename and page number like this: "filename.pdf - Page X"
+Answer:"""
             self.slide_selection_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
                 input_variables=["question", "available_slides"],
                 if "<|eot_id|>" in answer:
                     answer = answer.split("<|eot_id|>")[-1].strip()
+                # Remove any prompt artifacts
+                if answer.startswith("Answer:"):
+                    answer = answer[7:].strip()
                 # Add warning if not in curriculum
                 if curriculum_relevance_score == 0:
                     answer = "⚠️ **Note: This topic is not covered in the current curriculum.**\n\n" + answer
                 total_pages = len(doc)
                 doc.close()
+                # Try to find a better page if the current one is a title slide
+                target_page = page_number
+                page_text = self.pdf_pages[filename].get(page_number, "")
+                # If current page has very little text (likely a title slide), look for content pages
+                if len(page_text.strip()) < 100:  # Title slides usually have little text
+                    # Look for pages with more content in the same PDF
+                    for page_num in range(1, total_pages + 1):
+                        if page_num in self.pdf_pages[filename]:
+                            text = self.pdf_pages[filename][page_num]
+                            if len(text.strip()) > 200:  # Look for content-rich pages
+                                target_page = page_num
+                                break
                 # Get the target page and neighboring pages (2 before, 2 after)
+                start_page = max(1, target_page - 2)
+                end_page = min(total_pages, target_page + 2)
                 for page_num in range(start_page, end_page + 1):
                     img = self.get_pdf_page_image(pdf_path, page_num)
                     if img:
+                        if page_num == target_page:
                             # Highlight the most relevant page
                             label = f"📌 {filename} - Page {page_num} (Most Relevant)"
                         else: