IW2025 commited on
Commit
fc0df0e
·
verified ·
1 Parent(s): 5eade97

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -34
app.py CHANGED
@@ -66,59 +66,41 @@ class CurriculumChatbot:
66
 
67
  def _setup_llm(self):
68
  try:
69
- # Use Llama 3.1-8B for better question answering
70
- model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
71
  pipe = pipeline(
72
  "text-generation",
73
  model=model_name,
74
- max_new_tokens=300,
75
- temperature=0.3,
76
  do_sample=True,
77
- top_p=0.9,
78
- repetition_penalty=1.1,
79
- device_map="auto" if torch.cuda.is_available() else None
80
  )
81
  self.llm = HuggingFacePipeline(pipeline=pipe)
82
 
83
- # Create QA prompt template
84
- qa_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
85
 
86
- You are a helpful AI programming tutor. You MUST ALWAYS provide a clear, educational answer to every question. Never say you cannot answer or that you don't know.
87
-
88
- If the question is about curriculum content, use the provided context to give a detailed, educational explanation. If the curriculum content doesn't perfectly match the question, adapt your answer to be relevant while using the curriculum information.
89
-
90
- If the question is not covered in the curriculum, provide a comprehensive general programming answer based on your knowledge.
91
-
92
- Always be educational, clear, and helpful.
93
-
94
- <|eot_id|><|start_header_id|>user<|end_header_id|>
95
 
96
  Question: {question}
97
 
98
- {filled_context}
99
-
100
- <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
101
 
102
  self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
103
  input_variables=["question", "filled_context"],
104
  template=qa_template
105
  ))
106
 
107
- # Create slide selection prompt template
108
- slide_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
109
-
110
- You are an AI that identifies the most relevant slide page for a given question. Return ONLY the filename and page number in this exact format: "filename.pdf - Page X"
111
-
112
- <|eot_id|><|start_header_id|>user<|end_header_id|>
113
-
114
- Question: {question}
115
 
116
  Available slides:
117
  {available_slides}
118
 
119
- Which slide is most relevant? Return only: "filename.pdf - Page X"
120
 
121
- <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
122
 
123
  self.slide_selection_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
124
  input_variables=["question", "available_slides"],
@@ -199,6 +181,10 @@ Which slide is most relevant? Return only: "filename.pdf - Page X"
199
  if "<|eot_id|>" in answer:
200
  answer = answer.split("<|eot_id|>")[-1].strip()
201
 
 
 
 
 
202
  # Add warning if not in curriculum
203
  if curriculum_relevance_score == 0:
204
  answer = "⚠️ **Note: This topic is not covered in the current curriculum.**\n\n" + answer
@@ -232,14 +218,28 @@ Which slide is most relevant? Return only: "filename.pdf - Page X"
232
  total_pages = len(doc)
233
  doc.close()
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  # Get the target page and neighboring pages (2 before, 2 after)
236
- start_page = max(1, page_number - 2)
237
- end_page = min(total_pages, page_number + 2)
238
 
239
  for page_num in range(start_page, end_page + 1):
240
  img = self.get_pdf_page_image(pdf_path, page_num)
241
  if img:
242
- if page_num == page_number:
243
  # Highlight the most relevant page
244
  label = f"📌 {filename} - Page {page_num} (Most Relevant)"
245
  else:
 
66
 
67
  def _setup_llm(self):
68
  try:
69
+ # Use a smaller, faster model for Hugging Face Spaces
70
+ model_name = "microsoft/DialoGPT-medium" # Smaller model for faster inference
71
  pipe = pipeline(
72
  "text-generation",
73
  model=model_name,
74
+ max_new_tokens=200,
75
+ temperature=0.7,
76
  do_sample=True,
77
+ pad_token_id=50256
 
 
78
  )
79
  self.llm = HuggingFacePipeline(pipeline=pipe)
80
 
81
+ # Create QA prompt template for DialoGPT
82
+ qa_template = """Based on the following curriculum content, please answer this question clearly and educationally:
83
 
84
+ {filled_context}
 
 
 
 
 
 
 
 
85
 
86
  Question: {question}
87
 
88
+ Answer:"""
 
 
89
 
90
  self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
91
  input_variables=["question", "filled_context"],
92
  template=qa_template
93
  ))
94
 
95
+ # Create slide selection prompt template for DialoGPT
96
+ slide_template = """Given this question: {question}
 
 
 
 
 
 
97
 
98
  Available slides:
99
  {available_slides}
100
 
101
+ Which slide is most relevant? Return only the filename and page number like this: "filename.pdf - Page X"
102
 
103
+ Answer:"""
104
 
105
  self.slide_selection_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
106
  input_variables=["question", "available_slides"],
 
181
  if "<|eot_id|>" in answer:
182
  answer = answer.split("<|eot_id|>")[-1].strip()
183
 
184
+ # Remove any prompt artifacts
185
+ if answer.startswith("Answer:"):
186
+ answer = answer[7:].strip()
187
+
188
  # Add warning if not in curriculum
189
  if curriculum_relevance_score == 0:
190
  answer = "⚠️ **Note: This topic is not covered in the current curriculum.**\n\n" + answer
 
218
  total_pages = len(doc)
219
  doc.close()
220
 
221
+ # Try to find a better page if the current one is a title slide
222
+ target_page = page_number
223
+ page_text = self.pdf_pages[filename].get(page_number, "")
224
+
225
+ # If current page has very little text (likely a title slide), look for content pages
226
+ if len(page_text.strip()) < 100: # Title slides usually have little text
227
+ # Look for pages with more content in the same PDF
228
+ for page_num in range(1, total_pages + 1):
229
+ if page_num in self.pdf_pages[filename]:
230
+ text = self.pdf_pages[filename][page_num]
231
+ if len(text.strip()) > 200: # Look for content-rich pages
232
+ target_page = page_num
233
+ break
234
+
235
  # Get the target page and neighboring pages (2 before, 2 after)
236
+ start_page = max(1, target_page - 2)
237
+ end_page = min(total_pages, target_page + 2)
238
 
239
  for page_num in range(start_page, end_page + 1):
240
  img = self.get_pdf_page_image(pdf_path, page_num)
241
  if img:
242
+ if page_num == target_page:
243
  # Highlight the most relevant page
244
  label = f"📌 {filename} - Page {page_num} (Most Relevant)"
245
  else: