Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -66,59 +66,41 @@ class CurriculumChatbot:
|
|
| 66 |
|
| 67 |
def _setup_llm(self):
|
| 68 |
try:
|
| 69 |
-
# Use
|
| 70 |
-
model_name = "
|
| 71 |
pipe = pipeline(
|
| 72 |
"text-generation",
|
| 73 |
model=model_name,
|
| 74 |
-
max_new_tokens=
|
| 75 |
-
temperature=0.
|
| 76 |
do_sample=True,
|
| 77 |
-
|
| 78 |
-
repetition_penalty=1.1,
|
| 79 |
-
device_map="auto" if torch.cuda.is_available() else None
|
| 80 |
)
|
| 81 |
self.llm = HuggingFacePipeline(pipeline=pipe)
|
| 82 |
|
| 83 |
-
# Create QA prompt template
|
| 84 |
-
qa_template = """
|
| 85 |
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
If the question is about curriculum content, use the provided context to give a detailed, educational explanation. If the curriculum content doesn't perfectly match the question, adapt your answer to be relevant while using the curriculum information.
|
| 89 |
-
|
| 90 |
-
If the question is not covered in the curriculum, provide a comprehensive general programming answer based on your knowledge.
|
| 91 |
-
|
| 92 |
-
Always be educational, clear, and helpful.
|
| 93 |
-
|
| 94 |
-
<|eot_id|><|start_header_id|>user<|end_header_id|>
|
| 95 |
|
| 96 |
Question: {question}
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
|
| 101 |
|
| 102 |
self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
|
| 103 |
input_variables=["question", "filled_context"],
|
| 104 |
template=qa_template
|
| 105 |
))
|
| 106 |
|
| 107 |
-
# Create slide selection prompt template
|
| 108 |
-
slide_template = """
|
| 109 |
-
|
| 110 |
-
You are an AI that identifies the most relevant slide page for a given question. Return ONLY the filename and page number in this exact format: "filename.pdf - Page X"
|
| 111 |
-
|
| 112 |
-
<|eot_id|><|start_header_id|>user<|end_header_id|>
|
| 113 |
-
|
| 114 |
-
Question: {question}
|
| 115 |
|
| 116 |
Available slides:
|
| 117 |
{available_slides}
|
| 118 |
|
| 119 |
-
Which slide is most relevant? Return only: "filename.pdf - Page X"
|
| 120 |
|
| 121 |
-
|
| 122 |
|
| 123 |
self.slide_selection_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
|
| 124 |
input_variables=["question", "available_slides"],
|
|
@@ -199,6 +181,10 @@ Which slide is most relevant? Return only: "filename.pdf - Page X"
|
|
| 199 |
if "<|eot_id|>" in answer:
|
| 200 |
answer = answer.split("<|eot_id|>")[-1].strip()
|
| 201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
# Add warning if not in curriculum
|
| 203 |
if curriculum_relevance_score == 0:
|
| 204 |
answer = "⚠️ **Note: This topic is not covered in the current curriculum.**\n\n" + answer
|
|
@@ -232,14 +218,28 @@ Which slide is most relevant? Return only: "filename.pdf - Page X"
|
|
| 232 |
total_pages = len(doc)
|
| 233 |
doc.close()
|
| 234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
# Get the target page and neighboring pages (2 before, 2 after)
|
| 236 |
-
start_page = max(1,
|
| 237 |
-
end_page = min(total_pages,
|
| 238 |
|
| 239 |
for page_num in range(start_page, end_page + 1):
|
| 240 |
img = self.get_pdf_page_image(pdf_path, page_num)
|
| 241 |
if img:
|
| 242 |
-
if page_num ==
|
| 243 |
# Highlight the most relevant page
|
| 244 |
label = f"📌 {filename} - Page {page_num} (Most Relevant)"
|
| 245 |
else:
|
|
|
|
| 66 |
|
| 67 |
def _setup_llm(self):
|
| 68 |
try:
|
| 69 |
+
# Use a smaller, faster model for Hugging Face Spaces
|
| 70 |
+
model_name = "microsoft/DialoGPT-medium" # Smaller model for faster inference
|
| 71 |
pipe = pipeline(
|
| 72 |
"text-generation",
|
| 73 |
model=model_name,
|
| 74 |
+
max_new_tokens=200,
|
| 75 |
+
temperature=0.7,
|
| 76 |
do_sample=True,
|
| 77 |
+
pad_token_id=50256
|
|
|
|
|
|
|
| 78 |
)
|
| 79 |
self.llm = HuggingFacePipeline(pipeline=pipe)
|
| 80 |
|
| 81 |
+
# Create QA prompt template for DialoGPT
|
| 82 |
+
qa_template = """Based on the following curriculum content, please answer this question clearly and educationally:
|
| 83 |
|
| 84 |
+
{filled_context}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
Question: {question}
|
| 87 |
|
| 88 |
+
Answer:"""
|
|
|
|
|
|
|
| 89 |
|
| 90 |
self.qa_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
|
| 91 |
input_variables=["question", "filled_context"],
|
| 92 |
template=qa_template
|
| 93 |
))
|
| 94 |
|
| 95 |
+
# Create slide selection prompt template for DialoGPT
|
| 96 |
+
slide_template = """Given this question: {question}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
Available slides:
|
| 99 |
{available_slides}
|
| 100 |
|
| 101 |
+
Which slide is most relevant? Return only the filename and page number like this: "filename.pdf - Page X"
|
| 102 |
|
| 103 |
+
Answer:"""
|
| 104 |
|
| 105 |
self.slide_selection_chain = LLMChain(llm=self.llm, prompt=PromptTemplate(
|
| 106 |
input_variables=["question", "available_slides"],
|
|
|
|
| 181 |
if "<|eot_id|>" in answer:
|
| 182 |
answer = answer.split("<|eot_id|>")[-1].strip()
|
| 183 |
|
| 184 |
+
# Remove any prompt artifacts
|
| 185 |
+
if answer.startswith("Answer:"):
|
| 186 |
+
answer = answer[7:].strip()
|
| 187 |
+
|
| 188 |
# Add warning if not in curriculum
|
| 189 |
if curriculum_relevance_score == 0:
|
| 190 |
answer = "⚠️ **Note: This topic is not covered in the current curriculum.**\n\n" + answer
|
|
|
|
| 218 |
total_pages = len(doc)
|
| 219 |
doc.close()
|
| 220 |
|
| 221 |
+
# Try to find a better page if the current one is a title slide
|
| 222 |
+
target_page = page_number
|
| 223 |
+
page_text = self.pdf_pages[filename].get(page_number, "")
|
| 224 |
+
|
| 225 |
+
# If current page has very little text (likely a title slide), look for content pages
|
| 226 |
+
if len(page_text.strip()) < 100: # Title slides usually have little text
|
| 227 |
+
# Look for pages with more content in the same PDF
|
| 228 |
+
for page_num in range(1, total_pages + 1):
|
| 229 |
+
if page_num in self.pdf_pages[filename]:
|
| 230 |
+
text = self.pdf_pages[filename][page_num]
|
| 231 |
+
if len(text.strip()) > 200: # Look for content-rich pages
|
| 232 |
+
target_page = page_num
|
| 233 |
+
break
|
| 234 |
+
|
| 235 |
# Get the target page and neighboring pages (2 before, 2 after)
|
| 236 |
+
start_page = max(1, target_page - 2)
|
| 237 |
+
end_page = min(total_pages, target_page + 2)
|
| 238 |
|
| 239 |
for page_num in range(start_page, end_page + 1):
|
| 240 |
img = self.get_pdf_page_image(pdf_path, page_num)
|
| 241 |
if img:
|
| 242 |
+
if page_num == target_page:
|
| 243 |
# Highlight the most relevant page
|
| 244 |
label = f"📌 {filename} - Page {page_num} (Most Relevant)"
|
| 245 |
else:
|