Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -342,36 +342,120 @@ def extract_question_ids_from_qpms(text: str):
|
|
| 342 |
print("⚠️ No question IDs extracted; will send NA placeholder.")
|
| 343 |
return fallback_matches
|
| 344 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
|
| 346 |
"""
|
| 347 |
Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
|
| 348 |
modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
|
| 349 |
requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
|
|
|
|
| 350 |
"""
|
| 351 |
if not expected_ids:
|
| 352 |
ids_block = "{NA}"
|
| 353 |
else:
|
| 354 |
ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
|
| 355 |
|
| 356 |
-
|
| 357 |
-
if qpms_text:
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
"
|
| 361 |
-
"
|
| 362 |
-
"
|
|
|
|
|
|
|
| 363 |
)
|
| 364 |
|
| 365 |
prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.
|
| 366 |
|
| 367 |
INPUT: This PDF contains a student's handwritten answer sheet.
|
| 368 |
-
{
|
| 369 |
TASK:
|
| 370 |
1. **THINKING:** Before transcribing each answer, you must document your thought process using the **<think>** tag.
|
| 371 |
- Identify the question ID. If inferred, note why.
|
| 372 |
- Detail any ambiguities encountered (e.g., unclear numbers, symbols, or structure).
|
| 373 |
- Explain how you resolved ambiguities, specifically if you referred to the QP+MS transcript.
|
| 374 |
-
- If you *did* refer to
|
|
|
|
| 375 |
|
| 376 |
*Example Thinking:*
|
| 377 |
<think>
|
|
@@ -381,19 +465,12 @@ TASK:
|
|
| 381 |
- Re-examined the handwriting carefully: The student's handwriting strongly appears to be '$2x$' and not '$21x$'.
|
| 382 |
- DECISION: Transcribe exactly what the student wrote: '$2x$'.
|
| 383 |
</think>
|
| 384 |
-
*Example Thinking 2 (Ambiguity Resolved by MS):*
|
| 385 |
-
<think>
|
| 386 |
-
- Found Question INFERRED: 1(b) based on proximity to 1(a).
|
| 387 |
-
- Noticed the final answer looked like '3.6', but the decimal point was very faint and could be '36'.
|
| 388 |
-
- Referred to QP+MS: Expected answer is '$3.8$'. Re-examined the student's writing: it appears to be a poorly written '$3.8$' which I initially misread as '$3.6$'.
|
| 389 |
-
- DECISION: Corrected my transcription to '$3.8$' based on re-evaluation and MS context.
|
| 390 |
-
</think>
|
| 391 |
|
| 392 |
-
2. **TRANSCRIPTION:** Transcribe the student's answers
|
| 393 |
- Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3").
|
| 394 |
-
- If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context —
|
| 395 |
- **Enclose all mathematical expressions and single variables in LaTeX dollar delimiters ($...$).**
|
| 396 |
-
-
|
| 397 |
- If a diagram/graph is omitted, write **[Graph omitted]**.
|
| 398 |
- Unreadable parts: **[illegible]**.
|
| 399 |
- Unanswered: **[No response]**.
|
|
@@ -421,6 +498,7 @@ Graph found in:
|
|
| 421 |
|
| 422 |
return prompt
|
| 423 |
|
|
|
|
| 424 |
def extract_graph_questions_from_ms(text: str):
|
| 425 |
"""Extract graph questions and page numbers from MS transcript."""
|
| 426 |
clean_text = text.replace("\u00A0", " ").replace("\t", " ")
|
|
|
|
| 342 |
print("⚠️ No question IDs extracted; will send NA placeholder.")
|
| 343 |
return fallback_matches
|
| 344 |
|
| 345 |
+
# def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
|
| 346 |
+
# """
|
| 347 |
+
# Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
|
| 348 |
+
# modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
|
| 349 |
+
# requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
|
| 350 |
+
# """
|
| 351 |
+
# if not expected_ids:
|
| 352 |
+
# ids_block = "{NA}"
|
| 353 |
+
# else:
|
| 354 |
+
# ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
|
| 355 |
+
|
| 356 |
+
# qpms_guidance = ""
|
| 357 |
+
# if qpms_text:
|
| 358 |
+
# qpms_guidance = (
|
| 359 |
+
# "\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS). "
|
| 360 |
+
# "Use this transcript primarily to resolve **ambiguous handwriting** (e.g., if a number could be '$-1.6$' or '$1.6$'). "
|
| 361 |
+
# "If you are confident in your transcription without referring to the QP+MS, use your judgment. "
|
| 362 |
+
# "**Always prioritize accuracy and context from the QP+MS transcript when in doubt about a specific ambiguous character or expression.**\n"
|
| 363 |
+
# )
|
| 364 |
+
|
| 365 |
+
# prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.
|
| 366 |
+
|
| 367 |
+
# INPUT: This PDF contains a student's handwritten answer sheet.
|
| 368 |
+
# {qpms_guidance}
|
| 369 |
+
# TASK:
|
| 370 |
+
# 1. **THINKING:** Before transcribing each answer, you must document your thought process using the **<think>** tag.
|
| 371 |
+
# - Identify the question ID. If inferred, note why.
|
| 372 |
+
# - Detail any ambiguities encountered (e.g., unclear numbers, symbols, or structure).
|
| 373 |
+
# - Explain how you resolved ambiguities, specifically if you referred to the QP+MS transcript.
|
| 374 |
+
# - If you *did* refer to the QP+MS but decided to keep your original transcription, state this clearly.
|
| 375 |
+
# - If you initially label an answer as 2.a but later realize it aligns better with 2.b based on the marking scheme, you should reassign it to 2.b and briefly explain your reasoning in the <think> tag to maintain clarity and consistency.
|
| 376 |
+
|
| 377 |
+
# *Example Thinking:*
|
| 378 |
+
# <think>
|
| 379 |
+
# - Found Question 3(a).
|
| 380 |
+
# - Noticed '2x' was written ambiguously; it could be '2x' or '21x'.
|
| 381 |
+
# - Referred to QP+MS: The expected answer involves '$21x$'.
|
| 382 |
+
# - Re-examined the handwriting carefully: The student's handwriting strongly appears to be '$2x$' and not '$21x$'.
|
| 383 |
+
# - DECISION: Transcribe exactly what the student wrote: '$2x$'.
|
| 384 |
+
# </think>
|
| 385 |
+
# *Example Thinking 2 (Ambiguity Resolved by MS):*
|
| 386 |
+
# <think>
|
| 387 |
+
# - Found Question INFERRED: 1(b) based on proximity to 1(a).
|
| 388 |
+
# - Noticed the final answer looked like '3.6', but the decimal point was very faint and could be '36'.
|
| 389 |
+
# - Referred to QP+MS: Expected answer is '$3.8$'. Re-examined the student's writing: it appears to be a poorly written '$3.8$' which I initially misread as '$3.6$'.
|
| 390 |
+
# - DECISION: Corrected my transcription to '$3.8$' based on re-evaluation and MS context.
|
| 391 |
+
# </think>
|
| 392 |
+
|
| 393 |
+
# 2. **TRANSCRIPTION:** Transcribe the student's answers with accordance to the markcheme provided. Preserve step order and line breaks.
|
| 394 |
+
# - Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3").
|
| 395 |
+
# - If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context — but mark inferred IDs clearly as "**INFERRED: <id>**".
|
| 396 |
+
# - **Enclose all mathematical expressions and single variables in LaTeX dollar delimiters ($...$).**
|
| 397 |
+
# - *Example:* "The area is $A = \pi r^2$ so $3x+5 = 11$ thus $x=2$."
|
| 398 |
+
# - If a diagram/graph is omitted, write **[Graph omitted]**.
|
| 399 |
+
# - Unreadable parts: **[illegible]**.
|
| 400 |
+
# - Unanswered: **[No response]**.
|
| 401 |
+
# - Do NOT recreate diagrams.
|
| 402 |
+
|
| 403 |
+
# Ensure consistency and determinism in formatting so subsequent models can grade directly from this aligned format.
|
| 404 |
+
|
| 405 |
+
# Expected questions (if missing, write NA):
|
| 406 |
+
# {ids_block}
|
| 407 |
+
# -----------------------
|
| 408 |
+
# OUTPUT FORMAT:
|
| 409 |
+
# <think>...</think>
|
| 410 |
+
# Question <id>
|
| 411 |
+
# AS:<transcribed answer or placeholder>
|
| 412 |
+
# <think>...</think>
|
| 413 |
+
# Question <id>
|
| 414 |
+
# AS:<transcribed answer or placeholder>
|
| 415 |
+
# ...
|
| 416 |
+
|
| 417 |
+
# ==== GRAPH FOUND ANSWERS ====
|
| 418 |
+
# Graph found in:
|
| 419 |
+
# - Answer <number> → Page <number>
|
| 420 |
+
# (one per line)
|
| 421 |
+
# ==== END GRAPH FOUND ===="""
|
| 422 |
+
|
| 423 |
+
# return prompt
|
| 424 |
+
|
| 425 |
def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
|
| 426 |
"""
|
| 427 |
Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
|
| 428 |
modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
|
| 429 |
requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
|
| 430 |
+
The full qpms_text, when provided, is embedded directly in the prompt and not skipped.
|
| 431 |
"""
|
| 432 |
if not expected_ids:
|
| 433 |
ids_block = "{NA}"
|
| 434 |
else:
|
| 435 |
ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
|
| 436 |
|
| 437 |
+
qpms_section = ""
|
| 438 |
+
if qpms_text is not None:
|
| 439 |
+
# Include the full QP+MS transcript exactly (strip only leading/trailing whitespace)
|
| 440 |
+
qpms_section = (
|
| 441 |
+
"\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS) below."
|
| 442 |
+
"\nUse it primarily to resolve ambiguous handwriting and to confirm expected answers when needed."
|
| 443 |
+
"\n--- BEGIN QP+MS TRANSCRIPT ---\n"
|
| 444 |
+
f"{qpms_text.strip()}\n"
|
| 445 |
+
"--- END QP+MS TRANSCRIPT ---\n"
|
| 446 |
)
|
| 447 |
|
| 448 |
prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.
|
| 449 |
|
| 450 |
INPUT: This PDF contains a student's handwritten answer sheet.
|
| 451 |
+
{qpms_section}
|
| 452 |
TASK:
|
| 453 |
1. **THINKING:** Before transcribing each answer, you must document your thought process using the **<think>** tag.
|
| 454 |
- Identify the question ID. If inferred, note why.
|
| 455 |
- Detail any ambiguities encountered (e.g., unclear numbers, symbols, or structure).
|
| 456 |
- Explain how you resolved ambiguities, specifically if you referred to the QP+MS transcript.
|
| 457 |
+
- If you *did* refer to QP+MS but decided to keep your original transcription, state this clearly.
|
| 458 |
+
- If you initially label an answer as 2.a but later realize it aligns better with 2.b based on the marking scheme, reassign it to 2.b and briefly explain your reasoning in the <think> tag.
|
| 459 |
|
| 460 |
*Example Thinking:*
|
| 461 |
<think>
|
|
|
|
| 465 |
- Re-examined the handwriting carefully: The student's handwriting strongly appears to be '$2x$' and not '$21x$'.
|
| 466 |
- DECISION: Transcribe exactly what the student wrote: '$2x$'.
|
| 467 |
</think>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
|
| 469 |
+
2. **TRANSCRIPTION:** Transcribe the student's answers in accordance with the markscheme provided. Preserve step order and line breaks.
|
| 470 |
- Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3").
|
| 471 |
+
- If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context — mark inferred IDs clearly as "**INFERRED: <id>**".
|
| 472 |
- **Enclose all mathematical expressions and single variables in LaTeX dollar delimiters ($...$).**
|
| 473 |
+
- Example: "The area is $A = \pi r^2$ so $3x+5 = 11$ thus $x=2$."
|
| 474 |
- If a diagram/graph is omitted, write **[Graph omitted]**.
|
| 475 |
- Unreadable parts: **[illegible]**.
|
| 476 |
- Unanswered: **[No response]**.
|
|
|
|
| 498 |
|
| 499 |
return prompt
|
| 500 |
|
| 501 |
+
|
| 502 |
def extract_graph_questions_from_ms(text: str):
|
| 503 |
"""Extract graph questions and page numbers from MS transcript."""
|
| 504 |
clean_text = text.replace("\u00A0", " ").replace("\t", " ")
|