Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -395,39 +395,83 @@ def extract_question_ids_from_qpms(text: str):
|
|
| 395 |
print("⚠️ No question IDs extracted; will send NA placeholder.")
|
| 396 |
return fallback_matches
|
| 397 |
|
| 398 |
-
def
|
| 399 |
"""
|
| 400 |
-
Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions
|
|
|
|
|
|
|
| 401 |
"""
|
| 402 |
if not expected_ids:
|
| 403 |
ids_block = "{NA}"
|
| 404 |
else:
|
| 405 |
ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
|
| 406 |
-
|
|
|
|
| 407 |
if qpms_text:
|
| 408 |
-
|
| 409 |
"\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS). "
|
| 410 |
-
"
|
| 411 |
-
"
|
| 412 |
-
"Always prioritize accuracy and context from the QP+MS transcript when in doubt.\n"
|
| 413 |
)
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
Ensure consistency and determinism in formatting so subsequent models can grade directly from this aligned format.
|
|
|
|
| 423 |
Expected questions (if missing, write NA):
|
| 424 |
{ids_block}
|
| 425 |
-----------------------
|
| 426 |
OUTPUT FORMAT:
|
|
|
|
| 427 |
Question <id>
|
| 428 |
-
AS:
|
| 429 |
-
<
|
| 430 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
return prompt
|
| 432 |
|
| 433 |
def extract_graph_questions_from_ms(text: str):
|
|
|
|
| 395 |
print("⚠️ No question IDs extracted; will send NA placeholder.")
|
| 396 |
return fallback_matches
|
| 397 |
|
| 398 |
+
def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
|
| 399 |
"""
|
| 400 |
+
Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
|
| 401 |
+
modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
|
| 402 |
+
requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
|
| 403 |
"""
|
| 404 |
if not expected_ids:
|
| 405 |
ids_block = "{NA}"
|
| 406 |
else:
|
| 407 |
ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
|
| 408 |
+
|
| 409 |
+
qpms_guidance = ""
|
| 410 |
if qpms_text:
|
| 411 |
+
qpms_guidance = (
|
| 412 |
"\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS). "
|
| 413 |
+
"Use this transcript primarily to resolve **ambiguous handwriting** (e.g., if a number could be '$-1.6$' or '$1.6$'). "
|
| 414 |
+
"If you are confident in your transcription without referring to the QP+MS, use your judgment. "
|
| 415 |
+
"**Always prioritize accuracy and context from the QP+MS transcript when in doubt about a specific ambiguous character or expression.**\n"
|
| 416 |
)
|
| 417 |
+
|
| 418 |
+
prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.
|
| 419 |
+
|
| 420 |
+
INPUT: This PDF contains a student's handwritten answer sheet.
|
| 421 |
+
{qpms_guidance}
|
| 422 |
+
TASK:
|
| 423 |
+
1. **THINKING:** Before transcribing each answer, you must document your thought process using the **<think>** tag.
|
| 424 |
+
- Identify the question ID. If inferred, note why.
|
| 425 |
+
- Detail any ambiguities encountered (e.g., unclear numbers, symbols, or structure).
|
| 426 |
+
- Explain how you resolved ambiguities, specifically if you referred to the QP+MS transcript.
|
| 427 |
+
- If you *did* refer to the QP+MS but decided to keep your original transcription, state this clearly.
|
| 428 |
+
|
| 429 |
+
*Example Thinking:*
|
| 430 |
+
<think>
|
| 431 |
+
- Found Question 3(a).
|
| 432 |
+
- Noticed '2x' was written ambiguously; it could be '2x' or '21x'.
|
| 433 |
+
- Referred to QP+MS: The expected answer involves '$21x$'.
|
| 434 |
+
- Re-examined the handwriting carefully: The student's handwriting strongly appears to be '$2x$' and not '$21x$'.
|
| 435 |
+
- DECISION: Transcribe exactly what the student wrote: '$2x$'.
|
| 436 |
+
</think>
|
| 437 |
+
*Example Thinking 2 (Ambiguity Resolved by MS):*
|
| 438 |
+
<think>
|
| 439 |
+
- Found Question INFERRED: 1(b) based on proximity to 1(a).
|
| 440 |
+
- Noticed the final answer looked like '3.6', but the decimal point was very faint and could be '36'.
|
| 441 |
+
- Referred to QP+MS: Expected answer is '$3.8$'. Re-examined the student's writing: it appears to be a poorly written '$3.8$' which I initially misread as '$3.6$'.
|
| 442 |
+
- DECISION: Corrected my transcription to '$3.8$' based on re-evaluation and MS context.
|
| 443 |
+
</think>
|
| 444 |
+
|
| 445 |
+
2. **TRANSCRIPTION:** Transcribe the student's answers exactly (as text). Preserve step order and line breaks.
|
| 446 |
+
- Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3").
|
| 447 |
+
- If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context — but mark inferred IDs clearly as "**INFERRED: <id>**".
|
| 448 |
+
- **Enclose all mathematical expressions and single variables in LaTeX dollar delimiters ($...$).**
|
| 449 |
+
- *Example:* "The area is $A = \pi r^2$ so $3x+5 = 11$ thus $x=2$."
|
| 450 |
+
- If a diagram/graph is omitted, write **[Graph omitted]**.
|
| 451 |
+
- Unreadable parts: **[illegible]**.
|
| 452 |
+
- Unanswered: **[No response]**.
|
| 453 |
+
- Do NOT recreate diagrams.
|
| 454 |
+
|
| 455 |
Ensure consistency and determinism in formatting so subsequent models can grade directly from this aligned format.
|
| 456 |
+
|
| 457 |
Expected questions (if missing, write NA):
|
| 458 |
{ids_block}
|
| 459 |
-----------------------
|
| 460 |
OUTPUT FORMAT:
|
| 461 |
+
<think>...</think>
|
| 462 |
Question <id>
|
| 463 |
+
AS:<transcribed answer or placeholder>
|
| 464 |
+
<think>...</think>
|
| 465 |
+
Question <id>
|
| 466 |
+
AS:<transcribed answer or placeholder>
|
| 467 |
+
...
|
| 468 |
+
|
| 469 |
+
==== GRAPH FOUND ANSWERS ====
|
| 470 |
+
Graph found in:
|
| 471 |
+
- Answer <number> → Page <number>
|
| 472 |
+
(one per line)
|
| 473 |
+
==== END GRAPH FOUND ===="""
|
| 474 |
+
|
| 475 |
return prompt
|
| 476 |
|
| 477 |
def extract_graph_questions_from_ms(text: str):
|