atz21 commited on
Commit
9b0a372
·
verified ·
1 Parent(s): f999bc3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -18
app.py CHANGED
@@ -395,39 +395,83 @@ def extract_question_ids_from_qpms(text: str):
395
  print("⚠️ No question IDs extracted; will send NA placeholder.")
396
  return fallback_matches
397
 
398
- def build_as_prompt_with_expected_ids(expected_ids, qpms_text=None):
399
  """
400
- Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions.
 
 
401
  """
402
  if not expected_ids:
403
  ids_block = "{NA}"
404
  else:
405
  ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
406
- refer_text = ""
 
407
  if qpms_text:
408
- refer_text = (
409
  "\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS). "
410
- "If you encounter ambiguous handwriting (for example, if a number could be '-1.6' or '1.6'), refer to the QP+MS transcript to infer the student's intended answer. "
411
- "However, if you are confident in your transcription, you may use your own judgment. "
412
- "Always prioritize accuracy and context from the QP+MS transcript when in doubt.\n"
413
  )
414
- prompt = f"""You are a high-quality handwritten transcription assistant.
415
- INPUT: This PDF contains a student's handwritten answer sheet.{refer_text}
416
- TASK: Transcribe the student's answers exactly (as text). Preserve step order and line breaks. Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3"). If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context — but mark inferred IDs clearly as "INFERRED: <id>"
417
- Enclose all mathematical expressions in Markdown fenced code blocks (``` triple backticks).
418
- If a diagram/graph is omitted, write [Graph omitted].
419
- Unreadable parts: [illegible].
420
- Unanswered: [No response].
421
- Do NOT recreate diagrams.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  Ensure consistency and determinism in formatting so subsequent models can grade directly from this aligned format.
 
423
  Expected questions (if missing, write NA):
424
  {ids_block}
425
  -----------------------
426
  OUTPUT FORMAT:
 
427
  Question <id>
428
- AS:
429
- <transcribed answer or placeholder>
430
- ==== GRAPH FOUND ANSWERS ====\nGraph found in:\n- Answer <number> → Page <number>\n(one per line)\n==== END GRAPH FOUND ===="""
 
 
 
 
 
 
 
 
 
431
  return prompt
432
 
433
  def extract_graph_questions_from_ms(text: str):
 
395
  print("⚠️ No question IDs extracted; will send NA placeholder.")
396
  return fallback_matches
397
 
398
+ def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
399
  """
400
+ Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
401
+ modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
402
+ requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
403
  """
404
  if not expected_ids:
405
  ids_block = "{NA}"
406
  else:
407
  ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
408
+
409
+ qpms_guidance = ""
410
  if qpms_text:
411
+ qpms_guidance = (
412
  "\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS). "
413
+ "Use this transcript primarily to resolve **ambiguous handwriting** (e.g., if a number could be '$-1.6$' or '$1.6$'). "
414
+ "If you are confident in your transcription without referring to the QP+MS, use your judgment. "
415
+ "**Always prioritize accuracy and context from the QP+MS transcript when in doubt about a specific ambiguous character or expression.**\n"
416
  )
417
+
418
+ prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.
419
+
420
+ INPUT: This PDF contains a student's handwritten answer sheet.
421
+ {qpms_guidance}
422
+ TASK:
423
+ 1. **THINKING:** Before transcribing each answer, you must document your thought process using the **<think>** tag.
424
+ - Identify the question ID. If inferred, note why.
425
+ - Detail any ambiguities encountered (e.g., unclear numbers, symbols, or structure).
426
+ - Explain how you resolved ambiguities, specifically if you referred to the QP+MS transcript.
427
+ - If you *did* refer to the QP+MS but decided to keep your original transcription, state this clearly.
428
+
429
+ *Example Thinking:*
430
+ <think>
431
+ - Found Question 3(a).
432
+ - Noticed '2x' was written ambiguously; it could be '2x' or '21x'.
433
+ - Referred to QP+MS: The expected answer involves '$21x$'.
434
+ - Re-examined the handwriting carefully: The student's handwriting strongly appears to be '$2x$' and not '$21x$'.
435
+ - DECISION: Transcribe exactly what the student wrote: '$2x$'.
436
+ </think>
437
+ *Example Thinking 2 (Ambiguity Resolved by MS):*
438
+ <think>
439
+ - Found Question INFERRED: 1(b) based on proximity to 1(a).
440
+ - Noticed the final answer looked like '3.6', but the decimal point was very faint and could be '36'.
441
+ - Referred to QP+MS: Expected answer is '$3.8$'. Re-examined the student's writing: it appears to be a poorly written '$3.8$' which I initially misread as '$3.6$'.
442
+ - DECISION: Corrected my transcription to '$3.8$' based on re-evaluation and MS context.
443
+ </think>
444
+
445
+ 2. **TRANSCRIPTION:** Transcribe the student's answers exactly (as text). Preserve step order and line breaks.
446
+ - Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3").
447
+ - If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context — but mark inferred IDs clearly as "**INFERRED: <id>**".
448
+ - **Enclose all mathematical expressions and single variables in LaTeX dollar delimiters ($...$).**
449
+ - *Example:* "The area is $A = \pi r^2$ so $3x+5 = 11$ thus $x=2$."
450
+ - If a diagram/graph is omitted, write **[Graph omitted]**.
451
+ - Unreadable parts: **[illegible]**.
452
+ - Unanswered: **[No response]**.
453
+ - Do NOT recreate diagrams.
454
+
455
  Ensure consistency and determinism in formatting so subsequent models can grade directly from this aligned format.
456
+
457
  Expected questions (if missing, write NA):
458
  {ids_block}
459
  -----------------------
460
  OUTPUT FORMAT:
461
+ <think>...</think>
462
  Question <id>
463
+ AS:<transcribed answer or placeholder>
464
+ <think>...</think>
465
+ Question <id>
466
+ AS:<transcribed answer or placeholder>
467
+ ...
468
+
469
+ ==== GRAPH FOUND ANSWERS ====
470
+ Graph found in:
471
+ - Answer <number> → Page <number>
472
+ (one per line)
473
+ ==== END GRAPH FOUND ===="""
474
+
475
  return prompt
476
 
477
  def extract_graph_questions_from_ms(text: str):