atz21 commited on
Commit
bf851f8
·
verified ·
1 Parent(s): f1f0ca1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -19
app.py CHANGED
@@ -342,36 +342,120 @@ def extract_question_ids_from_qpms(text: str):
342
  print("⚠️ No question IDs extracted; will send NA placeholder.")
343
  return fallback_matches
344
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
346
  """
347
  Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
348
  modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
349
  requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
 
350
  """
351
  if not expected_ids:
352
  ids_block = "{NA}"
353
  else:
354
  ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
355
 
356
- qpms_guidance = ""
357
- if qpms_text:
358
- qpms_guidance = (
359
- "\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS). "
360
- "Use this transcript primarily to resolve **ambiguous handwriting** (e.g., if a number could be '$-1.6$' or '$1.6$'). "
361
- "If you are confident in your transcription without referring to the QP+MS, use your judgment. "
362
- "**Always prioritize accuracy and context from the QP+MS transcript when in doubt about a specific ambiguous character or expression.**\n"
 
 
363
  )
364
 
365
  prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.
366
 
367
  INPUT: This PDF contains a student's handwritten answer sheet.
368
- {qpms_guidance}
369
  TASK:
370
  1. **THINKING:** Before transcribing each answer, you must document your thought process using the **<think>** tag.
371
  - Identify the question ID. If inferred, note why.
372
  - Detail any ambiguities encountered (e.g., unclear numbers, symbols, or structure).
373
  - Explain how you resolved ambiguities, specifically if you referred to the QP+MS transcript.
374
- - If you *did* refer to the QP+MS but decided to keep your original transcription, state this clearly.
 
375
 
376
  *Example Thinking:*
377
  <think>
@@ -381,19 +465,12 @@ TASK:
381
  - Re-examined the handwriting carefully: The student's handwriting strongly appears to be '$2x$' and not '$21x$'.
382
  - DECISION: Transcribe exactly what the student wrote: '$2x$'.
383
  </think>
384
- *Example Thinking 2 (Ambiguity Resolved by MS):*
385
- <think>
386
- - Found Question INFERRED: 1(b) based on proximity to 1(a).
387
- - Noticed the final answer looked like '3.6', but the decimal point was very faint and could be '36'.
388
- - Referred to QP+MS: Expected answer is '$3.8$'. Re-examined the student's writing: it appears to be a poorly written '$3.8$' which I initially misread as '$3.6$'.
389
- - DECISION: Corrected my transcription to '$3.8$' based on re-evaluation and MS context.
390
- </think>
391
 
392
- 2. **TRANSCRIPTION:** Transcribe the student's answers exactly (as text). Preserve step order and line breaks.
393
  - Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3").
394
- - If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context — but mark inferred IDs clearly as "**INFERRED: <id>**".
395
  - **Enclose all mathematical expressions and single variables in LaTeX dollar delimiters ($...$).**
396
- - *Example:* "The area is $A = \pi r^2$ so $3x+5 = 11$ thus $x=2$."
397
  - If a diagram/graph is omitted, write **[Graph omitted]**.
398
  - Unreadable parts: **[illegible]**.
399
  - Unanswered: **[No response]**.
@@ -421,6 +498,7 @@ Graph found in:
421
 
422
  return prompt
423
 
 
424
  def extract_graph_questions_from_ms(text: str):
425
  """Extract graph questions and page numbers from MS transcript."""
426
  clean_text = text.replace("\u00A0", " ").replace("\t", " ")
 
342
  print("⚠️ No question IDs extracted; will send NA placeholder.")
343
  return fallback_matches
344
 
345
+ # def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
346
+ # """
347
+ # Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
348
+ # modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
349
+ # requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
350
+ # """
351
+ # if not expected_ids:
352
+ # ids_block = "{NA}"
353
+ # else:
354
+ # ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
355
+
356
+ # qpms_guidance = ""
357
+ # if qpms_text:
358
+ # qpms_guidance = (
359
+ # "\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS). "
360
+ # "Use this transcript primarily to resolve **ambiguous handwriting** (e.g., if a number could be '$-1.6$' or '$1.6$'). "
361
+ # "If you are confident in your transcription without referring to the QP+MS, use your judgment. "
362
+ # "**Always prioritize accuracy and context from the QP+MS transcript when in doubt about a specific ambiguous character or expression.**\n"
363
+ # )
364
+
365
+ # prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.
366
+
367
+ # INPUT: This PDF contains a student's handwritten answer sheet.
368
+ # {qpms_guidance}
369
+ # TASK:
370
+ # 1. **THINKING:** Before transcribing each answer, you must document your thought process using the **<think>** tag.
371
+ # - Identify the question ID. If inferred, note why.
372
+ # - Detail any ambiguities encountered (e.g., unclear numbers, symbols, or structure).
373
+ # - Explain how you resolved ambiguities, specifically if you referred to the QP+MS transcript.
374
+ # - If you *did* refer to the QP+MS but decided to keep your original transcription, state this clearly.
375
+ # - If you initially label an answer as 2.a but later realize it aligns better with 2.b based on the marking scheme, you should reassign it to 2.b and briefly explain your reasoning in the <think> tag to maintain clarity and consistency.
376
+
377
+ # *Example Thinking:*
378
+ # <think>
379
+ # - Found Question 3(a).
380
+ # - Noticed '2x' was written ambiguously; it could be '2x' or '21x'.
381
+ # - Referred to QP+MS: The expected answer involves '$21x$'.
382
+ # - Re-examined the handwriting carefully: The student's handwriting strongly appears to be '$2x$' and not '$21x$'.
383
+ # - DECISION: Transcribe exactly what the student wrote: '$2x$'.
384
+ # </think>
385
+ # *Example Thinking 2 (Ambiguity Resolved by MS):*
386
+ # <think>
387
+ # - Found Question INFERRED: 1(b) based on proximity to 1(a).
388
+ # - Noticed the final answer looked like '3.6', but the decimal point was very faint and could be '36'.
389
+ # - Referred to QP+MS: Expected answer is '$3.8$'. Re-examined the student's writing: it appears to be a poorly written '$3.8$' which I initially misread as '$3.6$'.
390
+ # - DECISION: Corrected my transcription to '$3.8$' based on re-evaluation and MS context.
391
+ # </think>
392
+
393
+ # 2. **TRANSCRIPTION:** Transcribe the student's answers with accordance to the markcheme provided. Preserve step order and line breaks.
394
+ # - Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3").
395
+ # - If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context — but mark inferred IDs clearly as "**INFERRED: <id>**".
396
+ # - **Enclose all mathematical expressions and single variables in LaTeX dollar delimiters ($...$).**
397
+ # - *Example:* "The area is $A = \pi r^2$ so $3x+5 = 11$ thus $x=2$."
398
+ # - If a diagram/graph is omitted, write **[Graph omitted]**.
399
+ # - Unreadable parts: **[illegible]**.
400
+ # - Unanswered: **[No response]**.
401
+ # - Do NOT recreate diagrams.
402
+
403
+ # Ensure consistency and determinism in formatting so subsequent models can grade directly from this aligned format.
404
+
405
+ # Expected questions (if missing, write NA):
406
+ # {ids_block}
407
+ # -----------------------
408
+ # OUTPUT FORMAT:
409
+ # <think>...</think>
410
+ # Question <id>
411
+ # AS:<transcribed answer or placeholder>
412
+ # <think>...</think>
413
+ # Question <id>
414
+ # AS:<transcribed answer or placeholder>
415
+ # ...
416
+
417
+ # ==== GRAPH FOUND ANSWERS ====
418
+ # Graph found in:
419
+ # - Answer <number> → Page <number>
420
+ # (one per line)
421
+ # ==== END GRAPH FOUND ===="""
422
+
423
+ # return prompt
424
+
425
  def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
426
  """
427
  Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
428
  modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
429
  requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
430
+ The full qpms_text, when provided, is embedded directly in the prompt and not skipped.
431
  """
432
  if not expected_ids:
433
  ids_block = "{NA}"
434
  else:
435
  ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
436
 
437
+ qpms_section = ""
438
+ if qpms_text is not None:
439
+ # Include the full QP+MS transcript exactly (strip only leading/trailing whitespace)
440
+ qpms_section = (
441
+ "\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS) below."
442
+ "\nUse it primarily to resolve ambiguous handwriting and to confirm expected answers when needed."
443
+ "\n--- BEGIN QP+MS TRANSCRIPT ---\n"
444
+ f"{qpms_text.strip()}\n"
445
+ "--- END QP+MS TRANSCRIPT ---\n"
446
  )
447
 
448
  prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.
449
 
450
  INPUT: This PDF contains a student's handwritten answer sheet.
451
+ {qpms_section}
452
  TASK:
453
  1. **THINKING:** Before transcribing each answer, you must document your thought process using the **<think>** tag.
454
  - Identify the question ID. If inferred, note why.
455
  - Detail any ambiguities encountered (e.g., unclear numbers, symbols, or structure).
456
  - Explain how you resolved ambiguities, specifically if you referred to the QP+MS transcript.
457
+ - If you *did* refer to QP+MS but decided to keep your original transcription, state this clearly.
458
+ - If you initially label an answer as 2.a but later realize it aligns better with 2.b based on the marking scheme, reassign it to 2.b and briefly explain your reasoning in the <think> tag.
459
 
460
  *Example Thinking:*
461
  <think>
 
465
  - Re-examined the handwriting carefully: The student's handwriting strongly appears to be '$2x$' and not '$21x$'.
466
  - DECISION: Transcribe exactly what the student wrote: '$2x$'.
467
  </think>
 
 
 
 
 
 
 
468
 
469
+ 2. **TRANSCRIPTION:** Transcribe the student's answers in accordance with the markscheme provided. Preserve step order and line breaks.
470
  - Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3").
471
+ - If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context — mark inferred IDs clearly as "**INFERRED: <id>**".
472
  - **Enclose all mathematical expressions and single variables in LaTeX dollar delimiters ($...$).**
473
+ - Example: "The area is $A = \pi r^2$ so $3x+5 = 11$ thus $x=2$."
474
  - If a diagram/graph is omitted, write **[Graph omitted]**.
475
  - Unreadable parts: **[illegible]**.
476
  - Unanswered: **[No response]**.
 
498
 
499
  return prompt
500
 
501
+
502
  def extract_graph_questions_from_ms(text: str):
503
  """Extract graph questions and page numbers from MS transcript."""
504
  clean_text = text.replace("\u00A0", " ").replace("\t", " ")