neurolearn

Sleeping

App Files Files Community

atz21 commited on Oct 30, 2025

Commit

9b0a372

verified ·

1 Parent(s): f999bc3

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -18

app.py CHANGED Viewed

@@ -395,39 +395,83 @@ def extract_question_ids_from_qpms(text: str):
         print("⚠️ No question IDs extracted; will send NA placeholder.")
     return fallback_matches
-def build_as_prompt_with_expected_ids(expected_ids, qpms_text=None):
     """
-    Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions.
     """
     if not expected_ids:
         ids_block = "{NA}"
     else:
         ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
-    refer_text = ""
     if qpms_text:
-        refer_text = (
             "\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS). "
-            "If you encounter ambiguous handwriting (for example, if a number could be '-1.6' or '1.6'), refer to the QP+MS transcript to infer the student's intended answer. "
-            "However, if you are confident in your transcription, you may use your own judgment. "
-            "Always prioritize accuracy and context from the QP+MS transcript when in doubt.\n"
         )
-    prompt = f"""You are a high-quality handwritten transcription assistant.
-INPUT: This PDF contains a student's handwritten answer sheet.{refer_text}
-TASK: Transcribe the student's answers exactly (as text). Preserve step order and line breaks. Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3"). If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context — but mark inferred IDs clearly as "INFERRED: <id>"
-Enclose all mathematical expressions in Markdown fenced code blocks (``` triple backticks).
-If a diagram/graph is omitted, write [Graph omitted].
-Unreadable parts: [illegible].
-Unanswered: [No response].
-Do NOT recreate diagrams.
 Ensure consistency and determinism in formatting so subsequent models can grade directly from this aligned format.
 Expected questions (if missing, write NA):
 {ids_block}
 -----------------------
 OUTPUT FORMAT:
 Question <id>
-AS:
-<transcribed answer or placeholder>
-==== GRAPH FOUND ANSWERS ====\nGraph found in:\n- Answer <number> → Page <number>\n(one per line)\n==== END GRAPH FOUND ===="""
     return prompt
 def extract_graph_questions_from_ms(text: str):

         print("⚠️ No question IDs extracted; will send NA placeholder.")
     return fallback_matches
+def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
     """
+    Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
+    modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
+    requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
     """
     if not expected_ids:
         ids_block = "{NA}"
     else:
         ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
+    qpms_guidance = ""
     if qpms_text:
+        qpms_guidance = (
             "\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS). "
+            "Use this transcript primarily to resolve **ambiguous handwriting** (e.g., if a number could be '$-1.6$' or '$1.6$'). "
+            "If you are confident in your transcription without referring to the QP+MS, use your judgment. "
+            "**Always prioritize accuracy and context from the QP+MS transcript when in doubt about a specific ambiguous character or expression.**\n"
         )
+    prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.
+INPUT: This PDF contains a student's handwritten answer sheet.
+{qpms_guidance}
+TASK:
+1. **THINKING:** Before transcribing each answer, you must document your thought process using the **<think>** tag.
+    - Identify the question ID. If inferred, note why.
+    - Detail any ambiguities encountered (e.g., unclear numbers, symbols, or structure).
+    - Explain how you resolved ambiguities, specifically if you referred to the QP+MS transcript.
+    - If you *did* refer to the QP+MS but decided to keep your original transcription, state this clearly.
+    *Example Thinking:*
+    <think>
+    - Found Question 3(a).
+    - Noticed '2x' was written ambiguously; it could be '2x' or '21x'.
+    - Referred to QP+MS: The expected answer involves '$21x$'.
+    - Re-examined the handwriting carefully: The student's handwriting strongly appears to be '$2x$' and not '$21x$'.
+    - DECISION: Transcribe exactly what the student wrote: '$2x$'.
+    </think>
+    *Example Thinking 2 (Ambiguity Resolved by MS):*
+    <think>
+    - Found Question INFERRED: 1(b) based on proximity to 1(a).
+    - Noticed the final answer looked like '3.6', but the decimal point was very faint and could be '36'.
+    - Referred to QP+MS: Expected answer is '$3.8$'. Re-examined the student's writing: it appears to be a poorly written '$3.8$' which I initially misread as '$3.6$'.
+    - DECISION: Corrected my transcription to '$3.8$' based on re-evaluation and MS context.
+    </think>
+2. **TRANSCRIPTION:** Transcribe the student's answers exactly (as text). Preserve step order and line breaks.
+    - Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3").
+    - If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context — but mark inferred IDs clearly as "**INFERRED: <id>**".
+    - **Enclose all mathematical expressions and single variables in LaTeX dollar delimiters ($...$).**
+        - *Example:* "The area is $A = \pi r^2$ so $3x+5 = 11$ thus $x=2$."
+    - If a diagram/graph is omitted, write **[Graph omitted]**.
+    - Unreadable parts: **[illegible]**.
+    - Unanswered: **[No response]**.
+    - Do NOT recreate diagrams.
 Ensure consistency and determinism in formatting so subsequent models can grade directly from this aligned format.
 Expected questions (if missing, write NA):
 {ids_block}
 -----------------------
 OUTPUT FORMAT:
+<think>...</think>
 Question <id>
+AS:<transcribed answer or placeholder>
+<think>...</think>
+Question <id>
+AS:<transcribed answer or placeholder>
+...
+==== GRAPH FOUND ANSWERS ====
+Graph found in:
+- Answer <number> → Page <number>
+(one per line)
+==== END GRAPH FOUND ===="""
     return prompt
 def extract_graph_questions_from_ms(text: str):