atz21 commited on
Commit
643922c
Β·
verified Β·
1 Parent(s): 983d970

Update prompts.py

Browse files
Files changed (1) hide show
  1. prompts.py +1413 -231
prompts.py CHANGED
@@ -1,270 +1,1452 @@
1
- """
2
- Prompts for AI Grading System
3
- Contains all system prompts for transcription and grading
4
- """
5
-
6
- # ---------------- TRANSCRIPTION PROMPTS ----------------
7
- QP_MS_TRANSCRIPTION_PROMPT = {
8
- "role": "system",
9
- "content": """You are a high-quality OCR/Transcription assistant.
10
- INPUT: This file is a PDF that first contains the Question Paper and immediately after it the Markscheme.
11
- TASK:
12
- 1. Transcribe EXACTLY all the questions FIRST (with their total marks).
13
- 2. After ALL questions, transcribe the Markscheme exactly, preserving M/A/R notation in brackets.
14
- 3. Always number the questions sequentially (Question 1, Question 2, Question 3, …) **in the order they appear in the PDF**, even if the PDF shows a different number or leaves it blank. Do NOT skip or leave Question: blank. Never start a question other than question 1 (even if it is labelled in pdf as 8 name it 1).
15
- 4. If a question or sub-question is labelled with a letter (e.g., "Q1.a", "Q2(b)", "1 (c)(i)"), transcribe it as "Question 1.a", "Question 2.b", "Question 1.c.i" etc., exactly preserving the hierarchy of sub-question identifiers.
16
- 5. After the markscheme, DETECT and FLAG all questions in the markscheme where a graph/diagram is expected. For each, output the question number and the page number in the format below.
17
- FORMAT:
18
- ==== PAPER TOTAL MARKS ====
19
- <total marks>
20
- ==== QUESTIONS BEGIN ====
21
- Question 1.a
22
- Total Marks: <number>
23
- QP: <question text>
24
- --QUESTION-END--
25
- Question 1.b
26
- Total Marks: <number>
27
- QP: <question text>
28
- --QUESTION-END--
29
- Question 2
30
- Total Marks: <number>
31
- QP: <question text>
32
- --QUESTION-END--
33
- (repeat for all questions in order of appearance)
34
- ==== QUESTIONS END ====
35
- ==== MARKSCHEME BEGIN ====
36
- Answer 1.a:
37
- <exact MS for Q1.a with notations M1, A1, R1 etc>
38
- Answer 1.b:
39
- <exact MS for Q1.b with notations>
40
- Answer 2 :
41
- <exact MS for Q2 with notations>
42
- (repeat for all answers)
43
- ==== MARKSCHEME END ====
44
- ==== GRAPH EXPECTED QUESTIONS ====
45
- Graph expected in:
46
- - Question <number> β†’ Page <number>
47
- (one per line)
48
- ==== END GRAPH EXPECTED ====
49
- """
50
- }
51
-
52
- # ---------------- GRADING PROMPTS ----------------
53
-
54
- # Common grading rules for all subjects
55
- COMMON_GRADING_RULES = """You are an official examiner. Apply the following grading rules precisely and consistently.
56
- ### Mark Abbreviations:
57
- - **M**: Method marks – awarded for correct mathematical procedures, approaches, or techniques
58
- - **A**: Accuracy/Answer marks – awarded for correct final or intermediate answers
59
- - **R**: Reasoning marks – awarded for justifications, explanations, or logical deductions
60
- - **AG**: Answer Given – the answer is provided in the question; award no marks for simply stating it
61
- - **FT**: Follow Through – marks awarded when a student correctly applies a method using their own previous (incorrect) answer
62
- - **MR**: Misread – penalty applied when student misreads a value from the question (deduct from first applicable A-mark only, once per question)
63
- ---
64
- ## Grading Rules
65
- ### Core Principles:
66
- 1. **Award marks using official annotations** (e.g., M1, A2, R1).
67
- 2. **Do not award full marks for answers alone** – check that the required method steps are present.
68
- 3. **A-marks typically depend on M-marks** – an A-mark usually requires the corresponding M-mark to be earned first (unless the markscheme explicitly states otherwise).
69
- 4. **Accept equivalent forms** unless the markscheme specifies exact form (e.g., "simplified form only").
70
- 5. **Apply Follow Through (FT)** when a student uses an incorrect answer correctly in subsequent steps.
71
- 6. **Misread (MR) Penalty**: If a student misreads a numerical value from the question:
72
- - Deduct from the **first applicable A-mark** in that question only
73
- - Apply MR penalty **once per question** (not per sub-question)
74
- - M-marks can still be awarded if the method is correct
75
- - Annotate as: `\\textcolor{red}{A0 (MR applied)}`
76
- ### Formatting & LaTeX Constraints (CRITICAL):
77
- - **Red Text**: Use LaTeX syntax for lost marks or errors. Do NOT use HTML.
78
- - Correct: `\\textcolor{red}{M0}`
79
- - Incorrect: `<span style="color:red">M0</span>`
80
- - **Math Delimiters**: Ensure ALL mathematical expressions, variables, and numbers are enclosed in single dollar signs.
81
- - Correct: `$x^2 + y^2 = 4$`
82
- - Incorrect: x^2 + y^2 = 4
83
- - **Table Integrity**: Ensure table cells contain NO line breaks. Keep descriptions concise on a single line.
84
- - **Highlighting**:
85
- - In the "Awarded" column, if a mark is 0 or lost, format it as `\\textcolor{red}{M0}` or `\\textcolor{red}{A0}`.
86
- - In the "Examiner Notes", if referring to a specific error, you may wrap it in `\\textcolor{red}{...}`.
87
- ### Graph/Diagram Questions:
88
- - When graph/diagram images are provided, describe visual evidence in the "Examiner Notes" column
89
- - Examples: "Correct parabola shape, y-intercept matches", "Line has wrong gradient", "Asymptote missing"
90
- ---
91
- ## Output Format
92
- Produce the following structure for each question/sub-question:
93
- ### Question <1.a>
94
- **Markscheme vs Student Answer**
95
- | Mark ID | Markscheme Expectation | Student's Response | Awarded | Examiner Notes |
96
- |---------|------------------------|-------------------|---------|----------------|
97
- | M1 | Use product rule: $u'v + uv'$ | Student wrote: $u'v + uv'$ | M1 | Correct method applied |
98
- | A1 | $2xe^x + e^x$ | Student answer: $x e^x$ | \\textcolor{red}{A0} | Missing the factor of 2 |
99
- **Total: X/Y**
100
- ---
101
- *(Repeat for all questions)*
102
- ---
103
- ### Examiner's Summary Report
104
- **IMPORTANT**: Group all sub-questions under their parent question. Sum the marks for all sub-parts (e.g., 1.a, 1.b, 1.c) and report as a single entry for Question 1.
105
- **Format Rules for Summary Report**:
106
- - If a question has sub-parts (1.a, 1.b, etc.), group them as "Question 1" with combined marks
107
- - If a question has no sub-parts (just "Question 2"), report it directly
108
- - Assign ONE overall remark per grouped question based on the predominant error type across all sub-parts
109
- - **CRITICAL**: If a student writes "NA", "N/A", "Not Applicable", or similar for a question, assign remark **E** and award 0 marks. **Only when remark **E** is used do we subtract the question's marks from the adjusted total; all other remarks (including **D**) are counted in the total.
110
- - **CRITICAL**: Calculate adjusted total by excluding marks from questions with remark **E** (NA questions)
111
- - Example: If paper total is 63 marks, but Question 8 (6 marks) is marked NA by student:
112
- - Adjusted total = 63 - 6 = 57 marks
113
- - Report as: **Total: <obtained>/<adjusted_total>** (e.g., "Total: 45/57" not "45/63")
114
- | Question Number | Marks | Remark | Feedback |
115
- |-----------------|-------|--------|----------|
116
- | 1 | 10/12 | A | Strong answer, only minor mistake |
117
- | 2 | 0/8 | E | Student wrote "NA" - question not applicable |
118
- | 3 | 7/10 | C | Adequate, but lacked depth/clarity |
119
- | ... | ... | ... | ... |
120
- **Total: <obtained_marks>/<adjusted_max_marks>**
121
- ---
122
- ## Remark Codes (assign ONE per grouped question):
123
- - **A**: All Good – mostly full marks across sub-parts, no major errors
124
- - **B**: Silly Mistake – minor arithmetic/algebraic slips (e.g., $2 + 3 = 6$, sign error in final step)
125
- - **C**: Conceptual Error – wrong formula, incorrect method, fundamental misunderstanding in one or more sub-parts
126
- - **D**: Hard Question - Assigned when the student leaves the question blank, crosses it out, or makes no meaningful attempt.
127
- - **E**: Not Applicable - Assigned only when the question is explicitly marked as "Not Applicable" (NA).
128
-
129
- 3. **Graph images** (if applicable) for questions involving diagrams
130
-
131
- - Match student answers to question IDs from the QP+MS transcript.
132
- - Grade according to the **verbatim markscheme**, but accept mathematically/conceptually equivalent answers (justify in "Examiner Notes").
133
- - For graph questions, use provided images as visual context and describe what you observe.
134
- - Ensure mark IDs in your grading table match those in the markscheme.
135
- - Be consistent: if a student makes the same type of error multiple times, apply the same penalty logic each time.
136
- """
137
 
138
- # Science-specific grading guidelines (from Cambridge IGCSE Mark Scheme)
139
- SCIENCE_SPECIFIC_GUIDELINES = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
- ## Acronyms and Shorthand
 
 
 
142
 
143
- | Acronym / shorthand | Explanation |
144
- |--------------------|-------------|
145
- | **A mark** | Final answer mark for a fully correct answer including the unit. |
146
- | **C mark** | Compensatory mark awarded when the A mark is not. |
147
- | **B mark** | Independent mark not dependent on other marks. |
148
- | **M mark** | Method mark that must be scored before any linked A mark. |
149
- | **( ) Brackets** | Words not required; contradicting bracketed content negates the mark. |
150
- | **Underlining** | Underlined word or correct synonym must appear; exact word needed for technical terms. |
151
- | **/** or **OR** | Any listed alternative gains credit. |
152
- | **owtte** | Or words to that effect. |
153
- | **ignore** | Incorrect/irrelevant point disregarded and not treated as contradictory. |
154
- | **insufficient** | Not worthy of credit on its own. |
155
- | **CON** | Contradicts a correct point; mark not awarded. |
156
- | **ecf [part]** | Error carried forward if used correctly in later steps. |
157
- | **cao** | Correct answer only. |
158
 
159
- ---
 
 
 
 
 
 
 
 
 
160
 
161
- # Science-Specific Marking Rules (Condensed)
 
162
 
163
- 1. **Keyword Use**
164
- Credit awarded only when keywords are used in correct scientific context.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
- 2. **Contradictions**
167
- Contradicted points receive no credit.
168
- Irrelevant wrong science is ignored.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
- 3. **Spelling**
171
- Must clearly distinguish between similar syllabus terms (e.g. ethane/ethene, glucagon/glycogen).
172
 
173
- 4. **Error Carried Forward (ECF)**
174
- Incorrect earlier values may receive later credit if used logically and scientifically correctly.
175
 
176
- 5. **List Rule**
177
- - Treat responses as continuous prose.
178
- - Incorrect responses count toward required number; β€œignore” items do not.
179
- - Contradictory responses cancel credit.
180
- - Extra responses beyond the required number may be ignored if scientifically wrong.
 
 
 
 
 
 
 
 
 
 
181
 
182
- 6. **Calculation Guidance**
183
- - Full credit for correct answers even without working unless β€œshow working” is required.
184
- - Accept values that round correctly to expected significant figures.
185
- - Standard-form coefficient flexibility allowed if convertible.
186
- - Missing/incorrect units usually invalidate the final calculation mark unless separately credited.
 
 
 
187
 
188
- 7. **Chemical-Equation Guidance**
189
- - Accept multiples/fractions of coefficients unless stated otherwise.
190
- - Ignore state symbols unless required.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
- # Maths grading prompt
196
- MATHS_GRADING_PROMPT = {
197
- "role": "system",
198
- "content": COMMON_GRADING_RULES
199
- }
200
 
201
- # Science grading prompt (includes science-specific guidelines)
202
- SCIENCE_GRADING_PROMPT = {
203
- "role": "system",
204
- "content": COMMON_GRADING_RULES + SCIENCE_SPECIFIC_GUIDELINES
205
- }
206
 
207
- # Economics-specific grading guidelines
208
- ECONOMICS_SPECIFIC_GUIDELINES = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
- ## Economics Answering & Marking Guidelines
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
- ### Core Principles:
213
- 1. **Use correct economic concepts**: Credit answers only when terms (e.g., opportunity cost, demand, inflation) are used accurately and in context.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
- 2. **Reward developed reasoning, not lists**: A point must show cause β†’ effect (e.g., "higher demand β†’ higher price β†’ higher output"). Lists without explanation earn limited credit.
216
 
217
- 3. **Both sides needed for 'Discuss'**: Award high marks only when the answer presents advantages and disadvantages with economic reasoning.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
- 4. **Apply the list rule**: For "State two…", only the first two non-contradictory, relevant points count.
 
 
 
 
 
 
220
 
221
- 5. **Diagram marks must match requirements**: Diagrams must include:
222
- - Correctly labelled axes
223
- - Labelled curves
224
- - Correct shifts/movements
225
- - Equilibrium points
226
 
227
- 6. **Do not credit contradictory statements**: If an answer contradicts itself, remove credit for that point.
 
 
 
 
 
 
 
 
228
 
229
- 7. **Allow valid alternative economics**: If the logic is correct and consistent with economic theory, accept it even if wording differs from the markscheme.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
- ### Example Marking Standards:
 
 
 
 
 
232
 
233
- **Explain question example:**
234
- Question: Explain why a fall in income may reduce the demand for new cars. (2 marks)
 
235
 
236
- - **Good answer (full marks)**: A fall in income reduces consumers' purchasing power (1), making new cars less affordable, so quantity demanded decreases (1).
237
- - **Weak answer**: "People will buy fewer cars." (No reasoning β†’ 0–1 mark.)
238
 
239
- ### Economics-Specific Mark Types:
240
- - **Knowledge marks**: For correct identification of economic concepts
241
- - **Application marks**: For applying economic theory to specific contexts
242
- - **Analysis marks**: For explaining economic relationships and cause-effect chains
243
- - **Evaluation marks**: For weighing up arguments, considering limitations, making judgments
 
 
 
 
 
 
 
 
 
 
 
244
 
245
- """
246
 
247
- # Economics grading prompt
248
- ECONOMICS_GRADING_PROMPT = {
249
- "role": "system",
250
- "content": COMMON_GRADING_RULES + ECONOMICS_SPECIFIC_GUIDELINES
251
- }
252
 
253
- # Function to get the appropriate grading prompt based on subject
254
- def get_grading_prompt(subject="maths"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  """
256
- Get the appropriate grading prompt based on the subject.
 
257
 
258
  Args:
259
- subject (str): Either "maths", "science", or "economics"
260
-
261
  Returns:
262
- dict: The grading prompt dictionary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  """
264
- subject = subject.lower()
265
- if subject == "science":
266
- return SCIENCE_GRADING_PROMPT
267
- elif subject == "economics":
268
- return ECONOMICS_GRADING_PROMPT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  else:
270
- return MATHS_GRADING_PROMPT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import subprocess
5
+ import time
6
+ import shutil
7
+ import img2pdf
8
+ import gradio as gr
9
+ from google import genai # NEW SDK
10
+ from pdf2image import convert_from_path
11
+ from PIL import Image, ImageDraw, ImageFont
12
+ import cv2
13
+ import numpy as np
14
+ from PyPDF2 import PdfReader, PdfWriter
15
+ from prompts import QP_MS_TRANSCRIPTION_PROMPT, get_grading_prompt
16
+ from supabase import create_client, Client
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ # ---------------- CONFIG ----------------
19
+ # Multi-API Key Configuration for handling RESOURCE_EXHAUSTED errors
20
+ class GeminiClientManager:
21
+ """Manages multiple Gemini API keys with automatic rotation on quota exhaustion."""
22
+
23
+ def __init__(self):
24
+ # Load all three API keys from environment
25
+ self.api_keys = [
26
+ os.getenv("GEMINI_API_KEY_1"),
27
+ os.getenv("GEMINI_API_KEY_2"),
28
+ os.getenv("GEMINI_API_KEY_3")
29
+ ]
30
+
31
+ # Filter out None values
32
+ self.api_keys = [key for key in self.api_keys if key]
33
+
34
+ if not self.api_keys:
35
+ raise ValueError("❌ No API keys found! Please set at least GEMINI_API_KEY_1")
36
+
37
+ print(f"βœ… Loaded {len(self.api_keys)} Gemini API key(s)")
38
+
39
+ # Current key index (0 = primary)
40
+ self.current_key_index = 0
41
+
42
+ # Create clients for all keys
43
+ self.clients = [genai.Client(api_key=key) for key in self.api_keys]
44
+
45
+ def get_current_client(self):
46
+ """Get the currently active client."""
47
+ return self.clients[self.current_key_index]
48
+
49
+ def rotate_to_next_key(self):
50
+ """Rotate to the next available API key."""
51
+ if len(self.api_keys) == 1:
52
+ print("⚠️ Only one API key available, cannot rotate")
53
+ return False
54
+
55
+ old_index = self.current_key_index
56
+ self.current_key_index = (self.current_key_index + 1) % len(self.api_keys)
57
+ print(f"πŸ”„ Rotating from API key #{old_index + 1} to API key #{self.current_key_index + 1}")
58
+ return True
59
+
60
+ def reset_to_primary(self):
61
+ """Reset to primary (first) API key."""
62
+ if self.current_key_index != 0:
63
+ print(f"πŸ”™ Resetting to primary API key #1")
64
+ self.current_key_index = 0
65
 
66
+ # Initialize the client manager
67
+ client_manager = GeminiClientManager()
68
+ client = client_manager.get_current_client() # For backward compatibility
69
+ GRID_ROWS, GRID_COLS = 20, 14
70
 
71
+ # Supabase configuration
72
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
73
+ SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY")
74
+ SUPABASE_BUCKET = "examfiles"
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ # Initialize Supabase client (only if credentials are available)
77
+ supabase_client = None
78
+ if SUPABASE_URL and SUPABASE_SERVICE_KEY:
79
+ try:
80
+ supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
81
+ print("βœ… Supabase client initialized successfully")
82
+ except Exception as e:
83
+ print(f"⚠️ Supabase initialization failed: {e}")
84
+ else:
85
+ print("⚠️ Supabase credentials not found - file upload to storage disabled")
86
 
87
+ # ---------------- PROMPTS ----------------
88
+ # Prompts are now imported from prompts.py
89
 
90
+ # ---------------- SUPABASE HELPERS ----------------
91
+ def upload_file_to_supabase(local_path, file_type="unknown", timestamp=None):
92
+ """
93
+ Upload a file to Supabase Storage.
94
+
95
+ Args:
96
+ local_path (str): Local file path
97
+ file_type (str): Type of file (qp, ms, ans, graded, imprinted)
98
+ timestamp (str): Unix timestamp for folder organization (optional)
99
+
100
+ Returns:
101
+ str: Public URL of uploaded file or None if upload failed
102
+ """
103
+ if not supabase_client:
104
+ print("⚠️ Supabase not configured - skipping upload")
105
+ return None
106
+
107
+ try:
108
+ if timestamp is None:
109
+ timestamp = str(int(time.time()))
110
+
111
+ original_name = os.path.basename(local_path)
112
+ # Use original filename without prefix for cleaner storage
113
+ remote_path = f"{timestamp}/{original_name}"
114
+
115
+ print(f"πŸ“€ Uploading {file_type} to Supabase: {remote_path}")
116
+
117
+ with open(local_path, "rb") as f:
118
+ supabase_client.storage.from_(SUPABASE_BUCKET).upload(
119
+ remote_path,
120
+ f,
121
+ file_options={"upsert": "true"}
122
+ )
123
+
124
+ public_url = f"{SUPABASE_URL}/storage/v1/object/public/{SUPABASE_BUCKET}/{remote_path}"
125
+ print(f"βœ… Uploaded successfully: {public_url}")
126
+ return public_url
127
+
128
+ except Exception as e:
129
+ print(f"❌ Supabase upload failed for {file_type}: {e}")
130
+ return None
131
 
132
+ def process_and_upload_input_files(qp_file_obj, ms_file_obj, ans_file_obj):
133
+ """
134
+ Process uploaded files and upload them to Supabase using a shared timestamp.
135
+
136
+ Args:
137
+ qp_file_obj: Gradio file object for Question Paper
138
+ ms_file_obj: Gradio file object for Markscheme
139
+ ans_file_obj: Gradio file object for Answer Sheet
140
+
141
+ Returns:
142
+ tuple: (qp_path, ms_path, ans_path, upload_urls_dict, timestamp)
143
+ """
144
+ print("\n" + "="*60)
145
+ print("πŸ“ PROCESSING INPUT FILES")
146
+ print("="*60)
147
+
148
+ # Generate single timestamp for this entire run
149
+ run_timestamp = str(int(time.time()))
150
+ print(f"πŸ• Run timestamp: {run_timestamp}")
151
+
152
+ upload_urls = {
153
+ "qp_url": None,
154
+ "ms_url": None,
155
+ "ans_url": None
156
+ }
157
+
158
+ # Get local paths from Gradio file objects
159
+ qp_path = qp_file_obj.name if qp_file_obj else None
160
+ ms_path = ms_file_obj.name if ms_file_obj else None
161
+ ans_path = ans_file_obj.name if ans_file_obj else None
162
+
163
+ # Upload to Supabase if configured (all files use same timestamp)
164
+ if supabase_client:
165
+ if qp_path:
166
+ upload_urls["qp_url"] = upload_file_to_supabase(qp_path, "qp", run_timestamp)
167
+ if ms_path:
168
+ upload_urls["ms_url"] = upload_file_to_supabase(ms_path, "ms", run_timestamp)
169
+ if ans_path:
170
+ upload_urls["ans_url"] = upload_file_to_supabase(ans_path, "ans", run_timestamp)
171
+
172
+ print("="*60 + "\n")
173
+
174
+ return qp_path, ms_path, ans_path, upload_urls, run_timestamp
175
 
 
 
176
 
 
 
177
 
178
+ # ---------------- HELPERS ----------------
179
+ def parse_md_table(md):
180
+ """Parse a Markdown table into a list of rows."""
181
+ lines = [l for l in md.split("\n") if l.strip()]
182
+ if len(lines) < 3:
183
+ return []
184
+ lines = lines[2:] # skip header + separator
185
+ rows = []
186
+ for line in lines:
187
+ parts = [c.strip() for c in line.strip("|").split("|")]
188
+ # Filter out empty strings from leading/trailing pipes
189
+ clean_parts = [p for p in parts if p]
190
+ if clean_parts:
191
+ rows.append(clean_parts)
192
+ return rows
193
 
194
+ def convert_html_color_spans(md_text):
195
+ """Convert HTML color spans to LaTeX textcolor commands."""
196
+ pattern = r'<span\s+style="color:\s*([^"]+)">\s*(.*?)\s*</span>'
197
+ def repl(m):
198
+ color = m.group(1).strip()
199
+ text = m.group(2)
200
+ return fr'\textcolor{{{color}}}{{{text}}}'
201
+ return re.sub(pattern, repl, md_text, flags=re.IGNORECASE)
202
 
203
+ def cleanup_markdown_for_latex(md_text):
204
+ """Clean up markdown text for better LaTeX conversion."""
205
+ # Ensure spacing between bold headers and tables
206
+ md_text = re.sub(r'(\*\*Markscheme vs Student Answer\*\*)\s*(\|)', r'\1\n\n\2', md_text)
207
+
208
+ # Convert common unicode math symbols to LaTeX (safety net)
209
+ replacements = {
210
+ '∫': r'\int ',
211
+ 'Β²': '^2',
212
+ 'Β³': '^3',
213
+ 'Β½': r'\frac{1}{2}',
214
+ 'ΒΌ': r'\frac{1}{4}',
215
+ '∞': r'\infty',
216
+ '≀': r'\leq',
217
+ 'β‰₯': r'\geq',
218
+ 'β‰ ': r'\neq',
219
+ 'Β±': r'\pm',
220
+ 'Γ—': r'\times',
221
+ 'Γ·': r'\div',
222
+ '√': r'\sqrt',
223
+ 'βˆ‘': r'\sum',
224
+ '∏': r'\prod',
225
+ 'βˆ‚': r'\partial',
226
+ 'Ο€': r'\pi',
227
+ 'ΞΈ': r'\theta',
228
+ 'Ξ±': r'\alpha',
229
+ 'Ξ²': r'\beta',
230
+ 'Ξ³': r'\gamma',
231
+ 'Ξ΄': r'\delta',
232
+ 'Ξ΅': r'\epsilon',
233
+ 'Ξ»': r'\lambda',
234
+ 'ΞΌ': r'\mu',
235
+ 'Οƒ': r'\sigma',
236
+ 'Ξ”': r'\Delta',
237
+ 'Ξ£': r'\Sigma',
238
+ 'Ξ©': r'\Omega'
239
+ }
240
+
241
+ for char, latex in replacements.items():
242
+ md_text = md_text.replace(char, f'${latex}$')
243
+
244
+ return md_text
245
 
246
+ def escape_latex_special_chars(text):
247
+ """Escape special LaTeX characters in text."""
248
+ replacements = {
249
+ '%': r'\%',
250
+ '&': r'\&',
251
+ '#': r'\#',
252
+ '_': r'\_',
253
+ '{': r'\{',
254
+ '}': r'\}',
255
+ '~': r'\textasciitilde{}',
256
+ '^': r'\textasciicircum{}'
257
+ }
258
+
259
+ # Don't escape if already in math mode or LaTeX command
260
+ if '$' in text or '\\' in text:
261
+ return text
262
+
263
+ for char, escaped in replacements.items():
264
+ text = text.replace(char, escaped)
265
+
266
+ return text
267
 
268
+ def save_as_pdf(text, filename="output.pdf"):
269
+ """
270
+ Convert Markdown text to PDF using Pandoc with pdflatex.
271
+ Extracts the Examiner's Summary Report and places it at the top with enhanced formatting.
272
+ Converts HTML color spans to LaTeX textcolor commands.
273
+
274
+ Args:
275
+ text (str): Markdown content to convert
276
+ filename (str): Output PDF filename
277
+
278
+ Returns:
279
+ str: Path to the generated PDF file
280
+
281
+ Raises:
282
+ Exception: If Pandoc or pdflatex is not available, or conversion fails
283
+ """
284
+ base_name = os.path.splitext(filename)[0]
285
+ temp_md_file = f"{base_name}_input.md"
286
+ temp_tex_file = f"{base_name}_temp.tex"
287
+
288
+ print("\n" + "="*60)
289
+ print("οΏ½ MARKDOWoN TO PDF CONVERSION PROCESS")
290
+ print("="*60)
291
+
292
+ try:
293
+ # Step 1: Extract Summary Report Table
294
+ print("\n[STEP 1/6] Extracting Examiner's Summary Report...")
295
+ summary_pattern = re.compile(
296
+ r"### Examiner's Summary Report\s*\n\n(\|.*?\|)\s*\n\n\*\*Total:\s*(.*?)\*\*",
297
+ re.DOTALL
298
+ )
299
+ summary_match = summary_pattern.search(text)
300
+
301
+ if summary_match:
302
+ summary_table_md = summary_match.group(1)
303
+ summary_total = summary_match.group(2)
304
+ text = summary_pattern.sub("", text)
305
+ print(f" βœ… SUCCESS: Extracted summary report with total: {summary_total}")
306
+ else:
307
+ summary_table_md = ""
308
+ summary_total = ""
309
+ print(" ⚠️ WARNING: No Examiner's Summary Report found in markdown")
310
+
311
+ # Step 2: Clean up markdown
312
+ print("\n[STEP 2/6] Cleaning markdown and converting HTML to LaTeX...")
313
+ text = cleanup_markdown_for_latex(text)
314
+ text = convert_html_color_spans(text)
315
+ print(" βœ… SUCCESS: Markdown cleaned and HTML color spans converted")
316
+
317
+ # Save cleaned markdown
318
+ with open(temp_md_file, 'w', encoding='utf-8') as f:
319
+ f.write(text)
320
+ print(f" πŸ“ Saved cleaned markdown to: {temp_md_file}")
321
+
322
+ # Step 3: Convert MD to LaTeX via Pandoc
323
+ print("\n[STEP 3/6] Converting markdown to LaTeX using Pandoc...")
324
+ pandoc_cmd = [
325
+ "pandoc",
326
+ "--from=markdown",
327
+ "--to=latex",
328
+ "--standalone",
329
+ temp_md_file,
330
+ "-o", temp_tex_file
331
+ ]
332
+ print(f" πŸ”§ Running: {' '.join(pandoc_cmd)}")
333
+
334
+ result = subprocess.run(pandoc_cmd, capture_output=True, check=False)
335
+
336
+ if result.returncode != 0:
337
+ try:
338
+ stderr = result.stderr.decode('utf-8', errors='replace')
339
+ except:
340
+ stderr = str(result.stderr)
341
+ print(f" ❌ FAILED: Pandoc returned error code {result.returncode}")
342
+ print(f" Error details: {stderr[:500]}")
343
+ raise Exception(f"Pandoc conversion failed: {stderr}")
344
+
345
+ if not os.path.exists(temp_tex_file):
346
+ print(f" ❌ FAILED: LaTeX file not created at {temp_tex_file}")
347
+ raise Exception("Pandoc did not create the expected LaTeX file")
348
+
349
+ print(f" βœ… SUCCESS: LaTeX file created at {temp_tex_file}")
350
+
351
+ # Step 4: Modify the generated LaTeX
352
+ print("\n[STEP 4/6] Enhancing LaTeX document...")
353
+ with open(temp_tex_file, "r", encoding="utf-8") as f:
354
+ tex = f.read()
355
+
356
+ tex = tex.replace(
357
+ r"\documentclass{article}",
358
+ r"\documentclass[12pt]{extarticle}"
359
+ )
360
+
361
+ insert_packages = r"""\usepackage[a4paper, margin=1in]{geometry}
362
+ \usepackage{xcolor}
363
+ \usepackage{colortbl}
364
+ \usepackage{booktabs}
365
+ \usepackage{array}
366
+ \usepackage{longtable}
367
+ \renewcommand{\arraystretch}{1.4}
368
+ \newcolumntype{L}[1]{>{\raggedright\arraybackslash}p{#1}}"""
369
+
370
+ tex = tex.replace(r"\begin{document}", insert_packages + "\n\\begin{document}")
371
+ print(" βœ… SUCCESS: Enhanced document class and added packages")
372
+
373
+ # Step 5: Build enhanced LaTeX table for summary
374
+ if summary_table_md:
375
+ print("\n[STEP 5/6] Building enhanced summary table...")
376
+ summary_rows = parse_md_table(summary_table_md)
377
+ print(f" πŸ“Š Parsed {len(summary_rows)} rows from summary table")
378
+
379
+ summary_latex = r"""\section*{Examiner's Summary Report}
380
+ \begin{center}
381
+ \rowcolors{2}{gray!10}{white}
382
+ \begin{tabular}{|c|c|c|L{8cm}|}
383
+ \hline
384
+ \rowcolor{gray!30}
385
+ \textbf{Question} & \textbf{Marks} & \textbf{Remark} & \textbf{Feedback} \\ \hline
386
  """
387
+ for row in summary_rows:
388
+ if len(row) >= 4:
389
+ feedback = row[3]
390
+ if not ('$' in feedback or '\\textcolor' in feedback):
391
+ feedback = feedback.replace('%', r'\%').replace('&', r'\&').replace('#', r'\#')
392
+
393
+ summary_latex += f"{row[0]} & {row[1]} & {row[2]} & {feedback} \\\\ \\hline\n"
394
+
395
+ summary_latex += r"\end{tabular}"
396
+ summary_latex += "\n\\end{center}\n\n"
397
+ summary_latex += f"\\vspace{{0.5cm}}\\noindent\\textbf{{\\Large Overall Score: {summary_total}}}\n\n"
398
+ summary_latex += "\\hrulefill\n\\vspace{1cm}\n\n"
399
+ summary_latex += "\\newpage\n\n"
400
+
401
+ tex = tex.replace(
402
+ r"\begin{document}",
403
+ r"\begin{document}" + "\n\n" + summary_latex
404
+ )
405
+ print(" βœ… SUCCESS: Summary table with zebra striping injected at document top")
406
+ else:
407
+ print("\n[STEP 5/6] Skipping summary table (not found)")
408
+
409
+ with open(temp_tex_file, "w", encoding="utf-8") as f:
410
+ f.write(tex)
411
+
412
+ # Step 6: Compile PDF with pdflatex
413
+ print("\n[STEP 6/6] Compiling PDF with pdflatex...")
414
+ pdflatex_cmd = [
415
+ "pdflatex",
416
+ "-interaction=nonstopmode",
417
+ f"-output-directory={os.path.dirname(os.path.abspath(temp_tex_file)) or '.'}",
418
+ temp_tex_file
419
+ ]
420
+
421
+ print(" πŸ”§ Running pdflatex (pass 1/2)...")
422
+ result1 = subprocess.run(pdflatex_cmd, capture_output=True, check=False)
423
+
424
+ print(" πŸ”§ Running pdflatex (pass 2/2)...")
425
+ result2 = subprocess.run(pdflatex_cmd, capture_output=True, check=False)
426
+
427
+ temp_pdf = temp_tex_file.replace(".tex", ".pdf")
428
+
429
+ if not os.path.exists(temp_pdf):
430
+ print(f" ❌ FAILED: PDF not created at {temp_pdf}")
431
+
432
+ try:
433
+ stderr = result2.stderr.decode('utf-8', errors='replace')
434
+ except:
435
+ stderr = str(result2.stderr)
436
+
437
+ log_file = temp_tex_file.replace(".tex", ".log")
438
+ if os.path.exists(log_file):
439
+ print(f" πŸ“‹ Checking LaTeX log file: {log_file}")
440
+ try:
441
+ with open(log_file, 'r', encoding='utf-8', errors='replace') as f:
442
+ log_content = f.read()
443
+ error_lines = [line for line in log_content.split('\n') if '!' in line]
444
+ if error_lines:
445
+ print(f" ❌ LaTeX Errors found ({len(error_lines)} lines):")
446
+ for err_line in error_lines[:10]:
447
+ print(f" {err_line}")
448
+ stderr += "\n\nLaTeX Errors:\n" + "\n".join(error_lines[:10])
449
+ except Exception as log_err:
450
+ print(f" ⚠️ Could not read log file: {log_err}")
451
+
452
+ raise Exception(f"pdflatex failed to create PDF. Error: {stderr[:1000]}")
453
+
454
+ print(f" βœ… SUCCESS: PDF compiled at {temp_pdf}")
455
+
456
+ # Move output PDF to final filename
457
+ if os.path.exists(filename):
458
+ os.remove(filename)
459
+ os.rename(temp_pdf, filename)
460
+ print(f" πŸ“¦ Moved to final location: {filename}")
461
+
462
+ # Clean up temporary files
463
+ print("\n[CLEANUP] Removing temporary files...")
464
+ cleaned_count = 0
465
+ for ext in [".md", ".tex", ".aux", ".log", ".out"]:
466
+ temp_file = base_name + ext
467
+ if os.path.exists(temp_file):
468
+ os.remove(temp_file)
469
+ cleaned_count += 1
470
+ for prefix in ["_input", "_temp"]:
471
+ temp_file = base_name + prefix + ext
472
+ if os.path.exists(temp_file):
473
+ os.remove(temp_file)
474
+ cleaned_count += 1
475
+ print(f" 🧹 Cleaned up {cleaned_count} temporary files")
476
+
477
+ print("\n" + "="*60)
478
+ print("βœ… PDF CONVERSION COMPLETED SUCCESSFULLY")
479
+ print(f"πŸ“„ Output file: {filename}")
480
+ print("="*60 + "\n")
481
+
482
+ return filename
483
+
484
+ except subprocess.CalledProcessError as e:
485
+ print(f"\n❌ SUBPROCESS ERROR: {e}")
486
+ print(f" STDOUT: {e.stdout}")
487
+ print(f" STDERR: {e.stderr}")
488
+ print("="*60 + "\n")
489
+ raise Exception(f"PDF conversion failed: {e.stderr}")
490
+
491
+ except FileNotFoundError as e:
492
+ print(f"\n❌ FILE NOT FOUND ERROR: {e}")
493
+ print("="*60)
494
+ print("⚠️ REQUIRED TOOLS MISSING")
495
+ print("Please install the following:")
496
+ print(" β€’ pandoc")
497
+ print(" β€’ texlive (or MiKTeX on Windows)")
498
+ print(" β€’ texlive-latex-extra (for extarticle class)")
499
+ print("="*60 + "\n")
500
+ raise Exception(
501
+ "Pandoc or pdflatex not found. Please install:\n"
502
+ " - pandoc\n"
503
+ " - texlive (or MiKTeX on Windows)\n"
504
+ " - texlive-latex-extra (for extarticle class)"
505
+ )
506
+
507
+ except Exception as e:
508
+ print(f"\n❌ UNEXPECTED ERROR: {e}")
509
+ import traceback
510
+ traceback.print_exc()
511
+ print("="*60 + "\n")
512
+ raise
513
+
514
+ def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
515
+ if output_path is None:
516
+ base, ext = os.path.splitext(input_path)
517
+ output_path = f"{base}_compressed{ext}"
518
 
519
+ try:
520
+ size = os.path.getsize(input_path)
521
+ except Exception:
522
+ return input_path
 
523
 
524
+ if size <= max_size:
525
+ print(f"ℹ️ Not compressing {input_path} ({size/1024/1024:.2f} MB <= {max_size/1024/1024} MB)")
526
+ return input_path
 
 
527
 
528
+ print(f"πŸ”Ž Compressing {input_path} ({size/1024/1024:.2f} MB) -> {output_path}")
529
+ try:
530
+ gs_cmd = [
531
+ "gs", "-sDEVICE=pdfwrite",
532
+ "-dCompatibilityLevel=1.4",
533
+ "-dPDFSETTINGS=/ebook",
534
+ "-dNOPAUSE", "-dQUIET", "-dBATCH",
535
+ f"-sOutputFile={output_path}", input_path
536
+ ]
537
+ subprocess.run(gs_cmd, check=True)
538
+ new_size = os.path.getsize(output_path)
539
+ print(f"βœ… Compression done. New size: {new_size/1024/1024:.2f} MB")
540
+ if new_size <= max_size:
541
+ return output_path
542
+ else:
543
+ print("⚠️ Compressed file still larger than threshold; returning original")
544
+ return input_path
545
+ except Exception as e:
546
+ print("❌ Compression error:", e)
547
+ return input_path
548
 
549
+ def upload_to_gemini(path, display_name=None):
550
+ """
551
+ Upload a file to Gemini using the NEW google-genai SDK.
552
+ Uses the current active API key from client_manager.
553
+ """
554
+ print(f"πŸ“€ Uploading {path} to Gemini...")
555
+ try:
556
+ current_client = client_manager.get_current_client()
557
+ uploaded_file = current_client.files.upload(file=path)
558
+
559
+ # Wait for processing to complete
560
+ print(f"⏳ Waiting for file processing: {uploaded_file.name}")
561
+ while uploaded_file.state.name == "PROCESSING":
562
+ time.sleep(2)
563
+ uploaded_file = current_client.files.get(name=uploaded_file.name)
564
+
565
+ if uploaded_file.state.name == "FAILED":
566
+ raise Exception(f"File processing failed: {uploaded_file.name}")
567
+
568
+ print(f"βœ… Uploaded and processed: {uploaded_file.name}")
569
+ return uploaded_file
570
+ except Exception as e:
571
+ print(f"❌ Upload failed for {path}: {e}")
572
+ raise
573
+
574
+ def merge_pdfs(paths, output_path):
575
+ writer = PdfWriter()
576
+ for p in paths:
577
+ reader = PdfReader(p)
578
+ for page in reader.pages:
579
+ writer.add_page(page)
580
+ with open(output_path, "wb") as f:
581
+ writer.write(f)
582
+ return output_path
583
 
584
+ def gemini_generate_content(prompt_text, file_upload_obj=None, image_obj=None, model_name="gemini-2.5-pro", fallback_model="gemini-2.5-flash"):
585
+ """
586
+ Send prompt_text and optionally an uploaded file (or an image object/list) to the model using NEW SDK.
587
+ Automatically rotates through available API keys on RESOURCE_EXHAUSTED errors.
588
+ Returns textual response and prints progress.
589
+ """
590
+ contents = [prompt_text]
591
+
592
+ if file_upload_obj:
593
+ contents.append(file_upload_obj)
594
+
595
+ if image_obj:
596
+ if isinstance(image_obj, list):
597
+ for img_path in image_obj:
598
+ if isinstance(img_path, str):
599
+ pil_img = Image.open(img_path)
600
+ contents.append(pil_img)
601
+ else:
602
+ contents.append(img_path)
603
+ else:
604
+ if isinstance(image_obj, str):
605
+ pil_img = Image.open(image_obj)
606
+ contents.append(pil_img)
607
+ else:
608
+ contents.append(image_obj)
609
+
610
+ print("πŸ“‘ Sending request to Gemini (prompt length:", len(prompt_text), "chars )")
611
+
612
+ # Try with all available API keys
613
+ max_attempts = len(client_manager.api_keys)
614
+ attempt = 0
615
+
616
+ while attempt < max_attempts:
617
+ current_client = client_manager.get_current_client()
618
+ current_key_num = client_manager.current_key_index + 1
619
+
620
+ try:
621
+ print(f"πŸ”‘ Using API key #{current_key_num} with model {model_name}")
622
+ response = current_client.models.generate_content(
623
+ model=model_name,
624
+ contents=contents
625
+ )
626
+ raw_text = response.text
627
+ print(f"πŸ“₯ Received response (chars): {len(raw_text)}")
628
+
629
+ # Success! Reset to primary key for next request
630
+ client_manager.reset_to_primary()
631
+ return raw_text
632
+
633
+ except Exception as e:
634
+ error_str = str(e)
635
+ print(f"❌ Generation failed with API key #{current_key_num}: {e}")
636
+
637
+ # Check if it's a RESOURCE_EXHAUSTED error
638
+ if "429" in error_str or "RESOURCE_EXHAUSTED" in error_str:
639
+ print(f"⚠️ Quota exhausted for API key #{current_key_num}")
640
+
641
+ # Try to rotate to next key
642
+ if client_manager.rotate_to_next_key():
643
+ attempt += 1
644
+ print(f"πŸ”„ Retrying with next API key (attempt {attempt + 1}/{max_attempts})...")
645
+ continue
646
+ else:
647
+ # Only one key available, try fallback model
648
+ print(f"⚑ Trying fallback model: {fallback_model}")
649
+ try:
650
+ response = current_client.models.generate_content(
651
+ model=fallback_model,
652
+ contents=contents
653
+ )
654
+ raw_text = response.text
655
+ print(f"πŸ“₯ Received response (chars): {len(raw_text)}")
656
+ client_manager.reset_to_primary()
657
+ return raw_text
658
+ except Exception as e2:
659
+ print(f"❌ Fallback also failed: {e2}")
660
+ raise Exception(f"All API keys exhausted. Error: {e2}")
661
+ else:
662
+ # Not a quota error, try fallback model with same key
663
+ print(f"⚑ Trying fallback model: {fallback_model}")
664
+ try:
665
+ response = current_client.models.generate_content(
666
+ model=fallback_model,
667
+ contents=contents
668
+ )
669
+ raw_text = response.text
670
+ print(f"πŸ“₯ Received response (chars): {len(raw_text)}")
671
+ client_manager.reset_to_primary()
672
+ return raw_text
673
+ except Exception as e2:
674
+ print(f"❌ Fallback also failed: {e2}")
675
+ # If we have more keys, try them
676
+ if attempt < max_attempts - 1:
677
+ client_manager.rotate_to_next_key()
678
+ attempt += 1
679
+ print(f"πŸ”„ Trying next API key (attempt {attempt + 1}/{max_attempts})...")
680
+ continue
681
+ else:
682
+ raise Exception(f"All attempts failed. Last error: {e2}")
683
+
684
+ # If we exhausted all attempts
685
+ raise Exception(f"❌ All {max_attempts} API key(s) exhausted. Please check your quota or try again later.")
686
 
 
687
 
688
+ # ---------------- PARSERS ----------------
689
+ def extract_question_ids_from_qpms(text: str):
690
+ """Extract question IDs from QP+MS transcript."""
691
+ print("πŸ”Ž Extracting question IDs from QP+MS transcript using regex...")
692
+
693
+ clean_text = text.replace("\u00A0", " ").replace("\t", " ")
694
+
695
+ primary_matches = re.findall(r"^\s*Question\s*[:\s]\s*([\dA-Za-z.()]+)", clean_text, re.MULTILINE)
696
+ if primary_matches:
697
+ print(f"βœ… Extracted {len(primary_matches)} question IDs from explicit 'Question X' lines.")
698
+ print("IDs:", primary_matches)
699
+ return primary_matches
700
+
701
+ fallback_matches = re.findall(r"^\s*(\d+(?:[.)]|\([a-zA-Z0-9]+\))?[a-zA-Z0-9]*)", clean_text, re.MULTILINE)
702
+ if fallback_matches:
703
+ print(f"βœ… Extracted {len(fallback_matches)} question IDs (fallback numbered lists).")
704
+ print("IDs:", fallback_matches)
705
+ else:
706
+ print("⚠️ No question IDs extracted; will send NA placeholder.")
707
+ return fallback_matches
708
 
709
+ def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
710
+ """
711
+ Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
712
+ modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
713
+ requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
714
+ Includes explicit rules for interpreting NA-like answers and no-response situations.
715
+ """
716
 
717
+ if not expected_ids:
718
+ ids_block = "{NA}"
719
+ else:
720
+ ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
 
721
 
722
+ qpms_section = ""
723
+ if qpms_text is not None:
724
+ qpms_section = (
725
+ "\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS) below."
726
+ "\nUse it primarily to resolve ambiguous handwriting and to confirm expected answers when needed."
727
+ "\n--- BEGIN QP+MS TRANSCRIPT ---\n"
728
+ f"{qpms_text.strip()}\n"
729
+ "--- END QP+MS TRANSCRIPT ---\n"
730
+ )
731
 
732
+ prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.
733
+ INPUT: This PDF contains a student's handwritten answer sheet.
734
+ {qpms_section}
735
+ TASK:
736
+ 1. **THINKING:** Before transcribing each answer, document your thought process inside a **<think>** tag.
737
+ - Identify the question ID. If inferred, note why.
738
+ - Detail any ambiguities (unclear numbers, symbols, or structures).
739
+ - Explain how ambiguities were resolved, including whether the QP+MS transcript was consulted.
740
+ - If QP+MS was consulted but you chose not to change the transcription, state this.
741
+ - If the initial question label was incorrect (e.g., 2.a vs 2.b), correct it and briefly explain the reasoning in <think>.
742
+ *Example Thinking:*
743
+ <think>
744
+ - Found Question 3(a).
745
+ - The term could be '$2x$' or '21x'.
746
+ - Markscheme uses '$21x$', but handwriting matches '$2x$'.
747
+ - Decision: transcribe '$2x$'.
748
+ </think>
749
 
750
+ 2. **TRANSCRIPTION:** Transcribe the student's answers directly and faithfully.
751
+ - Assign each answer to a labelled question ID when present.
752
+ - For unlabeled answers, segment logically and mark inferred IDs as "**INFERRED: <id>**".
753
+ - **Mathematical expressions and standalone variables must appear inside LaTeX dollar delimiters ($...$).**
754
+ - If a diagram/graph is omitted, write **[Graph omitted]**.
755
+ - If handwriting is unreadable: **[illegible]**.
756
 
757
+ **ANSWER-INTERPRETATION RULES:**
758
+ - If the student writes β€œNA”, β€œN/A”, β€œNot Applicable”, or clear equivalents β†’ record exactly as **NA**.
759
+ - If the student leaves the space blank, crosses it out, makes no meaningful attempt, or provides no answer β†’ record **[No response]**.
760
 
761
+ Ensure deterministic formatting so subsequent models can grade directly from this aligned format.
 
762
 
763
+ Expected questions (if missing, write NA):
764
+ {ids_block}
765
+ -----------------------
766
+ OUTPUT FORMAT:
767
+ <think>...</think>
768
+ Question <id>
769
+ AS:<transcribed answer or placeholder>
770
+ <think>...</think>
771
+ Question <id>
772
+ AS:<transcribed answer or placeholder>
773
+ ...
774
+ ==== GRAPH FOUND ANSWERS ====
775
+ Graph found in:
776
+ - Answer <number> β†’ Page <number>
777
+ (one per line)
778
+ ==== END GRAPH FOUND ===="""
779
 
780
+ return prompt
781
 
 
 
 
 
 
782
 
783
+
784
+ def extract_graph_questions_from_ms(text: str):
785
+ """Extract graph questions and page numbers from MS transcript."""
786
+ clean_text = text.replace("\u00A0", " ").replace("\t", " ")
787
+ match = re.search(r"==== GRAPH EXPECTED QUESTIONS ====\s*(.*?)\s*==== END GRAPH EXPECTED ====",
788
+ clean_text, re.S)
789
+ graph_dict = {}
790
+ if match:
791
+ block = match.group(1)
792
+ for line in block.splitlines():
793
+ line = line.strip()
794
+ if line.startswith("- Question"):
795
+ q_match = re.match(r"- Question\s+([\dA-Za-z.()]+)\s*β†’\s*Page\s*(\d+)", line)
796
+ if q_match:
797
+ q_id, page = q_match.groups()
798
+ graph_dict[q_id] = int(page)
799
+ return graph_dict
800
+
801
+ def extract_graph_answers_from_as(text: str):
802
+ """Extract graph answers and page numbers from AS transcript."""
803
+ clean_text = text.replace("\u00A0", " ").replace("\t", " ")
804
+ block = re.search(r"==== GRAPH FOUND ANSWERS ====\s*(.*?)\s*==== END GRAPH FOUND ====",
805
+ clean_text, re.S)
806
+ graph_dict = {}
807
+ if block:
808
+ for line in block.group(1).splitlines():
809
+ line = line.strip()
810
+ if line.startswith("- Answer"):
811
+ match = re.match(r"- Answer\s+([\dA-Za-z.()]+)\s*β†’\s*Page\s*(\d+)", line)
812
+ if match:
813
+ ans_id, page = match.groups()
814
+ graph_dict[ans_id] = int(page)
815
+ return graph_dict
816
+
817
+ def extract_marks_from_grading(grading_text):
818
+ """
819
+ Parse the grading markdown and extract marks per question from the Awarded column only.
820
+ """
821
+ print("πŸ”Ž Extracting awarded marks from grading output...")
822
+ grading_json = {"grading": []}
823
+
824
+ question_blocks = re.split(r"###\s*Question\s+", grading_text)
825
+ for block in question_blocks[1:]:
826
+ first_line = block.strip().splitlines()[0].strip() if block.strip().splitlines() else ""
827
+ q_id_match = re.match(r"([0-9]+(?:[a-zA-Z]|\([^)]+\)|(?:\.[a-zA-Z0-9]+))*)", first_line)
828
+ if not q_id_match:
829
+ q_id = first_line.split()[0] if first_line else ""
830
+ else:
831
+ q_id = q_id_match.group(1).strip()
832
+
833
+ # Extract marks only from the "Awarded" column (4th column in the table)
834
+ awarded = []
835
+ lines = block.split('\n')
836
+ for line in lines:
837
+ if '|' in line:
838
+ parts = [p.strip() for p in line.split('|')]
839
+ # Check if this is a data row (not header or separator) and has at least 5 columns
840
+ if len(parts) >= 5 and not parts[1].startswith('-'):
841
+ awarded_col = parts[4] # 4th column (index 4 because of leading empty from split)
842
+ # Extract mark codes from the awarded column
843
+ marks = re.findall(r"\b([MABCR]\d+|[MABCR]0)\b", awarded_col)
844
+ awarded.extend(marks)
845
+
846
+ grading_json["grading"].append({
847
+ "question": q_id,
848
+ "marks_awarded": awarded
849
+ })
850
+ print("βœ… Extracted grading marks for", len(grading_json["grading"]), "question blocks.")
851
+ print(json.dumps(grading_json, indent=2))
852
+ return grading_json
853
+
854
+ def check_and_correct_total_marks(grading_text):
855
  """
856
+ Verifies the total marks in the Examiner's Summary Report against
857
+ the sum of individual question marks. Corrects if discrepancy found.
858
 
859
  Args:
860
+ grading_text (str): The full grading markdown text
861
+
862
  Returns:
863
+ tuple: (corrected_text, calculated_awarded, calculated_possible, was_corrected)
864
+ """
865
+ print("\n" + "="*60)
866
+ print("πŸ” VERIFYING TOTAL MARKS IN SUMMARY REPORT")
867
+ print("="*60)
868
+
869
+ question_marks = {}
870
+ calculated_total_awarded = 0
871
+ calculated_total_possible = 0
872
+
873
+ # Updated pattern to match BOTH formats:
874
+ # ### Question <1.a> (with angle brackets)
875
+ # ### Question 1.a (without angle brackets)
876
+ # The <? makes the opening bracket optional
877
+ # The >? makes the closing bracket optional
878
+ question_block_pattern = re.compile(
879
+ r"### Question\s*<?([0-9]+(?:[.()][a-z0-9]+)*)>?\s*[\s\S]*?\*\*Total:\s*(\d+)/(\d+)\*\*",
880
+ re.DOTALL | re.IGNORECASE
881
+ )
882
+
883
+ matches = question_block_pattern.finditer(grading_text)
884
+ for match in matches:
885
+ question_id = match.group(1).strip()
886
+ awarded = int(match.group(2))
887
+ possible = int(match.group(3))
888
+ question_marks[question_id] = {'awarded': awarded, 'possible': possible}
889
+ calculated_total_awarded += awarded
890
+ calculated_total_possible += possible
891
+
892
+ print(f"\nοΏ½ Exltracted marks from {len(question_marks)} questions:")
893
+ for q_id, marks in question_marks.items():
894
+ print(f" Question {q_id}: {marks['awarded']}/{marks['possible']}")
895
+
896
+ print(f"\nπŸ“ˆ Calculated totals from individual questions:")
897
+ print(f" Awarded: {calculated_total_awarded}")
898
+ print(f" Possible: {calculated_total_possible}")
899
+
900
+ # Find the summary report section
901
+ summary_report_start = grading_text.find("### Examiner's Summary Report")
902
+ if summary_report_start == -1:
903
+ print("⚠️ Warning: Could not find '### Examiner's Summary Report' section.")
904
+ return grading_text, calculated_total_awarded, calculated_total_possible, False
905
+
906
+ summary_section = grading_text[summary_report_start:]
907
+ summary_total_pattern = re.compile(r"(\*\*Total:\s*)(\d+)/(\d+)(\*\*)")
908
+ summary_match = summary_total_pattern.search(summary_section)
909
+
910
+ original_summary_awarded = 0
911
+ original_summary_possible = 0
912
+
913
+ if summary_match:
914
+ original_summary_awarded = int(summary_match.group(2))
915
+ original_summary_possible = int(summary_match.group(3))
916
+ print(f"\nπŸ“‹ Original summary report total: {original_summary_awarded}/{original_summary_possible}")
917
+ else:
918
+ print("⚠️ Warning: Could not find overall total in summary report.")
919
+ return grading_text, calculated_total_awarded, calculated_total_possible, False
920
+
921
+ # Check for discrepancies
922
+ corrected_report_text = grading_text
923
+ total_mismatch = False
924
+
925
+ if calculated_total_awarded != original_summary_awarded:
926
+ print(f"\n❌ DISCREPANCY FOUND in awarded marks!")
927
+ print(f" Calculated: {calculated_total_awarded}")
928
+ print(f" Reported: {original_summary_awarded}")
929
+ total_mismatch = True
930
+
931
+ if calculated_total_possible != original_summary_possible:
932
+ print(f"\n❌ DISCREPANCY FOUND in possible marks!")
933
+ print(f" Calculated: {calculated_total_possible}")
934
+ print(f" Reported: {original_summary_possible}")
935
+ total_mismatch = True
936
+
937
+ if total_mismatch:
938
+ print(f"\nπŸ”§ CORRECTING summary total:")
939
+ print(f" FROM: {original_summary_awarded}/{original_summary_possible}")
940
+ print(f" TO: {calculated_total_awarded}/{calculated_total_possible}")
941
+
942
+ # Correct only in the summary section
943
+ corrected_summary_section = re.sub(
944
+ summary_total_pattern,
945
+ rf"\g<1>{calculated_total_awarded}/{calculated_total_possible}\g<4>",
946
+ summary_section,
947
+ count=1
948
+ )
949
+
950
+ corrected_report_text = grading_text[:summary_report_start] + corrected_summary_section
951
+ print("βœ… Total marks corrected successfully!")
952
+ else:
953
+ print("\nβœ… Total marks are CORRECT - no correction needed!")
954
+
955
+ print("="*60 + "\n")
956
+
957
+ return corrected_report_text, calculated_total_awarded, calculated_total_possible, total_mismatch
958
+
959
+ # ---------------- MAPPING/IMPRINT HELPERS ----------------
960
+ def ask_gemini_for_mapping_batch(image_paths, grading_json, expected_ids=None, rows=GRID_ROWS, cols=GRID_COLS):
961
+ """
962
+ Send multiple page images together to Gemini for batch mapping processing.
963
+ """
964
+ ids_block = "{NA}"
965
+ if expected_ids:
966
+ ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
967
+
968
+ prompt = f"""You are an exam marker. Your role is to identify where each question begins on each page.
969
+ The pages are divided into a {rows} x {cols} grid. Each cell has a RUNNING NUMBER label.
970
+ For each question in the grading JSON, return the cell NUMBER where the FIRST STEP of that question begins.
971
+ ⚠ IMPORTANT RULES:
972
+ - Do not place marks inside another question's answer area.
973
+ - Prefer placing the marks in a BLANK cell immediately to the RIGHT of the answer step. If no blank cell is available to the right, then place in a blank cell to the LEFT.
974
+ - Never place marks above or below the answer.
975
+ - Each question should have unique cell number
976
+ - If a question serial number is visible in the answer image, you must mandatorily identify the corresponding question using the grading JSON.
977
+ IMPORTANT: For your help i have provided u questions that u can expect in the images:
978
+ {ids_block}
979
+ Return JSON only, like:
980
+ [{{"page": 1, "question": "1(a)", "cell_number": 15}}, ...]
981
+ Grading JSON:
982
+ {json.dumps(grading_json, indent=2)}"""
983
+
984
+ images = [Image.open(p) for p in image_paths]
985
+
986
+ print(f"πŸ“‘ Sending batch mapping request for {len(image_paths)} pages to Gemini...")
987
+
988
+ try:
989
+ contents = [prompt] + images
990
+ response = client.models.generate_content(
991
+ model="gemini-2.5-flash",
992
+ contents=contents
993
+ )
994
+ raw_text = response.text
995
+ except:
996
+ print("⚠️ Trying fallback model for mapping...")
997
+ contents = [prompt] + images
998
+ response = client.models.generate_content(
999
+ model="gemini-2.5-flash-preview-09-2025",
1000
+ contents=contents
1001
+ )
1002
+ raw_text = response.text
1003
+
1004
+ print("πŸ“₯ Batch mapping response (chars):", len(raw_text))
1005
+ print("πŸ”Ž Gemini raw batch output:")
1006
+ print(raw_text)
1007
+
1008
+ try:
1009
+ match = re.search(r'(\[.*\])', raw_text, re.DOTALL)
1010
+ if match:
1011
+ mapping = json.loads(match.group(1))
1012
+ print(f"βœ… Parsed Gemini batch mapping for {len(image_paths)} pages")
1013
+ return mapping
1014
+ else:
1015
+ print("❌ Failed to find JSON array in response")
1016
+ return []
1017
+ except Exception as e:
1018
+ print(f"❌ Failed to parse Gemini JSON mapping: {e}")
1019
+ return []
1020
+
1021
+ def normalize_question_id(qid):
1022
+ """
1023
+ Normalize question ID to a standard format for matching.
1024
+ Converts formats like:
1025
+ - "1(a)" -> "1.a"
1026
+ - "2(c).i" -> "2.c.i"
1027
+ - "3.d.ii" -> "3.d.ii" (already normalized)
1028
+ """
1029
+ if not qid:
1030
+ return qid
1031
+
1032
+ # Replace parentheses format: 1(a) -> 1.a
1033
+ qid = re.sub(r'(\d+)\(([a-zA-Z])\)', r'\1.\2', qid)
1034
+
1035
+ # Replace format like 2(c).i -> 2.c.i
1036
+ qid = re.sub(r'(\d+)\(([a-zA-Z]+)\)\.', r'\1.\2.', qid)
1037
+
1038
+ return qid
1039
+
1040
+ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, expected_ids=None, rows=GRID_ROWS, cols=GRID_COLS):
1041
+ """
1042
+ Convert PDF to images, create grid-numbered images for batch sending to Gemini,
1043
+ then annotate and produce imprinted PDF.
1044
+ """
1045
+ print("πŸ“„ Converting answer PDF to images for imprinting...")
1046
+ pages = convert_from_path(pdf_path, dpi=100)
1047
+ annotated_page_paths = []
1048
+ temp_grid_images = []
1049
+
1050
+ for p_index, page in enumerate(pages):
1051
+ img = page.convert("RGB")
1052
+ w, h = img.size
1053
+ cell_w, cell_h = w / cols, h / rows
1054
+
1055
+ draw = ImageDraw.Draw(img)
1056
+ try:
1057
+ num_font = ImageFont.truetype("arial.ttf", 20)
1058
+ except Exception:
1059
+ num_font = ImageFont.load_default()
1060
+
1061
+ cell_num = 1
1062
+ for r in range(rows):
1063
+ for c in range(cols):
1064
+ x = int(c * cell_w + cell_w / 2)
1065
+ y = int(r * cell_h + cell_h / 2)
1066
+ text = str(cell_num)
1067
+ bbox = draw.textbbox((0, 0), text, font=num_font)
1068
+ tw = bbox[2] - bbox[0]
1069
+ th = bbox[3] - bbox[1]
1070
+ draw.text((x - tw/2, y - th/2), text, fill="black", font=num_font)
1071
+ cell_num += 1
1072
+
1073
+ temp_path = f"page_{p_index+1}_grid.png"
1074
+ img.save(temp_path, "PNG")
1075
+ temp_grid_images.append(temp_path)
1076
+ print("πŸ›° Created grid image:", temp_path)
1077
+
1078
+ print("πŸ“‘ Sending page images to Gemini in batches for mapping...")
1079
+ batch_size = 10
1080
+ all_mappings = []
1081
+
1082
+ for start in range(0, len(temp_grid_images), batch_size):
1083
+ batch_paths = temp_grid_images[start:start+batch_size]
1084
+ batch_mapping = ask_gemini_for_mapping_batch(batch_paths, grading_json, expected_ids, rows, cols)
1085
+ all_mappings.extend(batch_mapping)
1086
+ print(f"βœ… Processed batch {start//batch_size + 1}: pages {start+1}-{start+len(batch_paths)}")
1087
+
1088
+ print("πŸ–Š Annotating pages with marks...")
1089
+ for p_index, page in enumerate(pages):
1090
+ page_num = p_index + 1
1091
+ page_img = page.convert("RGB")
1092
+ img_cv = np.array(page_img)
1093
+ img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
1094
+ h, w, _ = img_cv.shape
1095
+ cell_w_px, cell_h_px = w / cols, h / rows
1096
+
1097
+ page_mappings = [m for m in all_mappings if m.get("page") == page_num]
1098
+
1099
+ for item in page_mappings:
1100
+ qid = item.get("question")
1101
+ cell_number = item.get("cell_number")
1102
+ if qid is None or cell_number is None:
1103
+ continue
1104
+
1105
+ # Normalize the question ID from Gemini mapping
1106
+ normalized_qid = normalize_question_id(qid)
1107
+
1108
+ # Try exact match first with normalized ID
1109
+ marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", [])
1110
+ if g["question"] == normalized_qid), [])
1111
+
1112
+ # If no match, try case-insensitive match
1113
+ if not marks_list:
1114
+ marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", [])
1115
+ if g["question"].lower() == normalized_qid.lower()), [])
1116
+
1117
+ # If still no match, try with original qid
1118
+ if not marks_list:
1119
+ marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", [])
1120
+ if g["question"] == qid), [])
1121
+
1122
+ marks_text = ",".join(marks_list) if marks_list else "?"
1123
+
1124
+ if marks_text == "?":
1125
+ print(f"⚠️ No marks found for question '{qid}' (normalized: '{normalized_qid}') on page {page_num}")
1126
+
1127
+ row = (cell_number - 1) // cols
1128
+ col = (cell_number - 1) % cols
1129
+
1130
+ x_c = int((col + 1) * cell_w_px - cell_w_px / 4)
1131
+ y_c = int((row + 0.5) * cell_h_px)
1132
+
1133
+ font_scale = max(1.0, min(2.0, cell_h_px / 40.0))
1134
+ thickness = max(2, int(font_scale * 2))
1135
+ cv2.putText(img_cv, marks_text, (x_c, y_c), cv2.FONT_HERSHEY_SIMPLEX,
1136
+ font_scale, (0, 0, 255), thickness, cv2.LINE_AA)
1137
+ print(f"πŸ–Š Marks annotated for page {page_num}, question {qid}: {marks_text}")
1138
+
1139
+ annotated_path = f"annotated_page_{page_num}.png"
1140
+ cv2.imwrite(annotated_path, img_cv)
1141
+ annotated_page_paths.append(annotated_path)
1142
+ print("βœ… Annotated page saved:", annotated_path)
1143
+
1144
+ print("πŸ“‘ Merging annotated pages into final PDF...")
1145
+ with open(output_pdf, "wb") as f:
1146
+ f.write(img2pdf.convert(annotated_page_paths))
1147
+
1148
+ compressed = compress_pdf(output_pdf)
1149
+ print("πŸ“‘ Imprinted PDF saved to:", compressed)
1150
+ return compressed
1151
+
1152
+ def extract_pdf_pages_as_images(pdf_path, page_numbers, prefix):
1153
+ """
1154
+ Extracts unique pages (1-based) from a PDF as images, saves as PNG, returns list of file paths.
1155
+ Handles cases where requested pages don't exist in the PDF.
1156
  """
1157
+ if not page_numbers:
1158
+ print(f"⚠️ No page numbers provided for extraction")
1159
+ return []
1160
+
1161
+ unique_pages = sorted(set(page_numbers))
1162
+
1163
+ # First, get the total page count to validate requested pages
1164
+ try:
1165
+ from PyPDF2 import PdfReader
1166
+ reader = PdfReader(pdf_path)
1167
+ total_pages = len(reader.pages)
1168
+ print(f"πŸ“„ PDF has {total_pages} total pages")
1169
+
1170
+ # Filter out invalid page numbers
1171
+ valid_pages = [p for p in unique_pages if 1 <= p <= total_pages]
1172
+ invalid_pages = [p for p in unique_pages if p not in valid_pages]
1173
+
1174
+ if invalid_pages:
1175
+ print(f"⚠️ Skipping invalid page numbers (out of range): {invalid_pages}")
1176
+
1177
+ if not valid_pages:
1178
+ print(f"❌ No valid pages to extract from {pdf_path}")
1179
+ return []
1180
+
1181
+ unique_pages = valid_pages
1182
+ except Exception as e:
1183
+ print(f"⚠️ Could not validate page numbers: {e}. Proceeding with extraction...")
1184
+
1185
+ # Extract the pages
1186
+ try:
1187
+ images = convert_from_path(pdf_path, dpi=200, first_page=min(unique_pages), last_page=max(unique_pages))
1188
+ except Exception as e:
1189
+ print(f"❌ Failed to convert PDF pages to images: {e}")
1190
+ return []
1191
+
1192
+ out_paths = []
1193
+ for idx, page_num in enumerate(unique_pages):
1194
+ img_idx = page_num - min(unique_pages)
1195
+
1196
+ # Bounds check to prevent index errors
1197
+ if img_idx >= len(images):
1198
+ print(f"⚠️ Page {page_num} not found in extracted images (index {img_idx} >= {len(images)}). Skipping...")
1199
+ continue
1200
+
1201
+ try:
1202
+ img = images[img_idx]
1203
+ out_path = f"{prefix}_page_{page_num}.png"
1204
+ img.save(out_path, "PNG")
1205
+ print(f"πŸ“€ Extracted graph page {page_num} from {pdf_path} as {out_path}")
1206
+ out_paths.append(out_path)
1207
+ except Exception as e:
1208
+ print(f"❌ Failed to save page {page_num}: {e}")
1209
+ continue
1210
+
1211
+ return out_paths
1212
+
1213
+ # ---------------- PIPELINE ----------------
1214
+ def align_and_grade_pipeline(qp_path, ms_path, ans_path, subject="Maths", imprint=False, run_timestamp=None):
1215
+ """
1216
+ Final pipeline with graph-aware grading logic using NEW SDK.
1217
+
1218
+ Args:
1219
+ qp_path: Path to Question Paper PDF
1220
+ ms_path: Path to Markscheme PDF
1221
+ ans_path: Path to Answer Sheet PDF
1222
+ subject: Subject name (Maths or Science)
1223
+ imprint: Whether to generate imprinted PDF
1224
+ run_timestamp: Unix timestamp for organizing files in Supabase
1225
+ """
1226
+ try:
1227
+ print("πŸ” Starting pipeline...")
1228
+ qp_path = compress_pdf(qp_path)
1229
+ ms_path = compress_pdf(ms_path)
1230
+ ans_path = compress_pdf(ans_path)
1231
+
1232
+ merged_qpms_path = os.path.splitext(qp_path)[0] + "_merged_qp_ms.pdf"
1233
+ merge_pdfs([qp_path, ms_path], merged_qpms_path)
1234
+ print("πŸ“Ž Merged QP + MS ->", merged_qpms_path)
1235
+
1236
+ print("πŸ”Ό Uploading files to Gemini...")
1237
+ merged_uploaded = upload_to_gemini(merged_qpms_path)
1238
+ ans_uploaded = upload_to_gemini(ans_path)
1239
+ print("βœ… Upload complete.")
1240
+
1241
+ print("1.i) Transcribing QP+MS (questions first, then full markscheme, with graph detection)...")
1242
+ qpms_prompt = QP_MS_TRANSCRIPTION_PROMPT["content"] + "\nAt the end, also list all questions in the markscheme where a graph is expected, in the format:\nGraph expected in:\n- Question <number> β†’ Page <number>\n(One per line, after ==== MARKSCHEME END ====)"
1243
+ qpms_text = gemini_generate_content(qpms_prompt, file_upload_obj=merged_uploaded, model_name="gemini-2.5-flash", fallback_model="gemini-2.5-flash-preview-09-2025")
1244
+ print("πŸ“„ QP+MS transcription received. Saving debug file: debug_qpms_transcript.txt")
1245
+ with open("debug_qpms_transcript.txt", "w", encoding="utf-8") as f:
1246
+ f.write(qpms_text)
1247
+
1248
+ ms_graph_mapping = extract_graph_questions_from_ms(qpms_text)
1249
+ print("πŸ–ΌοΈ Graph-expected questions in MS:", ms_graph_mapping)
1250
+ ms_graph_pages = list(ms_graph_mapping.values())
1251
+ ms_graph_images = []
1252
+ if ms_graph_pages:
1253
+ ms_graph_images = extract_pdf_pages_as_images(merged_qpms_path, ms_graph_pages, prefix="qpms_graph")
1254
+
1255
+ extracted_ids = extract_question_ids_from_qpms(qpms_text)
1256
+ if not extracted_ids:
1257
+ extracted_ids = ["NA"]
1258
+
1259
+ print("1.ii) Building AS transcription prompt with expected question IDs and graph detection, sending to Gemini...")
1260
+ as_prompt = build_as_cot_prompt_with_expected_ids(extracted_ids, qpms_text) + "\nAt the end, also list all answers where a graph is found, in the format:\nGraph found in:\n- Answer <number> β†’ Page <number>\n(One per line, after all answers)"
1261
+ as_text = gemini_generate_content(as_prompt, file_upload_obj=ans_uploaded, model_name="gemini-2.5-flash", fallback_model="gemini-2.5-flash-preview-09-2025")
1262
+ print("πŸ“ AS transcription received. Saving debug file: debug_as_transcript.txt")
1263
+ with open("debug_as_transcript.txt", "w", encoding="utf-8") as f:
1264
+ f.write(as_text)
1265
+
1266
+ as_graph_mapping = extract_graph_answers_from_as(as_text)
1267
+ print("πŸ–ΌοΈ Graph-attempted answers in AS:", as_graph_mapping)
1268
+ as_graph_pages = list(as_graph_mapping.values())
1269
+ as_graph_images = []
1270
+ if as_graph_pages:
1271
+ as_graph_images = extract_pdf_pages_as_images(ans_path, as_graph_pages, prefix="as_graph")
1272
+
1273
+ print("2) Preparing grading input and sending to Gemini for grading...")
1274
+ grading_input = (
1275
+ "=== QP+MS TRANSCRIPT BEGIN ===\n"
1276
+ + qpms_text
1277
+ + "\n=== QP+MS TRANSCRIPT END ===\n\n"
1278
+ + "=== ANSWER SHEET TRANSCRIPT BEGIN ===\n"
1279
+ + as_text
1280
+ + "\n=== ANSWER SHEET TRANSCRIPT END ===\n"
1281
+ )
1282
+ if ms_graph_images or as_graph_images:
1283
+ graph_note = "\n\n---\nSome questions require graphs. I've attached the relevant graph pages from QP+MS and from the Answer Sheet. Use them as visual context when grading.\n---\n"
1284
+ grading_input += graph_note
1285
+ grading_prompt_obj = get_grading_prompt(subject.lower())
1286
+ grading_prompt_system = grading_prompt_obj["content"]
1287
+ grading_images = ms_graph_images + as_graph_images
1288
+ grading_text = gemini_generate_content(grading_prompt_system + "\n\nPlease grade the following transcripts:\n" + grading_input, image_obj=grading_images if grading_images else None, model_name="gemini-2.5-pro", fallback_model="gemini-2.5-flash")
1289
+ print("🧾 Grading output received. Saving debug file: debug_grading.md")
1290
+ with open("debug_grading.md", "w", encoding="utf-8") as f:
1291
+ f.write(grading_text)
1292
+
1293
+ # Verify and correct total marks if needed
1294
+ grading_text, calc_awarded, calc_possible, was_corrected = check_and_correct_total_marks(grading_text)
1295
+
1296
+ if was_corrected:
1297
+ print("πŸ“ Saving corrected grading to debug file: debug_grading_corrected.md")
1298
+ with open("debug_grading_corrected.md", "w", encoding="utf-8") as f:
1299
+ f.write(grading_text)
1300
+
1301
+ base_name = os.path.splitext(os.path.basename(ans_path))[0]
1302
+ grading_pdf_path = save_as_pdf(grading_text, f"{base_name}_graded.pdf")
1303
+ print("πŸ“„ Grading PDF saved:", grading_pdf_path)
1304
+
1305
+ grading_json = extract_marks_from_grading(grading_text)
1306
+ with open("debug_grading_json.json", "w", encoding="utf-8") as f:
1307
+ json.dump(grading_json, f, indent=2, ensure_ascii=False)
1308
+ print("πŸ”§ Grading marks extraction complete.")
1309
+
1310
+ imprinted_pdf_path = None
1311
+ if imprint:
1312
+ print("✍ Imprint option enabled. Starting imprinting process...")
1313
+ imprinted_pdf_path = f"{base_name}_imprinted.pdf"
1314
+ imprinted_pdf_path = imprint_marks_using_mapping(ans_path, grading_json, imprinted_pdf_path, extracted_ids)
1315
+ print("βœ… Imprinting finished. Imprinted PDF at:", imprinted_pdf_path)
1316
+
1317
+ # Upload output files to Supabase (using same timestamp as input files)
1318
+ output_urls = {
1319
+ "graded_pdf_url": None,
1320
+ "imprinted_pdf_url": None
1321
+ }
1322
+
1323
+ if supabase_client:
1324
+ print("\nπŸ“€ Uploading output files to Supabase...")
1325
+ if grading_pdf_path:
1326
+ output_urls["graded_pdf_url"] = upload_file_to_supabase(grading_pdf_path, "graded", run_timestamp)
1327
+ if imprinted_pdf_path:
1328
+ output_urls["imprinted_pdf_url"] = upload_file_to_supabase(imprinted_pdf_path, "imprinted", run_timestamp)
1329
+
1330
+ print("🏁 Pipeline finished successfully.")
1331
+ return qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path, output_urls
1332
+
1333
+ except Exception as e:
1334
+ print("❌ Pipeline error:", e)
1335
+ import traceback
1336
+ traceback.print_exc()
1337
+ return f"❌ Error: {e}", None, None, None, None, {}
1338
+
1339
+ # ---------------- GRADIO UI ----------------
1340
+ with gr.Blocks(title="AI Grading (Pandoc + pdflatex)") as demo:
1341
+ gr.Markdown("## πŸ“˜ AI Grading β€” Using Pandoc + pdflatex for PDF Generation")
1342
+ gr.Markdown("**βœ… Now using Pandoc with pdflatex for professional-quality PDF outputs!**")
1343
+
1344
+ if supabase_client:
1345
+ gr.Markdown("**☁️ Supabase Storage: Enabled** - All files will be uploaded to cloud storage")
1346
  else:
1347
+ gr.Markdown("**⚠️ Supabase Storage: Disabled** - Files will only be processed locally")
1348
+
1349
+ with gr.Row():
1350
+ qp_file = gr.File(label="πŸ“„ Upload Question Paper (PDF)")
1351
+ ms_file = gr.File(label="πŸ“„ Upload Markscheme (PDF)")
1352
+ ans_file = gr.File(label="πŸ“ Upload Student Answer Sheet (PDF)")
1353
+
1354
+ with gr.Row():
1355
+ subject_dropdown = gr.Dropdown(
1356
+ choices=["Maths", "Science", "Economics"],
1357
+ value="Maths",
1358
+ label="πŸ“š Subject",
1359
+ info="Select the subject to apply appropriate grading guidelines"
1360
+ )
1361
+ imprint_toggle = gr.Checkbox(label="✍ Imprint Marks on Student Answer Sheet", value=False)
1362
+
1363
+ run_button = gr.Button("πŸš€ Run Pipeline")
1364
+
1365
+ # File URLs section (only shown if Supabase is enabled)
1366
+ if supabase_client:
1367
+ with gr.Accordion("☁️ Uploaded File URLs", open=False):
1368
+ file_urls_box = gr.Textbox(label="Cloud Storage URLs", lines=8, interactive=False)
1369
+
1370
+ with gr.Row():
1371
+ qpms_box = gr.Textbox(label="πŸ“‘ QP+MS Transcript", lines=12)
1372
+ as_box = gr.Textbox(label="πŸ“ AS Transcript", lines=12)
1373
+
1374
+ grading_output_box = gr.Textbox(label="🧾 Grading (Markdown)", lines=20)
1375
+ grading_pdf_file = gr.File(label="πŸ“₯ Download Grading PDF")
1376
+ imprint_pdf_file = gr.File(label="πŸ“₯ Download Imprinted PDF (Optional)")
1377
+
1378
+ def run_pipeline(qp_file_obj, ms_file_obj, ans_file_obj, subject_choice, imprint_flag):
1379
+ if not qp_file_obj or not ms_file_obj or not ans_file_obj:
1380
+ error_msg = "❌ Please upload all three files"
1381
+ if supabase_client:
1382
+ return error_msg, "", "", None, None, ""
1383
+ else:
1384
+ return error_msg, "", "", None, None
1385
+
1386
+ # Process and upload input files (generates shared timestamp)
1387
+ qp_path, ms_path, ans_path, input_urls, run_timestamp = process_and_upload_input_files(
1388
+ qp_file_obj, ms_file_obj, ans_file_obj
1389
+ )
1390
+
1391
+ # Run the grading pipeline (pass timestamp to keep all files together)
1392
+ qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path, output_urls = align_and_grade_pipeline(
1393
+ qp_path, ms_path, ans_path, subject=subject_choice, imprint=imprint_flag, run_timestamp=run_timestamp
1394
+ )
1395
+
1396
+ # Build URLs summary
1397
+ urls_summary = ""
1398
+ if supabase_client:
1399
+ urls_summary = f"πŸ“€ UPLOADED FILES (Timestamp: {run_timestamp}):\n\n"
1400
+ urls_summary += "INPUT FILES:\n"
1401
+ if input_urls.get("qp_url"):
1402
+ urls_summary += f"β€’ Question Paper: {input_urls['qp_url']}\n"
1403
+ if input_urls.get("ms_url"):
1404
+ urls_summary += f"β€’ Markscheme: {input_urls['ms_url']}\n"
1405
+ if input_urls.get("ans_url"):
1406
+ urls_summary += f"β€’ Answer Sheet: {input_urls['ans_url']}\n"
1407
+
1408
+ urls_summary += "\nOUTPUT FILES:\n"
1409
+ if output_urls.get("graded_pdf_url"):
1410
+ urls_summary += f"β€’ Graded PDF: {output_urls['graded_pdf_url']}\n"
1411
+ if output_urls.get("imprinted_pdf_url"):
1412
+ urls_summary += f"β€’ Imprinted PDF: {output_urls['imprinted_pdf_url']}\n"
1413
+
1414
+ urls_summary += f"\nπŸ“ All files stored in: examfiles/{run_timestamp}/\n"
1415
+
1416
+ if not any(input_urls.values()) and not any(output_urls.values()):
1417
+ urls_summary += "\n⚠️ No files were uploaded to Supabase"
1418
+
1419
+ if supabase_client:
1420
+ return (
1421
+ qpms_text or "",
1422
+ as_text or "",
1423
+ grading_text or "",
1424
+ grading_pdf_path,
1425
+ imprinted_pdf_path,
1426
+ urls_summary
1427
+ )
1428
+ else:
1429
+ return (
1430
+ qpms_text or "",
1431
+ as_text or "",
1432
+ grading_text or "",
1433
+ grading_pdf_path,
1434
+ imprinted_pdf_path
1435
+ )
1436
+
1437
+ # Set up the click handler based on whether Supabase is enabled
1438
+ if supabase_client:
1439
+ run_button.click(
1440
+ fn=run_pipeline,
1441
+ inputs=[qp_file, ms_file, ans_file, subject_dropdown, imprint_toggle],
1442
+ outputs=[qpms_box, as_box, grading_output_box, grading_pdf_file, imprint_pdf_file, file_urls_box]
1443
+ )
1444
+ else:
1445
+ run_button.click(
1446
+ fn=run_pipeline,
1447
+ inputs=[qp_file, ms_file, ans_file, subject_dropdown, imprint_toggle],
1448
+ outputs=[qpms_box, as_box, grading_output_box, grading_pdf_file, imprint_pdf_file]
1449
+ )
1450
+
1451
+ if __name__ == "__main__":
1452
+ demo.launch()