atz21 commited on
Commit
6ef90e5
·
verified ·
1 Parent(s): 17962e4

Update prompts.py

Browse files
Files changed (1) hide show
  1. prompts.py +231 -1413
prompts.py CHANGED
@@ -1,1452 +1,270 @@
1
- import os
2
- import re
3
- import json
4
- import subprocess
5
- import time
6
- import shutil
7
- import img2pdf
8
- import gradio as gr
9
- from google import genai # NEW SDK
10
- from pdf2image import convert_from_path
11
- from PIL import Image, ImageDraw, ImageFont
12
- import cv2
13
- import numpy as np
14
- from PyPDF2 import PdfReader, PdfWriter
15
- from prompts import QP_MS_TRANSCRIPTION_PROMPT, get_grading_prompt
16
- from supabase import create_client, Client
17
-
18
- # ---------------- CONFIG ----------------
19
- # Multi-API Key Configuration for handling RESOURCE_EXHAUSTED errors
20
- class GeminiClientManager:
21
- """Manages multiple Gemini API keys with automatic rotation on quota exhaustion."""
22
-
23
- def __init__(self):
24
- # Load all three API keys from environment
25
- self.api_keys = [
26
- os.getenv("GEMINI_API_KEY_1"),
27
- os.getenv("GEMINI_API_KEY_2"),
28
- os.getenv("GEMINI_API_KEY_3")
29
- ]
30
-
31
- # Filter out None values
32
- self.api_keys = [key for key in self.api_keys if key]
33
-
34
- if not self.api_keys:
35
- raise ValueError("❌ No API keys found! Please set at least GEMINI_API_KEY_1")
36
-
37
- print(f"✅ Loaded {len(self.api_keys)} Gemini API key(s)")
38
-
39
- # Current key index (0 = primary)
40
- self.current_key_index = 0
41
-
42
- # Create clients for all keys
43
- self.clients = [genai.Client(api_key=key) for key in self.api_keys]
44
-
45
- def get_current_client(self):
46
- """Get the currently active client."""
47
- return self.clients[self.current_key_index]
48
-
49
- def rotate_to_next_key(self):
50
- """Rotate to the next available API key."""
51
- if len(self.api_keys) == 1:
52
- print("⚠️ Only one API key available, cannot rotate")
53
- return False
54
-
55
- old_index = self.current_key_index
56
- self.current_key_index = (self.current_key_index + 1) % len(self.api_keys)
57
- print(f"🔄 Rotating from API key #{old_index + 1} to API key #{self.current_key_index + 1}")
58
- return True
59
-
60
- def reset_to_primary(self):
61
- """Reset to primary (first) API key."""
62
- if self.current_key_index != 0:
63
- print(f"🔙 Resetting to primary API key #1")
64
- self.current_key_index = 0
65
-
66
- # Initialize the client manager
67
- client_manager = GeminiClientManager()
68
- client = client_manager.get_current_client() # For backward compatibility
69
- GRID_ROWS, GRID_COLS = 20, 14
70
-
71
- # Supabase configuration
72
- SUPABASE_URL = os.getenv("SUPABASE_URL")
73
- SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY")
74
- SUPABASE_BUCKET = "examfiles"
75
-
76
- # Initialize Supabase client (only if credentials are available)
77
- supabase_client = None
78
- if SUPABASE_URL and SUPABASE_SERVICE_KEY:
79
- try:
80
- supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
81
- print("✅ Supabase client initialized successfully")
82
- except Exception as e:
83
- print(f"⚠️ Supabase initialization failed: {e}")
84
- else:
85
- print("⚠️ Supabase credentials not found - file upload to storage disabled")
86
-
87
- # ---------------- PROMPTS ----------------
88
- # Prompts are now imported from prompts.py
89
-
90
- # ---------------- SUPABASE HELPERS ----------------
91
- def upload_file_to_supabase(local_path, file_type="unknown", timestamp=None):
92
- """
93
- Upload a file to Supabase Storage.
94
-
95
- Args:
96
- local_path (str): Local file path
97
- file_type (str): Type of file (qp, ms, ans, graded, imprinted)
98
- timestamp (str): Unix timestamp for folder organization (optional)
99
-
100
- Returns:
101
- str: Public URL of uploaded file or None if upload failed
102
- """
103
- if not supabase_client:
104
- print("⚠️ Supabase not configured - skipping upload")
105
- return None
106
-
107
- try:
108
- if timestamp is None:
109
- timestamp = str(int(time.time()))
110
-
111
- original_name = os.path.basename(local_path)
112
- # Use original filename without prefix for cleaner storage
113
- remote_path = f"{timestamp}/{original_name}"
114
-
115
- print(f"📤 Uploading {file_type} to Supabase: {remote_path}")
116
-
117
- with open(local_path, "rb") as f:
118
- supabase_client.storage.from_(SUPABASE_BUCKET).upload(
119
- remote_path,
120
- f,
121
- file_options={"upsert": "true"}
122
- )
123
-
124
- public_url = f"{SUPABASE_URL}/storage/v1/object/public/{SUPABASE_BUCKET}/{remote_path}"
125
- print(f"✅ Uploaded successfully: {public_url}")
126
- return public_url
127
-
128
- except Exception as e:
129
- print(f"❌ Supabase upload failed for {file_type}: {e}")
130
- return None
131
-
132
- def process_and_upload_input_files(qp_file_obj, ms_file_obj, ans_file_obj):
133
- """
134
- Process uploaded files and upload them to Supabase using a shared timestamp.
135
-
136
- Args:
137
- qp_file_obj: Gradio file object for Question Paper
138
- ms_file_obj: Gradio file object for Markscheme
139
- ans_file_obj: Gradio file object for Answer Sheet
140
-
141
- Returns:
142
- tuple: (qp_path, ms_path, ans_path, upload_urls_dict, timestamp)
143
- """
144
- print("\n" + "="*60)
145
- print("📁 PROCESSING INPUT FILES")
146
- print("="*60)
147
-
148
- # Generate single timestamp for this entire run
149
- run_timestamp = str(int(time.time()))
150
- print(f"🕐 Run timestamp: {run_timestamp}")
151
-
152
- upload_urls = {
153
- "qp_url": None,
154
- "ms_url": None,
155
- "ans_url": None
156
- }
157
-
158
- # Get local paths from Gradio file objects
159
- qp_path = qp_file_obj.name if qp_file_obj else None
160
- ms_path = ms_file_obj.name if ms_file_obj else None
161
- ans_path = ans_file_obj.name if ans_file_obj else None
162
-
163
- # Upload to Supabase if configured (all files use same timestamp)
164
- if supabase_client:
165
- if qp_path:
166
- upload_urls["qp_url"] = upload_file_to_supabase(qp_path, "qp", run_timestamp)
167
- if ms_path:
168
- upload_urls["ms_url"] = upload_file_to_supabase(ms_path, "ms", run_timestamp)
169
- if ans_path:
170
- upload_urls["ans_url"] = upload_file_to_supabase(ans_path, "ans", run_timestamp)
171
-
172
- print("="*60 + "\n")
173
-
174
- return qp_path, ms_path, ans_path, upload_urls, run_timestamp
175
-
176
-
177
-
178
- # ---------------- HELPERS ----------------
179
- def parse_md_table(md):
180
- """Parse a Markdown table into a list of rows."""
181
- lines = [l for l in md.split("\n") if l.strip()]
182
- if len(lines) < 3:
183
- return []
184
- lines = lines[2:] # skip header + separator
185
- rows = []
186
- for line in lines:
187
- parts = [c.strip() for c in line.strip("|").split("|")]
188
- # Filter out empty strings from leading/trailing pipes
189
- clean_parts = [p for p in parts if p]
190
- if clean_parts:
191
- rows.append(clean_parts)
192
- return rows
193
-
194
- def convert_html_color_spans(md_text):
195
- """Convert HTML color spans to LaTeX textcolor commands."""
196
- pattern = r'<span\s+style="color:\s*([^"]+)">\s*(.*?)\s*</span>'
197
- def repl(m):
198
- color = m.group(1).strip()
199
- text = m.group(2)
200
- return fr'\textcolor{{{color}}}{{{text}}}'
201
- return re.sub(pattern, repl, md_text, flags=re.IGNORECASE)
202
-
203
- def cleanup_markdown_for_latex(md_text):
204
- """Clean up markdown text for better LaTeX conversion."""
205
- # Ensure spacing between bold headers and tables
206
- md_text = re.sub(r'(\*\*Markscheme vs Student Answer\*\*)\s*(\|)', r'\1\n\n\2', md_text)
207
-
208
- # Convert common unicode math symbols to LaTeX (safety net)
209
- replacements = {
210
- '∫': r'\int ',
211
- '²': '^2',
212
- '³': '^3',
213
- '½': r'\frac{1}{2}',
214
- '¼': r'\frac{1}{4}',
215
- '∞': r'\infty',
216
- '≤': r'\leq',
217
- '≥': r'\geq',
218
- '≠': r'\neq',
219
- '±': r'\pm',
220
- '×': r'\times',
221
- '÷': r'\div',
222
- '√': r'\sqrt',
223
- '∑': r'\sum',
224
- '∏': r'\prod',
225
- '∂': r'\partial',
226
- 'π': r'\pi',
227
- 'θ': r'\theta',
228
- 'α': r'\alpha',
229
- 'β': r'\beta',
230
- 'γ': r'\gamma',
231
- 'δ': r'\delta',
232
- 'ε': r'\epsilon',
233
- 'λ': r'\lambda',
234
- 'μ': r'\mu',
235
- 'σ': r'\sigma',
236
- 'Δ': r'\Delta',
237
- 'Σ': r'\Sigma',
238
- 'Ω': r'\Omega'
239
- }
240
-
241
- for char, latex in replacements.items():
242
- md_text = md_text.replace(char, f'${latex}$')
243
-
244
- return md_text
245
-
246
- def escape_latex_special_chars(text):
247
- """Escape special LaTeX characters in text."""
248
- replacements = {
249
- '%': r'\%',
250
- '&': r'\&',
251
- '#': r'\#',
252
- '_': r'\_',
253
- '{': r'\{',
254
- '}': r'\}',
255
- '~': r'\textasciitilde{}',
256
- '^': r'\textasciicircum{}'
257
- }
258
-
259
- # Don't escape if already in math mode or LaTeX command
260
- if '$' in text or '\\' in text:
261
- return text
262
-
263
- for char, escaped in replacements.items():
264
- text = text.replace(char, escaped)
265
-
266
- return text
267
-
268
- def save_as_pdf(text, filename="output.pdf"):
269
- """
270
- Convert Markdown text to PDF using Pandoc with pdflatex.
271
- Extracts the Examiner's Summary Report and places it at the top with enhanced formatting.
272
- Converts HTML color spans to LaTeX textcolor commands.
273
-
274
- Args:
275
- text (str): Markdown content to convert
276
- filename (str): Output PDF filename
277
-
278
- Returns:
279
- str: Path to the generated PDF file
280
-
281
- Raises:
282
- Exception: If Pandoc or pdflatex is not available, or conversion fails
283
- """
284
- base_name = os.path.splitext(filename)[0]
285
- temp_md_file = f"{base_name}_input.md"
286
- temp_tex_file = f"{base_name}_temp.tex"
287
-
288
- print("\n" + "="*60)
289
- print("� MARKDOWoN TO PDF CONVERSION PROCESS")
290
- print("="*60)
291
-
292
- try:
293
- # Step 1: Extract Summary Report Table
294
- print("\n[STEP 1/6] Extracting Examiner's Summary Report...")
295
- summary_pattern = re.compile(
296
- r"### Examiner's Summary Report\s*\n\n(\|.*?\|)\s*\n\n\*\*Total:\s*(.*?)\*\*",
297
- re.DOTALL
298
- )
299
- summary_match = summary_pattern.search(text)
300
-
301
- if summary_match:
302
- summary_table_md = summary_match.group(1)
303
- summary_total = summary_match.group(2)
304
- text = summary_pattern.sub("", text)
305
- print(f" ✅ SUCCESS: Extracted summary report with total: {summary_total}")
306
- else:
307
- summary_table_md = ""
308
- summary_total = ""
309
- print(" ⚠️ WARNING: No Examiner's Summary Report found in markdown")
310
-
311
- # Step 2: Clean up markdown
312
- print("\n[STEP 2/6] Cleaning markdown and converting HTML to LaTeX...")
313
- text = cleanup_markdown_for_latex(text)
314
- text = convert_html_color_spans(text)
315
- print(" ✅ SUCCESS: Markdown cleaned and HTML color spans converted")
316
-
317
- # Save cleaned markdown
318
- with open(temp_md_file, 'w', encoding='utf-8') as f:
319
- f.write(text)
320
- print(f" 📝 Saved cleaned markdown to: {temp_md_file}")
321
-
322
- # Step 3: Convert MD to LaTeX via Pandoc
323
- print("\n[STEP 3/6] Converting markdown to LaTeX using Pandoc...")
324
- pandoc_cmd = [
325
- "pandoc",
326
- "--from=markdown",
327
- "--to=latex",
328
- "--standalone",
329
- temp_md_file,
330
- "-o", temp_tex_file
331
- ]
332
- print(f" 🔧 Running: {' '.join(pandoc_cmd)}")
333
-
334
- result = subprocess.run(pandoc_cmd, capture_output=True, check=False)
335
-
336
- if result.returncode != 0:
337
- try:
338
- stderr = result.stderr.decode('utf-8', errors='replace')
339
- except:
340
- stderr = str(result.stderr)
341
- print(f" ❌ FAILED: Pandoc returned error code {result.returncode}")
342
- print(f" Error details: {stderr[:500]}")
343
- raise Exception(f"Pandoc conversion failed: {stderr}")
344
-
345
- if not os.path.exists(temp_tex_file):
346
- print(f" ❌ FAILED: LaTeX file not created at {temp_tex_file}")
347
- raise Exception("Pandoc did not create the expected LaTeX file")
348
-
349
- print(f" ✅ SUCCESS: LaTeX file created at {temp_tex_file}")
350
-
351
- # Step 4: Modify the generated LaTeX
352
- print("\n[STEP 4/6] Enhancing LaTeX document...")
353
- with open(temp_tex_file, "r", encoding="utf-8") as f:
354
- tex = f.read()
355
-
356
- tex = tex.replace(
357
- r"\documentclass{article}",
358
- r"\documentclass[12pt]{extarticle}"
359
- )
360
-
361
- insert_packages = r"""\usepackage[a4paper, margin=1in]{geometry}
362
- \usepackage{xcolor}
363
- \usepackage{colortbl}
364
- \usepackage{booktabs}
365
- \usepackage{array}
366
- \usepackage{longtable}
367
- \renewcommand{\arraystretch}{1.4}
368
- \newcolumntype{L}[1]{>{\raggedright\arraybackslash}p{#1}}"""
369
-
370
- tex = tex.replace(r"\begin{document}", insert_packages + "\n\\begin{document}")
371
- print(" ✅ SUCCESS: Enhanced document class and added packages")
372
-
373
- # Step 5: Build enhanced LaTeX table for summary
374
- if summary_table_md:
375
- print("\n[STEP 5/6] Building enhanced summary table...")
376
- summary_rows = parse_md_table(summary_table_md)
377
- print(f" 📊 Parsed {len(summary_rows)} rows from summary table")
378
-
379
- summary_latex = r"""\section*{Examiner's Summary Report}
380
- \begin{center}
381
- \rowcolors{2}{gray!10}{white}
382
- \begin{tabular}{|c|c|c|L{8cm}|}
383
- \hline
384
- \rowcolor{gray!30}
385
- \textbf{Question} & \textbf{Marks} & \textbf{Remark} & \textbf{Feedback} \\ \hline
386
  """
387
- for row in summary_rows:
388
- if len(row) >= 4:
389
- feedback = row[3]
390
- if not ('$' in feedback or '\\textcolor' in feedback):
391
- feedback = feedback.replace('%', r'\%').replace('&', r'\&').replace('#', r'\#')
392
-
393
- summary_latex += f"{row[0]} & {row[1]} & {row[2]} & {feedback} \\\\ \\hline\n"
394
-
395
- summary_latex += r"\end{tabular}"
396
- summary_latex += "\n\\end{center}\n\n"
397
- summary_latex += f"\\vspace{{0.5cm}}\\noindent\\textbf{{\\Large Overall Score: {summary_total}}}\n\n"
398
- summary_latex += "\\hrulefill\n\\vspace{1cm}\n\n"
399
- summary_latex += "\\newpage\n\n"
400
-
401
- tex = tex.replace(
402
- r"\begin{document}",
403
- r"\begin{document}" + "\n\n" + summary_latex
404
- )
405
- print(" ✅ SUCCESS: Summary table with zebra striping injected at document top")
406
- else:
407
- print("\n[STEP 5/6] Skipping summary table (not found)")
408
-
409
- with open(temp_tex_file, "w", encoding="utf-8") as f:
410
- f.write(tex)
411
-
412
- # Step 6: Compile PDF with pdflatex
413
- print("\n[STEP 6/6] Compiling PDF with pdflatex...")
414
- pdflatex_cmd = [
415
- "pdflatex",
416
- "-interaction=nonstopmode",
417
- f"-output-directory={os.path.dirname(os.path.abspath(temp_tex_file)) or '.'}",
418
- temp_tex_file
419
- ]
420
-
421
- print(" 🔧 Running pdflatex (pass 1/2)...")
422
- result1 = subprocess.run(pdflatex_cmd, capture_output=True, check=False)
423
-
424
- print(" 🔧 Running pdflatex (pass 2/2)...")
425
- result2 = subprocess.run(pdflatex_cmd, capture_output=True, check=False)
426
-
427
- temp_pdf = temp_tex_file.replace(".tex", ".pdf")
428
-
429
- if not os.path.exists(temp_pdf):
430
- print(f" ❌ FAILED: PDF not created at {temp_pdf}")
431
-
432
- try:
433
- stderr = result2.stderr.decode('utf-8', errors='replace')
434
- except:
435
- stderr = str(result2.stderr)
436
-
437
- log_file = temp_tex_file.replace(".tex", ".log")
438
- if os.path.exists(log_file):
439
- print(f" 📋 Checking LaTeX log file: {log_file}")
440
- try:
441
- with open(log_file, 'r', encoding='utf-8', errors='replace') as f:
442
- log_content = f.read()
443
- error_lines = [line for line in log_content.split('\n') if '!' in line]
444
- if error_lines:
445
- print(f" ❌ LaTeX Errors found ({len(error_lines)} lines):")
446
- for err_line in error_lines[:10]:
447
- print(f" {err_line}")
448
- stderr += "\n\nLaTeX Errors:\n" + "\n".join(error_lines[:10])
449
- except Exception as log_err:
450
- print(f" ⚠️ Could not read log file: {log_err}")
451
-
452
- raise Exception(f"pdflatex failed to create PDF. Error: {stderr[:1000]}")
453
-
454
- print(f" ✅ SUCCESS: PDF compiled at {temp_pdf}")
455
-
456
- # Move output PDF to final filename
457
- if os.path.exists(filename):
458
- os.remove(filename)
459
- os.rename(temp_pdf, filename)
460
- print(f" 📦 Moved to final location: {filename}")
461
-
462
- # Clean up temporary files
463
- print("\n[CLEANUP] Removing temporary files...")
464
- cleaned_count = 0
465
- for ext in [".md", ".tex", ".aux", ".log", ".out"]:
466
- temp_file = base_name + ext
467
- if os.path.exists(temp_file):
468
- os.remove(temp_file)
469
- cleaned_count += 1
470
- for prefix in ["_input", "_temp"]:
471
- temp_file = base_name + prefix + ext
472
- if os.path.exists(temp_file):
473
- os.remove(temp_file)
474
- cleaned_count += 1
475
- print(f" 🧹 Cleaned up {cleaned_count} temporary files")
476
-
477
- print("\n" + "="*60)
478
- print("✅ PDF CONVERSION COMPLETED SUCCESSFULLY")
479
- print(f"📄 Output file: {filename}")
480
- print("="*60 + "\n")
481
-
482
- return filename
483
-
484
- except subprocess.CalledProcessError as e:
485
- print(f"\n❌ SUBPROCESS ERROR: {e}")
486
- print(f" STDOUT: {e.stdout}")
487
- print(f" STDERR: {e.stderr}")
488
- print("="*60 + "\n")
489
- raise Exception(f"PDF conversion failed: {e.stderr}")
490
-
491
- except FileNotFoundError as e:
492
- print(f"\n❌ FILE NOT FOUND ERROR: {e}")
493
- print("="*60)
494
- print("⚠️ REQUIRED TOOLS MISSING")
495
- print("Please install the following:")
496
- print(" • pandoc")
497
- print(" • texlive (or MiKTeX on Windows)")
498
- print(" • texlive-latex-extra (for extarticle class)")
499
- print("="*60 + "\n")
500
- raise Exception(
501
- "Pandoc or pdflatex not found. Please install:\n"
502
- " - pandoc\n"
503
- " - texlive (or MiKTeX on Windows)\n"
504
- " - texlive-latex-extra (for extarticle class)"
505
- )
506
-
507
- except Exception as e:
508
- print(f"\n❌ UNEXPECTED ERROR: {e}")
509
- import traceback
510
- traceback.print_exc()
511
- print("="*60 + "\n")
512
- raise
513
-
514
- def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
515
- if output_path is None:
516
- base, ext = os.path.splitext(input_path)
517
- output_path = f"{base}_compressed{ext}"
518
-
519
- try:
520
- size = os.path.getsize(input_path)
521
- except Exception:
522
- return input_path
523
-
524
- if size <= max_size:
525
- print(f"ℹ️ Not compressing {input_path} ({size/1024/1024:.2f} MB <= {max_size/1024/1024} MB)")
526
- return input_path
527
-
528
- print(f"🔎 Compressing {input_path} ({size/1024/1024:.2f} MB) -> {output_path}")
529
- try:
530
- gs_cmd = [
531
- "gs", "-sDEVICE=pdfwrite",
532
- "-dCompatibilityLevel=1.4",
533
- "-dPDFSETTINGS=/ebook",
534
- "-dNOPAUSE", "-dQUIET", "-dBATCH",
535
- f"-sOutputFile={output_path}", input_path
536
- ]
537
- subprocess.run(gs_cmd, check=True)
538
- new_size = os.path.getsize(output_path)
539
- print(f"✅ Compression done. New size: {new_size/1024/1024:.2f} MB")
540
- if new_size <= max_size:
541
- return output_path
542
- else:
543
- print("⚠️ Compressed file still larger than threshold; returning original")
544
- return input_path
545
- except Exception as e:
546
- print("❌ Compression error:", e)
547
- return input_path
548
-
549
- def upload_to_gemini(path, display_name=None):
550
- """
551
- Upload a file to Gemini using the NEW google-genai SDK.
552
- Uses the current active API key from client_manager.
553
- """
554
- print(f"📤 Uploading {path} to Gemini...")
555
- try:
556
- current_client = client_manager.get_current_client()
557
- uploaded_file = current_client.files.upload(file=path)
558
-
559
- # Wait for processing to complete
560
- print(f"⏳ Waiting for file processing: {uploaded_file.name}")
561
- while uploaded_file.state.name == "PROCESSING":
562
- time.sleep(2)
563
- uploaded_file = current_client.files.get(name=uploaded_file.name)
564
-
565
- if uploaded_file.state.name == "FAILED":
566
- raise Exception(f"File processing failed: {uploaded_file.name}")
567
-
568
- print(f"✅ Uploaded and processed: {uploaded_file.name}")
569
- return uploaded_file
570
- except Exception as e:
571
- print(f"❌ Upload failed for {path}: {e}")
572
- raise
573
-
574
- def merge_pdfs(paths, output_path):
575
- writer = PdfWriter()
576
- for p in paths:
577
- reader = PdfReader(p)
578
- for page in reader.pages:
579
- writer.add_page(page)
580
- with open(output_path, "wb") as f:
581
- writer.write(f)
582
- return output_path
583
-
584
- def gemini_generate_content(prompt_text, file_upload_obj=None, image_obj=None, model_name="gemini-2.5-pro", fallback_model="gemini-2.5-flash"):
585
- """
586
- Send prompt_text and optionally an uploaded file (or an image object/list) to the model using NEW SDK.
587
- Automatically rotates through available API keys on RESOURCE_EXHAUSTED errors.
588
- Returns textual response and prints progress.
589
- """
590
- contents = [prompt_text]
591
-
592
- if file_upload_obj:
593
- contents.append(file_upload_obj)
594
-
595
- if image_obj:
596
- if isinstance(image_obj, list):
597
- for img_path in image_obj:
598
- if isinstance(img_path, str):
599
- pil_img = Image.open(img_path)
600
- contents.append(pil_img)
601
- else:
602
- contents.append(img_path)
603
- else:
604
- if isinstance(image_obj, str):
605
- pil_img = Image.open(image_obj)
606
- contents.append(pil_img)
607
- else:
608
- contents.append(image_obj)
609
-
610
- print("📡 Sending request to Gemini (prompt length:", len(prompt_text), "chars )")
611
-
612
- # Try with all available API keys
613
- max_attempts = len(client_manager.api_keys)
614
- attempt = 0
615
-
616
- while attempt < max_attempts:
617
- current_client = client_manager.get_current_client()
618
- current_key_num = client_manager.current_key_index + 1
619
-
620
- try:
621
- print(f"🔑 Using API key #{current_key_num} with model {model_name}")
622
- response = current_client.models.generate_content(
623
- model=model_name,
624
- contents=contents
625
- )
626
- raw_text = response.text
627
- print(f"📥 Received response (chars): {len(raw_text)}")
628
-
629
- # Success! Reset to primary key for next request
630
- client_manager.reset_to_primary()
631
- return raw_text
632
-
633
- except Exception as e:
634
- error_str = str(e)
635
- print(f"❌ Generation failed with API key #{current_key_num}: {e}")
636
-
637
- # Check if it's a RESOURCE_EXHAUSTED error
638
- if "429" in error_str or "RESOURCE_EXHAUSTED" in error_str:
639
- print(f"⚠️ Quota exhausted for API key #{current_key_num}")
640
-
641
- # Try to rotate to next key
642
- if client_manager.rotate_to_next_key():
643
- attempt += 1
644
- print(f"🔄 Retrying with next API key (attempt {attempt + 1}/{max_attempts})...")
645
- continue
646
- else:
647
- # Only one key available, try fallback model
648
- print(f"⚡ Trying fallback model: {fallback_model}")
649
- try:
650
- response = current_client.models.generate_content(
651
- model=fallback_model,
652
- contents=contents
653
- )
654
- raw_text = response.text
655
- print(f"📥 Received response (chars): {len(raw_text)}")
656
- client_manager.reset_to_primary()
657
- return raw_text
658
- except Exception as e2:
659
- print(f"❌ Fallback also failed: {e2}")
660
- raise Exception(f"All API keys exhausted. Error: {e2}")
661
- else:
662
- # Not a quota error, try fallback model with same key
663
- print(f"⚡ Trying fallback model: {fallback_model}")
664
- try:
665
- response = current_client.models.generate_content(
666
- model=fallback_model,
667
- contents=contents
668
- )
669
- raw_text = response.text
670
- print(f"📥 Received response (chars): {len(raw_text)}")
671
- client_manager.reset_to_primary()
672
- return raw_text
673
- except Exception as e2:
674
- print(f"❌ Fallback also failed: {e2}")
675
- # If we have more keys, try them
676
- if attempt < max_attempts - 1:
677
- client_manager.rotate_to_next_key()
678
- attempt += 1
679
- print(f"🔄 Trying next API key (attempt {attempt + 1}/{max_attempts})...")
680
- continue
681
- else:
682
- raise Exception(f"All attempts failed. Last error: {e2}")
683
-
684
- # If we exhausted all attempts
685
- raise Exception(f"❌ All {max_attempts} API key(s) exhausted. Please check your quota or try again later.")
686
-
687
-
688
- # ---------------- PARSERS ----------------
689
- def extract_question_ids_from_qpms(text: str):
690
- """Extract question IDs from QP+MS transcript."""
691
- print("🔎 Extracting question IDs from QP+MS transcript using regex...")
692
-
693
- clean_text = text.replace("\u00A0", " ").replace("\t", " ")
694
-
695
- primary_matches = re.findall(r"^\s*Question\s*[:\s]\s*([\dA-Za-z.()]+)", clean_text, re.MULTILINE)
696
- if primary_matches:
697
- print(f"✅ Extracted {len(primary_matches)} question IDs from explicit 'Question X' lines.")
698
- print("IDs:", primary_matches)
699
- return primary_matches
700
-
701
- fallback_matches = re.findall(r"^\s*(\d+(?:[.)]|\([a-zA-Z0-9]+\))?[a-zA-Z0-9]*)", clean_text, re.MULTILINE)
702
- if fallback_matches:
703
- print(f"✅ Extracted {len(fallback_matches)} question IDs (fallback numbered lists).")
704
- print("IDs:", fallback_matches)
705
- else:
706
- print("⚠️ No question IDs extracted; will send NA placeholder.")
707
- return fallback_matches
708
-
709
- def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
710
- """
711
- Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
712
- modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
713
- requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
714
- Includes explicit rules for interpreting NA-like answers and no-response situations.
715
- """
716
-
717
- if not expected_ids:
718
- ids_block = "{NA}"
719
- else:
720
- ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
721
-
722
- qpms_section = ""
723
- if qpms_text is not None:
724
- qpms_section = (
725
- "\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS) below."
726
- "\nUse it primarily to resolve ambiguous handwriting and to confirm expected answers when needed."
727
- "\n--- BEGIN QP+MS TRANSCRIPT ---\n"
728
- f"{qpms_text.strip()}\n"
729
- "--- END QP+MS TRANSCRIPT ---\n"
730
- )
731
-
732
- prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.
733
- INPUT: This PDF contains a student's handwritten answer sheet.
734
- {qpms_section}
735
- TASK:
736
- 1. **THINKING:** Before transcribing each answer, document your thought process inside a **<think>** tag.
737
- - Identify the question ID. If inferred, note why.
738
- - Detail any ambiguities (unclear numbers, symbols, or structures).
739
- - Explain how ambiguities were resolved, including whether the QP+MS transcript was consulted.
740
- - If QP+MS was consulted but you chose not to change the transcription, state this.
741
- - If the initial question label was incorrect (e.g., 2.a vs 2.b), correct it and briefly explain the reasoning in <think>.
742
- *Example Thinking:*
743
- <think>
744
- - Found Question 3(a).
745
- - The term could be '$2x$' or '21x'.
746
- - Markscheme uses '$21x$', but handwriting matches '$2x$'.
747
- - Decision: transcribe '$2x$'.
748
- </think>
749
 
750
- 2. **TRANSCRIPTION:** Transcribe the student's answers directly and faithfully.
751
- - Assign each answer to a labelled question ID when present.
752
- - For unlabeled answers, segment logically and mark inferred IDs as "**INFERRED: <id>**".
753
- - **Mathematical expressions and standalone variables must appear inside LaTeX dollar delimiters ($...$).**
754
- - If a diagram/graph is omitted, write **[Graph omitted]**.
755
- - If handwriting is unreadable: **[illegible]**.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
756
 
757
- **ANSWER-INTERPRETATION RULES:**
758
- - If the student writes “NA”, “N/A”, “Not Applicable”, or clear equivalents → record exactly as **NA**.
759
- - If the student leaves the space blank, crosses it out, makes no meaningful attempt, or provides no answer → record **[No response]**.
760
 
761
- Ensure deterministic formatting so subsequent models can grade directly from this aligned format.
762
 
763
- Expected questions (if missing, write NA):
764
- {ids_block}
765
- -----------------------
766
- OUTPUT FORMAT:
767
- <think>...</think>
768
- Question <id>
769
- AS:<transcribed answer or placeholder>
770
- <think>...</think>
771
- Question <id>
772
- AS:<transcribed answer or placeholder>
773
- ...
774
- ==== GRAPH FOUND ANSWERS ====
775
- Graph found in:
776
- - Answer <number> Page <number>
777
- (one per line)
778
- ==== END GRAPH FOUND ===="""
779
 
780
- return prompt
781
 
 
782
 
 
 
783
 
784
- def extract_graph_questions_from_ms(text: str):
785
- """Extract graph questions and page numbers from MS transcript."""
786
- clean_text = text.replace("\u00A0", " ").replace("\t", " ")
787
- match = re.search(r"==== GRAPH EXPECTED QUESTIONS ====\s*(.*?)\s*==== END GRAPH EXPECTED ====",
788
- clean_text, re.S)
789
- graph_dict = {}
790
- if match:
791
- block = match.group(1)
792
- for line in block.splitlines():
793
- line = line.strip()
794
- if line.startswith("- Question"):
795
- q_match = re.match(r"- Question\s+([\dA-Za-z.()]+)\s*→\s*Page\s*(\d+)", line)
796
- if q_match:
797
- q_id, page = q_match.groups()
798
- graph_dict[q_id] = int(page)
799
- return graph_dict
800
 
801
- def extract_graph_answers_from_as(text: str):
802
- """Extract graph answers and page numbers from AS transcript."""
803
- clean_text = text.replace("\u00A0", " ").replace("\t", " ")
804
- block = re.search(r"==== GRAPH FOUND ANSWERS ====\s*(.*?)\s*==== END GRAPH FOUND ====",
805
- clean_text, re.S)
806
- graph_dict = {}
807
- if block:
808
- for line in block.group(1).splitlines():
809
- line = line.strip()
810
- if line.startswith("- Answer"):
811
- match = re.match(r"- Answer\s+([\dA-Za-z.()]+)\s*→\s*Page\s*(\d+)", line)
812
- if match:
813
- ans_id, page = match.groups()
814
- graph_dict[ans_id] = int(page)
815
- return graph_dict
816
 
817
- def extract_marks_from_grading(grading_text):
818
- """
819
- Parse the grading markdown and extract marks per question from the Awarded column only.
820
- """
821
- print("🔎 Extracting awarded marks from grading output...")
822
- grading_json = {"grading": []}
823
 
824
- question_blocks = re.split(r"###\s*Question\s+", grading_text)
825
- for block in question_blocks[1:]:
826
- first_line = block.strip().splitlines()[0].strip() if block.strip().splitlines() else ""
827
- q_id_match = re.match(r"([0-9]+(?:[a-zA-Z]|\([^)]+\)|(?:\.[a-zA-Z0-9]+))*)", first_line)
828
- if not q_id_match:
829
- q_id = first_line.split()[0] if first_line else ""
830
- else:
831
- q_id = q_id_match.group(1).strip()
832
-
833
- # Extract marks only from the "Awarded" column (4th column in the table)
834
- awarded = []
835
- lines = block.split('\n')
836
- for line in lines:
837
- if '|' in line:
838
- parts = [p.strip() for p in line.split('|')]
839
- # Check if this is a data row (not header or separator) and has at least 5 columns
840
- if len(parts) >= 5 and not parts[1].startswith('-'):
841
- awarded_col = parts[4] # 4th column (index 4 because of leading empty from split)
842
- # Extract mark codes from the awarded column
843
- marks = re.findall(r"\b([MABCR]\d+|[MABCR]0)\b", awarded_col)
844
- awarded.extend(marks)
845
-
846
- grading_json["grading"].append({
847
- "question": q_id,
848
- "marks_awarded": awarded
849
- })
850
- print("✅ Extracted grading marks for", len(grading_json["grading"]), "question blocks.")
851
- print(json.dumps(grading_json, indent=2))
852
- return grading_json
853
 
854
- def check_and_correct_total_marks(grading_text):
855
- """
856
- Verifies the total marks in the Examiner's Summary Report against
857
- the sum of individual question marks. Corrects if discrepancy found.
858
-
859
- Args:
860
- grading_text (str): The full grading markdown text
861
-
862
- Returns:
863
- tuple: (corrected_text, calculated_awarded, calculated_possible, was_corrected)
864
- """
865
- print("\n" + "="*60)
866
- print("🔍 VERIFYING TOTAL MARKS IN SUMMARY REPORT")
867
- print("="*60)
868
-
869
- question_marks = {}
870
- calculated_total_awarded = 0
871
- calculated_total_possible = 0
872
-
873
- # Updated pattern to match BOTH formats:
874
- # ### Question <1.a> (with angle brackets)
875
- # ### Question 1.a (without angle brackets)
876
- # The <? makes the opening bracket optional
877
- # The >? makes the closing bracket optional
878
- question_block_pattern = re.compile(
879
- r"### Question\s*<?([0-9]+(?:[.()][a-z0-9]+)*)>?\s*[\s\S]*?\*\*Total:\s*(\d+)/(\d+)\*\*",
880
- re.DOTALL | re.IGNORECASE
881
- )
882
-
883
- matches = question_block_pattern.finditer(grading_text)
884
- for match in matches:
885
- question_id = match.group(1).strip()
886
- awarded = int(match.group(2))
887
- possible = int(match.group(3))
888
- question_marks[question_id] = {'awarded': awarded, 'possible': possible}
889
- calculated_total_awarded += awarded
890
- calculated_total_possible += possible
891
-
892
- print(f"\n� Exltracted marks from {len(question_marks)} questions:")
893
- for q_id, marks in question_marks.items():
894
- print(f" Question {q_id}: {marks['awarded']}/{marks['possible']}")
895
-
896
- print(f"\n📈 Calculated totals from individual questions:")
897
- print(f" Awarded: {calculated_total_awarded}")
898
- print(f" Possible: {calculated_total_possible}")
899
-
900
- # Find the summary report section
901
- summary_report_start = grading_text.find("### Examiner's Summary Report")
902
- if summary_report_start == -1:
903
- print("⚠️ Warning: Could not find '### Examiner's Summary Report' section.")
904
- return grading_text, calculated_total_awarded, calculated_total_possible, False
905
-
906
- summary_section = grading_text[summary_report_start:]
907
- summary_total_pattern = re.compile(r"(\*\*Total:\s*)(\d+)/(\d+)(\*\*)")
908
- summary_match = summary_total_pattern.search(summary_section)
909
-
910
- original_summary_awarded = 0
911
- original_summary_possible = 0
912
-
913
- if summary_match:
914
- original_summary_awarded = int(summary_match.group(2))
915
- original_summary_possible = int(summary_match.group(3))
916
- print(f"\n📋 Original summary report total: {original_summary_awarded}/{original_summary_possible}")
917
- else:
918
- print("⚠️ Warning: Could not find overall total in summary report.")
919
- return grading_text, calculated_total_awarded, calculated_total_possible, False
920
-
921
- # Check for discrepancies
922
- corrected_report_text = grading_text
923
- total_mismatch = False
924
-
925
- if calculated_total_awarded != original_summary_awarded:
926
- print(f"\n❌ DISCREPANCY FOUND in awarded marks!")
927
- print(f" Calculated: {calculated_total_awarded}")
928
- print(f" Reported: {original_summary_awarded}")
929
- total_mismatch = True
930
-
931
- if calculated_total_possible != original_summary_possible:
932
- print(f"\n❌ DISCREPANCY FOUND in possible marks!")
933
- print(f" Calculated: {calculated_total_possible}")
934
- print(f" Reported: {original_summary_possible}")
935
- total_mismatch = True
936
-
937
- if total_mismatch:
938
- print(f"\n🔧 CORRECTING summary total:")
939
- print(f" FROM: {original_summary_awarded}/{original_summary_possible}")
940
- print(f" TO: {calculated_total_awarded}/{calculated_total_possible}")
941
-
942
- # Correct only in the summary section
943
- corrected_summary_section = re.sub(
944
- summary_total_pattern,
945
- rf"\g<1>{calculated_total_awarded}/{calculated_total_possible}\g<4>",
946
- summary_section,
947
- count=1
948
- )
949
-
950
- corrected_report_text = grading_text[:summary_report_start] + corrected_summary_section
951
- print("✅ Total marks corrected successfully!")
952
- else:
953
- print("\n✅ Total marks are CORRECT - no correction needed!")
954
-
955
- print("="*60 + "\n")
956
-
957
- return corrected_report_text, calculated_total_awarded, calculated_total_possible, total_mismatch
958
 
959
- # ---------------- MAPPING/IMPRINT HELPERS ----------------
960
- def ask_gemini_for_mapping_batch(image_paths, grading_json, expected_ids=None, rows=GRID_ROWS, cols=GRID_COLS):
961
- """
962
- Send multiple page images together to Gemini for batch mapping processing.
963
- """
964
- ids_block = "{NA}"
965
- if expected_ids:
966
- ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
967
-
968
- prompt = f"""You are an exam marker. Your role is to identify where each question begins on each page.
969
- The pages are divided into a {rows} x {cols} grid. Each cell has a RUNNING NUMBER label.
970
- For each question in the grading JSON, return the cell NUMBER where the FIRST STEP of that question begins.
971
- ⚠ IMPORTANT RULES:
972
- - Do not place marks inside another question's answer area.
973
- - Prefer placing the marks in a BLANK cell immediately to the RIGHT of the answer step. If no blank cell is available to the right, then place in a blank cell to the LEFT.
974
- - Never place marks above or below the answer.
975
- - Each question should have unique cell number
976
- - If a question serial number is visible in the answer image, you must mandatorily identify the corresponding question using the grading JSON.
977
- IMPORTANT: For your help i have provided u questions that u can expect in the images:
978
- {ids_block}
979
- Return JSON only, like:
980
- [{{"page": 1, "question": "1(a)", "cell_number": 15}}, ...]
981
- Grading JSON:
982
- {json.dumps(grading_json, indent=2)}"""
983
 
984
- images = [Image.open(p) for p in image_paths]
985
-
986
- print(f"📡 Sending batch mapping request for {len(image_paths)} pages to Gemini...")
987
-
988
- try:
989
- contents = [prompt] + images
990
- response = client.models.generate_content(
991
- model="gemini-2.5-flash",
992
- contents=contents
993
- )
994
- raw_text = response.text
995
- except:
996
- print("⚠️ Trying fallback model for mapping...")
997
- contents = [prompt] + images
998
- response = client.models.generate_content(
999
- model="gemini-2.5-flash-preview-09-2025",
1000
- contents=contents
1001
- )
1002
- raw_text = response.text
1003
-
1004
- print("📥 Batch mapping response (chars):", len(raw_text))
1005
- print("🔎 Gemini raw batch output:")
1006
- print(raw_text)
1007
-
1008
- try:
1009
- match = re.search(r'(\[.*\])', raw_text, re.DOTALL)
1010
- if match:
1011
- mapping = json.loads(match.group(1))
1012
- print(f"✅ Parsed Gemini batch mapping for {len(image_paths)} pages")
1013
- return mapping
1014
- else:
1015
- print("❌ Failed to find JSON array in response")
1016
- return []
1017
- except Exception as e:
1018
- print(f"❌ Failed to parse Gemini JSON mapping: {e}")
1019
- return []
1020
 
1021
- def normalize_question_id(qid):
1022
- """
1023
- Normalize question ID to a standard format for matching.
1024
- Converts formats like:
1025
- - "1(a)" -> "1.a"
1026
- - "2(c).i" -> "2.c.i"
1027
- - "3.d.ii" -> "3.d.ii" (already normalized)
1028
- """
1029
- if not qid:
1030
- return qid
1031
-
1032
- # Replace parentheses format: 1(a) -> 1.a
1033
- qid = re.sub(r'(\d+)\(([a-zA-Z])\)', r'\1.\2', qid)
1034
-
1035
- # Replace format like 2(c).i -> 2.c.i
1036
- qid = re.sub(r'(\d+)\(([a-zA-Z]+)\)\.', r'\1.\2.', qid)
1037
-
1038
- return qid
1039
 
1040
- def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, expected_ids=None, rows=GRID_ROWS, cols=GRID_COLS):
1041
- """
1042
- Convert PDF to images, create grid-numbered images for batch sending to Gemini,
1043
- then annotate and produce imprinted PDF.
1044
- """
1045
- print("📄 Converting answer PDF to images for imprinting...")
1046
- pages = convert_from_path(pdf_path, dpi=100)
1047
- annotated_page_paths = []
1048
- temp_grid_images = []
1049
 
1050
- for p_index, page in enumerate(pages):
1051
- img = page.convert("RGB")
1052
- w, h = img.size
1053
- cell_w, cell_h = w / cols, h / rows
 
1054
 
1055
- draw = ImageDraw.Draw(img)
1056
- try:
1057
- num_font = ImageFont.truetype("arial.ttf", 20)
1058
- except Exception:
1059
- num_font = ImageFont.load_default()
1060
 
1061
- cell_num = 1
1062
- for r in range(rows):
1063
- for c in range(cols):
1064
- x = int(c * cell_w + cell_w / 2)
1065
- y = int(r * cell_h + cell_h / 2)
1066
- text = str(cell_num)
1067
- bbox = draw.textbbox((0, 0), text, font=num_font)
1068
- tw = bbox[2] - bbox[0]
1069
- th = bbox[3] - bbox[1]
1070
- draw.text((x - tw/2, y - th/2), text, fill="black", font=num_font)
1071
- cell_num += 1
1072
 
1073
- temp_path = f"page_{p_index+1}_grid.png"
1074
- img.save(temp_path, "PNG")
1075
- temp_grid_images.append(temp_path)
1076
- print("🛰 Created grid image:", temp_path)
1077
 
1078
- print("📡 Sending page images to Gemini in batches for mapping...")
1079
- batch_size = 10
1080
- all_mappings = []
1081
-
1082
- for start in range(0, len(temp_grid_images), batch_size):
1083
- batch_paths = temp_grid_images[start:start+batch_size]
1084
- batch_mapping = ask_gemini_for_mapping_batch(batch_paths, grading_json, expected_ids, rows, cols)
1085
- all_mappings.extend(batch_mapping)
1086
- print(f"✅ Processed batch {start//batch_size + 1}: pages {start+1}-{start+len(batch_paths)}")
1087
 
1088
- print("🖊 Annotating pages with marks...")
1089
- for p_index, page in enumerate(pages):
1090
- page_num = p_index + 1
1091
- page_img = page.convert("RGB")
1092
- img_cv = np.array(page_img)
1093
- img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
1094
- h, w, _ = img_cv.shape
1095
- cell_w_px, cell_h_px = w / cols, h / rows
1096
 
1097
- page_mappings = [m for m in all_mappings if m.get("page") == page_num]
1098
-
1099
- for item in page_mappings:
1100
- qid = item.get("question")
1101
- cell_number = item.get("cell_number")
1102
- if qid is None or cell_number is None:
1103
- continue
1104
 
1105
- # Normalize the question ID from Gemini mapping
1106
- normalized_qid = normalize_question_id(qid)
1107
-
1108
- # Try exact match first with normalized ID
1109
- marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", [])
1110
- if g["question"] == normalized_qid), [])
1111
-
1112
- # If no match, try case-insensitive match
1113
- if not marks_list:
1114
- marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", [])
1115
- if g["question"].lower() == normalized_qid.lower()), [])
1116
-
1117
- # If still no match, try with original qid
1118
- if not marks_list:
1119
- marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", [])
1120
- if g["question"] == qid), [])
1121
 
1122
- marks_text = ",".join(marks_list) if marks_list else "?"
1123
-
1124
- if marks_text == "?":
1125
- print(f"⚠️ No marks found for question '{qid}' (normalized: '{normalized_qid}') on page {page_num}")
1126
 
1127
- row = (cell_number - 1) // cols
1128
- col = (cell_number - 1) % cols
1129
 
1130
- x_c = int((col + 1) * cell_w_px - cell_w_px / 4)
1131
- y_c = int((row + 0.5) * cell_h_px)
1132
 
1133
- font_scale = max(1.0, min(2.0, cell_h_px / 40.0))
1134
- thickness = max(2, int(font_scale * 2))
1135
- cv2.putText(img_cv, marks_text, (x_c, y_c), cv2.FONT_HERSHEY_SIMPLEX,
1136
- font_scale, (0, 0, 255), thickness, cv2.LINE_AA)
1137
- print(f"🖊 Marks annotated for page {page_num}, question {qid}: {marks_text}")
1138
 
1139
- annotated_path = f"annotated_page_{page_num}.png"
1140
- cv2.imwrite(annotated_path, img_cv)
1141
- annotated_page_paths.append(annotated_path)
1142
- print("✅ Annotated page saved:", annotated_path)
1143
 
1144
- print("📑 Merging annotated pages into final PDF...")
1145
- with open(output_pdf, "wb") as f:
1146
- f.write(img2pdf.convert(annotated_page_paths))
 
 
1147
 
1148
- compressed = compress_pdf(output_pdf)
1149
- print("📑 Imprinted PDF saved to:", compressed)
1150
- return compressed
1151
 
1152
- def extract_pdf_pages_as_images(pdf_path, page_numbers, prefix):
1153
- """
1154
- Extracts unique pages (1-based) from a PDF as images, saves as PNG, returns list of file paths.
1155
- Handles cases where requested pages don't exist in the PDF.
1156
- """
1157
- if not page_numbers:
1158
- print(f"⚠️ No page numbers provided for extraction")
1159
- return []
1160
-
1161
- unique_pages = sorted(set(page_numbers))
1162
-
1163
- # First, get the total page count to validate requested pages
1164
- try:
1165
- from PyPDF2 import PdfReader
1166
- reader = PdfReader(pdf_path)
1167
- total_pages = len(reader.pages)
1168
- print(f"📄 PDF has {total_pages} total pages")
1169
-
1170
- # Filter out invalid page numbers
1171
- valid_pages = [p for p in unique_pages if 1 <= p <= total_pages]
1172
- invalid_pages = [p for p in unique_pages if p not in valid_pages]
1173
-
1174
- if invalid_pages:
1175
- print(f"⚠️ Skipping invalid page numbers (out of range): {invalid_pages}")
1176
-
1177
- if not valid_pages:
1178
- print(f"❌ No valid pages to extract from {pdf_path}")
1179
- return []
1180
-
1181
- unique_pages = valid_pages
1182
- except Exception as e:
1183
- print(f"⚠️ Could not validate page numbers: {e}. Proceeding with extraction...")
1184
-
1185
- # Extract the pages
1186
- try:
1187
- images = convert_from_path(pdf_path, dpi=200, first_page=min(unique_pages), last_page=max(unique_pages))
1188
- except Exception as e:
1189
- print(f"❌ Failed to convert PDF pages to images: {e}")
1190
- return []
1191
-
1192
- out_paths = []
1193
- for idx, page_num in enumerate(unique_pages):
1194
- img_idx = page_num - min(unique_pages)
1195
-
1196
- # Bounds check to prevent index errors
1197
- if img_idx >= len(images):
1198
- print(f"⚠️ Page {page_num} not found in extracted images (index {img_idx} >= {len(images)}). Skipping...")
1199
- continue
1200
-
1201
- try:
1202
- img = images[img_idx]
1203
- out_path = f"{prefix}_page_{page_num}.png"
1204
- img.save(out_path, "PNG")
1205
- print(f"📤 Extracted graph page {page_num} from {pdf_path} as {out_path}")
1206
- out_paths.append(out_path)
1207
- except Exception as e:
1208
- print(f"❌ Failed to save page {page_num}: {e}")
1209
- continue
1210
-
1211
- return out_paths
1212
 
1213
- # ---------------- PIPELINE ----------------
1214
- def align_and_grade_pipeline(qp_path, ms_path, ans_path, subject="Maths", imprint=False, run_timestamp=None):
1215
  """
1216
- Final pipeline with graph-aware grading logic using NEW SDK.
1217
 
1218
  Args:
1219
- qp_path: Path to Question Paper PDF
1220
- ms_path: Path to Markscheme PDF
1221
- ans_path: Path to Answer Sheet PDF
1222
- subject: Subject name (Maths or Science)
1223
- imprint: Whether to generate imprinted PDF
1224
- run_timestamp: Unix timestamp for organizing files in Supabase
1225
- """
1226
- try:
1227
- print("🔁 Starting pipeline...")
1228
- qp_path = compress_pdf(qp_path)
1229
- ms_path = compress_pdf(ms_path)
1230
- ans_path = compress_pdf(ans_path)
1231
-
1232
- merged_qpms_path = os.path.splitext(qp_path)[0] + "_merged_qp_ms.pdf"
1233
- merge_pdfs([qp_path, ms_path], merged_qpms_path)
1234
- print("📎 Merged QP + MS ->", merged_qpms_path)
1235
-
1236
- print("🔼 Uploading files to Gemini...")
1237
- merged_uploaded = upload_to_gemini(merged_qpms_path)
1238
- ans_uploaded = upload_to_gemini(ans_path)
1239
- print("✅ Upload complete.")
1240
-
1241
- print("1.i) Transcribing QP+MS (questions first, then full markscheme, with graph detection)...")
1242
- qpms_prompt = QP_MS_TRANSCRIPTION_PROMPT["content"] + "\nAt the end, also list all questions in the markscheme where a graph is expected, in the format:\nGraph expected in:\n- Question <number> → Page <number>\n(One per line, after ==== MARKSCHEME END ====)"
1243
- qpms_text = gemini_generate_content(qpms_prompt, file_upload_obj=merged_uploaded, model_name="gemini-2.5-flash", fallback_model="gemini-2.5-flash-preview-09-2025")
1244
- print("📄 QP+MS transcription received. Saving debug file: debug_qpms_transcript.txt")
1245
- with open("debug_qpms_transcript.txt", "w", encoding="utf-8") as f:
1246
- f.write(qpms_text)
1247
-
1248
- ms_graph_mapping = extract_graph_questions_from_ms(qpms_text)
1249
- print("🖼️ Graph-expected questions in MS:", ms_graph_mapping)
1250
- ms_graph_pages = list(ms_graph_mapping.values())
1251
- ms_graph_images = []
1252
- if ms_graph_pages:
1253
- ms_graph_images = extract_pdf_pages_as_images(merged_qpms_path, ms_graph_pages, prefix="qpms_graph")
1254
-
1255
- extracted_ids = extract_question_ids_from_qpms(qpms_text)
1256
- if not extracted_ids:
1257
- extracted_ids = ["NA"]
1258
-
1259
- print("1.ii) Building AS transcription prompt with expected question IDs and graph detection, sending to Gemini...")
1260
- as_prompt = build_as_cot_prompt_with_expected_ids(extracted_ids, qpms_text) + "\nAt the end, also list all answers where a graph is found, in the format:\nGraph found in:\n- Answer <number> → Page <number>\n(One per line, after all answers)"
1261
- as_text = gemini_generate_content(as_prompt, file_upload_obj=ans_uploaded, model_name="gemini-2.5-flash", fallback_model="gemini-2.5-flash-preview-09-2025")
1262
- print("📝 AS transcription received. Saving debug file: debug_as_transcript.txt")
1263
- with open("debug_as_transcript.txt", "w", encoding="utf-8") as f:
1264
- f.write(as_text)
1265
-
1266
- as_graph_mapping = extract_graph_answers_from_as(as_text)
1267
- print("🖼️ Graph-attempted answers in AS:", as_graph_mapping)
1268
- as_graph_pages = list(as_graph_mapping.values())
1269
- as_graph_images = []
1270
- if as_graph_pages:
1271
- as_graph_images = extract_pdf_pages_as_images(ans_path, as_graph_pages, prefix="as_graph")
1272
-
1273
- print("2) Preparing grading input and sending to Gemini for grading...")
1274
- grading_input = (
1275
- "=== QP+MS TRANSCRIPT BEGIN ===\n"
1276
- + qpms_text
1277
- + "\n=== QP+MS TRANSCRIPT END ===\n\n"
1278
- + "=== ANSWER SHEET TRANSCRIPT BEGIN ===\n"
1279
- + as_text
1280
- + "\n=== ANSWER SHEET TRANSCRIPT END ===\n"
1281
- )
1282
- if ms_graph_images or as_graph_images:
1283
- graph_note = "\n\n---\nSome questions require graphs. I've attached the relevant graph pages from QP+MS and from the Answer Sheet. Use them as visual context when grading.\n---\n"
1284
- grading_input += graph_note
1285
- grading_prompt_obj = get_grading_prompt(subject.lower())
1286
- grading_prompt_system = grading_prompt_obj["content"]
1287
- grading_images = ms_graph_images + as_graph_images
1288
- grading_text = gemini_generate_content(grading_prompt_system + "\n\nPlease grade the following transcripts:\n" + grading_input, image_obj=grading_images if grading_images else None, model_name="gemini-2.5-pro", fallback_model="gemini-2.5-flash")
1289
- print("🧾 Grading output received. Saving debug file: debug_grading.md")
1290
- with open("debug_grading.md", "w", encoding="utf-8") as f:
1291
- f.write(grading_text)
1292
-
1293
- # Verify and correct total marks if needed
1294
- grading_text, calc_awarded, calc_possible, was_corrected = check_and_correct_total_marks(grading_text)
1295
-
1296
- if was_corrected:
1297
- print("📝 Saving corrected grading to debug file: debug_grading_corrected.md")
1298
- with open("debug_grading_corrected.md", "w", encoding="utf-8") as f:
1299
- f.write(grading_text)
1300
-
1301
- base_name = os.path.splitext(os.path.basename(ans_path))[0]
1302
- grading_pdf_path = save_as_pdf(grading_text, f"{base_name}_graded.pdf")
1303
- print("📄 Grading PDF saved:", grading_pdf_path)
1304
-
1305
- grading_json = extract_marks_from_grading(grading_text)
1306
- with open("debug_grading_json.json", "w", encoding="utf-8") as f:
1307
- json.dump(grading_json, f, indent=2, ensure_ascii=False)
1308
- print("🔧 Grading marks extraction complete.")
1309
-
1310
- imprinted_pdf_path = None
1311
- if imprint:
1312
- print("✍ Imprint option enabled. Starting imprinting process...")
1313
- imprinted_pdf_path = f"{base_name}_imprinted.pdf"
1314
- imprinted_pdf_path = imprint_marks_using_mapping(ans_path, grading_json, imprinted_pdf_path, extracted_ids)
1315
- print("✅ Imprinting finished. Imprinted PDF at:", imprinted_pdf_path)
1316
-
1317
- # Upload output files to Supabase (using same timestamp as input files)
1318
- output_urls = {
1319
- "graded_pdf_url": None,
1320
- "imprinted_pdf_url": None
1321
- }
1322
-
1323
- if supabase_client:
1324
- print("\n📤 Uploading output files to Supabase...")
1325
- if grading_pdf_path:
1326
- output_urls["graded_pdf_url"] = upload_file_to_supabase(grading_pdf_path, "graded", run_timestamp)
1327
- if imprinted_pdf_path:
1328
- output_urls["imprinted_pdf_url"] = upload_file_to_supabase(imprinted_pdf_path, "imprinted", run_timestamp)
1329
-
1330
- print("🏁 Pipeline finished successfully.")
1331
- return qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path, output_urls
1332
-
1333
- except Exception as e:
1334
- print("❌ Pipeline error:", e)
1335
- import traceback
1336
- traceback.print_exc()
1337
- return f"❌ Error: {e}", None, None, None, None, {}
1338
-
1339
- # ---------------- GRADIO UI ----------------
1340
- with gr.Blocks(title="AI Grading (Pandoc + pdflatex)") as demo:
1341
- gr.Markdown("## 📘 AI Grading — Using Pandoc + pdflatex for PDF Generation")
1342
- gr.Markdown("**✅ Now using Pandoc with pdflatex for professional-quality PDF outputs!**")
1343
 
1344
- if supabase_client:
1345
- gr.Markdown("**☁️ Supabase Storage: Enabled** - All files will be uploaded to cloud storage")
1346
- else:
1347
- gr.Markdown("**⚠️ Supabase Storage: Disabled** - Files will only be processed locally")
1348
-
1349
- with gr.Row():
1350
- qp_file = gr.File(label="📄 Upload Question Paper (PDF)")
1351
- ms_file = gr.File(label="📄 Upload Markscheme (PDF)")
1352
- ans_file = gr.File(label="📝 Upload Student Answer Sheet (PDF)")
1353
-
1354
- with gr.Row():
1355
- subject_dropdown = gr.Dropdown(
1356
- choices=["Maths", "Science", "Economics"],
1357
- value="Maths",
1358
- label="📚 Subject",
1359
- info="Select the subject to apply appropriate grading guidelines"
1360
- )
1361
- imprint_toggle = gr.Checkbox(label="✍ Imprint Marks on Student Answer Sheet", value=False)
1362
-
1363
- run_button = gr.Button("🚀 Run Pipeline")
1364
-
1365
- # File URLs section (only shown if Supabase is enabled)
1366
- if supabase_client:
1367
- with gr.Accordion("☁️ Uploaded File URLs", open=False):
1368
- file_urls_box = gr.Textbox(label="Cloud Storage URLs", lines=8, interactive=False)
1369
-
1370
- with gr.Row():
1371
- qpms_box = gr.Textbox(label="📑 QP+MS Transcript", lines=12)
1372
- as_box = gr.Textbox(label="📝 AS Transcript", lines=12)
1373
-
1374
- grading_output_box = gr.Textbox(label="🧾 Grading (Markdown)", lines=20)
1375
- grading_pdf_file = gr.File(label="📥 Download Grading PDF")
1376
- imprint_pdf_file = gr.File(label="📥 Download Imprinted PDF (Optional)")
1377
-
1378
- def run_pipeline(qp_file_obj, ms_file_obj, ans_file_obj, subject_choice, imprint_flag):
1379
- if not qp_file_obj or not ms_file_obj or not ans_file_obj:
1380
- error_msg = "❌ Please upload all three files"
1381
- if supabase_client:
1382
- return error_msg, "", "", None, None, ""
1383
- else:
1384
- return error_msg, "", "", None, None
1385
-
1386
- # Process and upload input files (generates shared timestamp)
1387
- qp_path, ms_path, ans_path, input_urls, run_timestamp = process_and_upload_input_files(
1388
- qp_file_obj, ms_file_obj, ans_file_obj
1389
- )
1390
-
1391
- # Run the grading pipeline (pass timestamp to keep all files together)
1392
- qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path, output_urls = align_and_grade_pipeline(
1393
- qp_path, ms_path, ans_path, subject=subject_choice, imprint=imprint_flag, run_timestamp=run_timestamp
1394
- )
1395
-
1396
- # Build URLs summary
1397
- urls_summary = ""
1398
- if supabase_client:
1399
- urls_summary = f"📤 UPLOADED FILES (Timestamp: {run_timestamp}):\n\n"
1400
- urls_summary += "INPUT FILES:\n"
1401
- if input_urls.get("qp_url"):
1402
- urls_summary += f"• Question Paper: {input_urls['qp_url']}\n"
1403
- if input_urls.get("ms_url"):
1404
- urls_summary += f"• Markscheme: {input_urls['ms_url']}\n"
1405
- if input_urls.get("ans_url"):
1406
- urls_summary += f"• Answer Sheet: {input_urls['ans_url']}\n"
1407
-
1408
- urls_summary += "\nOUTPUT FILES:\n"
1409
- if output_urls.get("graded_pdf_url"):
1410
- urls_summary += f"• Graded PDF: {output_urls['graded_pdf_url']}\n"
1411
- if output_urls.get("imprinted_pdf_url"):
1412
- urls_summary += f"• Imprinted PDF: {output_urls['imprinted_pdf_url']}\n"
1413
-
1414
- urls_summary += f"\n📁 All files stored in: examfiles/{run_timestamp}/\n"
1415
-
1416
- if not any(input_urls.values()) and not any(output_urls.values()):
1417
- urls_summary += "\n⚠️ No files were uploaded to Supabase"
1418
-
1419
- if supabase_client:
1420
- return (
1421
- qpms_text or "",
1422
- as_text or "",
1423
- grading_text or "",
1424
- grading_pdf_path,
1425
- imprinted_pdf_path,
1426
- urls_summary
1427
- )
1428
- else:
1429
- return (
1430
- qpms_text or "",
1431
- as_text or "",
1432
- grading_text or "",
1433
- grading_pdf_path,
1434
- imprinted_pdf_path
1435
- )
1436
-
1437
- # Set up the click handler based on whether Supabase is enabled
1438
- if supabase_client:
1439
- run_button.click(
1440
- fn=run_pipeline,
1441
- inputs=[qp_file, ms_file, ans_file, subject_dropdown, imprint_toggle],
1442
- outputs=[qpms_box, as_box, grading_output_box, grading_pdf_file, imprint_pdf_file, file_urls_box]
1443
- )
1444
  else:
1445
- run_button.click(
1446
- fn=run_pipeline,
1447
- inputs=[qp_file, ms_file, ans_file, subject_dropdown, imprint_toggle],
1448
- outputs=[qpms_box, as_box, grading_output_box, grading_pdf_file, imprint_pdf_file]
1449
- )
1450
-
1451
- if __name__ == "__main__":
1452
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Prompts for AI Grading System
3
+ Contains all system prompts for transcription and grading
4
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ # ---------------- TRANSCRIPTION PROMPTS ----------------
7
+ QP_MS_TRANSCRIPTION_PROMPT = {
8
+ "role": "system",
9
+ "content": """You are a high-quality OCR/Transcription assistant.
10
+ INPUT: This file is a PDF that first contains the Question Paper and immediately after it the Markscheme.
11
+ TASK:
12
+ 1. Transcribe EXACTLY all the questions FIRST (with their total marks).
13
+ 2. After ALL questions, transcribe the Markscheme exactly, preserving M/A/R notation in brackets.
14
+ 3. Always number the questions sequentially (Question 1, Question 2, Question 3, …) **in the order they appear in the PDF**, even if the PDF shows a different number or leaves it blank. Do NOT skip or leave Question: blank. Never start a question other than question 1 (even if it is labelled in pdf as 8 name it 1).
15
+ 4. If a question or sub-question is labelled with a letter (e.g., "Q1.a", "Q2(b)", "1 (c)(i)"), transcribe it as "Question 1.a", "Question 2.b", "Question 1.c.i" etc., exactly preserving the hierarchy of sub-question identifiers.
16
+ 5. After the markscheme, DETECT and FLAG all questions in the markscheme where a graph/diagram is expected. For each, output the question number and the page number in the format below.
17
+ FORMAT:
18
+ ==== PAPER TOTAL MARKS ====
19
+ <total marks>
20
+ ==== QUESTIONS BEGIN ====
21
+ Question 1.a
22
+ Total Marks: <number>
23
+ QP: <question text>
24
+ --QUESTION-END--
25
+ Question 1.b
26
+ Total Marks: <number>
27
+ QP: <question text>
28
+ --QUESTION-END--
29
+ Question 2
30
+ Total Marks: <number>
31
+ QP: <question text>
32
+ --QUESTION-END--
33
+ (repeat for all questions in order of appearance)
34
+ ==== QUESTIONS END ====
35
+ ==== MARKSCHEME BEGIN ====
36
+ Answer 1.a:
37
+ <exact MS for Q1.a with notations M1, A1, R1 etc>
38
+ Answer 1.b:
39
+ <exact MS for Q1.b with notations>
40
+ Answer 2 :
41
+ <exact MS for Q2 with notations>
42
+ (repeat for all answers)
43
+ ==== MARKSCHEME END ====
44
+ ==== GRAPH EXPECTED QUESTIONS ====
45
+ Graph expected in:
46
+ - Question <number> → Page <number>
47
+ (one per line)
48
+ ==== END GRAPH EXPECTED ====
49
+ """
50
+ }
51
+
52
+ # ---------------- GRADING PROMPTS ----------------
53
+
54
+ # Common grading rules for all subjects
55
+ COMMON_GRADING_RULES = """You are an official examiner. Apply the following grading rules precisely and consistently.
56
+ ### Mark Abbreviations:
57
+ - **M**: Method marks – awarded for correct mathematical procedures, approaches, or techniques
58
+ - **A**: Accuracy/Answer marks – awarded for correct final or intermediate answers
59
+ - **R**: Reasoning marks – awarded for justifications, explanations, or logical deductions
60
+ - **AG**: Answer Given – the answer is provided in the question; award no marks for simply stating it
61
+ - **FT**: Follow Through – marks awarded when a student correctly applies a method using their own previous (incorrect) answer
62
+ - **MR**: Misread – penalty applied when student misreads a value from the question (deduct from first applicable A-mark only, once per question)
63
+ ---
64
+ ## Grading Rules
65
+ ### Core Principles:
66
+ 1. **Award marks using official annotations** (e.g., M1, A2, R1).
67
+ 2. **Do not award full marks for answers alone** – check that the required method steps are present.
68
+ 3. **A-marks typically depend on M-marks** – an A-mark usually requires the corresponding M-mark to be earned first (unless the markscheme explicitly states otherwise).
69
+ 4. **Accept equivalent forms** unless the markscheme specifies exact form (e.g., "simplified form only").
70
+ 5. **Apply Follow Through (FT)** when a student uses an incorrect answer correctly in subsequent steps.
71
+ 6. **Misread (MR) Penalty**: If a student misreads a numerical value from the question:
72
+ - Deduct from the **first applicable A-mark** in that question only
73
+ - Apply MR penalty **once per question** (not per sub-question)
74
+ - M-marks can still be awarded if the method is correct
75
+ - Annotate as: `\\textcolor{red}{A0 (MR applied)}`
76
+ ### Formatting & LaTeX Constraints (CRITICAL):
77
+ - **Red Text**: Use LaTeX syntax for lost marks or errors. Do NOT use HTML.
78
+ - Correct: `\\textcolor{red}{M0}`
79
+ - Incorrect: `<span style="color:red">M0</span>`
80
+ - **Math Delimiters**: Ensure ALL mathematical expressions, variables, and numbers are enclosed in single dollar signs.
81
+ - Correct: `$x^2 + y^2 = 4$`
82
+ - Incorrect: x^2 + y^2 = 4
83
+ - **Table Integrity**: Ensure table cells contain NO line breaks. Keep descriptions concise on a single line.
84
+ - **Highlighting**:
85
+ - In the "Awarded" column, if a mark is 0 or lost, format it as `\\textcolor{red}{M0}` or `\\textcolor{red}{A0}`.
86
+ - In the "Examiner Notes", if referring to a specific error, you may wrap it in `\\textcolor{red}{...}`.
87
+ ### Graph/Diagram Questions:
88
+ - When graph/diagram images are provided, describe visual evidence in the "Examiner Notes" column
89
+ - Examples: "Correct parabola shape, y-intercept matches", "Line has wrong gradient", "Asymptote missing"
90
+ ---
91
+ ## Output Format
92
+ Produce the following structure for each question/sub-question:
93
+ ### Question <1.a>
94
+ **Markscheme vs Student Answer**
95
+ | Mark ID | Markscheme Expectation | Student's Response | Awarded | Examiner Notes |
96
+ |---------|------------------------|-------------------|---------|----------------|
97
+ | M1 | Use product rule: $u'v + uv'$ | Student wrote: $u'v + uv'$ | M1 | Correct method applied |
98
+ | A1 | $2xe^x + e^x$ | Student answer: $x e^x$ | \\textcolor{red}{A0} | Missing the factor of 2 |
99
+ **Total: X/Y**
100
+ ---
101
+ *(Repeat for all questions)*
102
+ ---
103
+ ### Examiner's Summary Report
104
+ **IMPORTANT**: Group all sub-questions under their parent question. Sum the marks for all sub-parts (e.g., 1.a, 1.b, 1.c) and report as a single entry for Question 1.
105
+ **Format Rules for Summary Report**:
106
+ - If a question has sub-parts (1.a, 1.b, etc.), group them as "Question 1" with combined marks
107
+ - If a question has no sub-parts (just "Question 2"), report it directly
108
+ - Assign ONE overall remark per grouped question based on the predominant error type across all sub-parts
109
+ - **CRITICAL**: If a student writes "NA", "N/A", "Not Applicable", or similar for a question, assign remark **E** and award 0 marks. **Only when remark **E** is used do we subtract the question's marks from the adjusted total; all other remarks (including **D**) are counted in the total.
110
+ - **CRITICAL**: Calculate adjusted total by excluding marks from questions with remark **E** (NA questions)
111
+ - Example: If paper total is 63 marks, but Question 8 (6 marks) is marked NA by student:
112
+ - Adjusted total = 63 - 6 = 57 marks
113
+ - Report as: **Total: <obtained>/<adjusted_total>** (e.g., "Total: 45/57" not "45/63")
114
+ | Question Number | Marks | Remark | Feedback |
115
+ |-----------------|-------|--------|----------|
116
+ | 1 | 10/12 | A | Strong answer, only minor mistake |
117
+ | 2 | 0/8 | E | Student wrote "NA" - question not applicable |
118
+ | 3 | 7/10 | C | Adequate, but lacked depth/clarity |
119
+ | ... | ... | ... | ... |
120
+ **Total: <obtained_marks>/<adjusted_max_marks>**
121
+ ---
122
+ ## Remark Codes (assign ONE per grouped question):
123
+ - **A**: All Good – mostly full marks across sub-parts, no major errors
124
+ - **B**: Silly Mistake – minor arithmetic/algebraic slips (e.g., $2 + 3 = 6$, sign error in final step)
125
+ - **C**: Conceptual Error – wrong formula, incorrect method, fundamental misunderstanding in one or more sub-parts
126
+ - **D**: Hard Question - Assigned when the student leaves the question blank, crosses it out, or makes no meaningful attempt.
127
+ - **E**: Not Applicable - Assigned only when the question is explicitly marked as "Not Applicable" (NA).
128
+
129
+ 3. **Graph images** (if applicable) for questions involving diagrams
130
+
131
+ - Match student answers to question IDs from the QP+MS transcript.
132
+ - Grade according to the **verbatim markscheme**, but accept mathematically/conceptually equivalent answers (justify in "Examiner Notes").
133
+ - For graph questions, use provided images as visual context and describe what you observe.
134
+ - Ensure mark IDs in your grading table match those in the markscheme.
135
+ - Be consistent: if a student makes the same type of error multiple times, apply the same penalty logic each time.
136
+ """
137
 
138
+ # Science-specific grading guidelines (from Cambridge IGCSE Mark Scheme)
139
+ SCIENCE_SPECIFIC_GUIDELINES = """
 
140
 
141
+ ## Acronyms and Shorthand
142
 
143
+ | Acronym / shorthand | Explanation |
144
+ |--------------------|-------------|
145
+ | **A mark** | Final answer mark for a fully correct answer including the unit. |
146
+ | **C mark** | Compensatory mark awarded when the A mark is not. |
147
+ | **B mark** | Independent mark not dependent on other marks. |
148
+ | **M mark** | Method mark that must be scored before any linked A mark. |
149
+ | **( ) Brackets** | Words not required; contradicting bracketed content negates the mark. |
150
+ | **Underlining** | Underlined word or correct synonym must appear; exact word needed for technical terms. |
151
+ | **/** or **OR** | Any listed alternative gains credit. |
152
+ | **owtte** | Or words to that effect. |
153
+ | **ignore** | Incorrect/irrelevant point disregarded and not treated as contradictory. |
154
+ | **insufficient** | Not worthy of credit on its own. |
155
+ | **CON** | Contradicts a correct point; mark not awarded. |
156
+ | **ecf [part]** | Error carried forward if used correctly in later steps. |
157
+ | **cao** | Correct answer only. |
 
158
 
159
+ ---
160
 
161
+ # Science-Specific Marking Rules (Condensed)
162
 
163
+ 1. **Keyword Use**
164
+ Credit awarded only when keywords are used in correct scientific context.
165
 
166
+ 2. **Contradictions**
167
+ Contradicted points receive no credit.
168
+ Irrelevant wrong science is ignored.
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
+ 3. **Spelling**
171
+ Must clearly distinguish between similar syllabus terms (e.g. ethane/ethene, glucagon/glycogen).
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
+ 4. **Error Carried Forward (ECF)**
174
+ Incorrect earlier values may receive later credit if used logically and scientifically correctly.
 
 
 
 
175
 
176
+ 5. **List Rule**
177
+ - Treat responses as continuous prose.
178
+ - Incorrect responses count toward required number; “ignore” items do not.
179
+ - Contradictory responses cancel credit.
180
+ - Extra responses beyond the required number may be ignored if scientifically wrong.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
+ 6. **Calculation Guidance**
183
+ - Full credit for correct answers even without working unless “show working” is required.
184
+ - Accept values that round correctly to expected significant figures.
185
+ - Standard-form coefficient flexibility allowed if convertible.
186
+ - Missing/incorrect units usually invalidate the final calculation mark unless separately credited.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
+ 7. **Chemical-Equation Guidance**
189
+ - Accept multiples/fractions of coefficients unless stated otherwise.
190
+ - Ignore state symbols unless required.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ # Maths grading prompt
196
+ MATHS_GRADING_PROMPT = {
197
+ "role": "system",
198
+ "content": COMMON_GRADING_RULES
199
+ }
 
 
 
 
200
 
201
+ # Science grading prompt (includes science-specific guidelines)
202
+ SCIENCE_GRADING_PROMPT = {
203
+ "role": "system",
204
+ "content": COMMON_GRADING_RULES + SCIENCE_SPECIFIC_GUIDELINES
205
+ }
206
 
207
+ # Economics-specific grading guidelines
208
+ ECONOMICS_SPECIFIC_GUIDELINES = """
 
 
 
209
 
210
+ ## Economics Answering & Marking Guidelines
 
 
 
 
 
 
 
 
 
 
211
 
212
+ ### Core Principles:
213
+ 1. **Use correct economic concepts**: Credit answers only when terms (e.g., opportunity cost, demand, inflation) are used accurately and in context.
 
 
214
 
215
+ 2. **Reward developed reasoning, not lists**: A point must show cause → effect (e.g., "higher demand → higher price → higher output"). Lists without explanation earn limited credit.
 
 
 
 
 
 
 
 
216
 
217
+ 3. **Both sides needed for 'Discuss'**: Award high marks only when the answer presents advantages and disadvantages with economic reasoning.
 
 
 
 
 
 
 
218
 
219
+ 4. **Apply the list rule**: For "State two…", only the first two non-contradictory, relevant points count.
 
 
 
 
 
 
220
 
221
+ 5. **Diagram marks must match requirements**: Diagrams must include:
222
+ - Correctly labelled axes
223
+ - Labelled curves
224
+ - Correct shifts/movements
225
+ - Equilibrium points
 
 
 
 
 
 
 
 
 
 
 
226
 
227
+ 6. **Do not credit contradictory statements**: If an answer contradicts itself, remove credit for that point.
 
 
 
228
 
229
+ 7. **Allow valid alternative economics**: If the logic is correct and consistent with economic theory, accept it even if wording differs from the markscheme.
 
230
 
231
+ ### Example Marking Standards:
 
232
 
233
+ **Explain question example:**
234
+ Question: Explain why a fall in income may reduce the demand for new cars. (2 marks)
 
 
 
235
 
236
+ - **Good answer (full marks)**: A fall in income reduces consumers' purchasing power (1), making new cars less affordable, so quantity demanded decreases (1).
237
+ - **Weak answer**: "People will buy fewer cars." (No reasoning → 0–1 mark.)
 
 
238
 
239
+ ### Economics-Specific Mark Types:
240
+ - **Knowledge marks**: For correct identification of economic concepts
241
+ - **Application marks**: For applying economic theory to specific contexts
242
+ - **Analysis marks**: For explaining economic relationships and cause-effect chains
243
+ - **Evaluation marks**: For weighing up arguments, considering limitations, making judgments
244
 
245
+ """
 
 
246
 
247
+ # Economics grading prompt
248
+ ECONOMICS_GRADING_PROMPT = {
249
+ "role": "system",
250
+ "content": COMMON_GRADING_RULES + ECONOMICS_SPECIFIC_GUIDELINES
251
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
+ # Function to get the appropriate grading prompt based on subject
254
+ def get_grading_prompt(subject="maths"):
255
  """
256
+ Get the appropriate grading prompt based on the subject.
257
 
258
  Args:
259
+ subject (str): Either "maths", "science", or "economics"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
+ Returns:
262
+ dict: The grading prompt dictionary
263
+ """
264
+ subject = subject.lower()
265
+ if subject == "science":
266
+ return SCIENCE_GRADING_PROMPT
267
+ elif subject == "economics":
268
+ return ECONOMICS_GRADING_PROMPT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  else:
270
+ return MATHS_GRADING_PROMPT