File size: 39,074 Bytes
419e7c7
6941b48
 
 
 
 
 
 
419e7c7
6941b48
 
 
 
 
 
2b866da
6941b48
419e7c7
 
6941b48
b248fb0
6941b48
f999bc3
 
 
 
6941b48
 
 
 
618ee94
f999bc3
618ee94
 
6941b48
 
 
618ee94
6941b48
618ee94
 
 
 
 
 
6941b48
 
 
618ee94
 
6941b48
 
 
618ee94
6941b48
618ee94
6941b48
618ee94
6941b48
618ee94
 
 
 
 
 
6941b48
 
618ee94
6941b48
618ee94
6941b48
618ee94
 
 
 
 
 
f999bc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f92014f
f999bc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f92014f
f999bc3
 
 
 
d146b76
 
 
 
 
 
 
f92014f
f999bc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
618ee94
f999bc3
b248fb0
6941b48
 
 
 
 
 
b248fb0
6941b48
 
 
 
b248fb0
6941b48
 
 
 
b248fb0
6941b48
 
 
b248fb0
6941b48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b248fb0
2b866da
 
419e7c7
2b866da
 
 
419e7c7
 
 
 
 
 
 
 
 
 
 
 
2b866da
 
 
 
 
6941b48
 
 
 
 
 
 
 
 
b248fb0
f1f0ca1
b248fb0
419e7c7
b248fb0
 
419e7c7
 
6941b48
419e7c7
 
6941b48
2dd3b2b
 
 
 
419e7c7
2dd3b2b
419e7c7
2dd3b2b
 
 
419e7c7
2dd3b2b
419e7c7
 
6941b48
419e7c7
 
 
 
 
 
 
 
 
 
 
 
f1f0ca1
419e7c7
 
f1f0ca1
419e7c7
 
 
 
 
 
 
 
b248fb0
6941b48
27ea33f
2b866da
27ea33f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b248fb0
bf851f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b0a372
a5a195e
9b0a372
 
 
bf851f8
a5a195e
 
 
 
 
9b0a372
bf851f8
 
 
 
 
 
 
 
 
a5a195e
9b0a372
 
 
 
bf851f8
9b0a372
 
 
 
 
bf851f8
 
9b0a372
 
 
 
 
 
 
 
 
 
bf851f8
9b0a372
bf851f8
9b0a372
bf851f8
9b0a372
 
 
 
 
a5a195e
9b0a372
a5a195e
 
 
 
9b0a372
a5a195e
9b0a372
 
 
 
 
 
 
 
 
 
 
 
a5a195e
 
bf851f8
5268488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5a195e
5268488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b248fb0
6941b48
b248fb0
2b866da
b248fb0
6941b48
 
b248fb0
6941b48
 
 
b248fb0
6941b48
 
 
 
 
 
 
 
 
 
 
 
b248fb0
6941b48
419e7c7
559b8c9
 
 
 
 
 
 
 
 
b248fb0
559b8c9
b248fb0
559b8c9
332de5f
559b8c9
 
 
 
b248fb0
559b8c9
b248fb0
559b8c9
 
 
 
 
 
419e7c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559b8c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6941b48
c962bfa
419e7c7
b248fb0
559b8c9
2b866da
b248fb0
 
 
 
 
 
 
 
 
 
 
 
 
559b8c9
b248fb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559b8c9
2b866da
559b8c9
 
 
 
419e7c7
559b8c9
 
b248fb0
 
 
559b8c9
b248fb0
 
 
 
 
 
559b8c9
 
 
b248fb0
 
 
 
de357f2
b248fb0
 
 
 
de357f2
b248fb0
 
 
 
 
559b8c9
 
 
 
 
b248fb0
 
559b8c9
b248fb0
559b8c9
b248fb0
 
 
 
559b8c9
b248fb0
 
 
 
 
 
 
a5a195e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b866da
a5a195e
 
419e7c7
a5a195e
 
 
 
 
 
 
 
 
 
 
 
419e7c7
 
332de5f
a5a195e
 
 
419e7c7
a5a195e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5084988
419e7c7
a5a195e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b866da
a5a195e
 
 
419e7c7
a5a195e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419e7c7
a5a195e
419e7c7
a5a195e
 
 
 
 
 
 
2b866da
 
a5a195e
b248fb0
 
419e7c7
 
 
b248fb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419e7c7
 
 
b248fb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559b8c9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874

import os 
import re 
import json 
import subprocess 
import time 
import img2pdf 
import gradio as gr 
from google import genai  # NEW SDK
from markdown_pdf import MarkdownPdf, Section 
from pdf2image import convert_from_path 
from PIL import Image, ImageDraw, ImageFont 
import cv2 
import numpy as np 
from PyPDF2 import PdfReader, PdfWriter 

# ---------------- CONFIG ---------------- 
# Create client with new SDK
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
GRID_ROWS, GRID_COLS = 20, 14 

# ---------------- PROMPTS ---------------- 
PROMPTS = {
    "QP_MS_TRANSCRIPTION": {
        "role": "system",
        "content": """You are a high-quality OCR/Transcription assistant. 
INPUT: This file is a PDF that first contains the Question Paper and immediately after it the Markscheme. 
TASK:  
1. Transcribe EXACTLY all the questions FIRST (with their total marks).   
2. After ALL questions, transcribe the Markscheme exactly, preserving M/A/R notation in brackets.   
3. Always number the questions sequentially (Question 1, Question 2, Question 3, …) **in the order they appear in the PDF**, even if the PDF shows a different number or leaves it blank. Do NOT skip or leave Question: blank. Never start a question other than question 1 (even if it is labelled in pdf as 8 name it 1).  
4. If a question or sub-question is labelled with a letter (e.g., "Q1.a", "Q2(b)", "1 (c)(i)"), transcribe it as "Question 1.a", "Question 2.b", "Question 1.c.i" etc., exactly preserving the hierarchy of sub-question identifiers.  
5. After the markscheme, DETECT and FLAG all questions in the markscheme where a graph/diagram is expected. For each, output the question number and the page number in the format below. 

FORMAT:   
==== PAPER TOTAL MARKS ====   
<total marks>   

==== QUESTIONS BEGIN ====   
Question 1.a  
Total Marks: <number>   
QP: <question text>   
--QUESTION-END--   

Question 1.b  
Total Marks: <number>   
QP: <question text>   
--QUESTION-END--   

Question 2
Total Marks: <number>   
QP: <question text>   
--QUESTION-END--   

(repeat for all questions in order of appearance)   

==== QUESTIONS END ====   

==== MARKSCHEME BEGIN ====   
Answer 1.a:   
<exact MS for Q1.a with notations M1, A1, R1 etc>   

Answer 1.b:   
<exact MS for Q1.b with notations>  

Answer 2 : 
<exact MS for Q2 with notations>  

(repeat for all answers)   

==== MARKSCHEME END ====   

==== GRAPH EXPECTED QUESTIONS ====  
Graph expected in:  
- Question <number> β†’ Page <number>  
(one per line)  
==== END GRAPH EXPECTED ====  
"""
    },
    
    "GRADING_PROMPT": {
        "role": "system",
        "content": """You are an official examiner. Apply the following grading rules precisely and consistently.

### Mark Abbreviations:
- **M**: Method marks – awarded for correct mathematical procedures, approaches, or techniques
- **A**: Accuracy/Answer marks – awarded for correct final or intermediate answers
- **R**: Reasoning marks – awarded for justifications, explanations, or logical deductions
- **AG**: Answer Given – the answer is provided in the question; award no marks for simply stating it
- **FT**: Follow Through – marks awarded when a student correctly applies a method using their own previous (incorrect) answer
- **MR**: Misread – penalty applied when student misreads a value from the question (deduct from first applicable A-mark only, once per question)

---

## Grading Rules

### Core Principles:
1. **Award marks using official annotations** (e.g., M1, A2, R1).
2. **Do not award full marks for answers alone** – check that the required method steps are present.
3. **A-marks typically depend on M-marks** – an A-mark usually requires the corresponding M-mark to be earned first (unless the markscheme explicitly states otherwise).
4. **Accept equivalent forms** unless the markscheme specifies exact form (e.g., "simplified form only").
5. **Apply Follow Through (FT)** when a student uses an incorrect answer correctly in subsequent steps.
6. **Misread (MR) Penalty**: If a student misreads a numerical value from the question:
   - Deduct from the **first applicable A-mark** in that question only
   - Apply MR penalty **once per question** (not per sub-question)
   - M-marks can still be awarded if the method is correct
   - Annotate as: `A0 (MR applied)`

### Formatting Lost Marks:
- **Lost marks must be highlighted in red**: `<span style="color:red">M0</span>`, `<span style="color:red">A0</span>`, etc.
- **In the table**: Use red styling for "Awarded" column when mark is lost
- **Do use red** for markscheme expectations or student responses themselves when mark is lost

### Graph/Diagram Questions:
- When graph/diagram images are provided, describe visual evidence in the "Examiner Notes" column
- Examples: "Correct parabola shape, y-intercept matches", "Line has wrong gradient", "Asymptote missing"

---

## Output Format

Produce the following structure for each question/sub-question:

### Question <1.a>

**Markscheme vs Student Answer**

| Mark ID | Markscheme Expectation | Student's Response | Awarded | Examiner Notes |
|---------|------------------------|-------------------|---------|----------------|
| M1      | Use product rule: $u'v + uv'$ | Student wrote: $u'v + uv'$ βœ“ | M1 | Correct method applied |
| A1      | Final answer: $2xe^x + e^x$ | Student answer: $2xe^x + e^x$ βœ“ | A1 | Correct, depends on M1 |

**Total: X/Y**

---

*(Repeat for all questions)*

---

### Examiner's Summary Report

**IMPORTANT**: Group all sub-questions under their parent question. Sum the marks for all sub-parts (e.g., 1.a, 1.b, 1.c) and report as a single entry for Question 1.

**Format Rules for Summary Report**:
- If a question has sub-parts (1.a, 1.b, etc.), group them as "Question 1" with combined marks
- If a question has no sub-parts (just "Question 2"), report it directly
- Assign ONE overall remark per grouped question based on the predominant error type across all sub-parts

| Question Number | Marks | Remark | Feedback |
|-----------------|-------|--------|----------|
| 1               | 10/12 | A      | Strong answer, only minor mistake |
| 2               | 5/8   | B      | Good attempt, missing some detail |
| 3               | 7/10  | C      | Adequate, but lacked depth/clarity |
| …               | …     | …      | … (continue for all answers)      |

...(repeat for all answers)   

**Example Explanation**:
- Question 1 has sub-parts 1.a (3/5), 1.b (5/7), 1.c (2/0) β†’ Total: (3+5+2)/(5+7+0) = 10/12
- Question 2 has sub-parts 2.a (2/3), 2.b (3/5) β†’ Total: (2+3)/(3+5) = 5/8
- Question 3 has no sub-parts β†’ Report as-is: 7/10

**Total: <obtained_marks>/<max_marks>**

---

## Remark Codes (assign ONE per grouped question):
- **A**: All Good – mostly full marks across sub-parts, no major errors
- **B**: Silly Mistake – minor arithmetic/algebraic slips (e.g., $2 + 3 = 6$, sign error in final step)
- **C**: Conceptual Error – wrong formula, incorrect method, fundamental misunderstanding in one or more sub-parts
- **D**: Hard Question – question is inherently difficult; partial credit reflects genuine attempt
- **E**: Not Applicable – question not attempted, or answer entirely illegible/missing

**Remark Selection for Grouped Questions**:
- If all sub-parts are correct β†’ **A**
- If majority are correct with 1-2 arithmetic errors β†’ **B**
- If one or more sub-parts show conceptual errors β†’ **C**
- If question is difficult and student made reasonable attempt β†’ **D**
- If all sub-parts are missing/illegible β†’ **E**

---

## Additional Instructions:
- You will receive:
  1. **QP+MS transcript** (authoritative source for question wording, total marks, and markscheme with M/A/R notation)
  2. **AS transcript** (student answers in LaTeX-formatted markdown)
  3. **Graph images** (if applicable) for questions involving diagrams
  
- Match student answers to question IDs from the QP+MS transcript.
- Grade according to the **verbatim markscheme**, but accept mathematically/conceptually equivalent answers (justify in "Examiner Notes").
- For graph questions, use provided images as visual context and describe what you observe.
- Ensure mark IDs in your grading table match those in the markscheme.
- Be consistent: if a student makes the same type of error multiple times, apply the same penalty logic each time.
"""
    }
}


# ---------------- HELPERS ---------------- 
def save_as_pdf(text, filename="output.pdf"): 
    pdf = MarkdownPdf() 
    pdf.add_section(Section(text, toc=False)) 
    pdf.save(filename) 
    return filename 

def compress_pdf(input_path, output_path=None, max_size=20*1024*1024): 
    if output_path is None: 
        base, ext = os.path.splitext(input_path) 
        output_path = f"{base}_compressed{ext}" 

    try: 
        size = os.path.getsize(input_path) 
    except Exception: 
        return input_path 

    if size <= max_size: 
        print(f"ℹ️ Not compressing {input_path} ({size/1024/1024:.2f} MB <= {max_size/1024/1024} MB)") 
        return input_path 

    print(f"πŸ”Ž Compressing {input_path} ({size/1024/1024:.2f} MB) -> {output_path}") 
    try: 
        gs_cmd = [ 
            "gs", "-sDEVICE=pdfwrite", 
            "-dCompatibilityLevel=1.4", 
            "-dPDFSETTINGS=/ebook", 
            "-dNOPAUSE", "-dQUIET", "-dBATCH", 
            f"-sOutputFile={output_path}", input_path 
        ] 
        subprocess.run(gs_cmd, check=True) 
        new_size = os.path.getsize(output_path) 
        print(f"βœ… Compression done. New size: {new_size/1024/1024:.2f} MB") 
        if new_size <= max_size: 
            return output_path 
        else: 
            print("⚠️ Compressed file still larger than threshold; returning original") 
            return input_path 
    except Exception as e: 
        print("❌ Compression error:", e) 
        return input_path 

def upload_to_gemini(path, display_name=None):
    """
    Upload a file to Gemini using the NEW google-genai SDK.
    """
    print(f"πŸ“€ Uploading {path} to Gemini...")
    try:
        uploaded_file = client.files.upload(file=path)
        
        # Wait for processing to complete
        print(f"⏳ Waiting for file processing: {uploaded_file.name}")
        while uploaded_file.state.name == "PROCESSING":
            time.sleep(2)
            uploaded_file = client.files.get(name=uploaded_file.name)
        
        if uploaded_file.state.name == "FAILED":
            raise Exception(f"File processing failed: {uploaded_file.name}")
            
        print(f"βœ… Uploaded and processed: {uploaded_file.name}")
        return uploaded_file
    except Exception as e:
        print(f"❌ Upload failed for {path}: {e}")
        raise

def merge_pdfs(paths, output_path): 
    writer = PdfWriter() 
    for p in paths: 
        reader = PdfReader(p) 
        for page in reader.pages: 
            writer.add_page(page) 
    with open(output_path, "wb") as f: 
        writer.write(f) 
    return output_path 

def gemini_generate_content(prompt_text, file_upload_obj=None, image_obj=None, model_name="gemini-2.5-pro"): 
    """ 
    Send prompt_text and optionally an uploaded file (or an image object/list) to the model using NEW SDK.
    Returns textual response and prints progress. 
    """ 
    contents = [prompt_text] 
    
    if file_upload_obj: 
        contents.append(file_upload_obj) 
        
    if image_obj: 
        if isinstance(image_obj, list):
            for img_path in image_obj:
                if isinstance(img_path, str):
                    pil_img = Image.open(img_path)
                    contents.append(pil_img)
                else:
                    contents.append(img_path)
        else:
            if isinstance(image_obj, str):
                pil_img = Image.open(image_obj)
                contents.append(pil_img)
            else:
                contents.append(image_obj)
                
    print("πŸ“‘ Sending request to Gemini (prompt length:", len(prompt_text), "chars )") 
    
    try:
        response = client.models.generate_content(
            model=model_name,
            contents=contents
        )
        raw_text = response.text
        print("πŸ“₯ Received response (chars):", len(raw_text)) 
        return raw_text
    except Exception as e:
        print(f"❌ Generation failed: {e}")
        # Try fallback model
        print("⚑ Trying fallback model: gemini-2.5-flash")
        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash",
                contents=contents
            )
            raw_text = response.text
            print("πŸ“₯ Received response (chars):", len(raw_text)) 
            return raw_text
        except Exception as e2:
            print(f"❌ Fallback also failed: {e2}")
            raise

# ---------------- PARSERS ---------------- 
def extract_question_ids_from_qpms(text: str):
    """Extract question IDs from QP+MS transcript."""
    print("πŸ”Ž Extracting question IDs from QP+MS transcript using regex...")
    
    clean_text = text.replace("\u00A0", " ").replace("\t", " ")
    
    primary_matches = re.findall(r"^\s*Question\s*[:\s]\s*([\dA-Za-z.()]+)", clean_text, re.MULTILINE)
    if primary_matches:
        print(f"βœ… Extracted {len(primary_matches)} question IDs from explicit 'Question X' lines.")
        print("IDs:", primary_matches)
        return primary_matches
    
    fallback_matches = re.findall(r"^\s*(\d+(?:[.)]|\([a-zA-Z0-9]+\))?[a-zA-Z0-9]*)", clean_text, re.MULTILINE)
    if fallback_matches:
        print(f"βœ… Extracted {len(fallback_matches)} question IDs (fallback numbered lists).")
        print("IDs:", fallback_matches)
    else:
        print("⚠️ No question IDs extracted; will send NA placeholder.")
    return fallback_matches 

# def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
#     """
#     Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
#     modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
#     requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
#     """
#     if not expected_ids:
#         ids_block = "{NA}"
#     else:
#         ids_block = "{\n" + "\n".join(expected_ids) + "\n}"

#     qpms_guidance = ""
#     if qpms_text:
#         qpms_guidance = (
#             "\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS). "
#             "Use this transcript primarily to resolve **ambiguous handwriting** (e.g., if a number could be '$-1.6$' or '$1.6$'). "
#             "If you are confident in your transcription without referring to the QP+MS, use your judgment. "
#             "**Always prioritize accuracy and context from the QP+MS transcript when in doubt about a specific ambiguous character or expression.**\n"
#         )

#     prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.

# INPUT: This PDF contains a student's handwritten answer sheet.
# {qpms_guidance}
# TASK:
# 1. **THINKING:** Before transcribing each answer, you must document your thought process using the **<think>** tag.
#     - Identify the question ID. If inferred, note why.
#     - Detail any ambiguities encountered (e.g., unclear numbers, symbols, or structure).
#     - Explain how you resolved ambiguities, specifically if you referred to the QP+MS transcript.
#     - If you *did* refer to the QP+MS but decided to keep your original transcription, state this clearly.
#     - If you initially label an answer as 2.a but later realize it aligns better with 2.b based on the marking scheme, you should reassign it to 2.b and briefly explain your reasoning in the <think> tag to maintain clarity and consistency.

#     *Example Thinking:*
#     <think>
#     - Found Question 3(a).
#     - Noticed '2x' was written ambiguously; it could be '2x' or '21x'.
#     - Referred to QP+MS: The expected answer involves '$21x$'.
#     - Re-examined the handwriting carefully: The student's handwriting strongly appears to be '$2x$' and not '$21x$'.
#     - DECISION: Transcribe exactly what the student wrote: '$2x$'.
#     </think>
#     *Example Thinking 2 (Ambiguity Resolved by MS):*
#     <think>
#     - Found Question INFERRED: 1(b) based on proximity to 1(a).
#     - Noticed the final answer looked like '3.6', but the decimal point was very faint and could be '36'.
#     - Referred to QP+MS: Expected answer is '$3.8$'. Re-examined the student's writing: it appears to be a poorly written '$3.8$' which I initially misread as '$3.6$'.
#     - DECISION: Corrected my transcription to '$3.8$' based on re-evaluation and MS context.
#     </think>

# 2. **TRANSCRIPTION:** Transcribe the student's answers with accordance to the markcheme provided. Preserve step order and line breaks.
#     - Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3").
#     - If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context β€” but mark inferred IDs clearly as "**INFERRED: <id>**".
#     - **Enclose all mathematical expressions and single variables in LaTeX dollar delimiters ($...$).**
#         - *Example:* "The area is $A = \pi r^2$ so $3x+5 = 11$ thus $x=2$."
#     - If a diagram/graph is omitted, write **[Graph omitted]**.
#     - Unreadable parts: **[illegible]**.
#     - Unanswered: **[No response]**.
#     - Do NOT recreate diagrams.

# Ensure consistency and determinism in formatting so subsequent models can grade directly from this aligned format.

# Expected questions (if missing, write NA):
# {ids_block}
# -----------------------
# OUTPUT FORMAT:
# <think>...</think>
# Question <id>
# AS:<transcribed answer or placeholder>
# <think>...</think>
# Question <id>
# AS:<transcribed answer or placeholder>
# ...

# ==== GRAPH FOUND ANSWERS ====
# Graph found in:
# - Answer <number> β†’ Page <number>
# (one per line)
# ==== END GRAPH FOUND ===="""

#     return prompt

def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
    """
    Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
    modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
    requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
    The full qpms_text, when provided, is embedded directly in the prompt and not skipped.
    """
    if not expected_ids:
        ids_block = "{NA}"
    else:
        ids_block = "{\n" + "\n".join(expected_ids) + "\n}"

    qpms_section = ""
    if qpms_text is not None:
        # Include the full QP+MS transcript exactly (strip only leading/trailing whitespace)
        qpms_section = (
            "\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS) below."
            "\nUse it primarily to resolve ambiguous handwriting and to confirm expected answers when needed."
            "\n--- BEGIN QP+MS TRANSCRIPT ---\n"
            f"{qpms_text.strip()}\n"
            "--- END QP+MS TRANSCRIPT ---\n"
        )

    prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.

INPUT: This PDF contains a student's handwritten answer sheet.
{qpms_section}
TASK:
1. **THINKING:** Before transcribing each answer, you must document your thought process using the **<think>** tag.
    - Identify the question ID. If inferred, note why.
    - Detail any ambiguities encountered (e.g., unclear numbers, symbols, or structure).
    - Explain how you resolved ambiguities, specifically if you referred to the QP+MS transcript.
    - If you *did* refer to QP+MS but decided to keep your original transcription, state this clearly.
    - If you initially label an answer as 2.a but later realize it aligns better with 2.b based on the marking scheme, reassign it to 2.b and briefly explain your reasoning in the <think> tag.

    *Example Thinking:*
    <think>
    - Found Question 3(a).
    - Noticed '2x' was written ambiguously; it could be '2x' or '21x'.
    - Referred to QP+MS: The expected answer involves '$21x$'.
    - Re-examined the handwriting carefully: The student's handwriting strongly appears to be '$2x$' and not '$21x$'.
    - DECISION: Transcribe exactly what the student wrote: '$2x$'.
    </think>

2. **TRANSCRIPTION:** Transcribe the student's answers in accordance with the markscheme provided. Preserve step order and line breaks.
    - Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3").
    - If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context β€” mark inferred IDs clearly as "**INFERRED: <id>**".
    - **Enclose all mathematical expressions and single variables in LaTeX dollar delimiters ($...$).**
        - Example: "The area is $A = \pi r^2$ so $3x+5 = 11$ thus $x=2$."
    - If a diagram/graph is omitted, write **[Graph omitted]**.
    - Unreadable parts: **[illegible]**.
    - Unanswered: **[No response]**.
    - Do NOT recreate diagrams.

Ensure consistency and determinism in formatting so subsequent models can grade directly from this aligned format.

Expected questions (if missing, write NA):
{ids_block}
-----------------------
OUTPUT FORMAT:
<think>...</think>
Question <id>
AS:<transcribed answer or placeholder>
<think>...</think>
Question <id>
AS:<transcribed answer or placeholder>
...

==== GRAPH FOUND ANSWERS ====
Graph found in:
- Answer <number> β†’ Page <number>
(one per line)
==== END GRAPH FOUND ===="""

    return prompt


def extract_graph_questions_from_ms(text: str):
    """Extract graph questions and page numbers from MS transcript."""
    clean_text = text.replace("\u00A0", " ").replace("\t", " ")
    match = re.search(r"==== GRAPH EXPECTED QUESTIONS ====\s*(.*?)\s*==== END GRAPH EXPECTED ====", 
                     clean_text, re.S)
    graph_dict = {}
    if match:
        block = match.group(1)
        for line in block.splitlines():
            line = line.strip()
            if line.startswith("- Question"):
                q_match = re.match(r"- Question\s+([\dA-Za-z.()]+)\s*β†’\s*Page\s*(\d+)", line)
                if q_match:
                    q_id, page = q_match.groups()
                    graph_dict[q_id] = int(page)
    return graph_dict

def extract_graph_answers_from_as(text: str):
    """Extract graph answers and page numbers from AS transcript."""
    clean_text = text.replace("\u00A0", " ").replace("\t", " ")
    block = re.search(r"==== GRAPH FOUND ANSWERS ====\s*(.*?)\s*==== END GRAPH FOUND ====",
                     clean_text, re.S)
    graph_dict = {}
    if block:
        for line in block.group(1).splitlines():
            line = line.strip()
            if line.startswith("- Answer"):
                match = re.match(r"- Answer\s+([\dA-Za-z.()]+)\s*β†’\s*Page\s*(\d+)", line)
                if match:
                    ans_id, page = match.groups()
                    graph_dict[ans_id] = int(page)
    return graph_dict

def extract_marks_from_grading(grading_text): 
    """ 
    Parse the grading markdown and extract marks per question. 
    """ 
    print("πŸ”Ž Extracting awarded marks from grading output...") 
    grading_json = {"grading": []} 

    question_blocks = re.split(r"##\s*Question\s+", grading_text) 
    for block in question_blocks[1:]: 
        first_line = block.strip().splitlines()[0].strip() if block.strip().splitlines() else "" 
        q_id_match = re.match(r"([0-9]+(?:[a-zA-Z]|\([^)]+\)|(?:\.[a-zA-Z0-9]+))*)", first_line) 
        if not q_id_match: 
            q_id = first_line.split()[0] if first_line else "" 
        else: 
            q_id = q_id_match.group(1).strip() 
        awarded = re.findall(r"\b(M\d+|A\d+|R\d+|M0|A0|R0)\b", block) 
        grading_json["grading"].append({ 
            "question": q_id, 
            "marks_awarded": awarded 
        }) 
    print("βœ… Extracted grading marks for", len(grading_json["grading"]), "question blocks.") 
    print(json.dumps(grading_json, indent=2)) 
    return grading_json 

# ---------------- MAPPING/IMPRINT HELPERS ---------------- 
def ask_gemini_for_mapping_batch(image_paths, grading_json, expected_ids=None, rows=GRID_ROWS, cols=GRID_COLS):
    """
    Send multiple page images together to Gemini for batch mapping processing.
    """
    ids_block = "{NA}"
    if expected_ids:
        ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
    
    prompt = f"""You are an exam marker. Your role is to identify where each question begins on each page.
The pages are divided into a {rows} x {cols} grid. Each cell has a RUNNING NUMBER label.
For each question in the grading JSON, return the cell NUMBER where the FIRST STEP of that question begins.
⚠ IMPORTANT RULES:
- Do not place marks inside another question's answer area.
- Prefer placing the marks in a BLANK cell immediately to the RIGHT of the answer step. If no blank cell is available to the right, then place in a blank cell to the LEFT.
- Never place marks above or below the answer.
- Each question should have unique cell number
- If a question serial number is visible in the answer image, you must mandatorily identify the corresponding question using the grading JSON.
IMPORTANT: For your help i have provided u questions that u can expect in the images:
{ids_block}
Return JSON only, like:
[{{"page": 1, "question": "1(a)", "cell_number": 15}}, ...]
Grading JSON:
{json.dumps(grading_json, indent=2)}"""

    images = [Image.open(p) for p in image_paths]
    
    print(f"πŸ“‘ Sending batch mapping request for {len(image_paths)} pages to Gemini...")
    
    try:
        contents = [prompt] + images
        response = client.models.generate_content(
            model="gemini-2.0-flash-exp",
            contents=contents
        )
        raw_text = response.text
    except:
        print("⚠️ Trying fallback model for mapping...")
        contents = [prompt] + images
        response = client.models.generate_content(
            model="gemini-1.5-flash",
            contents=contents
        )
        raw_text = response.text
    
    print("πŸ“₯ Batch mapping response (chars):", len(raw_text))
    print("πŸ”Ž Gemini raw batch output:")
    print(raw_text)
    
    try:
        match = re.search(r'(\[.*\])', raw_text, re.DOTALL)
        if match:
            mapping = json.loads(match.group(1))
            print(f"βœ… Parsed Gemini batch mapping for {len(image_paths)} pages")
            return mapping
        else:
            print("❌ Failed to find JSON array in response")
            return []
    except Exception as e:
        print(f"❌ Failed to parse Gemini JSON mapping: {e}")
        return [] 

def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, expected_ids=None, rows=GRID_ROWS, cols=GRID_COLS): 
    """ 
    Convert PDF to images, create grid-numbered images for batch sending to Gemini, 
    then annotate and produce imprinted PDF.
    """ 
    print("πŸ“„ Converting answer PDF to images for imprinting...") 
    pages = convert_from_path(pdf_path, dpi=200) 
    annotated_page_paths = [] 
    temp_grid_images = [] 

    for p_index, page in enumerate(pages): 
        img = page.convert("RGB") 
        w, h = img.size 
        cell_w, cell_h = w / cols, h / rows 

        draw = ImageDraw.Draw(img) 
        try: 
            num_font = ImageFont.truetype("arial.ttf", 20) 
        except Exception: 
            num_font = ImageFont.load_default() 

        cell_num = 1 
        for r in range(rows): 
            for c in range(cols): 
                x = int(c * cell_w + cell_w / 2) 
                y = int(r * cell_h + cell_h / 2) 
                text = str(cell_num) 
                bbox = draw.textbbox((0, 0), text, font=num_font) 
                tw = bbox[2] - bbox[0] 
                th = bbox[3] - bbox[1] 
                draw.text((x - tw/2, y - th/2), text, fill="black", font=num_font) 
                cell_num += 1 

        temp_path = f"page_{p_index+1}_grid.png" 
        img.save(temp_path, "PNG") 
        temp_grid_images.append(temp_path) 
        print("πŸ›° Created grid image:", temp_path) 

    print("πŸ“‘ Sending page images to Gemini in batches for mapping...")
    batch_size = 10
    all_mappings = []
    
    for start in range(0, len(temp_grid_images), batch_size):
        batch_paths = temp_grid_images[start:start+batch_size]
        batch_mapping = ask_gemini_for_mapping_batch(batch_paths, grading_json, expected_ids, rows, cols)
        all_mappings.extend(batch_mapping)
        print(f"βœ… Processed batch {start//batch_size + 1}: pages {start+1}-{start+len(batch_paths)}")

    print("πŸ–Š Annotating pages with marks...") 
    for p_index, page in enumerate(pages): 
        page_num = p_index + 1
        page_img = page.convert("RGB") 
        img_cv = np.array(page_img) 
        img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR) 
        h, w, _ = img_cv.shape 
        cell_w_px, cell_h_px = w / cols, h / rows 

        page_mappings = [m for m in all_mappings if m.get("page") == page_num]
        
        for item in page_mappings: 
            qid = item.get("question") 
            cell_number = item.get("cell_number") 
            if qid is None or cell_number is None: 
                continue 

            marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", []) if g["question"] == qid), []) 
            if not marks_list: 
                marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", []) 
                                   if g["question"].lower() == (qid or "").lower()), []) 

            marks_text = ",".join(marks_list) if marks_list else "?" 

            row = (cell_number - 1) // cols 
            col = (cell_number - 1) % cols 

            x_c = int((col + 1) * cell_w_px - cell_w_px / 4) 
            y_c = int((row + 0.5) * cell_h_px) 

            font_scale = max(1.0, min(2.0, cell_h_px / 40.0)) 
            thickness = max(2, int(font_scale * 2)) 
            cv2.putText(img_cv, marks_text, (x_c, y_c), cv2.FONT_HERSHEY_SIMPLEX, 
                        font_scale, (0, 0, 255), thickness, cv2.LINE_AA) 
            print(f"πŸ–Š Marks annotated for page {page_num}, question {qid}: {marks_text}")

        annotated_path = f"annotated_page_{page_num}.png" 
        cv2.imwrite(annotated_path, img_cv) 
        annotated_page_paths.append(annotated_path) 
        print("βœ… Annotated page saved:", annotated_path) 

    print("πŸ“‘ Merging annotated pages into final PDF...")
    with open(output_pdf, "wb") as f: 
        f.write(img2pdf.convert(annotated_page_paths)) 

    compressed = compress_pdf(output_pdf) 
    print("πŸ“‘ Imprinted PDF saved to:", compressed) 
    return compressed 

def extract_pdf_pages_as_images(pdf_path, page_numbers, prefix):
    """
    Extracts unique pages (1-based) from a PDF as images, saves as PNG, returns list of file paths.
    """
    unique_pages = sorted(set(page_numbers))
    images = convert_from_path(pdf_path, dpi=200, first_page=min(unique_pages), last_page=max(unique_pages))
    out_paths = []
    for idx, page_num in enumerate(unique_pages):
        img_idx = page_num - min(unique_pages)
        img = images[img_idx]
        out_path = f"{prefix}_page_{page_num}.png"
        img.save(out_path, "PNG")
        print(f"πŸ“€ Extracted graph page {page_num} from {pdf_path} as {out_path}")
        out_paths.append(out_path)
    return out_paths

# ---------------- PIPELINE ----------------
def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
    """
    Final pipeline with graph-aware grading logic using NEW SDK.
    """
    try:
        print("πŸ” Starting pipeline...")
        qp_path = compress_pdf(qp_path)
        ms_path = compress_pdf(ms_path)
        ans_path = compress_pdf(ans_path)

        merged_qpms_path = os.path.splitext(qp_path)[0] + "_merged_qp_ms.pdf"
        merge_pdfs([qp_path, ms_path], merged_qpms_path)
        print("πŸ“Ž Merged QP + MS ->", merged_qpms_path)

        print("πŸ”Ό Uploading files to Gemini...")
        merged_uploaded = upload_to_gemini(merged_qpms_path)
        ans_uploaded = upload_to_gemini(ans_path)
        print("βœ… Upload complete.")

        print("1.i) Transcribing QP+MS (questions first, then full markscheme, with graph detection)...")
        qpms_prompt = PROMPTS["QP_MS_TRANSCRIPTION"]["content"] + "\nAt the end, also list all questions in the markscheme where a graph is expected, in the format:\nGraph expected in:\n- Question <number> β†’ Page <number>\n(One per line, after ==== MARKSCHEME END ====)"
        qpms_text = gemini_generate_content(qpms_prompt, file_upload_obj=merged_uploaded)
        print("πŸ“„ QP+MS transcription received. Saving debug file: debug_qpms_transcript.txt")
        with open("debug_qpms_transcript.txt", "w", encoding="utf-8") as f:
            f.write(qpms_text)

        ms_graph_mapping = extract_graph_questions_from_ms(qpms_text)
        print("πŸ–ΌοΈ Graph-expected questions in MS:", ms_graph_mapping)
        ms_graph_pages = list(ms_graph_mapping.values())
        ms_graph_images = []
        if ms_graph_pages:
            ms_graph_images = extract_pdf_pages_as_images(merged_qpms_path, ms_graph_pages, prefix="qpms_graph")

        extracted_ids = extract_question_ids_from_qpms(qpms_text)
        if not extracted_ids:
            extracted_ids = ["NA"]

        print("1.ii) Building AS transcription prompt with expected question IDs and graph detection, sending to Gemini...")
        as_prompt = build_as_cot_prompt_with_expected_ids(extracted_ids, qpms_text) + "\nAt the end, also list all answers where a graph is found, in the format:\nGraph found in:\n- Answer <number> β†’ Page <number>\n(One per line, after all answers)"
        as_text = gemini_generate_content(as_prompt, file_upload_obj=ans_uploaded)
        print("πŸ“ AS transcription received. Saving debug file: debug_as_transcript.txt")
        with open("debug_as_transcript.txt", "w", encoding="utf-8") as f:
            f.write(as_text)

        as_graph_mapping = extract_graph_answers_from_as(as_text)
        print("πŸ–ΌοΈ Graph-attempted answers in AS:", as_graph_mapping)
        as_graph_pages = list(as_graph_mapping.values())
        as_graph_images = []
        if as_graph_pages:
            as_graph_images = extract_pdf_pages_as_images(ans_path, as_graph_pages, prefix="as_graph")

        print("2) Preparing grading input and sending to Gemini for grading...")
        grading_input = (
            "=== QP+MS TRANSCRIPT BEGIN ===\n"
            + qpms_text
            + "\n=== QP+MS TRANSCRIPT END ===\n\n"
            + "=== ANSWER SHEET TRANSCRIPT BEGIN ===\n"
            + as_text
            + "\n=== ANSWER SHEET TRANSCRIPT END ===\n"
        )
        if ms_graph_images or as_graph_images:
            graph_note = "\n\n---\nSome questions require graphs. I've attached the relevant graph pages from QP+MS and from the Answer Sheet. Use them as visual context when grading.\n---\n"
            grading_input += graph_note
        grading_prompt_system = PROMPTS["GRADING_PROMPT"]["content"]
        grading_images = ms_graph_images + as_graph_images
        grading_text = gemini_generate_content(grading_prompt_system + "\n\nPlease grade the following transcripts:\n" + grading_input, image_obj=grading_images if grading_images else None)
        print("🧾 Grading output received. Saving debug file: debug_grading.md")
        with open("debug_grading.md", "w", encoding="utf-8") as f:
            f.write(grading_text)

        base_name = os.path.splitext(os.path.basename(ans_path))[0]
        grading_pdf_path = save_as_pdf(grading_text, f"{base_name}_graded.pdf")
        print("πŸ“„ Grading PDF saved:", grading_pdf_path)

        grading_json = extract_marks_from_grading(grading_text)
        with open("debug_grading_json.json", "w", encoding="utf-8") as f:
            json.dump(grading_json, f, indent=2, ensure_ascii=False)
        print("πŸ”§ Grading marks extraction complete.")

        imprinted_pdf_path = None
        if imprint:
            print("✍ Imprint option enabled. Starting imprinting process...")
            imprinted_pdf_path = f"{base_name}_imprinted.pdf"
            imprinted_pdf_path = imprint_marks_using_mapping(ans_path, grading_json, imprinted_pdf_path, extracted_ids)
            print("βœ… Imprinting finished. Imprinted PDF at:", imprinted_pdf_path)

        print("🏁 Pipeline finished successfully.")
        return qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path

    except Exception as e:
        print("❌ Pipeline error:", e)
        import traceback
        traceback.print_exc()
        return f"❌ Error: {e}", None, None, None, None

# ---------------- GRADIO UI ---------------- 
with gr.Blocks(title="AI Grading (Fixed - google-genai SDK)") as demo: 
    gr.Markdown("## πŸ“˜ AI Grading β€” Fixed with google-genai SDK") 
    gr.Markdown("**βœ… Now using the new official `google-genai` SDK (no more ragStoreName errors!)**")

    with gr.Row(): 
        qp_file = gr.File(label="πŸ“„ Upload Question Paper (PDF)") 
        ms_file = gr.File(label="πŸ“„ Upload Markscheme (PDF)") 
        ans_file = gr.File(label="πŸ“ Upload Student Answer Sheet (PDF)") 

    imprint_toggle = gr.Checkbox(label="✍ Imprint Marks on Student Answer Sheet", value=False) 
    run_button = gr.Button("πŸš€ Run Pipeline") 

    with gr.Row(): 
        qpms_box = gr.Textbox(label="πŸ“‘ QP+MS Transcript", lines=12) 
        as_box = gr.Textbox(label="πŸ“ AS Transcript", lines=12) 

    grading_output_box = gr.Textbox(label="🧾 Grading (Markdown)", lines=20) 
    grading_pdf_file = gr.File(label="πŸ“₯ Download Grading PDF") 
    imprint_pdf_file = gr.File(label="πŸ“₯ Download Imprinted PDF (Optional)") 

    def run_pipeline(qp_file_obj, ms_file_obj, ans_file_obj, imprint_flag): 
        if not qp_file_obj or not ms_file_obj or not ans_file_obj:
            return "❌ Please upload all three files", "", "", None, None
            
        qp_path = qp_file_obj.name 
        ms_path = ms_file_obj.name 
        ans_path = ans_file_obj.name 

        qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path = align_and_grade_pipeline( 
            qp_path, ms_path, ans_path, imprint=imprint_flag 
        ) 

        return qpms_text or "", as_text or "", grading_text or "", grading_pdf_path, imprinted_pdf_path 

    run_button.click( 
        fn=run_pipeline, 
        inputs=[qp_file, ms_file, ans_file, imprint_toggle], 
        outputs=[qpms_box, as_box, grading_output_box, grading_pdf_file, imprint_pdf_file] 
    ) 

if __name__ == "__main__": 
    demo.launch()