neurolearn

Sleeping

App Files Files Community

atz21 commited on Sep 27, 2025

Commit

6941b48

verified ·

1 Parent(s): ca54958

Update app.py

Browse files

Files changed (1) hide show

app.py +412 -378

app.py CHANGED Viewed

@@ -1,387 +1,421 @@
-import os
-import re
-import json
-import subprocess
-import img2pdf
-import gradio as gr
-import google.generativeai as genai
-from pdf2image import convert_from_path
-from PIL import Image, ImageDraw, ImageFont
-import cv2
-import numpy as np
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from PyPDF2 import PdfReader, PdfWriter
-from markdown_pdf import MarkdownPdf, Section
-# ---------------- CONFIG ----------------
-genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
-GRID_ROWS, GRID_COLS = 20, 14
-# ---------------- PROMPTS ----------------
-PROMPTS = {
-    "QP_MS_TRANSCRIPTION": {
-        "role": "system",
-        "content": """You are a high-quality OCR/Transcription assistant.
-INPUT: This file is a PDF that first contains the Question Paper and immediately after it the Markscheme.
-TASK:
-1. Transcribe EXACTLY all the questions FIRST (with their total marks).
-2. After ALL questions, transcribe the Markscheme exactly, preserving M/A/R notation in brackets.
-3. Always number the questions sequentially (Question 1, Question 2, Question 3, …) **in the order they appear in the PDF**, even if the PDF shows a different number or leaves it blank. Do NOT skip or leave Question: blank.
-FORMAT:
-==== PAPER TOTAL MARKS ====
-<total marks>
-==== QUESTIONS BEGIN ====
-Question 1.i
-Total Marks: <number>
-QP: <question text>
---QUESTION-END--
-(repeat for all questions)
-==== QUESTIONS END ====
-==== MARKSCHEME BEGIN ====
-Answer 1.i:
-<exact MS for Q1.i with notations M1, A1, R1 etc>
-(repeat for all answers)
-==== MARKSCHEME END ====
-"""
-    },
-    "GRADING_PROMPT": {
-        "role": "system",
-        "content": """Developer: You are an official examiner. Apply the following grading rules precisely.
-### Abbreviations:
-- **M**: Marks for Method
-- **A**: Marks for Accuracy/Answer
-- **R**: Marks for Reasoning
-- **AG**: Answer given in question—no marks
-- **FT**: Follow Through marks (if error carried forward correctly)
-- **MR**: Deduct for misread (once only)
----
-## Grading Instructions
-1. Award marks using official annotations (e.g., M1, A2).
-2. Do not award full marks for answers alone; check for method marks.
-3. A marks usually require a valid M mark first.
-4. Accept valid equivalent forms unless otherwise specified.
-5. Apply FT where appropriate.
-6. Use proper notation: M1A0, A1, etc.
-7. Any lost mark: use red `<span style="color:red">M0</span>` and make Reason red.
----
-## Output Format
-Produce two sections per question/sub-question, following this structure:
-## Question <id>
-### Markscheme vs Student Answer
-| Mark ID | Markscheme Expectation | Student’s Response | Awarded |
-|---------|------------------------|--------------------|---------|
-| M1_1    | Recognise GP           | "r=0.9"            | M1 |
-➡️ **Total: X/Y**
----
-### Examiner’s Report
-At the very end, provide a summary table:
-| Question Number | Marks | Remark |
-|-----------------|-------|--------|
-| 1               | X/Y   | <remark> |
-Then show total clearly as a final line:
-`Total: <obtained_marks>/<max_marks>`
-NOTES:
-- The assistant will receive two transcripts: (1) QP+MS transcription (questions then markscheme) and (2) AS transcription (student answers). Use the QP+MS transcript as the authoritative source.
-"""
-    }
-}
-# ---------------- HELPERS ----------------
-def save_as_pdf(text, filename="output.pdf"):
-    pdf = MarkdownPdf()
-    pdf.add_section(Section(text, toc=False))
-    pdf.save(filename)
-    return filename
-def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
-    if output_path is None:
-        base, ext = os.path.splitext(input_path)
-        output_path = f"{base}_compressed{ext}"
-    try:
-        size = os.path.getsize(input_path)
-    except Exception:
-        return input_path
-    if size <= max_size:
-        return input_path
-    try:
-        gs_cmd = [
-            "gs", "-sDEVICE=pdfwrite",
-            "-dCompatibilityLevel=1.4",
-            "-dPDFSETTINGS=/ebook",
-            "-dNOPAUSE", "-dQUIET", "-dBATCH",
-            f"-sOutputFile={output_path}", input_path
-        ]
-        subprocess.run(gs_cmd, check=True)
-        new_size = os.path.getsize(output_path)
-        if new_size <= max_size:
-            return output_path
-        return input_path
-    except Exception:
-        return input_path
-def create_model():
-    try:
-        return genai.GenerativeModel("gemini-2.5-pro", generation_config={"temperature": 0})
-    except Exception:
-        return genai.GenerativeModel("gemini-2.5-flash", generation_config={"temperature": 0})
-def merge_pdfs(paths, output_path):
-    writer = PdfWriter()
-    for p in paths:
-        reader = PdfReader(p)
-        for page in reader.pages:
-            writer.add_page(page)
-    with open(output_path, "wb") as f:
-        writer.write(f)
-    return output_path
-def gemini_generate_content(model, prompt_text, file_upload_obj=None, image_obj=None):
-    inputs = [prompt_text]
-    if file_upload_obj:
-        inputs.append(file_upload_obj)
-    if image_obj:
-        inputs.append(image_obj)
-    response = model.generate_content(inputs)
-    raw_text = getattr(response, "text", None)
-    if not raw_text and getattr(response, "candidates", None):
-        raw_text = response.candidates[0].content.parts[0].text
-    if raw_text is None:
-        raw_text = str(response)
-    return raw_text
-# ---------------- PARSERS ----------------
-def extract_question_ids_from_qpms(text):
-    ids = []
-    for m in re.finditer(r"(?im)^\s*Question\s*:\s*([0-9]+(?:[a-zA-Z0-9\.\(\)]+)*)\b", text):
-        ids.append(m.group(1).strip())
-    if not ids:
-        for m in re.finditer(r"(?m)^\s*([0-9]+(?:[a-zA-Z0-9\.\(\)]+)*)\s*[\.\):\-]\s", text):
-            ids.append(m.group(1).strip())
-    return ids if ids else ["NA"]
-def build_as_prompt_with_expected_ids(expected_ids):
-    ids_block = "{\n" + "\n".join(expected_ids) + "\n}" if expected_ids else "{NA}"
-    prompt = f"""You are a high-quality handwritten transcription assistant.
-INPUT: This PDF contains a student's handwritten answer sheet.
-TASK: Transcribe the student's answers exactly (as text), preserving step order and line breaks.
-Attempt to assign each answer to a question ID if student labelled it; else mark as INFERRED.
-Enclose math in ``` blocks, diagrams as [Graph omitted], unreadable as [illegible].
-Expected questions:
-{ids_block}
------------------------
-OUTPUT FORMAT:
-Question <id>
-AS:
-<transcribed answer or placeholder>
-"""
-    return prompt
-def extract_marks_from_grading_exact(grading_text):
-    grading_json = {"grading": []}
-    question_blocks = re.split(r"##\s*Question\s+", grading_text)
-    for block in question_blocks[1:]:
-        first_line = block.strip().splitlines()[0].strip() if block.strip().splitlines() else ""
-        q_id_match = re.match(r"([0-9]+(?:[a-zA-Z]|\([^\)]+\)|(?:\.[a-zA-Z0-9]+))*)", first_line)
-        q_id = q_id_match.group(1).strip() if q_id_match else first_line.split()[0] if first_line else ""
-        awarded = re.findall(r"\b(M\d+|A\d+|R\d+|M0|A0|R0)\b", block)
-        grading_json["grading"].append({"question": q_id, "marks_awarded": awarded})
-    return grading_json
-# ---------------- IMPRINT ----------------
-def ask_gemini_for_mapping_for_page_v2(model, image_path, grading_json, question_scheme, expected_ids, rows=GRID_ROWS, cols=GRID_COLS):
-    ids_block = "{\n" + "\n".join(expected_ids) + "\n}" if expected_ids else "{NA}"
-    prompt = f"""
-You are an exam marker. Identify where each question begins on this page.
-The page has {rows}x{cols} grid (cells 1..{rows*cols}).
-Authoritative question scheme:
-{question_scheme}
-Expected IDs (spot only these):
-{ids_block}
-Grading JSON:
-{json.dumps(grading_json, indent=2)}
-Instructions:
-- Return cell number where first step begins for each question.
-- Only include questions on this page.
-- Handle mislabelled steps: e.g., Q4.i above Q4 may belong to Q3.ii.
-- Avoid placing marks inside another question's answer area.
-- Prefer blank cell to the RIGHT, else LEFT.
-- Never above or below the answer.
-- Return JSON only, like:
-[{{"question":"1.a","cell_number":15}}, ...]
-"""
-    img = Image.open(image_path)
-    response = model.generate_content([prompt, img])
-    raw_text = getattr(response, "text", None)
-    if not raw_text and getattr(response, "candidates", None):
-        raw_text = response.candidates[0].content.parts[0].text
-    if not raw_text:
-        raw_text = str(response)
-    try:
-        start = raw_text.index('[')
-        end = raw_text.rindex(']') + 1
-        return json.loads(raw_text[start:end])
-    except Exception:
-        return []
-def imprint_marks_using_mapping_v2(pdf_path, grading_json, output_pdf, question_scheme, expected_ids, model, rows=GRID_ROWS, cols=GRID_COLS):
-    reader = PdfReader(pdf_path)
-    annotated_page_paths = []
-    pages = convert_from_path(pdf_path)  # keep original size
-    temp_grid_images = []
-    for p_index, page_img in enumerate(pages):
-        img = page_img.convert("RGB")
-        draw = ImageDraw.Draw(img)
         try:
-            font = ImageFont.truetype("arial.ttf", 16)
-        except:
-            font = ImageFont.load_default()
-        cell_w = img.width / cols
-        cell_h = img.height / rows
-        cell_num = 1
-        for r in range(rows):
-            for c in range(cols):
-                x = int(c * cell_w + cell_w / 2)
-                y = int(r * cell_h + cell_h / 2)
-                bbox = draw.textbbox((0,0), str(cell_num), font=font)
-                draw.text((x - (bbox[2]-bbox[0])/2, y - (bbox[3]-bbox[1])/2), str(cell_num), fill="black", font=font)
-                cell_num +=1
-        grid_path = f"page_{p_index+1}_grid.png"
-        img.save(grid_path, "PNG")
-        temp_grid_images.append(grid_path)
-    mappings_per_page = {}
-    with ThreadPoolExecutor(max_workers=min(8,len(temp_grid_images))) as ex:
-        futures = {
-            ex.submit(
-                ask_gemini_for_mapping_for_page_v2, model, img_path, grading_json, question_scheme, expected_ids, rows, cols
-            ): idx for idx,img_path in enumerate(temp_grid_images)
-        }
-        for fut in as_completed(futures):
-            idx = futures[fut]
-            try:
-                mapping_result = fut.result()
-                mappings_per_page[idx] = mapping_result
-                print(f"[IMPRINT] Mapping received for page {idx+1}: {repr(mapping_result)}")
-            except Exception as e:
-                mappings_per_page[idx] = []
-                print(f"[IMPRINT] Mapping failed for page {idx+1}: {repr(e)}")
-    for p_index, page_img in enumerate(pages):
-        img_cv = np.array(page_img.convert("RGB"))
-        img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
-        h, w, _ = img_cv.shape
-        cell_w_px, cell_h_px = w/cols, h/rows
-        mapping = mappings_per_page.get(p_index, [])
-        occupied = set()
-        for item in mapping:
-            qid = item.get("question")
-            cell_number = item.get("cell_number")
-            if qid is None or cell_number is None: continue
-            marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", []) if g["question"]==qid), [])
-            marks_text = ",".join(marks_list) if marks_list else "?"
-            row = (cell_number-1)//cols
-            col = (cell_number-1)%cols
-            candidates = []
-            if col+1<cols: candidates.append((row,col+1))
-            candidates.append((row,col))
-            if col-1>=0: candidates.append((row,col-1))
-            chosen = next(((r,c) for r,c in candidates if (r*cols+c+1) not in occupied), (row,col))
-            occupied.add(chosen[0]*cols+chosen[1]+1)
-            x_c = int((chosen[1]+0.5)*cell_w_px)
-            y_c = int((chosen[0]+0.5)*cell_h_px)
-            font_scale = max(0.6,min(1.6,cell_h_px/60))
-            thickness = max(1,int(font_scale*2))
-            cv2.putText(img_cv, marks_text, (x_c,y_c), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0,0,255), thickness)
-        annotated_path = f"annotated_page_{p_index+1}.png"
-        cv2.imwrite(annotated_path, img_cv)
-        annotated_page_paths.append(annotated_path)
-    with open(output_pdf,"wb") as f:
-        f.write(img2pdf.convert(annotated_page_paths))
-    return compress_pdf(output_pdf)
-# ---------------- PIPELINE ----------------
-def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
-    qp_path = compress_pdf(qp_path)
-    ms_path = compress_pdf(ms_path)
-    ans_path = compress_pdf(ans_path)
-    merged_qpms_path = os.path.splitext(qp_path)[0]+"_merged_qp_ms.pdf"
-    merge_pdfs([qp_path, ms_path], merged_qpms_path)
-    merged_uploaded = genai.upload_file(path=merged_qpms_path, display_name="QP+MS (merged)")
-    ans_uploaded = genai.upload_file(path=ans_path, display_name="Answer Sheet")
     model = create_model()
-    qpms_prompt = PROMPTS["QP_MS_TRANSCRIPTION"]["content"]
-    qpms_text = gemini_generate_content(model, qpms_prompt, file_upload_obj=merged_uploaded)
-    extracted_ids = extract_question_ids_from_qpms(qpms_text)
-    as_prompt = build_as_prompt_with_expected_ids(extracted_ids)
-    as_text = gemini_generate_content(model, as_prompt, file_upload_obj=ans_uploaded)
-    grading_input = (
-        "=== QP+MS TRANSCRIPT BEGIN ===\n"+qpms_text+
-        "\n=== QP+MS TRANSCRIPT END ===\n\n"+
-        "=== ANSWER SHEET TRANSCRIPT BEGIN ===\n"+as_text+
-        "\n=== ANSWER SHEET TRANSCRIPT END ===\n"
-    )
-    grading_prompt_system = PROMPTS["GRADING_PROMPT"]["content"]
-    grading_text = gemini_generate_content(model, grading_prompt_system+"\n\nPlease grade the following transcripts:\n"+grading_input)
-    grading_pdf_path = save_as_pdf(grading_text, os.path.splitext(os.path.basename(ans_path))[0]+"_graded.pdf")
-    grading_json = extract_marks_from_grading_exact(grading_text)
-    imprinted_pdf_path = None
-    if imprint:
-        question_scheme = qpms_text
-        imprinted_pdf_path = os.path.splitext(os.path.basename(ans_path))[0]+"_imprinted.pdf"
-        imprinted_pdf_path = imprint_marks_using_mapping_v2(ans_path, grading_json, imprinted_pdf_path, question_scheme, extracted_ids, model)
-    return qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path
-# ---------------- GRADIO ----------------
-with gr.Blocks(title="LeadIB AI Grading (Updated Imprint)") as demo:
-    gr.Markdown("## 📘 LeadIB AI Grading — Updated Imprint Pipeline\nUpload QP, Markscheme, and Student Answer Sheet.")
-    with gr.Row():
-        qp_file = gr.File(label="📄 Question Paper (PDF)")
-        ms_file = gr.File(label="📄 Markscheme (PDF)")
-        ans_file = gr.File(label="📝 Student Answer Sheet (PDF)")
-    imprint_toggle = gr.Checkbox(label="✍ Imprint Marks", value=False)
-    run_button = gr.Button("🚀 Run Pipeline")
-    with gr.Row():
-        qpms_box = gr.Textbox(label="📑 QP+MS Transcript", lines=12)
-        as_box = gr.Textbox(label="📝 AS Transcript", lines=12)
-    grading_output_box = gr.Textbox(label="🧾 Grading Markdown", lines=20)
-    grading_pdf_file = gr.File(label="📥 Download Grading PDF")
-    imprint_pdf_file = gr.File(label="📥 Download Imprinted PDF (Optional)")
-    def run_pipeline(qp_file_obj, ms_file_obj, ans_file_obj, imprint_flag):
-        qp_path = qp_file_obj.name
-        ms_path = ms_file_obj.name
-        ans_path = ans_file_obj.name
-        qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path = align_and_grade_pipeline(
-            qp_path, ms_path, ans_path, imprint=imprint_flag
-        )
-        return qpms_text or "", as_text or "", grading_text or "", grading_pdf_path, imprinted_pdf_path
-    run_button.click(
-        fn=run_pipeline,
-        inputs=[qp_file, ms_file, ans_file, imprint_toggle],
-        outputs=[qpms_box, as_box, grading_output_box, grading_pdf_file, imprint_pdf_file]
-    )
 if __name__ == "__main__":
-    demo.launch()

+import os
+import re
+import json
+import subprocess
+import tempfile
+import time
+import img2pdf
+import gradio as gr
+import google.generativeai as genai
+from markdown_pdf import MarkdownPdf, Section
+from pdf2image import convert_from_path
+from PIL import Image, ImageDraw, ImageFont
+import cv2
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from PyPDF2 import PdfReader, PdfWriter
+# ---------------- CONFIG ----------------
+genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
+GRID_ROWS, GRID_COLS = 20, 14
+# ---------------- PROMPTS ----------------
+PROMPTS = {
+    "QP_MS_TRANSCRIPTION" : {
+    "role": "system",
+    "content": """You are a high-quality OCR/Transcription assistant.
+INPUT: This file is a PDF that first contains the Question Paper and immediately after it the Markscheme.
+TASK:
+1. Transcribe EXACTLY all the questions FIRST (with their total marks).
+2. After ALL questions, transcribe the Markscheme exactly, preserving M/A/R notation in brackets.
+3. Always number the questions sequentially (Question 1, Question 2, Question 3, …) **in the order they appear in the PDF**, even if the PDF shows a different number or leaves it blank. Do NOT skip or leave Question: blank.
+FORMAT:
+==== PAPER TOTAL MARKS ====
+<total marks>
+==== QUESTIONS BEGIN ====
+Question 1.i
+Total Marks: <number>
+QP: <question text>
+--QUESTION-END--
+Question 1.ii
+Total Marks: <number>
+QP: <question text>
+--QUESTION-END--
+(repeat for all questions in order of appearance)
+==== QUESTIONS END ====
+==== MARKSCHEME BEGIN ====
+Answer 1.i:
+<exact MS for Q1.i with notations M1, A1, R1 etc>
+Answer 1.ii:
+<exact MS for Q1.ii with notations>
+Answer 2 :
+<exact MS for Q2 with notations>
+(repeat for all answers)
+==== MARKSCHEME END ====
+"""
+}
+,
+    "GRADING_PROMPT": {
+        "role": "system",
+        "content": """Developer: You are an official examiner. Apply the following grading rules precisely.
+### Abbreviations:
+- **M**: Marks for Method
+- **A**: Marks for Accuracy/Answer
+- **R**: Marks for Reasoning
+- **AG**: Answer given in question—no marks
+- **FT**: Follow Through marks (if error carried forward correctly)
+- **MR**: Deduct for misread (once only)
+---
+## Grading Instructions
+1. Award marks using official annotations (e.g., M1, A2).
+2. Do not award full marks for answers alone; check for method marks.
+3. A marks usually require a valid M mark first.
+4. Accept valid equivalent forms unless otherwise specified.
+5. Apply FT where appropriate.
+6. Use proper notation: M1A0, A1, etc.
+7. Any lost mark: use red `<span style="color:red">M0</span>` and make Reason red.
+---
+## Output Format
+Produce two sections per question/sub-question, following this structure:
+## Question <id>
+### Markscheme vs Student Answer
+| Mark ID | Markscheme Expectation | Student’s Response | Awarded |
+|---------|------------------------|--------------------|---------|
+| M1_1    | Recognise GP           | "r=0.9"            | M1 |
+➡️ **Total: X/Y**
+---
+### Examiner’s Report
+At the very end, provide a summary table:
+| Question Number | Marks | Remark |
+|-----------------|-------|--------|
+| 1               | X/Y   | <remark> |
+Then show total clearly as a final line:
+`Total: <obtained_marks>/<max_marks>`
+NOTES:
+- The assistant will receive two transcripts: (1) QP+MS transcription (questions then markscheme) and (2) AS transcription (student answers). Use the QP+MS transcript as the authoritative source of question wording, total marks, and verbatim markscheme entries (M/A/R mark IDs).
+- Match student answers to question IDs and grade according to the provided verbatim markscheme.
+- Produce full markdown as above. Ensure mark IDs used in the grading are present and consistent with the markscheme.
+"""
+    }
+}
+# ---------------- HELPERS ----------------
+def save_as_pdf(text, filename="output.pdf"):
+    pdf = MarkdownPdf()
+    pdf.add_section(Section(text, toc=False))
+    pdf.save(filename)
+    return filename
+def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
+    if output_path is None:
+        base, ext = os.path.splitext(input_path)
+        output_path = f"{base}_compressed{ext}"
+    try:
+        size = os.path.getsize(input_path)
+    except Exception:
+        return input_path
+    if size <= max_size:
+        print(f"ℹ️ Not compressing {input_path} ({size/1024/1024:.2f} MB <= {max_size/1024/1024} MB)")
+        return input_path
+    print(f"🔎 Compressing {input_path} ({size/1024/1024:.2f} MB) -> {output_path}")
+    try:
+        gs_cmd = [
+            "gs", "-sDEVICE=pdfwrite",
+            "-dCompatibilityLevel=1.4",
+            "-dPDFSETTINGS=/ebook",
+            "-dNOPAUSE", "-dQUIET", "-dBATCH",
+            f"-sOutputFile={output_path}", input_path
+        ]
+        subprocess.run(gs_cmd, check=True)
+        new_size = os.path.getsize(output_path)
+        print(f"✅ Compression done. New size: {new_size/1024/1024:.2f} MB")
+        if new_size <= max_size:
+            return output_path
+        else:
+            print("⚠️ Compressed file still larger than threshold; returning original")
+            return input_path
+    except Exception as e:
+        print("❌ Compression error:", e)
+        return input_path
+def create_model():
+    try:
+        print("⚡ Attempting to use gemini-2.5-pro model")
+        model = genai.GenerativeModel("gemini-2.5-pro", generation_config={"temperature": 0})
+        print("✅ Selected model: gemini-2.5-pro")
+        return model
+    except Exception as e:
+        print("⚠️ Could not use gemini-2.5-pro:", e)
+    try:
+        print("⚡ Falling back to gemini-2.5-flash model")
+        model = genai.GenerativeModel("gemini-2.5-flash", generation_config={"temperature": 0})
+        print("✅ Selected model: gemini-2.5-flash")
+        return model
+    except Exception as e:
+        print("❌ Failed to create any Gemini model:", e)
+        raise
+def merge_pdfs(paths, output_path):
+    writer = PdfWriter()
+    for p in paths:
+        reader = PdfReader(p)
+        for page in reader.pages:
+            writer.add_page(page)
+    with open(output_path, "wb") as f:
+        writer.write(f)
+    return output_path
+def gemini_generate_content(model, prompt_text, file_upload_obj=None, image_obj=None):
+    inputs = [prompt_text]
+    if file_upload_obj:
+        inputs.append(file_upload_obj)
+    if image_obj:
+        inputs.append(image_obj)
+    print("📡 Sending request to Gemini (prompt length:", len(prompt_text), "chars )")
+    response = model.generate_content(inputs)
+    raw_text = getattr(response, "text", None)
+    if not raw_text and getattr(response, "candidates", None):
+        raw_text = response.candidates[0].content.parts[0].text
+    if raw_text is None:
+        raw_text = str(response)
+    print("📥 Received response (chars):", len(raw_text))
+    return raw_text
+# ---------------- PARSERS ----------------
+def extract_question_ids_from_qpms(text):
+    print("🔎 Extracting question IDs from QP+MS transcript using regex...")
+    ids = []
+    for m in re.finditer(r"(?im)^\s*Question\s*:\s*([0-9]+(?:(?:\.[a-zA-Z0-9]+)+|(?:\([a-zA-Z0-9]+\))+|[a-zA-Z])*)\b", text):
+        qid = m.group(1).strip()
+        ids.append(qid)
+    if ids:
+        print(f"✅ Extracted {len(ids)} question IDs.")
+        print("IDs:", ids)
+        return ids
+    for m in re.finditer(r"(?m)^\s*([0-9]+(?:(?:\.[a-zA-Z0-9]+)+|(?:\([a-zA-Z0-9]+\))+|[a-zA-Z])*)\s*[\.\):\-]\s", text):
+        qid = m.group(1).strip()
+        ids.append(qid)
+    if ids:
+        print(f"✅ Extracted {len(ids)} question IDs (fallback heuristic).")
+        print("IDs:", ids)
+    else:
+        print("⚠️ No question IDs extracted; will send NA placeholder.")
+    return ids
+def build_as_prompt_with_expected_ids(expected_ids):
+    if not expected_ids:
+        ids_block = "{NA}"
+    else:
+        ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
+    prompt = f"""You are a high-quality handwritten transcription assistant.
+INPUT: This PDF contains a student's handwritten answer sheet.
+TASK: Transcribe the student's answers exactly (as text). Preserve step order and line breaks. Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3"). If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context — but mark inferred IDs clearly as "INFERRED: <id>"
+Enclose all mathematical expressions in Markdown fenced code blocks (``` triple backticks).
+If a diagram/graph is omitted, write [Graph omitted].
+Unreadable parts: [illegible].
+Unanswered: [No response].
+Do NOT recreate diagrams.
+Ensure consistency and determinism in formatting so subsequent models can grade directly from this aligned format.
+Expected questions (if missing, write NA):
+{ids_block}
+-----------------------
+OUTPUT FORMAT:
+Question <id>
+AS:
+<transcribed answer or placeholder>
+"""
+    return prompt
+def extract_marks_from_grading(grading_text):
+    print("🔎 Extracting awarded marks from grading output...")
+    grading_json = {"grading": []}
+    question_blocks = re.split(r"##\s*Question\s+", grading_text)
+    for block in question_blocks[1:]:
+        first_line = block.strip().splitlines()[0].strip() if block.strip().splitlines() else ""
+        q_id_match = re.match(r"([0-9]+(?:[a-zA-Z]|\([^\)]+\)|(?:\.[a-zA-Z0-9]+))*)", first_line)
+        if not q_id_match:
+            q_id = first_line.split()[0] if first_line else ""
+        else:
+            q_id = q_id_match.group(1).strip()
+        awarded = re.findall(r"\b(M\d+|A\d+|R\d+|M0|A0|R0)\b", block)
+        # 🔴 Change 1: DO NOT deduplicate, keep all marks in sequence
+        grading_json["grading"].append({
+            "question": q_id,
+            "marks_awarded": awarded
+        })
+    print("✅ Extracted grading marks for", len(grading_json["grading"]), "question blocks.")
+    print(json.dumps(grading_json, indent=2))
+    return grading_json
+# ---------------- MAPPING/IMPRINT HELPERS ----------------
+def ask_gemini_for_mapping_for_page(model, image_path, grading_json, rows=GRID_ROWS, cols=GRID_COLS, expected_ids=None):
+    if not expected_ids:
+        ids_block = "{NA}"
+    else:
+        ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
+    prompt = f"""
+You are an exam marker. Your role is to identify where each question begins on the page.
+The page is divided into a {rows} x {cols} grid. Each cell has a RUNNING NUMBER label (1..{rows*cols}).
+The only questions you should spot are listed here:
+{ids_block}
+For each question in the grading JSON, return the cell NUMBER where the FIRST STEP of that question begins.
+IMPORTANT RULES:
+- Do not place marks inside another question's answer area.
+- Prefer placing the marks in a BLANK cell immediately to the RIGHT of the answer step. If no blank cell is available to the right, then place in a blank cell to the LEFT.
+- Never place marks above or below the answer.
+- If you find something like Q4.i but above it you see "ii)", interpret it as belonging to Q3.ii instead.
+Return JSON only, like:
+[{{"question": "1.a", "cell_number": 15}}, ...]
+Grading JSON:
+{json.dumps(grading_json, indent=2)}
+"""
+    print(f"📡 Sending mapping request for image {image_path} to Gemini...")
+    img = Image.open(image_path)
+    response = model.generate_content([prompt, img])
+    raw_text = getattr(response, "text", None)
+    if not raw_text and getattr(response, "candidates", None):
+        raw_text = response.candidates[0].content.parts[0].text
+    if not raw_text:
+        raw_text = str(response)
+    print("📥 Mapping response (chars):", len(raw_text))
+    try:
+        start = raw_text.index('[')
+        end = raw_text.rindex(']') + 1
+        json_part = raw_text[start:end]
+        mapping = json.loads(json_part)
+        print("✅ Parsed mapping JSON for", image_path, "| entries:", len(mapping))
+        return mapping
+    except Exception:
+        match = re.search(r'(\[.*\])', raw_text, re.DOTALL)
+        if match:
+            try:
+                mapping = json.loads(match.group(1))
+                print("✅ Parsed mapping JSON (alt) for", image_path, "| entries:", len(mapping))
+                return mapping
+            except Exception:
+                pass
+        print("⚠️ Failed to parse mapping JSON for", image_path)
+        return []
+# ---------------- IMPRINTING ----------------
+def imprint_marks_using_mapping(image_path, mapping, output_path, rows=GRID_ROWS, cols=GRID_COLS):
+    print(f"🖊️ Imprinting marks on {image_path} -> {output_path}")
+    img = cv2.imread(image_path)
+    h, w, _ = img.shape
+    cell_h, cell_w = h // rows, w // cols
+    for entry in mapping:
         try:
+            q = entry["question"]
+            cell_num = int(entry["cell_number"])
+            awarded = entry.get("marks_awarded", [])
+            row = (cell_num - 1) // cols
+            col = (cell_num - 1) % cols
+            x = col * cell_w + 5
+            y = row * cell_h + 20
+            mark_text = f"{q}: {' '.join(awarded)}"
+            cv2.putText(img, mark_text, (x, y),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
+        except Exception as e:
+            print("⚠️ Imprint error for entry:", entry, "|", e)
+    cv2.imwrite(output_path, img)
+    return output_path
+# ---------------- MAIN PIPELINE ----------------
+def align_and_grade_pipeline(qp_ms_pdf, as_pdf):
     model = create_model()
+    # Step 1: Transcribe QP + MS
+    print("📄 Transcribing QP+MS PDF...")
+    qpms_text = gemini_generate_content(model, PROMPTS["QP_MS_TRANSCRIPTION"]["content"], file_upload_obj=qp_ms_pdf)
+    # Step 2: Extract IDs
+    expected_ids = extract_question_ids_from_qpms(qpms_text)
+    # Step 3: Transcribe AS
+    print("📄 Transcribing Answer Sheet PDF...")
+    as_prompt = build_as_prompt_with_expected_ids(expected_ids)
+    as_text = gemini_generate_content(model, as_prompt, file_upload_obj=as_pdf)
+    # Step 4: Grade
+    grading_prompt = PROMPTS["GRADING_PROMPT"]["content"] + "\n\n" + \
+                     "QP+MS Transcript:\n" + qpms_text + "\n\nAS Transcript:\n" + as_text
+    grading_text = gemini_generate_content(model, grading_prompt)
+    # Step 5: Extract marks JSON
+    grading_json = extract_marks_from_grading(grading_text)
+    # Step 6: Convert AS to images
+    images = convert_from_path(as_pdf, dpi=200)
+    temp_dir = tempfile.mkdtemp()
+    image_paths = []
+    for i, img in enumerate(images):
+        img_path = os.path.join(temp_dir, f"page_{i+1}.png")
+        img.save(img_path, "PNG")
+        image_paths.append(img_path)
+    # Step 7: Mapping for each page
+    mappings = []
+    for img_path in image_paths:
+        mapping = ask_gemini_for_mapping_for_page(model, img_path, grading_json,
+                                                 rows=GRID_ROWS, cols=GRID_COLS,
+                                                 expected_ids=expected_ids)
+        # Merge awarded marks into mapping
+        for entry in mapping:
+            for g in grading_json["grading"]:
+                if g["question"] == entry["question"]:
+                    entry["marks_awarded"] = g["marks_awarded"]
+        mappings.append((img_path, mapping))
+    # Step 8: Imprint marks
+    imprinted_paths = []
+    for img_path, mapping in mappings:
+        out_path = img_path.replace(".png", "_imprinted.png")
+        imprint_marks_using_mapping(img_path, mapping, out_path)
+        imprinted_paths.append(out_path)
+    # Step 9: Convert to PDF
+    output_pdf = os.path.join(temp_dir, "final_output.pdf")
+    with open(output_pdf, "wb") as f:
+        f.write(img2pdf.convert(imprinted_paths))
+    compressed_pdf = compress_pdf(output_pdf)
+    return grading_text, compressed_pdf
+# ---------------- GRADIO UI ----------------
+def run_gradio():
+    with gr.Blocks() as demo:
+        gr.Markdown("# 📘 Automated Exam Grader (QP + MS + AS)")
+        with gr.Row():
+            qpms_file = gr.File(label="Upload Question Paper + Markscheme PDF", file_types=[".pdf"])
+            as_file = gr.File(label="Upload Student Answer Sheet PDF", file_types=[".pdf"])
+        run_btn = gr.Button("Run Alignment + Grading")
+        grading_output = gr.Textbox(label="Grading Report (Markdown)", lines=20)
+        final_pdf = gr.File(label="Download Final Imprinted PDF")
+        def process(qpms_pdf, as_pdf):
+            grading_text, pdf_path = align_and_grade_pipeline(qpms_pdf, as_pdf)
+            return grading_text, pdf_path
+        run_btn.click(process, inputs=[qpms_file, as_file], outputs=[grading_output, final_pdf])
+    demo.launch()
 if __name__ == "__main__":
+    run_gradio()