TRIAL

Sleeping

App Files Files Community

atz21 commited on Sep 27, 2025

Commit

c962bfa

verified ·

1 Parent(s): e598b6e

Update app.py

Browse files

Files changed (1) hide show

app.py +203 -352

app.py CHANGED Viewed

@@ -2,18 +2,16 @@ import os
 import re
 import json
 import subprocess
-import tempfile
-import time
 import img2pdf
 import gradio as gr
 import google.generativeai as genai
-from markdown_pdf import MarkdownPdf, Section
 from pdf2image import convert_from_path
 from PIL import Image, ImageDraw, ImageFont
 import cv2
 import numpy as np
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from PyPDF2 import PdfReader, PdfWriter
 # ---------------- CONFIG ----------------
 genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
@@ -21,53 +19,31 @@ GRID_ROWS, GRID_COLS = 20, 14
 # ---------------- PROMPTS ----------------
 PROMPTS = {
-    "QP_MS_TRANSCRIPTION" : {
-    "role": "system",
-    "content": """You are a high-quality OCR/Transcription assistant.
 INPUT: This file is a PDF that first contains the Question Paper and immediately after it the Markscheme.
 TASK:
 1. Transcribe EXACTLY all the questions FIRST (with their total marks).
 2. After ALL questions, transcribe the Markscheme exactly, preserving M/A/R notation in brackets.
 3. Always number the questions sequentially (Question 1, Question 2, Question 3, …) **in the order they appear in the PDF**, even if the PDF shows a different number or leaves it blank. Do NOT skip or leave Question: blank.
 FORMAT:
 ==== PAPER TOTAL MARKS ====
 <total marks>
 ==== QUESTIONS BEGIN ====
 Question 1.i
 Total Marks: <number>
 QP: <question text>
 --QUESTION-END--
-Question 1.ii
-Total Marks: <number>
-QP: <question text>
---QUESTION-END--
-(repeat for all questions in order of appearance)
 ==== QUESTIONS END ====
 ==== MARKSCHEME BEGIN ====
 Answer 1.i:
 <exact MS for Q1.i with notations M1, A1, R1 etc>
-Answer 1.ii:
-<exact MS for Q1.ii with notations>
-Answer 2 :
-<exact MS for Q2 with notations>
 (repeat for all answers)
 ==== MARKSCHEME END ====
 """
-}
-,
-    # GRADING_PROMPT unchanged except we will print steps around calling it
     "GRADING_PROMPT": {
         "role": "system",
         "content": """Developer: You are an official examiner. Apply the following grading rules precisely.
@@ -87,32 +63,25 @@ Answer 2 :
 5. Apply FT where appropriate.
 6. Use proper notation: M1A0, A1, etc.
 7. Any lost mark: use red `<span style="color:red">M0</span>` and make Reason red.
----
 ## Output Format
 Produce two sections per question/sub-question, following this structure:
 ## Question <id>
 ### Markscheme vs Student Answer
 | Mark ID | Markscheme Expectation | Student’s Response | Awarded |
 |---------|------------------------|--------------------|---------|
 | M1_1    | Recognise GP           | "r=0.9"            | M1 |
 ➡️ **Total: X/Y**
 ---
 ### Examiner’s Report
 At the very end, provide a summary table:
 | Question Number | Marks | Remark |
 |-----------------|-------|--------|
 | 1               | X/Y   | <remark> |
 Then show total clearly as a final line:
 `Total: <obtained_marks>/<max_marks>`
 NOTES:
-- The assistant will receive two transcripts: (1) QP+MS transcription (questions then markscheme) and (2) AS transcription (student answers). Use the QP+MS transcript as the authoritative source of question wording, total marks, and verbatim markscheme entries (M/A/R mark IDs).
-- Match student answers to question IDs and grade according to the provided verbatim markscheme.
-- Produce full markdown as above. Ensure mark IDs used in the grading are present and consistent with the markscheme.
 """
     }
 }
@@ -128,17 +97,12 @@ def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
     if output_path is None:
         base, ext = os.path.splitext(input_path)
         output_path = f"{base}_compressed{ext}"
     try:
         size = os.path.getsize(input_path)
     except Exception:
         return input_path
     if size <= max_size:
-        print(f"ℹ️ Not compressing {input_path} ({size/1024/1024:.2f} MB <= {max_size/1024/1024} MB)")
         return input_path
-    print(f"🔎 Compressing {input_path} ({size/1024/1024:.2f} MB) -> {output_path}")
     try:
         gs_cmd = [
             "gs", "-sDEVICE=pdfwrite",
@@ -149,35 +113,17 @@ def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
         ]
         subprocess.run(gs_cmd, check=True)
         new_size = os.path.getsize(output_path)
-        print(f"✅ Compression done. New size: {new_size/1024/1024:.2f} MB")
         if new_size <= max_size:
             return output_path
-        else:
-            print("⚠️ Compressed file still larger than threshold; returning original")
-            return input_path
-    except Exception as e:
-        print("❌ Compression error:", e)
         return input_path
 def create_model():
-    """
-    Create the Gemini model and print which model is selected.
-    """
     try:
-        print("⚡ Attempting to use gemini-2.5-pro model")
-        model = genai.GenerativeModel("gemini-2.5-pro", generation_config={"temperature": 0})
-        print("✅ Selected model: gemini-2.5-pro")
-        return model
-    except Exception as e:
-        print("⚠️ Could not use gemini-2.5-pro:", e)
-    try:
-        print("⚡ Falling back to gemini-2.5-flash model")
-        model = genai.GenerativeModel("gemini-2.5-flash", generation_config={"temperature": 0})
-        print("✅ Selected model: gemini-2.5-flash")
-        return model
-    except Exception as e:
-        print("❌ Failed to create any Gemini model:", e)
-        raise
 def merge_pdfs(paths, output_path):
     writer = PdfWriter()
@@ -190,79 +136,49 @@ def merge_pdfs(paths, output_path):
     return output_path
 def gemini_generate_content(model, prompt_text, file_upload_obj=None, image_obj=None):
-    """
-    Send prompt_text and optionally an uploaded file (or an image object) to the model.
-    Returns textual response and prints progress.
-    """
     inputs = [prompt_text]
     if file_upload_obj:
         inputs.append(file_upload_obj)
     if image_obj:
         inputs.append(image_obj)
-    print("📡 Sending request to Gemini (prompt length:", len(prompt_text), "chars )")
     response = model.generate_content(inputs)
     raw_text = getattr(response, "text", None)
     if not raw_text and getattr(response, "candidates", None):
         raw_text = response.candidates[0].content.parts[0].text
     if raw_text is None:
         raw_text = str(response)
-    print("📥 Received response (chars):", len(raw_text))
     return raw_text
 # ---------------- PARSERS ----------------
 def extract_question_ids_from_qpms(text):
     """
-    Extract question IDs from QP+MS transcript output.
-    We expect the QP+MS prompt to produce lines like 'Question: <id>'
-    Return a list of unique IDs in order of appearance.
     """
-    print("🔎 Extracting question IDs from QP+MS transcript using regex...")
     ids = []
-    for m in re.finditer(r"(?im)^\s*Question\s*:\s*([0-9]+(?:(?:\.[a-zA-Z0-9]+)+|(?:\([a-zA-Z0-9]+\))+|[a-zA-Z])*)\b", text):
-        qid = m.group(1).strip()
-        if qid not in ids:
-            ids.append(qid)
-    if ids:
-        print(f"✅ Extracted {len(ids)} question IDs.")
-        print("IDs:", ids)
-        return ids
-    # fallback scans
-    for m in re.finditer(r"(?m)^\s*([0-9]+(?:(?:\.[a-zA-Z0-9]+)+|(?:\([a-zA-Z0-9]+\))+|[a-zA-Z])*)\s*[\.\):\-]\s", text):
         qid = m.group(1).strip()
-        if qid not in ids:
             ids.append(qid)
-    if ids:
-        print(f"✅ Extracted {len(ids)} question IDs (fallback heuristic).")
-        print("IDs:", ids)
-    else:
-        print("⚠️ No question IDs extracted; will send NA placeholder.")
-    return ids
 def build_as_prompt_with_expected_ids(expected_ids):
     """
-    Construct the AS transcription prompt injecting the expected IDs block.
     """
-    if not expected_ids:
-        ids_block = "{NA}"
-    else:
-        ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
     prompt = f"""You are a high-quality handwritten transcription assistant.
 INPUT: This PDF contains a student's handwritten answer sheet.
-TASK: Transcribe the student's answers exactly (as text). Preserve step order and line breaks. Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3"). If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context — but mark inferred IDs clearly as "INFERRED: <id>"
-Enclose all mathematical expressions in Markdown fenced code blocks (``` triple backticks).
-If a diagram/graph is omitted, write [Graph omitted].
-Unreadable parts: [illegible].
-Unanswered: [No response].
-Do NOT recreate diagrams.
-Ensure consistency and determinism in formatting so subsequent models can grade directly from this aligned format.
-Expected questions (if missing, write NA):
 {ids_block}
 -----------------------
 OUTPUT FORMAT:
@@ -270,61 +186,55 @@ Question <id>
 AS:
 <transcribed answer or placeholder>
 """
-    return prompt
-def extract_marks_from_grading(grading_text):
-    """
-    Parse the grading markdown produced by the GRADING_PROMPT and extract marks per question.
-    Returns dict: {"grading": [{"question": "1.a", "marks_awarded": ["M1","A1"]}, ...]}
-    """
-    print("🔎 Extracting awarded marks from grading output...")
     grading_json = {"grading": []}
     question_blocks = re.split(r"##\s*Question\s+", grading_text)
     for block in question_blocks[1:]:
         first_line = block.strip().splitlines()[0].strip() if block.strip().splitlines() else ""
         q_id_match = re.match(r"([0-9]+(?:[a-zA-Z]|\([^\)]+\)|(?:\.[a-zA-Z0-9]+))*)", first_line)
-        if not q_id_match:
-            q_id = first_line.split()[0] if first_line else ""
-        else:
-            q_id = q_id_match.group(1).strip()
         awarded = re.findall(r"\b(M\d+|A\d+|R\d+|M0|A0|R0)\b", block)
-        seen = set()
-        awarded_unique = []
-        for m in awarded:
-            if m not in seen:
-                awarded_unique.append(m)
-                seen.add(m)
-        grading_json["grading"].append({
-            "question": q_id,
-            "marks_awarded": awarded_unique
-        })
-    print("✅ Extracted grading marks for", len(grading_json["grading"]), "question blocks.")
-    print(json.dumps(grading_json, indent=2))
     return grading_json
-# ---------------- MAPPING/IMPRINT HELPERS ----------------
-def ask_gemini_for_mapping_for_page(model, image_path, grading_json, rows=GRID_ROWS, cols=GRID_COLS):
     """
-    Send a single page image along with the grading_json; LLM should return JSON mapping.
     """
     prompt = f"""
-You are an exam marker. Your role is to identify where each question begins on the page.
-The page is divided into a {rows} x {cols} grid. Each cell has a RUNNING NUMBER label (1..{rows*cols}).
-For each question in the grading JSON, return the cell NUMBER where the FIRST STEP of that question begins.
-IMPORTANT RULES:
-- Do not place marks inside another question's answer area.
-- Prefer placing the marks in a BLANK cell immediately to the RIGHT of the answer step. If no blank cell is available to the right, then place in a blank cell to the LEFT.
-- Never place marks above or below the answer.
-- If a question starts on a previous page, you may omit it for this page.
-Return JSON only, like:
-[{{"question": "1.a", "cell_number": 15}}, ...]
-Grading JSON:
 {json.dumps(grading_json, indent=2)}
 """
-    print(f"📡 Sending mapping request for image {image_path} to Gemini...")
     img = Image.open(image_path)
     response = model.generate_content([prompt, img])
     raw_text = getattr(response, "text", None)
@@ -332,250 +242,193 @@ Grading JSON:
         raw_text = response.candidates[0].content.parts[0].text
     if not raw_text:
         raw_text = str(response)
-    print("📥 Mapping response (chars):", len(raw_text))
     try:
         start = raw_text.index('[')
         end = raw_text.rindex(']') + 1
-        json_part = raw_text[start:end]
-        mapping = json.loads(json_part)
-        print("✅ Parsed mapping JSON for", image_path, "| entries:", len(mapping))
-        return mapping
     except Exception:
-        match = re.search(r'(\[.*\])', raw_text, re.DOTALL)
-        if match:
-            try:
-                mapping = json.loads(match.group(1))
-                print("✅ Parsed mapping JSON (alt) for", image_path, "| entries:", len(mapping))
-                return mapping
-            except Exception:
-                pass
-        print("⚠️ Failed to parse mapping JSON for", image_path)
         return []
-def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, rows=GRID_ROWS, cols=GRID_COLS):
     """
-    Convert PDF to images, create grid-numbered images for sending to Gemini,
-    send all page images in parallel to Gemini for mapping, then annotate and produce imprinted PDF.
     """
-    print("📄 Converting answer PDF to images for imprinting...")
-    pages = convert_from_path(pdf_path, dpi=200)
     annotated_page_paths = []
     temp_grid_images = []
-    for p_index, page in enumerate(pages):
-        img = page.convert("RGB")
-        w, h = img.size
-        cell_w, cell_h = w / cols, h / rows
         draw = ImageDraw.Draw(img)
         try:
-            num_font = ImageFont.truetype("arial.ttf", 16)
-        except Exception:
-            num_font = ImageFont.load_default()
         cell_num = 1
         for r in range(rows):
             for c in range(cols):
                 x = int(c * cell_w + cell_w / 2)
                 y = int(r * cell_h + cell_h / 2)
-                text = str(cell_num)
-                bbox = draw.textbbox((0, 0), text, font=num_font)
-                tw = bbox[2] - bbox[0]
-                th = bbox[3] - bbox[1]
-                draw.text((x - tw/2, y - th/2), text, fill="black", font=num_font)
-                cell_num += 1
-        temp_path = f"page_{p_index+1}_grid.png"
-        img.save(temp_path, "PNG")
-        temp_grid_images.append(temp_path)
-        print("🛰 Created grid image:", temp_path)
-    # Send all grid images in parallel to Gemini to get mappings
-    print("📡 Sending all page images to Gemini in parallel for mapping...")
     mappings_per_page = {}
-    model_local = model
-    with ThreadPoolExecutor(max_workers=min(8, len(temp_grid_images))) as ex:
-        futures = {ex.submit(ask_gemini_for_mapping_for_page, model_local, img_path, grading_json, rows, cols): idx
-                   for idx, img_path in enumerate(temp_grid_images)}
         for fut in as_completed(futures):
             idx = futures[fut]
             try:
-                mapping = fut.result()
-            except Exception as e:
-                print("⚠️ Mapping request failed for page", idx, e)
-                mapping = []
-            mappings_per_page[idx] = mapping
-    # Annotate original pages according to returned mappings
-    print("🖊 Annotating pages with marks...")
-    for p_index, page in enumerate(pages):
-        page_img = page.convert("RGB")
-        img_cv = np.array(page_img)
         img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
         h, w, _ = img_cv.shape
-        cell_w_px, cell_h_px = w / cols, h / rows
         mapping = mappings_per_page.get(p_index, [])
         occupied = set()
         for item in mapping:
             qid = item.get("question")
             cell_number = item.get("cell_number")
-            if qid is None or cell_number is None:
-                continue
-            marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", []) if g["question"] == qid), [])
-            if not marks_list:
-                marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", [])
-                                   if g["question"].lower() == (qid or "").lower()), [])
             marks_text = ",".join(marks_list) if marks_list else "?"
-            row = (cell_number - 1) // cols
-            col = (cell_number - 1) % cols
             candidates = []
-            if col + 1 < cols:
-                candidates.append((row, col + 1))
-            candidates.append((row, col))
-            if col - 1 >= 0:
-                candidates.append((row, col - 1))
-            chosen = None
-            for (r_c, c_c) in candidates:
-                cell_id = r_c * cols + c_c + 1
-                if cell_id not in occupied:
-                    chosen = (r_c, c_c)
-                    occupied.add(cell_id)
-                    break
-            if chosen is None:
-                chosen = (row, col)
-            r_c, c_c = chosen
-            x_c = int((c_c + 1) * cell_w_px - cell_w_px * 0.1)
-            y_c = int((r_c + 0.5) * cell_h_px)
-            font_scale = max(0.6, min(1.6, cell_h_px / 60.0))
-            thickness = max(1, int(font_scale * 2))
-            cv2.putText(img_cv, marks_text, (x_c, y_c), cv2.FONT_HERSHEY_SIMPLEX,
-                        font_scale, (0, 0, 255), thickness, cv2.LINE_AA)
         annotated_path = f"annotated_page_{p_index+1}.png"
         cv2.imwrite(annotated_path, img_cv)
         annotated_page_paths.append(annotated_path)
-        print("✅ Annotated page saved:", annotated_path)
-    with open(output_pdf, "wb") as f:
-        f.write(img2pdf.convert(annotated_page_paths))
-    compressed = compress_pdf(output_pdf)
-    print("📑 Imprinted PDF saved to:", compressed)
-    return compressed
-# ---------------- MAIN PIPELINE ----------------
 def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
-    """
-    Final pipeline implementing requested flow and verbose console logging.
-    """
-    try:
-        print("🔁 Starting pipeline...")
-        # Step 0: compress as needed
-        qp_path = compress_pdf(qp_path)
-        ms_path = compress_pdf(ms_path)
-        ans_path = compress_pdf(ans_path)
-        # Merge QP + MS
-        merged_qpms_path = os.path.splitext(qp_path)[0] + "_merged_qp_ms.pdf"
-        merge_pdfs([qp_path, ms_path], merged_qpms_path)
-        print("📎 Merged QP + MS ->", merged_qpms_path)
-        # Upload files to Gemini
-        print("🔼 Uploading files to Gemini...")
-        merged_uploaded = genai.upload_file(path=merged_qpms_path, display_name="QP+MS (merged)")
-        ans_uploaded = genai.upload_file(path=ans_path, display_name="Answer Sheet")
-        print("✅ Upload complete.")
-        # Create model and print which selected
-        model = create_model()
-        # Step 1.i: QP+MS transcription (first)
-        print("1.i) Transcribing QP+MS (questions first, then full markscheme)...")
-        qpms_prompt = PROMPTS["QP_MS_TRANSCRIPTION"]["content"]
-        qpms_text = gemini_generate_content(model, qpms_prompt, file_upload_obj=merged_uploaded)
-        print("📄 QP+MS transcription received. Saving debug file: debug_qpms_transcript.txt")
-        with open("debug_qpms_transcript.txt", "w", encoding="utf-8") as f:
-            f.write(qpms_text)
-        # Step 2: extract serial numbers (question IDs) using regex from qpms_text
-        extracted_ids = extract_question_ids_from_qpms(qpms_text)
-        if not extracted_ids:
-            extracted_ids = ["NA"]
-        # Step 1.ii: Build AS prompt injecting extracted IDs and transcribe AS
-        print("1.ii) Building AS transcription prompt with expected question IDs and sending to Gemini...")
-        as_prompt = build_as_prompt_with_expected_ids(extracted_ids)
-        as_text = gemini_generate_content(model, as_prompt, file_upload_obj=ans_uploaded)
-        print("📝 AS transcription received. Saving debug file: debug_as_transcript.txt")
-        with open("debug_as_transcript.txt", "w", encoding="utf-8") as f:
-            f.write(as_text)
-        # Step 3: Grading - send both transcripts to grading model
-        print("2) Preparing grading input and sending to Gemini for grading...")
-        grading_input = (
-            "=== QP+MS TRANSCRIPT BEGIN ===\n"
-            + qpms_text
-            + "\n=== QP+MS TRANSCRIPT END ===\n\n"
-            + "=== ANSWER SHEET TRANSCRIPT BEGIN ===\n"
-            + as_text
-            + "\n=== ANSWER SHEET TRANSCRIPT END ===\n"
-        )
-        grading_prompt_system = PROMPTS["GRADING_PROMPT"]["content"]
-        grading_text = gemini_generate_content(model, grading_prompt_system + "\n\nPlease grade the following transcripts:\n" + grading_input)
-        print("🧾 Grading output received. Saving debug file: debug_grading.md")
-        with open("debug_grading.md", "w", encoding="utf-8") as f:
-            f.write(grading_text)
-        # Save grading PDF
-        base_name = os.path.splitext(os.path.basename(ans_path))[0]
-        grading_pdf_path = save_as_pdf(grading_text, f"{base_name}_graded.pdf")
-        print("📄 Grading PDF saved:", grading_pdf_path)
-        # Step 4: Extract marks for imprinting
-        grading_json = extract_marks_from_grading(grading_text)
-        with open("debug_grading_json.json", "w", encoding="utf-8") as f:
-            json.dump(grading_json, f, indent=2, ensure_ascii=False)
-        print("🔧 Grading marks extraction complete.")
-        imprinted_pdf_path = None
-        if imprint:
-            print("✍ Imprint option enabled. Starting imprinting process (parallel mapping requests)...")
-            imprinted_pdf_path = f"{base_name}_imprinted.pdf"
-            imprinted_pdf_path = imprint_marks_using_mapping(ans_path, grading_json, imprinted_pdf_path, model)
-            print("✅ Imprinting finished. Imprinted PDF at:", imprinted_pdf_path)
-        print("🏁 Pipeline finished successfully.")
-        return qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path
-    except Exception as e:
-        print("❌ Pipeline error:", e)
-        return f"❌ Error: {e}", None, None, None, None
-# ---------------- GRADIO UI ----------------
-with gr.Blocks(title="LeadIB AI Grading (Final Flow — Verbose)") as demo:
-    gr.Markdown("## 📘 LeadIB AI Grading — Final Flow\nUpload **Question Paper**, **Markscheme**, and **Student Answer Sheet**.\nFlow: merge QP+MS -> transcribe QP+MS (questions first, full markscheme) -> extract IDs -> transcribe AS with expected IDs -> grade -> (optional) imprint. Console prints show progress.")
     with gr.Row():
-        qp_file = gr.File(label="📄 Upload Question Paper (PDF)")
-        ms_file = gr.File(label="📄 Upload Markscheme (PDF)")
-        ans_file = gr.File(label="📝 Upload Student Answer Sheet (PDF)")
-    imprint_toggle = gr.Checkbox(label="✍ Imprint Marks on Student Answer Sheet", value=False)
     run_button = gr.Button("🚀 Run Pipeline")
     with gr.Row():
         qpms_box = gr.Textbox(label="📑 QP+MS Transcript", lines=12)
         as_box = gr.Textbox(label="📝 AS Transcript", lines=12)
-    grading_output_box = gr.Textbox(label="🧾 Grading (Markdown)", lines=20)
     grading_pdf_file = gr.File(label="📥 Download Grading PDF")
     imprint_pdf_file = gr.File(label="📥 Download Imprinted PDF (Optional)")
@@ -583,11 +436,9 @@ with gr.Blocks(title="LeadIB AI Grading (Final Flow — Verbose)") as demo:
         qp_path = qp_file_obj.name
         ms_path = ms_file_obj.name
         ans_path = ans_file_obj.name
         qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path = align_and_grade_pipeline(
             qp_path, ms_path, ans_path, imprint=imprint_flag
         )
         return qpms_text or "", as_text or "", grading_text or "", grading_pdf_path, imprinted_pdf_path
     run_button.click(

 import re
 import json
 import subprocess
 import img2pdf
 import gradio as gr
 import google.generativeai as genai
 from pdf2image import convert_from_path
 from PIL import Image, ImageDraw, ImageFont
 import cv2
 import numpy as np
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from PyPDF2 import PdfReader, PdfWriter
+from markdown_pdf import MarkdownPdf, Section
 # ---------------- CONFIG ----------------
 genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
 # ---------------- PROMPTS ----------------
 PROMPTS = {
+    "QP_MS_TRANSCRIPTION": {
+        "role": "system",
+        "content": """You are a high-quality OCR/Transcription assistant.
 INPUT: This file is a PDF that first contains the Question Paper and immediately after it the Markscheme.
 TASK:
 1. Transcribe EXACTLY all the questions FIRST (with their total marks).
 2. After ALL questions, transcribe the Markscheme exactly, preserving M/A/R notation in brackets.
 3. Always number the questions sequentially (Question 1, Question 2, Question 3, …) **in the order they appear in the PDF**, even if the PDF shows a different number or leaves it blank. Do NOT skip or leave Question: blank.
 FORMAT:
 ==== PAPER TOTAL MARKS ====
 <total marks>
 ==== QUESTIONS BEGIN ====
 Question 1.i
 Total Marks: <number>
 QP: <question text>
 --QUESTION-END--
+(repeat for all questions)
 ==== QUESTIONS END ====
 ==== MARKSCHEME BEGIN ====
 Answer 1.i:
 <exact MS for Q1.i with notations M1, A1, R1 etc>
 (repeat for all answers)
 ==== MARKSCHEME END ====
 """
+    },
     "GRADING_PROMPT": {
         "role": "system",
         "content": """Developer: You are an official examiner. Apply the following grading rules precisely.
 5. Apply FT where appropriate.
 6. Use proper notation: M1A0, A1, etc.
 7. Any lost mark: use red `<span style="color:red">M0</span>` and make Reason red.
+---
 ## Output Format
 Produce two sections per question/sub-question, following this structure:
 ## Question <id>
 ### Markscheme vs Student Answer
 | Mark ID | Markscheme Expectation | Student’s Response | Awarded |
 |---------|------------------------|--------------------|---------|
 | M1_1    | Recognise GP           | "r=0.9"            | M1 |
 ➡️ **Total: X/Y**
 ---
 ### Examiner’s Report
 At the very end, provide a summary table:
 | Question Number | Marks | Remark |
 |-----------------|-------|--------|
 | 1               | X/Y   | <remark> |
 Then show total clearly as a final line:
 `Total: <obtained_marks>/<max_marks>`
 NOTES:
+- The assistant will receive two transcripts: (1) QP+MS transcription (questions then markscheme) and (2) AS transcription (student answers). Use the QP+MS transcript as the authoritative source.
 """
     }
 }
     if output_path is None:
         base, ext = os.path.splitext(input_path)
         output_path = f"{base}_compressed{ext}"
     try:
         size = os.path.getsize(input_path)
     except Exception:
         return input_path
     if size <= max_size:
         return input_path
     try:
         gs_cmd = [
             "gs", "-sDEVICE=pdfwrite",
         ]
         subprocess.run(gs_cmd, check=True)
         new_size = os.path.getsize(output_path)
         if new_size <= max_size:
             return output_path
+        return input_path
+    except Exception:
         return input_path
 def create_model():
     try:
+        return genai.GenerativeModel("gemini-2.5-pro", generation_config={"temperature": 0})
+    except Exception:
+        return genai.GenerativeModel("gemini-2.5-flash", generation_config={"temperature": 0})
 def merge_pdfs(paths, output_path):
     writer = PdfWriter()
     return output_path
 def gemini_generate_content(model, prompt_text, file_upload_obj=None, image_obj=None):
     inputs = [prompt_text]
     if file_upload_obj:
         inputs.append(file_upload_obj)
     if image_obj:
         inputs.append(image_obj)
     response = model.generate_content(inputs)
     raw_text = getattr(response, "text", None)
     if not raw_text and getattr(response, "candidates", None):
         raw_text = response.candidates[0].content.parts[0].text
     if raw_text is None:
         raw_text = str(response)
     return raw_text
 # ---------------- PARSERS ----------------
 def extract_question_ids_from_qpms(text):
     """
+    Find all question ids in order *without* deduplication.
+    We will collect every match in sequence exactly as found.
     """
     ids = []
+    # first try an explicit "Question: <id>" pattern
+    for m in re.finditer(r"(?im)^\s*Question\s*:\s*([0-9]+(?:[a-zA-Z0-9\.\(\)]+)*)\b", text):
         qid = m.group(1).strip()
+        ids.append(qid)
+    # if none found by that pattern, use a looser leading numbering pattern
+    if not ids:
+        for m in re.finditer(r"(?m)^\s*([0-9]+(?:[a-zA-Z0-9\.\(\)]+)*)\s*[\.\):\-]\s", text):
+            qid = m.group(1).strip()
             ids.append(qid)
+    return ids if ids else ["NA"]
 def build_as_prompt_with_expected_ids(expected_ids):
     """
+    Build the AS transcription prompt; also useful to produce an ids_block string
+    that can be passed to the imprint mapping prompt.
     """
+    ids_block = "{\n" + "\n".join(expected_ids) + "\n}" if expected_ids else "{NA}"
     prompt = f"""You are a high-quality handwritten transcription assistant.
 INPUT: This PDF contains a student's handwritten answer sheet.
+TASK: Transcribe the student's answers exactly (as text), preserving step order and line breaks.
+Attempt to assign each answer to a question ID if student labelled it; else mark as INFERRED.
+Enclose math in ``` blocks, diagrams as [Graph omitted], unreadable as [illegible].
+Expected questions:
 {ids_block}
 -----------------------
 OUTPUT FORMAT:
 AS:
 <transcribed answer or placeholder>
 """
+    return prompt, ids_block
+def extract_marks_from_grading_exact(grading_text):
     grading_json = {"grading": []}
     question_blocks = re.split(r"##\s*Question\s+", grading_text)
     for block in question_blocks[1:]:
         first_line = block.strip().splitlines()[0].strip() if block.strip().splitlines() else ""
         q_id_match = re.match(r"([0-9]+(?:[a-zA-Z]|\([^\)]+\)|(?:\.[a-zA-Z0-9]+))*)", first_line)
+        q_id = q_id_match.group(1).strip() if q_id_match else first_line.split()[0] if first_line else ""
         awarded = re.findall(r"\b(M\d+|A\d+|R\d+|M0|A0|R0)\b", block)
+        grading_json["grading"].append({"question": q_id, "marks_awarded": awarded})
     return grading_json
+# ---------------- IMPRINT ----------------
+def ask_gemini_for_mapping_for_page_v2(model, image_path, grading_json, question_scheme, ids_block, rows=GRID_ROWS, cols=GRID_COLS):
     """
+    Ask Gemini to map question IDs (from ids_block) to cell numbers on this page.
+    We pass the ids_block explicitly (same block used when transcribing student answers)
+    and instruct the model to return JSON only: a list of {"question":"<id>","cell_number":N}.
+    Also instruct the LLM about mislabelled subparts: e.g., if it sees 'ii)' above
+    'Q4.i' without a number, it may belong to Q3.ii.
     """
     prompt = f"""
+You are an exam marker. Identify where each question listed in the ids block begins on this page.
+The page has {rows}x{cols} grid (cells 1..{rows*cols}).
+QUESTION IDS (expected) you must look for:
+{ids_block}
+Question scheme (authoritative transcription excerpt):
+{question_scheme}
+Grading JSON (marks awarded summary):
 {json.dumps(grading_json, indent=2)}
+Instructions (IMPORTANT):
+- Only return questions from the provided IDs block above.
+- For each question you find on this page, return the single grid cell number where the first step of that question begins.
+- If you see a subpart like `ii)` with no leading question number directly above or below another labelled subpart, try to infer which question it belongs to (example: if you find `Q4.i` and above it you see `ii)` alone, it may be `Q3.ii` — if you infer like this explain your reasoning briefly in the JSON entry's optional "note" field).
+- Avoid placing marks inside another question's area; prefer an adjacent blank cell to the RIGHT if possible, else LEFT.
+- Return JSON only, exactly like:
+[
+  {"question":"1.a","cell_number":15},
+  {"question":"3.ii","cell_number":23, "note":"inferred from unlabeled ii above Q4.i"},
+  ...
+]
+- If no instances of an expected question appear on this page, return an empty list: [].
 """
+    # load image (PIL) so the model can see it if supported
     img = Image.open(image_path)
     response = model.generate_content([prompt, img])
     raw_text = getattr(response, "text", None)
         raw_text = response.candidates[0].content.parts[0].text
     if not raw_text:
         raw_text = str(response)
+    # try to extract a JSON array from the output
     try:
         start = raw_text.index('[')
         end = raw_text.rindex(']') + 1
+        return json.loads(raw_text[start:end])
     except Exception:
+        # if parsing fails, return an empty list for safety
         return []
+def imprint_marks_using_mapping_v2(pdf_path, grading_json, output_pdf, question_scheme, model, expected_ids, rows=GRID_ROWS, cols=GRID_COLS):
     """
+    Imprint marks onto the student answer PDF while preserving original page size.
+    - Reads page size from PDF (points).
+    - Converts pages to images with convert_from_path(..., size=(width_pt,height_pt)).
+    - Places grid and marks using cell widths/heights computed from the resulting image.
+    - Does NOT rescale images later.
+    - Writes final imprinted PDF using img2pdf with original page dimensions.
     """
+    reader = PdfReader(pdf_path)
+    # use the first page size as canonical for all pages (could be extended to per-page)
+    page0 = reader.pages[0]
+    width_pt = float(page0.mediabox.width)   # points
+    height_pt = float(page0.mediabox.height)
+    # convert PDF pages to images with exact size -> 1 image pixel ≈ 1 PDF point
+    # NOTE: pdf2image size expects a tuple of ints
+    pages = convert_from_path(pdf_path, size=(int(width_pt), int(height_pt)))
     annotated_page_paths = []
     temp_grid_images = []
+    # Draw grid numbers on a copy (useful to send to LLM to ask mapping)
+    for p_index, page_img in enumerate(pages):
+        img = page_img.convert("RGB")
         draw = ImageDraw.Draw(img)
         try:
+            font = ImageFont.truetype("arial.ttf", 16)
+        except:
+            font = ImageFont.load_default()
+        cell_w = img.width / cols
+        cell_h = img.height / rows
         cell_num = 1
         for r in range(rows):
             for c in range(cols):
+                # center of cell
                 x = int(c * cell_w + cell_w / 2)
                 y = int(r * cell_h + cell_h / 2)
+                bbox = draw.textbbox((0,0), str(cell_num), font=font)
+                draw.text((x - (bbox[2]-bbox[0])/2, y - (bbox[3]-bbox[1])/2), str(cell_num), fill="black", font=font)
+                cell_num +=1
+        grid_path = f"page_{p_index+1}_grid.png"
+        img.save(grid_path, "PNG")
+        temp_grid_images.append(grid_path)
+    # Build ids_block from expected_ids
+    ids_block = "{\n" + "\n".join(expected_ids) + "\n}" if expected_ids else "{NA}"
+    # Ask model to map each page (parallel)
     mappings_per_page = {}
+    with ThreadPoolExecutor(max_workers=min(8,len(temp_grid_images))) as ex:
+        futures = {ex.submit(ask_gemini_for_mapping_for_page_v2, model, img_path, grading_json, question_scheme, ids_block, rows, cols): idx
+                   for idx,img_path in enumerate(temp_grid_images)}
         for fut in as_completed(futures):
             idx = futures[fut]
             try:
+                mappings_per_page[idx] = fut.result()
+            except:
+                mappings_per_page[idx] = []
+    # Annotate original pages (no rescaling)
+    for p_index, page_img in enumerate(pages):
+        img_cv = np.array(page_img.convert("RGB"))
         img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
         h, w, _ = img_cv.shape
+        cell_w_px, cell_h_px = w/cols, h/rows
         mapping = mappings_per_page.get(p_index, [])
         occupied = set()
         for item in mapping:
             qid = item.get("question")
             cell_number = item.get("cell_number")
+            if qid is None or cell_number is None: continue
+            # find marks for that question in grading_json (exact string match)
+            marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", []) if g["question"]==qid), [])
             marks_text = ",".join(marks_list) if marks_list else "?"
+            # map cell_number -> row/col
+            row = (cell_number-1)//cols
+            col = (cell_number-1)%cols
+            # choose preference: right, same, left
             candidates = []
+            if col+1<cols: candidates.append((row,col+1))
+            candidates.append((row,col))
+            if col-1>=0: candidates.append((row,col-1))
+            chosen = next(((r,c) for r,c in candidates if (r*cols+c+1) not in occupied), (row,col))
+            occupied.add(chosen[0]*cols+chosen[1]+1)
+            x_c = int((chosen[1]+0.5)*cell_w_px)
+            y_c = int((chosen[0]+0.5)*cell_h_px)
+            # draw mark text directly onto image (OpenCV uses BGR)
+            font_scale = max(0.6,min(1.6,cell_h_px/60))
+            thickness = max(1,int(font_scale*2))
+            # ensure text doesn't go out of bounds; shift up a bit
+            text_size = cv2.getTextSize(marks_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)[0]
+            x_draw = max(0, min(w - text_size[0], x_c - text_size[0]//2))
+            y_draw = max(text_size[1], min(h - 1, y_c + text_size[1]//2))
+            cv2.putText(img_cv, marks_text, (x_draw,y_draw), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0,0,255), thickness)
+            # optional: if mapping includes "note", draw a small 'i' icon nearby
+            if item.get("note"):
+                note_text = "i"
+                ns = cv2.getTextSize(note_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale*0.8, max(1,int(thickness/2)))[0]
+                nx = max(0, x_draw + text_size[0] + 4)
+                ny = max(ns[1], y_draw)
+                cv2.putText(img_cv, note_text, (nx, ny), cv2.FONT_HERSHEY_SIMPLEX, font_scale*0.8, (0,0,0), max(1,int(thickness/2)))
         annotated_path = f"annotated_page_{p_index+1}.png"
         cv2.imwrite(annotated_path, img_cv)
         annotated_page_paths.append(annotated_path)
+    # Recreate PDF using img2pdf with the original page dimensions (width_pt,height_pt)
+    with open(output_pdf,"wb") as f:
+        f.write(img2pdf.convert(annotated_page_paths, layout_fun=img2pdf.get_layout_fun((width_pt,height_pt))))
+    return compress_pdf(output_pdf)
+# ---------------- PIPELINE ----------------
 def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
+    qp_path = compress_pdf(qp_path)
+    ms_path = compress_pdf(ms_path)
+    ans_path = compress_pdf(ans_path)
+    merged_qpms_path = os.path.splitext(qp_path)[0]+"_merged_qp_ms.pdf"
+    merge_pdfs([qp_path, ms_path], merged_qpms_path)
+    merged_uploaded = genai.upload_file(path=merged_qpms_path, display_name="QP+MS (merged)")
+    ans_uploaded = genai.upload_file(path=ans_path, display_name="Answer Sheet")
+    model = create_model()
+    qpms_prompt = PROMPTS["QP_MS_TRANSCRIPTION"]["content"]
+    qpms_text = gemini_generate_content(model, qpms_prompt, file_upload_obj=merged_uploaded)
+    # extract question ids (no deduplication)
+    extracted_ids = extract_question_ids_from_qpms(qpms_text)
+    # build AS prompt (and get ids_block)
+    as_prompt, ids_block = build_as_prompt_with_expected_ids(extracted_ids)
+    as_text = gemini_generate_content(model, as_prompt, file_upload_obj=ans_uploaded)
+    grading_input = (
+        "=== QP+MS TRANSCRIPT BEGIN ===\n"+qpms_text+
+        "\n=== QP+MS TRANSCRIPT END ===\n\n"+
+        "=== ANSWER SHEET TRANSCRIPT BEGIN ===\n"+as_text+
+        "\n=== ANSWER SHEET TRANSCRIPT END ===\n"
+    )
+    grading_prompt_system = PROMPTS["GRADING_PROMPT"]["content"]
+    grading_text = gemini_generate_content(model, grading_prompt_system+"\n\nPlease grade the following transcripts:\n"+grading_input)
+    grading_pdf_path = save_as_pdf(grading_text, os.path.splitext(os.path.basename(ans_path))[0]+"_graded.pdf")
+    grading_json = extract_marks_from_grading_exact(grading_text)
+    imprinted_pdf_path = None
+    if imprint:
+        question_scheme = qpms_text
+        imprinted_pdf_path = os.path.splitext(os.path.basename(ans_path))[0]+"_imprinted.pdf"
+        # Pass expected_ids (extracted_ids) to imprint function so it can build the ids_block and ask the model
+        imprinted_pdf_path = imprint_marks_using_mapping_v2(ans_path, grading_json, imprinted_pdf_path, question_scheme, model, expected_ids=extracted_ids)
+    return qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path
+# ---------------- GRADIO ----------------
+with gr.Blocks(title="LeadIB AI Grading (Updated Imprint)") as demo:
+    gr.Markdown("## 📘 LeadIB AI Grading — Updated Imprint Pipeline\nUpload QP, Markscheme, and Student Answer Sheet.")
     with gr.Row():
+        qp_file = gr.File(label="📄 Question Paper (PDF)")
+        ms_file = gr.File(label="📄 Markscheme (PDF)")
+        ans_file = gr.File(label="📝 Student Answer Sheet (PDF)")
+    imprint_toggle = gr.Checkbox(label="✍ Imprint Marks", value=False)
     run_button = gr.Button("🚀 Run Pipeline")
     with gr.Row():
         qpms_box = gr.Textbox(label="📑 QP+MS Transcript", lines=12)
         as_box = gr.Textbox(label="📝 AS Transcript", lines=12)
+    grading_output_box = gr.Textbox(label="🧾 Grading Markdown", lines=20)
     grading_pdf_file = gr.File(label="📥 Download Grading PDF")
     imprint_pdf_file = gr.File(label="📥 Download Imprinted PDF (Optional)")
         qp_path = qp_file_obj.name
         ms_path = ms_file_obj.name
         ans_path = ans_file_obj.name
         qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path = align_and_grade_pipeline(
             qp_path, ms_path, ans_path, imprint=imprint_flag
         )
         return qpms_text or "", as_text or "", grading_text or "", grading_pdf_path, imprinted_pdf_path
     run_button.click(