neurolearn

Sleeping

App Files Files Community

atz21 commited on Sep 24, 2025

Commit

e4d3932

verified ·

1 Parent(s): 9cdcbb2

Update app.py

Browse files

Files changed (1) hide show

app.py +231 -371

app.py CHANGED Viewed

@@ -15,60 +15,50 @@ import numpy as np
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from PyPDF2 import PdfReader, PdfWriter
-# ---------- CONFIG ----------
 genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
 GRID_ROWS, GRID_COLS = 20, 14
-# ---------- PROMPTS (updated) ----------
 PROMPTS = {
-    "QP_MS_TRANSCRIBE": {
         "role": "system",
         "content": """You are a high-quality OCR/Transcription assistant.
-INPUT: This file is a PDF that **first contains the Question Paper** and immediately after it **the Markscheme**.
-TASK: Produce an exact transcription in plain text with clear separators. For every question in the Question Paper extract and output:
-- Question ID (exact as printed, e.g., "1", "2(a)", "3.b", "4(ii)")
-- Question text (exact wording; do not change punctuation)
-- Total marks for the question (if printed; otherwise try to infer/leave blank)
-FOR THE MARKSCHEME: Transcribe the markscheme **verbatim** exactly as it appears. Do NOT alter mark IDs, abbreviations, indentation, or descriptions. The markscheme transcription must be faithful — errors in transcription should be kept as-is rather than "corrected".
-OUTPUT FORMAT:
-- Plain text with clearly delimited blocks. Use a pattern like:
-----
-QUESTION BEGIN
-ID: <id>
-QTEXT:
-<question text (multiline)>
-TOTAL_MARKS: <integer or empty>
-MARKSCHEME:
-<verbatim markscheme lines for this question (multiline)>
-QUESTION END
-----
-Repeat for every question in order. If some part is not available, leave the field empty but keep the block structure.
-"""
-    },
-    "AS_TRANSCRIBE": {
-        "role": "system",
-        "content": """You are a high-quality handwritten transcription assistant.
-INPUT: This PDF contains a student's handwritten answer sheet.
-TASK: Transcribe the student's answers exactly (as text). Preserve step order and line breaks. Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3"). If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context — but mark inferred IDs clearly as "INFERRED: <id>".
-OUTPUT FORMAT:
-Produce plain text with clearly delimited answer blocks using the pattern:
-----
-ANSWER BEGIN
-ID: <id or INFERRED:... or EMPTY>
-ANSWER:
-<transcribed student answer text (multiline)>
-ANSWER END
-----
-Repeat for each student answer block found.
 """
     },
     "GRADING_PROMPT": {
         "role": "system",
         "content": """Developer: You are an official examiner. Apply the following grading rules precisely.
@@ -90,29 +80,35 @@ Repeat for each student answer block found.
 7. Any lost mark: use red `<span style="color:red">M0</span>` and make Reason red.
 ---
 ## Output Format
-Produce two sections per question/sub-question:
----
-## Question X
 ### Markscheme vs Student Answer
 | Mark ID | Markscheme Expectation | Student’s Response | Awarded |
 |---------|------------------------|--------------------|---------|
 | M1_1    | Recognise GP           | "r=0.9"            | M1 |
-➡️ **Total: 6/7**
 ---
 ### Examiner’s Report
 At the very end, provide a summary table:
 | Question Number | Marks | Remark |
 |-----------------|-------|--------|
-| 1               | 6/7   | C      |
-Then show total clearly:
-`Total: 6/7`
-NOTES: The assistant will receive a structured alignment JSON (questions list with qp, total_marks, ms verbatim, and as transcribed). Grade each question independently, using the markscheme provided in the `ms` field (verbatim) and the student's `as`. Provide full markdown output as described above.
 """
     }
 }
-# -------------------- HELPERS (unchanged unless needed) --------------------
 def save_as_pdf(text, filename="output.pdf"):
     pdf = MarkdownPdf()
     pdf.add_section(Section(text, toc=False))
@@ -130,10 +126,8 @@ def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
         return input_path
     if size <= max_size:
-        print(f"ℹ️ Not compressing {input_path} ({size/1024/1024:.2f} MB <= {max_size/1024/1024} MB)")
         return input_path
-    print(f"🔎 Compressing {input_path} ({size/1024/1024:.2f} MB) -> {output_path}")
     try:
         gs_cmd = [
             "gs", "-sDEVICE=pdfwrite",
@@ -144,27 +138,118 @@ def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
         ]
         subprocess.run(gs_cmd, check=True)
         new_size = os.path.getsize(output_path)
-        print(f"✅ Compression done. New size: {new_size/1024/1024:.2f} MB")
         if new_size <= max_size:
             return output_path
         else:
-            print("⚠️ Compressed file still larger than threshold; returning original")
             return input_path
-    except Exception as e:
-        print("❌ Compression error:", e)
         return input_path
 def create_model():
     try:
-        print("⚡ Using gemini-2.5-pro model")
         return genai.GenerativeModel("gemini-2.5-pro", generation_config={"temperature": 0})
     except Exception:
-        print("⚡ Falling back to gemini-2.5-flash model")
         return genai.GenerativeModel("gemini-2.5-flash", generation_config={"temperature": 0})
 def extract_marks_from_grading(grading_text):
     grading_json = {"grading": []}
-    # Split by question header
     question_blocks = re.split(r"##\s*Question\s+", grading_text)
     for block in question_blocks[1:]:
         first_line = block.strip().splitlines()[0].strip()
@@ -186,159 +271,11 @@ def extract_marks_from_grading(grading_text):
         })
     return grading_json
-# ---------- PDF merging helper ----------
-def merge_pdfs(paths, output_path):
-    writer = PdfWriter()
-    for p in paths:
-        reader = PdfReader(p)
-        for page in reader.pages:
-            writer.add_page(page)
-    with open(output_path, "wb") as f:
-        writer.write(f)
-    return output_path
-# ---------- Transcript parsing helpers ----------
-def parse_qp_ms_transcript(text):
-    """
-    Parse QP+MS transcript produced according to the QP_MS_TRANSCRIBE prompt blocks.
-    Expected block markers: QUESTION BEGIN ... QUESTION END with fields ID, QTEXT, TOTAL_MARKS, MARKSCHEME.
-    Return list of questions: {id, qp, total_marks, ms}
-    """
-    questions = []
-    # Try to find blocks using the explicit markers we requested
-    blocks = re.findall(r"QUESTION BEGIN(.*?)QUESTION END", text, flags=re.DOTALL | re.IGNORECASE)
-    if blocks:
-        for block in blocks:
-            id_match = re.search(r"ID:\s*(.+)", block)
-            qtext_match = re.search(r"QTEXT:\s*(.*?)\n(?:TOTAL_MARKS:|MARKSCHEME:)", block, flags=re.DOTALL)
-            tm_match = re.search(r"TOTAL_MARKS:\s*(.*)", block)
-            ms_match = re.search(r"MARKSCHEME:\s*(.*)", block, flags=re.DOTALL)
-            qid = id_match.group(1).strip() if id_match else ""
-            qtext = qtext_match.group(1).strip() if qtext_match else ""
-            total_marks = tm_match.group(1).strip() if tm_match else ""
-            # try to normalize total_marks to int if possible
-            try:
-                total_marks = int(re.search(r"\d+", total_marks).group(0)) if total_marks else None
-            except Exception:
-                total_marks = None
-            ms = ms_match.group(1).strip() if ms_match else ""
-            questions.append({
-                "id": qid,
-                "qp": qtext,
-                "total_marks": total_marks,
-                "ms": ms
-            })
-        return questions
-    # Fallback: If model didn't follow markers, try splitting by lines that look like question headers
-    # This is conservative: find headings like "1", "1.", "1(a)" at line starts
-    parts = re.split(r"(?m)^\s*(\d+(?:\([a-zA-Z0-9]+\)|[a-zA-Z]|\.[a-zA-Z0-9]+)?)\s*[\.\):\-]\s*", text)
-    # parts list pattern: [pretext, id1, body1, id2, body2, ...]
-    if len(parts) >= 3:
-        it = iter(parts)
-        pre = next(it)
-        while True:
-            try:
-                qid = next(it).strip()
-                body = next(it)
-            except StopIteration:
-                break
-            # try to separate question text and markscheme inside body using "Markscheme" keyword
-            ms_split = re.split(r"(?i)\bmarkscheme\b|(?i)\bmark scheme\b", body, maxsplit=1)
-            if len(ms_split) == 2:
-                qtext = ms_split[0].strip(":-\n ")
-                ms = ms_split[1].strip()
-            else:
-                # try to look for "Marks" summary then rest
-                m_search = re.search(r"(?i)\bmarks[:\s]*\d+", body)
-                if m_search:
-                    # take text before marks as qtext
-                    qtext = body[:m_search.start()].strip()
-                    ms = body[m_search.start():].strip()
-                else:
-                    # fallback: put entire body into qp and ms empty
-                    qtext = body.strip()
-                    ms = ""
-            # try to find total marks integer
-            tm = None
-            tm_found = re.search(r"(?i)(?:total\s*marks|marks|[\/]\s*\d+|out of)\s*[:\s]*?(\d+)", body)
-            if tm_found:
-                try:
-                    tm = int(tm_found.group(1))
-                except:
-                    tm = None
-            questions.append({
-                "id": qid,
-                "qp": qtext,
-                "total_marks": tm,
-                "ms": ms
-            })
-        return questions
-    # If nothing found, return one block with raw text as fallback
-    return [{"id": "1", "qp": text.strip(), "total_marks": None, "ms": ""}]
-def parse_as_transcript(text):
-    """
-    Parse AS transcript into answer blocks. Expected markers ANSWER BEGIN ... ANSWER END.
-    Return list: {id, ans}
-    """
-    answers = []
-    blocks = re.findall(r"ANSWER BEGIN(.*?)ANSWER END", text, flags=re.DOTALL | re.IGNORECASE)
-    if blocks:
-        for block in blocks:
-            id_match = re.search(r"ID:\s*(.+)", block)
-            ans_match = re.search(r"ANSWER:\s*(.*)", block, flags=re.DOTALL)
-            qid = id_match.group(1).strip() if id_match else ""
-            ans = ans_match.group(1).strip() if ans_match else block.strip()
-            answers.append({
-                "id": qid,
-                "as": ans
-            })
-        return answers
-    # Fallback: split by likely question labels in the student's transcription, e.g., "1.", "1)", "1a."
-    parts = re.split(r"(?m)^\s*(\d+(?:[a-zA-Z]|\([^\)]+\))?)\s*[\.\):\-]\s*", text)
-    if len(parts) >= 3:
-        it = iter(parts)
-        pre = next(it)
-        while True:
-            try:
-                qid = next(it).strip()
-                body = next(it)
-            except StopIteration:
-                break
-            answers.append({"id": qid, "as": body.strip()})
-        return answers
-    # If no structure at all, try to chunk by double newlines
-    chunks = [c.strip() for c in text.split("\n\n") if c.strip()]
-    for i, c in enumerate(chunks, start=1):
-        answers.append({"id": f"INFERRED:{i}", "as": c})
-    return answers
-# ---------- Gemini call wrapper ----------
-def gemini_generate_content(model, prompt_text, file_upload_obj=None):
     """
-    Helper: send prompt_text and optionally a single uploaded file to model.generate_content.
-    Returns the textual response (str).
     """
-    inputs = [prompt_text]
-    if file_upload_obj:
-        inputs.append(file_upload_obj)
-    response = model.generate_content(inputs)
-    # Response handling as in original script
-    raw_text = getattr(response, "text", None)
-    if not raw_text and getattr(response, "candidates", None):
-        # new-style candidate chain
-        raw_text = response.candidates[0].content.parts[0].text
-    if not raw_text:
-        # attempt to stringify response
-        raw_text = str(response)
-    return raw_text
-# ---------- Imprinting and mapping helpers remain unchanged ----------
-def ask_gemini_for_mapping_for_page(model, image_path, grading_json, rows=GRID_ROWS, cols=GRID_COLS):
     prompt = f"""
 You are an exam marker. Your role is to identify where each question begins on the page.
 The page is divided into a {rows} x {cols} grid. Each cell has a RUNNING NUMBER label (1..{rows*cols}).
@@ -360,16 +297,13 @@ Grading JSON:
     raw_text = getattr(response, "text", None)
     if not raw_text and getattr(response, "candidates", None):
         raw_text = response.candidates[0].content.parts[0].text
-    print("🔎 Gemini mapping raw output (page):")
-    print(raw_text[:1000] + ("..." if len(raw_text) > 1000 else ""))
     try:
         start = raw_text.index('[')
         end = raw_text.rindex(']') + 1
         json_part = raw_text[start:end]
         mapping = json.loads(json_part)
         return mapping
-    except Exception as e:
         match = re.search(r'(\[.*\])', raw_text, re.DOTALL)
         if match:
             try:
@@ -380,11 +314,15 @@ Grading JSON:
         return []
 def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, rows=GRID_ROWS, cols=GRID_COLS):
     pages = convert_from_path(pdf_path, dpi=200)
     annotated_page_paths = []
-    print(f"📄 Converted answer PDF to {len(pages)} page image(s) for imprinting.")
     temp_grid_images = []
     for p_index, page in enumerate(pages):
         img = page.convert("RGB")
         w, h = img.size
@@ -412,19 +350,30 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, rows=
         img.save(temp_path, "PNG")
         temp_grid_images.append(temp_path)
-    for p_index, grid_img_path in enumerate(temp_grid_images):
-        print(f"\n🛰 Sending page {p_index+1} to Gemini for mapping...")
-        mapping = ask_gemini_for_mapping_for_page(model, grid_img_path, grading_json, rows, cols)
-        print(f"🔁 Parsed mapping for page {p_index+1}: {mapping}")
-        page_img = pages[p_index].convert("RGB")
         img_cv = np.array(page_img)
         img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
         h, w, _ = img_cv.shape
         cell_w_px, cell_h_px = w / cols, h / rows
         occupied = set()
         for item in mapping:
             qid = item.get("question")
             cell_number = item.get("cell_number")
@@ -434,14 +383,13 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, rows=
             marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", []) if g["question"] == qid), [])
             if not marks_list:
                 marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", [])
-                                   if g["question"].lower() == qid.lower()), [])
             marks_text = ",".join(marks_list) if marks_list else "?"
             row = (cell_number - 1) // cols
             col = (cell_number - 1) % cols
-            placed = False
             candidates = []
             if col + 1 < cols:
                 candidates.append((row, col + 1))
@@ -456,7 +404,6 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, rows=
                     chosen = (r_c, c_c)
                     occupied.add(cell_id)
                     break
             if chosen is None:
                 chosen = (row, col)
@@ -464,9 +411,6 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, rows=
             x_c = int((c_c + 1) * cell_w_px - cell_w_px * 0.1)
             y_c = int((r_c + 0.5) * cell_h_px)
-            print(f"Page {p_index+1} | Question {qid} -> mapped cell {cell_number} -> chosen cell ({r_c},{c_c})"
-                  f" -> pixel coords ({x_c},{y_c}) | marks: {marks_text}")
             font_scale = max(0.6, min(1.6, cell_h_px / 60.0))
             thickness = max(1, int(font_scale * 2))
             cv2.putText(img_cv, marks_text, (x_c, y_c), cv2.FONT_HERSHEY_SIMPLEX,
@@ -475,192 +419,108 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, rows=
         annotated_path = f"annotated_page_{p_index+1}.png"
         cv2.imwrite(annotated_path, img_cv)
         annotated_page_paths.append(annotated_path)
-        print(f"🖊 Annotated page saved: {annotated_path}")
     with open(output_pdf, "wb") as f:
         f.write(img2pdf.convert(annotated_page_paths))
-    print(f"📑 Imprinted PDF saved to: {output_pdf}")
     compressed = compress_pdf(output_pdf)
-    if compressed != output_pdf:
-        print(f"📦 Imprinted PDF compressed: {compressed}")
     return compressed
-# ---------- Main pipeline (rewritten) ----------
 def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
     """
-    New flow:
-    1) compress as needed
     2) merge QP + MS -> merged_qpms.pdf
-    3) upload merged_qpms and ans separately
-    4) send two parallel transcription requests:
-       - merged_qpms with QP_MS_TRANSCRIBE prompt
-       - ans with AS_TRANSCRIBE prompt
-    5) parse transcripts to get per-question qp, ms, and per-answer as
-    6) align locally by question ID
-    7) send aligned structure to grading prompt
-    8) extract marks and optionally imprint
     """
     try:
         qp_path = compress_pdf(qp_path)
         ms_path = compress_pdf(ms_path)
         ans_path = compress_pdf(ans_path)
-        # Merge QP + MS into single PDF
         merged_qpms_path = os.path.splitext(qp_path)[0] + "_merged_qp_ms.pdf"
         merge_pdfs([qp_path, ms_path], merged_qpms_path)
-        print(f"📎 Merged QP + MS -> {merged_qpms_path}")
-        # Upload files to Gemini
-        print("🔼 Uploading files to Gemini...")
         merged_uploaded = genai.upload_file(path=merged_qpms_path, display_name="QP+MS (merged)")
         ans_uploaded = genai.upload_file(path=ans_path, display_name="Answer Sheet")
         model = create_model()
-        # Prepare prompts
-        qpms_prompt = PROMPTS["QP_MS_TRANSCRIBE"]["content"]
-        as_prompt = PROMPTS["AS_TRANSCRIBE"]["content"]
-        # Send both requests in parallel
-        print("📡 Sending transcription requests (QP+MS & AS) in parallel...")
-        transcripts = {}
-        with ThreadPoolExecutor(max_workers=2) as ex:
-            futures = {
-                ex.submit(gemini_generate_content, model, qpms_prompt, merged_uploaded): "qpms",
-                ex.submit(gemini_generate_content, model, as_prompt, ans_uploaded): "as"
-            }
-            for fut in as_completed(futures):
-                key = futures[fut]
-                try:
-                    res_text = fut.result()
-                except Exception as e:
-                    res_text = f"❌ Error during transcription: {e}"
-                transcripts[key] = res_text
-                print(f"✅ Transcription complete for: {key} (chars: {len(res_text)})")
-        qpms_text = transcripts.get("qpms", "")
-        as_text = transcripts.get("as", "")
-        # Debug: save transcripts for review
         with open("debug_qpms_transcript.txt", "w", encoding="utf-8") as f:
             f.write(qpms_text)
         with open("debug_as_transcript.txt", "w", encoding="utf-8") as f:
             f.write(as_text)
-        # Parse transcripts
-        print("🔧 Parsing QP+MS transcript...")
-        qpms_questions = parse_qp_ms_transcript(qpms_text)
-        print(f"Found {len(qpms_questions)} questions in QP+MS transcript.")
-        print("🔧 Parsing Answer Sheet transcript...")
-        as_answers = parse_as_transcript(as_text)
-        print(f"Found {len(as_answers)} answer blocks in AS transcript.")
-        # Build alignment: map by normalized IDs
-        def normalize_id(qid):
-            if not qid:
-                return ""
-            s = qid.strip().lower()
-            s = re.sub(r"[\.\)\(:\s]+", "", s)
-            return s
-        answers_map = {}
-        for a in as_answers:
-            nid = normalize_id(a.get("id", ""))
-            if nid == "":
-                # if empty id, try to infer using INFERRED: or use a running fallback index
-                nid = a.get("id", "")
-            # store first matching block (if multiple blocks for same id, append)
-            if nid in answers_map:
-                answers_map[nid] += "\n\n" + a.get("as", "")
-            else:
-                answers_map[nid] = a.get("as", "")
-        aligned_questions = []
-        for q in qpms_questions:
-            qid = q.get("id", "")
-            nid = normalize_id(qid)
-            # try direct id match
-            student_ans = answers_map.get(nid)
-            # try alternative matches (e.g., '1a' vs '1(a)')
-            if student_ans is None:
-                for k in answers_map:
-                    if k.startswith(nid) or nid.startswith(k) or (nid and nid.replace(" ", "") in k):
-                        student_ans = answers_map[k]
-                        break
-            # fallback: look for first answer that contains the question id as text (loose)
-            if student_ans is None:
-                for k, v in answers_map.items():
-                    if qid and qid.lower() in k:
-                        student_ans = v
-                        break
-            aligned_questions.append({
-                "id": qid,
-                "qp": q.get("qp", ""),
-                "total_marks": q.get("total_marks"),
-                "ms": q.get("ms", ""),           # verbatim markscheme block
-                "as": student_ans if student_ans is not None else ""
-            })
-        # If any answer blocks left unmatched, optionally append them as INFERRED entries
-        matched_ids = set([normalize_id(q["id"]) for q in aligned_questions])
-        for k, v in answers_map.items():
-            if k not in matched_ids:
-                aligned_questions.append({
-                    "id": k,
-                    "qp": "",
-                    "total_marks": None,
-                    "ms": "",
-                    "as": v
-                })
-        # Build alignment JSON text to send to grading model
-        alignment_payload = {"questions": aligned_questions}
-        alignment_json_text = json.dumps(alignment_payload, indent=2, ensure_ascii=False)
-        print("📦 Built alignment JSON (truncated):")
-        print(alignment_json_text[:1000] + ("..." if len(alignment_json_text) > 1000 else ""))
-        # Step: grading
-        print("2️⃣ Sending grading prompt to Gemini...")
-        # We send both the system grading prompt and the alignment JSON as content
-        response = model.generate_content([PROMPTS["GRADING_PROMPT"]["content"], alignment_json_text])
         grading_text = getattr(response, "text", None)
         if not grading_text and getattr(response, "candidates", None):
             grading_text = response.candidates[0].content.parts[0].text
         if not grading_text:
             raise RuntimeError("No grading output returned from Gemini.")
-        print("✅ Grading Markdown received (truncated preview):")
-        print((grading_text[:1000] + '...') if len(grading_text) > 1000 else grading_text)
         # Save grading PDF
         base_name = os.path.splitext(os.path.basename(ans_path))[0]
         grading_pdf_path = save_as_pdf(grading_text, f"{base_name}_graded.pdf")
-        print(f"📄 Grading PDF saved: {grading_pdf_path}")
-        # Extract marks for imprinting
         grading_json = extract_marks_from_grading(grading_text)
-        print("🔧 Extracted grading JSON (per-question marks):")
-        print(json.dumps(grading_json, indent=2))
         imprinted_pdf_path = None
         if imprint:
-            print("✍ Imprint option enabled. Starting imprinting process...")
             imprinted_pdf_path = f"{base_name}_imprinted.pdf"
             imprinted_pdf_path = imprint_marks_using_mapping(ans_path, grading_json, imprinted_pdf_path, model)
-            print(f"✅ Imprinting finished. Imprinted PDF at: {imprinted_pdf_path}")
-        return alignment_json_text, grading_text, grading_pdf_path, imprinted_pdf_path
     except Exception as e:
-        print("❌ Pipeline error:", e)
-        return f"❌ Error: {e}", None, None, None
-# ---------------- GRADIO UI (adapted) ----------------
-with gr.Blocks(title="LeadIB AI Grading (New Flow: Parallel Transcription + Align + Grade)") as demo:
-    gr.Markdown("## 📘 LeadIB AI Grading — Updated Flow\nUpload **Question Paper**, **Markscheme**, and **Student Answer Sheet**.\nSystem: merge QP+MS -> transcribe QP+MS and AS in parallel -> align locally -> grade -> (optional) imprint marks.")
     with gr.Row():
         qp_file = gr.File(label="📄 Upload Question Paper (PDF)")
@@ -668,12 +528,13 @@ with gr.Blocks(title="LeadIB AI Grading (New Flow: Parallel Transcription + Alig
         ans_file = gr.File(label="📝 Upload Student Answer Sheet (PDF)")
     imprint_toggle = gr.Checkbox(label="✍ Imprint Marks on Student Answer Sheet", value=False)
-    run_button = gr.Button("🚀 Run Alignment + Grading")
     with gr.Row():
-        json_output_box = gr.Textbox(label="📑 Step: Alignment (JSON)", lines=20)
-        grading_output_box = gr.Textbox(label="📝 Step: Grading (Markdown)", lines=20)
     grading_pdf_file = gr.File(label="📥 Download Grading PDF")
     imprint_pdf_file = gr.File(label="📥 Download Imprinted PDF (Optional)")
@@ -682,17 +543,16 @@ with gr.Blocks(title="LeadIB AI Grading (New Flow: Parallel Transcription + Alig
         ms_path = ms_file_obj.name
         ans_path = ans_file_obj.name
-        alignment_text, grading_text, grading_pdf_path, imprinted_pdf_path = align_and_grade_pipeline(
             qp_path, ms_path, ans_path, imprint=imprint_flag
         )
-        # For Gradio file outputs: return text/paths
-        return alignment_text, grading_text, grading_pdf_path, imprinted_pdf_path
     run_button.click(
         fn=run_pipeline,
         inputs=[qp_file, ms_file, ans_file, imprint_toggle],
-        outputs=[json_output_box, grading_output_box, grading_pdf_file, imprint_pdf_file]
     )
 if __name__ == "__main__":

 from concurrent.futures import ThreadPoolExecutor, as_completed
 from PyPDF2 import PdfReader, PdfWriter
+# ---------------- CONFIG ----------------
 genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
 GRID_ROWS, GRID_COLS = 20, 14
+# ---------------- PROMPTS ----------------
 PROMPTS = {
+    "QP_MS_TRANSCRIPTION": {
         "role": "system",
         "content": """You are a high-quality OCR/Transcription assistant.
+INPUT: This file is a PDF that first contains the Question Paper and immediately after it the Markscheme.
+TASK: Produce an exact transcription in plain text with clear separators.
+total marks of paper
+question
+total marks of that question
+.
+(continue this for all question )
+mark scheme ( exact for each question)
+M :Marks awarded for attempting to use a correct Method.
+A : Marks awarded for an Answer or for Accuracy; often dependent on preceding M marks.
+R :Marks awarded for clear Reasoning.
+REPRESENT THESE ABOVE NOTATION IN MS CLEARLY , EG : M1 , A1 ,M2 ...
+( ignore THESE N1 , N2 , N3 )
+-----------------------
+OUTPUT FORMAT (use this exact block-style for each question; preserve formatting exactly):
+Paper Total Marks: <number>
+Question <id>
+Total Marks: <number>
+QP:
+<transcribed question text>
+MS:
+<exact verbatim markscheme lines for this question>
+--QUESTION-END--
+(repeat for all questions in order)
 """
     },
+    # AS_TRANSCRIPTION will be dynamically constructed (in code) after extracting IDs from QP+MS result
     "GRADING_PROMPT": {
         "role": "system",
         "content": """Developer: You are an official examiner. Apply the following grading rules precisely.
 7. Any lost mark: use red `<span style="color:red">M0</span>` and make Reason red.
 ---
 ## Output Format
+Produce two sections per question/sub-question, following this structure:
+## Question <id>
 ### Markscheme vs Student Answer
 | Mark ID | Markscheme Expectation | Student’s Response | Awarded |
 |---------|------------------------|--------------------|---------|
 | M1_1    | Recognise GP           | "r=0.9"            | M1 |
+➡️ **Total: X/Y**
 ---
 ### Examiner’s Report
 At the very end, provide a summary table:
 | Question Number | Marks | Remark |
 |-----------------|-------|--------|
+| 1               | X/Y   | <remark> |
+Then show total clearly as a final line:
+`Total: <obtained_marks>/<max_marks>`
+NOTES:
+- The assistant will receive two transcripts (QP+MS transcription & AS transcription) in plain text. Use the QP+MS transcript as the authoritative source of question wording, total marks, and verbatim markscheme entries (M/A/R mark IDs).
+- Match student answers to question IDs and grade according to the provided verbatim markscheme.
+- Produce full markdown as above. Ensure mark IDs used in the grading are present and consistent with the markscheme.
 """
     }
 }
+# ---------------- HELPERS ----------------
 def save_as_pdf(text, filename="output.pdf"):
     pdf = MarkdownPdf()
     pdf.add_section(Section(text, toc=False))
         return input_path
     if size <= max_size:
         return input_path
     try:
         gs_cmd = [
             "gs", "-sDEVICE=pdfwrite",
         ]
         subprocess.run(gs_cmd, check=True)
         new_size = os.path.getsize(output_path)
         if new_size <= max_size:
             return output_path
         else:
             return input_path
+    except Exception:
         return input_path
 def create_model():
     try:
         return genai.GenerativeModel("gemini-2.5-pro", generation_config={"temperature": 0})
     except Exception:
         return genai.GenerativeModel("gemini-2.5-flash", generation_config={"temperature": 0})
+def merge_pdfs(paths, output_path):
+    writer = PdfWriter()
+    for p in paths:
+        reader = PdfReader(p)
+        for page in reader.pages:
+            writer.add_page(page)
+    with open(output_path, "wb") as f:
+        writer.write(f)
+    return output_path
+def gemini_generate_content(model, prompt_text, file_upload_obj=None, image_obj=None):
+    """
+    Send prompt_text and optionally an uploaded file (or an image object) to the model.
+    Returns textual response.
+    """
+    inputs = [prompt_text]
+    if file_upload_obj:
+        inputs.append(file_upload_obj)
+    if image_obj:
+        inputs.append(image_obj)
+    response = model.generate_content(inputs)
+    raw_text = getattr(response, "text", None)
+    if not raw_text and getattr(response, "candidates", None):
+        raw_text = response.candidates[0].content.parts[0].text
+    if not raw_text:
+        raw_text = str(response)
+    return raw_text
+# ---------------- PARSERS ----------------
+def extract_question_ids_from_qpms(text):
+    """
+    Extract question IDs from QP+MS transcript output.
+    We expect QP_MS_TRANSCRIPTION to contain lines like: "Question <id>"
+    Return a list of unique IDs in order of appearance.
+    """
+    ids = []
+    # Primary: lines starting with 'Question <id>'
+    for m in re.finditer(r"(?im)^\s*Question\s+([0-9]+(?:[.\-a-zA-Z()]+(?:\.[a-zA-Z0-9()]+)*)?)\b", text):
+        qid = m.group(1).strip()
+        if qid not in ids:
+            ids.append(qid)
+    # Secondary: if none found, look for explicit markers like "Question <id>" with colon/line
+    if not ids:
+        for m in re.finditer(r"(?im)Question\s*[:\-]?\s*([0-9]+(?:[.\-a-zA-Z()]+)*)", text):
+            qid = m.group(1).strip()
+            if qid not in ids:
+                ids.append(qid)
+    # Tertiary fallback: scan for typical serial patterns in the document
+    if not ids:
+        # match patterns like 1, 1.a, 3.a.i, 2(b), etc., where they appear at line starts
+        for m in re.finditer(r"(?m)^\s*([0-9]+(?:(?:\.[a-zA-Z0-9]+)+|(?:\([a-zA-Z0-9]+\))+|[a-zA-Z])*)\s*[\.\):\-]", text):
+            qid = m.group(1).strip()
+            if qid not in ids:
+                ids.append(qid)
+    return ids
+def build_as_prompt_with_expected_ids(expected_ids):
+    """
+    Construct the AS transcription prompt injecting the expected IDs block (as {regex} slot).
+    The expected_ids is a list; we format them per user instruction inside braces.
+    """
+    if not expected_ids:
+        ids_block = "{NA}"
+    else:
+        # Format exactly as user provided: curly brace block with each ID on its own line
+        ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
+    prompt = f"""You are a high-quality handwritten transcription assistant.
+INPUT: This PDF contains a student's handwritten answer sheet.
+TASK: Transcribe the student's answers exactly (as text). Preserve step order and line breaks. Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3"). If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context — but mark inferred IDs clearly as "INFERRED: <id>"
+Enclose all mathematical expressions in Markdown fenced code blocks (``` triple backticks).
+If a diagram/graph is omitted, write [Graph omitted].
+Unreadable parts: [illegible].
+Unanswered: [No response].
+Do NOT recreate diagrams.
+Ensure consistency and determinism in formatting so subsequent models can grade directly from this aligned format.
+Expected questions (if missing, write NA):
+{ids_block}
+-----------------------
+OUTPUT FORMAT:
+Question <id>
+AS:
+<transcribed answer or placeholder>
+"""
+    return prompt
 def extract_marks_from_grading(grading_text):
+    """
+    Parse the grading markdown produced by the GRADING_PROMPT and extract marks per question.
+    Returns dict: {"grading": [{"question": "1.a", "marks_awarded": ["M1","A1"]}, ...]}
+    """
     grading_json = {"grading": []}
+    # Split by question sections using "## Question" header
     question_blocks = re.split(r"##\s*Question\s+", grading_text)
     for block in question_blocks[1:]:
         first_line = block.strip().splitlines()[0].strip()
         })
     return grading_json
+# ---------------- MAPPING/IMPRINT HELPERS ----------------
+def ask_gemini_for_mapping_for_page(model, image_path, grading_json, rows=GRID_ROWS, cols=GRID_COLS):
     """
+    Send a single page image along with the grading_json; LLM should return JSON mapping.
     """
     prompt = f"""
 You are an exam marker. Your role is to identify where each question begins on the page.
 The page is divided into a {rows} x {cols} grid. Each cell has a RUNNING NUMBER label (1..{rows*cols}).
     raw_text = getattr(response, "text", None)
     if not raw_text and getattr(response, "candidates", None):
         raw_text = response.candidates[0].content.parts[0].text
     try:
         start = raw_text.index('[')
         end = raw_text.rindex(']') + 1
         json_part = raw_text[start:end]
         mapping = json.loads(json_part)
         return mapping
+    except Exception:
         match = re.search(r'(\[.*\])', raw_text, re.DOTALL)
         if match:
             try:
         return []
 def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, rows=GRID_ROWS, cols=GRID_COLS):
+    """
+    Convert PDF to images, create grid-numbered images for sending to Gemini,
+    send all page images in parallel to Gemini for mapping, then annotate and produce imprinted PDF.
+    """
     pages = convert_from_path(pdf_path, dpi=200)
     annotated_page_paths = []
     temp_grid_images = []
+    # Create grid-numbered images for mapping prompt
     for p_index, page in enumerate(pages):
         img = page.convert("RGB")
         w, h = img.size
         img.save(temp_path, "PNG")
         temp_grid_images.append(temp_path)
+    # Send all grid images in parallel to Gemini to get mappings
+    mappings_per_page = {}
+    model_local = model  # for thread scope
+    with ThreadPoolExecutor(max_workers=min(8, len(temp_grid_images))) as ex:
+        futures = {ex.submit(ask_gemini_for_mapping_for_page, model_local, img_path, grading_json, rows, cols): idx
+                   for idx, img_path in enumerate(temp_grid_images)}
+        for fut in as_completed(futures):
+            idx = futures[fut]
+            try:
+                mapping = fut.result()
+            except Exception:
+                mapping = []
+            mappings_per_page[idx] = mapping
+    # Annotate original pages according to returned mappings
+    for p_index, page in enumerate(pages):
+        page_img = page.convert("RGB")
         img_cv = np.array(page_img)
         img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
         h, w, _ = img_cv.shape
         cell_w_px, cell_h_px = w / cols, h / rows
+        mapping = mappings_per_page.get(p_index, [])
         occupied = set()
         for item in mapping:
             qid = item.get("question")
             cell_number = item.get("cell_number")
             marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", []) if g["question"] == qid), [])
             if not marks_list:
                 marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", [])
+                                   if g["question"].lower() == (qid or "").lower()), [])
             marks_text = ",".join(marks_list) if marks_list else "?"
             row = (cell_number - 1) // cols
             col = (cell_number - 1) % cols
             candidates = []
             if col + 1 < cols:
                 candidates.append((row, col + 1))
                     chosen = (r_c, c_c)
                     occupied.add(cell_id)
                     break
             if chosen is None:
                 chosen = (row, col)
             x_c = int((c_c + 1) * cell_w_px - cell_w_px * 0.1)
             y_c = int((r_c + 0.5) * cell_h_px)
             font_scale = max(0.6, min(1.6, cell_h_px / 60.0))
             thickness = max(1, int(font_scale * 2))
             cv2.putText(img_cv, marks_text, (x_c, y_c), cv2.FONT_HERSHEY_SIMPLEX,
         annotated_path = f"annotated_page_{p_index+1}.png"
         cv2.imwrite(annotated_path, img_cv)
         annotated_page_paths.append(annotated_path)
     with open(output_pdf, "wb") as f:
         f.write(img2pdf.convert(annotated_page_paths))
     compressed = compress_pdf(output_pdf)
     return compressed
+# ---------------- MAIN PIPELINE ----------------
 def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
     """
+    Flow:
+    1) compress files if needed
     2) merge QP + MS -> merged_qpms.pdf
+    3) upload merged_qpms to Gemini, request transcription (QP+MS)
+    4) extract question IDs via regex from QP+MS result
+    5) build AS transcription prompt injecting expected IDs block
+    6) send AS transcription request (using injected expected IDs)
+    7) send both transcripts to grading prompt -> get grading markdown
+    8) extract marks for imprinting
+    9) optional imprint: convert pages, send page images in parallel to LLM for mapping, annotate and produce imprinted PDF
     """
     try:
+        # Step 0: compress
         qp_path = compress_pdf(qp_path)
         ms_path = compress_pdf(ms_path)
         ans_path = compress_pdf(ans_path)
+        # Merge QP + MS
         merged_qpms_path = os.path.splitext(qp_path)[0] + "_merged_qp_ms.pdf"
         merge_pdfs([qp_path, ms_path], merged_qpms_path)
+        # Upload files
         merged_uploaded = genai.upload_file(path=merged_qpms_path, display_name="QP+MS (merged)")
         ans_uploaded = genai.upload_file(path=ans_path, display_name="Answer Sheet")
         model = create_model()
+        # Step 1: QP+MS transcription (first)
+        qpms_prompt = PROMPTS["QP_MS_TRANSCRIPTION"]["content"]
+        qpms_text = gemini_generate_content(model, qpms_prompt, file_upload_obj=merged_uploaded)
+        # save debug
         with open("debug_qpms_transcript.txt", "w", encoding="utf-8") as f:
             f.write(qpms_text)
+        # Step 2: extract serial numbers (question IDs) using regex from qpms_text
+        extracted_ids = extract_question_ids_from_qpms(qpms_text)
+        # if empty, we still provide a default list placeholder so AS model writes NA for missing ones
+        if not extracted_ids:
+            extracted_ids = ["NA"]
+        # Step 3: Build AS prompt injecting extracted IDs
+        as_prompt = build_as_prompt_with_expected_ids(extracted_ids)
+        # Step 4: AS transcription (after injecting IDs)
+        as_text = gemini_generate_content(model, as_prompt, file_upload_obj=ans_uploaded)
         with open("debug_as_transcript.txt", "w", encoding="utf-8") as f:
             f.write(as_text)
+        # Step 5: Grading - send both transcripts to grading model
+        # Build payload by concatenating transcripts with clear separators
+        grading_input = (
+            "=== QP+MS TRANSCRIPT BEGIN ===\n"
+            + qpms_text
+            + "\n=== QP+MS TRANSCRIPT END ===\n\n"
+            + "=== ANSWER SHEET TRANSCRIPT BEGIN ===\n"
+            + as_text
+            + "\n=== ANSWER SHEET TRANSCRIPT END ===\n"
+        )
+        grading_prompt_system = PROMPTS["GRADING_PROMPT"]["content"]
+        grading_text = gemini_generate_content(model, grading_prompt_system, file_upload_obj=None, image_obj=None)
+        # The above call returns the system-only content if used incorrectly; instead we must pass both system prompt and content to generate_content
+        # Re-call properly:
+        response = model.generate_content([grading_prompt_system, grading_input])
         grading_text = getattr(response, "text", None)
         if not grading_text and getattr(response, "candidates", None):
             grading_text = response.candidates[0].content.parts[0].text
         if not grading_text:
             raise RuntimeError("No grading output returned from Gemini.")
         # Save grading PDF
         base_name = os.path.splitext(os.path.basename(ans_path))[0]
         grading_pdf_path = save_as_pdf(grading_text, f"{base_name}_graded.pdf")
+        # Step 6: Extract marks for imprinting
         grading_json = extract_marks_from_grading(grading_text)
+        with open("debug_grading_json.json", "w", encoding="utf-8") as f:
+            json.dump(grading_json, f, indent=2, ensure_ascii=False)
         imprinted_pdf_path = None
         if imprint:
+            # Step 7: Imprinting - send all page images in parallel to LLM for mapping and annotate
             imprinted_pdf_path = f"{base_name}_imprinted.pdf"
             imprinted_pdf_path = imprint_marks_using_mapping(ans_path, grading_json, imprinted_pdf_path, model)
+        return qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path
     except Exception as e:
+        return f"❌ Error: {e}", None, None, None, None
+# ---------------- GRADIO UI ----------------
+with gr.Blocks(title="LeadIB AI Grading (Updated Flow: QP+MS -> IDs -> AS -> Grade -> Imprint)") as demo:
+    gr.Markdown("## 📘 LeadIB AI Grading — Final Flow\nUpload **Question Paper**, **Markscheme**, and **Student Answer Sheet**.\nFlow: merge QP+MS -> transcribe (QP+MS) -> extract IDs -> transcribe AS with expected IDs -> grade -> (optional) imprint.")
     with gr.Row():
         qp_file = gr.File(label="📄 Upload Question Paper (PDF)")
         ans_file = gr.File(label="📝 Upload Student Answer Sheet (PDF)")
     imprint_toggle = gr.Checkbox(label="✍ Imprint Marks on Student Answer Sheet", value=False)
+    run_button = gr.Button("🚀 Run Pipeline")
     with gr.Row():
+        qpms_box = gr.Textbox(label="📑 QP+MS Transcript", lines=12)
+        as_box = gr.Textbox(label="📝 AS Transcript", lines=12)
+    grading_output_box = gr.Textbox(label="🧾 Grading (Markdown)", lines=20)
     grading_pdf_file = gr.File(label="📥 Download Grading PDF")
     imprint_pdf_file = gr.File(label="📥 Download Imprinted PDF (Optional)")
         ms_path = ms_file_obj.name
         ans_path = ans_file_obj.name
+        qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path = align_and_grade_pipeline(
             qp_path, ms_path, ans_path, imprint=imprint_flag
         )
+        return qpms_text or "", as_text or "", grading_text or "", grading_pdf_path, imprinted_pdf_path
     run_button.click(
         fn=run_pipeline,
         inputs=[qp_file, ms_file, ans_file, imprint_toggle],
+        outputs=[qpms_box, as_box, grading_output_box, grading_pdf_file, imprint_pdf_file]
     )
 if __name__ == "__main__":