neurolearn

Sleeping

App Files Files Community

atz21 commited on Sep 24, 2025

Commit

930defb

verified ·

1 Parent(s): e4d3932

Update app.py

Browse files

Files changed (1) hide show

app.py +131 -111

app.py CHANGED Viewed

@@ -21,44 +21,54 @@ GRID_ROWS, GRID_COLS = 20, 14
 # ---------------- PROMPTS ----------------
 PROMPTS = {
     "QP_MS_TRANSCRIPTION": {
         "role": "system",
         "content": """You are a high-quality OCR/Transcription assistant.
-INPUT: This file is a PDF that first contains the Question Paper and immediately after it the Markscheme.
 TASK: Produce an exact transcription in plain text with clear separators.
-total marks of paper
-question
-total marks of that question
-.
-(continue this for all question )
-mark scheme ( exact for each question)
-M :Marks awarded for attempting to use a correct Method.
-A : Marks awarded for an Answer or for Accuracy; often dependent on preceding M marks.
-R :Marks awarded for clear Reasoning.
-REPRESENT THESE ABOVE NOTATION IN MS CLEARLY , EG : M1 , A1 ,M2 ...
-( ignore THESE N1 , N2 , N3 )
------------------------
-OUTPUT FORMAT (use this exact block-style for each question; preserve formatting exactly):
-Paper Total Marks: <number>
-Question <id>
-Total Marks: <number>
 QP:
-<transcribed question text>
-MS:
-<exact verbatim markscheme lines for this question>
---QUESTION-END--
-(repeat for all questions in order)
 """
     },
-    # AS_TRANSCRIPTION will be dynamically constructed (in code) after extracting IDs from QP+MS result
     "GRADING_PROMPT": {
         "role": "system",
         "content": """Developer: You are an official examiner. Apply the following grading rules precisely.
@@ -101,7 +111,7 @@ Then show total clearly as a final line:
 `Total: <obtained_marks>/<max_marks>`
 NOTES:
-- The assistant will receive two transcripts (QP+MS transcription & AS transcription) in plain text. Use the QP+MS transcript as the authoritative source of question wording, total marks, and verbatim markscheme entries (M/A/R mark IDs).
 - Match student answers to question IDs and grade according to the provided verbatim markscheme.
 - Produce full markdown as above. Ensure mark IDs used in the grading are present and consistent with the markscheme.
 """
@@ -126,8 +136,10 @@ def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
         return input_path
     if size <= max_size:
         return input_path
     try:
         gs_cmd = [
             "gs", "-sDEVICE=pdfwrite",
@@ -138,18 +150,35 @@ def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
         ]
         subprocess.run(gs_cmd, check=True)
         new_size = os.path.getsize(output_path)
         if new_size <= max_size:
             return output_path
         else:
             return input_path
-    except Exception:
         return input_path
 def create_model():
     try:
-        return genai.GenerativeModel("gemini-2.5-pro", generation_config={"temperature": 0})
-    except Exception:
-        return genai.GenerativeModel("gemini-2.5-flash", generation_config={"temperature": 0})
 def merge_pdfs(paths, output_path):
     writer = PdfWriter()
@@ -164,58 +193,60 @@ def merge_pdfs(paths, output_path):
 def gemini_generate_content(model, prompt_text, file_upload_obj=None, image_obj=None):
     """
     Send prompt_text and optionally an uploaded file (or an image object) to the model.
-    Returns textual response.
     """
     inputs = [prompt_text]
     if file_upload_obj:
         inputs.append(file_upload_obj)
     if image_obj:
         inputs.append(image_obj)
     response = model.generate_content(inputs)
     raw_text = getattr(response, "text", None)
     if not raw_text and getattr(response, "candidates", None):
         raw_text = response.candidates[0].content.parts[0].text
-    if not raw_text:
         raw_text = str(response)
     return raw_text
 # ---------------- PARSERS ----------------
 def extract_question_ids_from_qpms(text):
     """
     Extract question IDs from QP+MS transcript output.
-    We expect QP_MS_TRANSCRIPTION to contain lines like: "Question <id>"
     Return a list of unique IDs in order of appearance.
     """
     ids = []
-    # Primary: lines starting with 'Question <id>'
-    for m in re.finditer(r"(?im)^\s*Question\s+([0-9]+(?:[.\-a-zA-Z()]+(?:\.[a-zA-Z0-9()]+)*)?)\b", text):
         qid = m.group(1).strip()
         if qid not in ids:
             ids.append(qid)
-    # Secondary: if none found, look for explicit markers like "Question <id>" with colon/line
-    if not ids:
-        for m in re.finditer(r"(?im)Question\s*[:\-]?\s*([0-9]+(?:[.\-a-zA-Z()]+)*)", text):
-            qid = m.group(1).strip()
-            if qid not in ids:
-                ids.append(qid)
-    # Tertiary fallback: scan for typical serial patterns in the document
-    if not ids:
-        # match patterns like 1, 1.a, 3.a.i, 2(b), etc., where they appear at line starts
-        for m in re.finditer(r"(?m)^\s*([0-9]+(?:(?:\.[a-zA-Z0-9]+)+|(?:\([a-zA-Z0-9]+\))+|[a-zA-Z])*)\s*[\.\):\-]", text):
-            qid = m.group(1).strip()
-            if qid not in ids:
-                ids.append(qid)
     return ids
 def build_as_prompt_with_expected_ids(expected_ids):
     """
-    Construct the AS transcription prompt injecting the expected IDs block (as {regex} slot).
-    The expected_ids is a list; we format them per user instruction inside braces.
     """
     if not expected_ids:
         ids_block = "{NA}"
     else:
-        # Format exactly as user provided: curly brace block with each ID on its own line
         ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
     prompt = f"""You are a high-quality handwritten transcription assistant.
@@ -247,15 +278,15 @@ def extract_marks_from_grading(grading_text):
     Parse the grading markdown produced by the GRADING_PROMPT and extract marks per question.
     Returns dict: {"grading": [{"question": "1.a", "marks_awarded": ["M1","A1"]}, ...]}
     """
     grading_json = {"grading": []}
-    # Split by question sections using "## Question" header
     question_blocks = re.split(r"##\s*Question\s+", grading_text)
     for block in question_blocks[1:]:
-        first_line = block.strip().splitlines()[0].strip()
         q_id_match = re.match(r"([0-9]+(?:[a-zA-Z]|\([^\)]+\)|(?:\.[a-zA-Z0-9]+))*)", first_line)
         if not q_id_match:
-            q_id = first_line.split()[0]
         else:
             q_id = q_id_match.group(1).strip()
         awarded = re.findall(r"\b(M\d+|A\d+|R\d+|M0|A0|R0)\b", block)
@@ -269,6 +300,8 @@ def extract_marks_from_grading(grading_text):
             "question": q_id,
             "marks_awarded": awarded_unique
         })
     return grading_json
 # ---------------- MAPPING/IMPRINT HELPERS ----------------
@@ -292,25 +325,32 @@ Return JSON only, like:
 Grading JSON:
 {json.dumps(grading_json, indent=2)}
 """
     img = Image.open(image_path)
     response = model.generate_content([prompt, img])
     raw_text = getattr(response, "text", None)
     if not raw_text and getattr(response, "candidates", None):
         raw_text = response.candidates[0].content.parts[0].text
     try:
         start = raw_text.index('[')
         end = raw_text.rindex(']') + 1
         json_part = raw_text[start:end]
         mapping = json.loads(json_part)
         return mapping
     except Exception:
         match = re.search(r'(\[.*\])', raw_text, re.DOTALL)
         if match:
             try:
                 mapping = json.loads(match.group(1))
                 return mapping
             except Exception:
                 pass
         return []
 def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, rows=GRID_ROWS, cols=GRID_COLS):
@@ -318,11 +358,11 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, rows=
     Convert PDF to images, create grid-numbered images for sending to Gemini,
     send all page images in parallel to Gemini for mapping, then annotate and produce imprinted PDF.
     """
     pages = convert_from_path(pdf_path, dpi=200)
     annotated_page_paths = []
     temp_grid_images = []
-    # Create grid-numbered images for mapping prompt
     for p_index, page in enumerate(pages):
         img = page.convert("RGB")
         w, h = img.size
@@ -349,10 +389,12 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, rows=
         temp_path = f"page_{p_index+1}_grid.png"
         img.save(temp_path, "PNG")
         temp_grid_images.append(temp_path)
     # Send all grid images in parallel to Gemini to get mappings
     mappings_per_page = {}
-    model_local = model  # for thread scope
     with ThreadPoolExecutor(max_workers=min(8, len(temp_grid_images))) as ex:
         futures = {ex.submit(ask_gemini_for_mapping_for_page, model_local, img_path, grading_json, rows, cols): idx
                    for idx, img_path in enumerate(temp_grid_images)}
@@ -360,11 +402,13 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, rows=
             idx = futures[fut]
             try:
                 mapping = fut.result()
-            except Exception:
                 mapping = []
             mappings_per_page[idx] = mapping
     # Annotate original pages according to returned mappings
     for p_index, page in enumerate(pages):
         page_img = page.convert("RGB")
         img_cv = np.array(page_img)
@@ -419,29 +463,23 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, rows=
         annotated_path = f"annotated_page_{p_index+1}.png"
         cv2.imwrite(annotated_path, img_cv)
         annotated_page_paths.append(annotated_path)
     with open(output_pdf, "wb") as f:
         f.write(img2pdf.convert(annotated_page_paths))
     compressed = compress_pdf(output_pdf)
     return compressed
 # ---------------- MAIN PIPELINE ----------------
 def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
     """
-    Flow:
-    1) compress files if needed
-    2) merge QP + MS -> merged_qpms.pdf
-    3) upload merged_qpms to Gemini, request transcription (QP+MS)
-    4) extract question IDs via regex from QP+MS result
-    5) build AS transcription prompt injecting expected IDs block
-    6) send AS transcription request (using injected expected IDs)
-    7) send both transcripts to grading prompt -> get grading markdown
-    8) extract marks for imprinting
-    9) optional imprint: convert pages, send page images in parallel to LLM for mapping, annotate and produce imprinted PDF
     """
     try:
-        # Step 0: compress
         qp_path = compress_pdf(qp_path)
         ms_path = compress_pdf(ms_path)
         ans_path = compress_pdf(ans_path)
@@ -449,36 +487,40 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
         # Merge QP + MS
         merged_qpms_path = os.path.splitext(qp_path)[0] + "_merged_qp_ms.pdf"
         merge_pdfs([qp_path, ms_path], merged_qpms_path)
-        # Upload files
         merged_uploaded = genai.upload_file(path=merged_qpms_path, display_name="QP+MS (merged)")
         ans_uploaded = genai.upload_file(path=ans_path, display_name="Answer Sheet")
         model = create_model()
-        # Step 1: QP+MS transcription (first)
         qpms_prompt = PROMPTS["QP_MS_TRANSCRIPTION"]["content"]
         qpms_text = gemini_generate_content(model, qpms_prompt, file_upload_obj=merged_uploaded)
-        # save debug
         with open("debug_qpms_transcript.txt", "w", encoding="utf-8") as f:
             f.write(qpms_text)
         # Step 2: extract serial numbers (question IDs) using regex from qpms_text
         extracted_ids = extract_question_ids_from_qpms(qpms_text)
-        # if empty, we still provide a default list placeholder so AS model writes NA for missing ones
         if not extracted_ids:
             extracted_ids = ["NA"]
-        # Step 3: Build AS prompt injecting extracted IDs
         as_prompt = build_as_prompt_with_expected_ids(extracted_ids)
-        # Step 4: AS transcription (after injecting IDs)
         as_text = gemini_generate_content(model, as_prompt, file_upload_obj=ans_uploaded)
         with open("debug_as_transcript.txt", "w", encoding="utf-8") as f:
             f.write(as_text)
-        # Step 5: Grading - send both transcripts to grading model
-        # Build payload by concatenating transcripts with clear separators
         grading_input = (
             "=== QP+MS TRANSCRIPT BEGIN ===\n"
             + qpms_text
@@ -488,39 +530,39 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
             + "\n=== ANSWER SHEET TRANSCRIPT END ===\n"
         )
         grading_prompt_system = PROMPTS["GRADING_PROMPT"]["content"]
-        grading_text = gemini_generate_content(model, grading_prompt_system, file_upload_obj=None, image_obj=None)
-        # The above call returns the system-only content if used incorrectly; instead we must pass both system prompt and content to generate_content
-        # Re-call properly:
-        response = model.generate_content([grading_prompt_system, grading_input])
-        grading_text = getattr(response, "text", None)
-        if not grading_text and getattr(response, "candidates", None):
-            grading_text = response.candidates[0].content.parts[0].text
-        if not grading_text:
-            raise RuntimeError("No grading output returned from Gemini.")
         # Save grading PDF
         base_name = os.path.splitext(os.path.basename(ans_path))[0]
         grading_pdf_path = save_as_pdf(grading_text, f"{base_name}_graded.pdf")
-        # Step 6: Extract marks for imprinting
         grading_json = extract_marks_from_grading(grading_text)
         with open("debug_grading_json.json", "w", encoding="utf-8") as f:
             json.dump(grading_json, f, indent=2, ensure_ascii=False)
         imprinted_pdf_path = None
         if imprint:
-            # Step 7: Imprinting - send all page images in parallel to LLM for mapping and annotate
             imprinted_pdf_path = f"{base_name}_imprinted.pdf"
             imprinted_pdf_path = imprint_marks_using_mapping(ans_path, grading_json, imprinted_pdf_path, model)
         return qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path
     except Exception as e:
         return f"❌ Error: {e}", None, None, None, None
 # ---------------- GRADIO UI ----------------
-with gr.Blocks(title="LeadIB AI Grading (Updated Flow: QP+MS -> IDs -> AS -> Grade -> Imprint)") as demo:
-    gr.Markdown("## 📘 LeadIB AI Grading — Final Flow\nUpload **Question Paper**, **Markscheme**, and **Student Answer Sheet**.\nFlow: merge QP+MS -> transcribe (QP+MS) -> extract IDs -> transcribe AS with expected IDs -> grade -> (optional) imprint.")
     with gr.Row():
         qp_file = gr.File(label="📄 Upload Question Paper (PDF)")
@@ -534,26 +576,4 @@ with gr.Blocks(title="LeadIB AI Grading (Updated Flow: QP+MS -> IDs -> AS -> Gra
         qpms_box = gr.Textbox(label="📑 QP+MS Transcript", lines=12)
         as_box = gr.Textbox(label="📝 AS Transcript", lines=12)
-    grading_output_box = gr.Textbox(label="🧾 Grading (Markdown)", lines=20)
-    grading_pdf_file = gr.File(label="📥 Download Grading PDF")
-    imprint_pdf_file = gr.File(label="📥 Download Imprinted PDF (Optional)")
-    def run_pipeline(qp_file_obj, ms_file_obj, ans_file_obj, imprint_flag):
-        qp_path = qp_file_obj.name
-        ms_path = ms_file_obj.name
-        ans_path = ans_file_obj.name
-        qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path = align_and_grade_pipeline(
-            qp_path, ms_path, ans_path, imprint=imprint_flag
-        )
-        return qpms_text or "", as_text or "", grading_text or "", grading_pdf_path, imprinted_pdf_path
-    run_button.click(
-        fn=run_pipeline,
-        inputs=[qp_file, ms_file, ans_file, imprint_toggle],
-        outputs=[qpms_box, as_box, grading_output_box, grading_pdf_file, imprint_pdf_file]
-    )
-if __name__ == "__main__":
-    demo.launch()

 # ---------------- PROMPTS ----------------
 PROMPTS = {
+    # Updated QP+MS transcription prompt:
     "QP_MS_TRANSCRIPTION": {
         "role": "system",
         "content": """You are a high-quality OCR/Transcription assistant.
+INPUT: This file is a scanned/printed PDF that first contains the Question Paper and then, after all questions, the Markscheme.
 TASK: Produce an exact transcription in plain text with clear separators.
+IMPORTANT: Output **ALL QUESTIONS FIRST** (in the same order they appear in the PDF).
+For each question, output:
+- Question ID (exact as printed, e.g., "1", "2(a)", "3.b", "4(ii)")
+- Question text (exact wording; do not change punctuation)
+- Total marks for that question (exact number if printed; if not printed leave blank)
+After you have outputted **all questions** (and their total marks), output the **entire markscheme block** exactly as it appears in the PDF. In the markscheme section, ensure notation is explicit and clear: represent M, A, R notation **in brackets** after each mark item where applicable. For example:
+[M1] Description...
+[A1] Description...
+[R1] Description...
+Also include at the top a single line stating the total marks of the paper (if present in the paper).
+KEY REQUIREMENTS:
+- Do NOT interleave question and markscheme. First: questions + totals. Second: markscheme (verbatim, preserving mark IDs/formatting).
+- Transcribe the markscheme verbatim; do NOT correct or reformat content (only ensure M/A/R are shown in brackets if present).
+- Represent M, A, R marks explicitly and consistently (e.g., M1, A2, R1). If mark IDs are missing, transcribe as-is.
+- Ignore any N1, N2, N3 notations (do not use them).
+OUTPUT FORMAT (use these exact markers to make parsing straightforward):
+==== PAPER TOTAL MARKS ====
+<integer or blank>
+==== QUESTIONS BEGIN ====
+Question: <id>
+Total Marks: <integer or blank>
 QP:
+<question text (multiline)>
+--QUESTION-END--
+(repeat the Question block for all questions, in order)
+==== QUESTIONS END ====
+==== MARKSCHEME BEGIN ====
+<verbatim markscheme text exactly as in PDF; include mark IDs and use brackets for M/A/R notations where they appear>
+==== MARKSCHEME END ====
 """
     },
+    # GRADING_PROMPT unchanged except we will print steps around calling it
     "GRADING_PROMPT": {
         "role": "system",
         "content": """Developer: You are an official examiner. Apply the following grading rules precisely.
 `Total: <obtained_marks>/<max_marks>`
 NOTES:
+- The assistant will receive two transcripts: (1) QP+MS transcription (questions then markscheme) and (2) AS transcription (student answers). Use the QP+MS transcript as the authoritative source of question wording, total marks, and verbatim markscheme entries (M/A/R mark IDs).
 - Match student answers to question IDs and grade according to the provided verbatim markscheme.
 - Produce full markdown as above. Ensure mark IDs used in the grading are present and consistent with the markscheme.
 """
         return input_path
     if size <= max_size:
+        print(f"ℹ️ Not compressing {input_path} ({size/1024/1024:.2f} MB <= {max_size/1024/1024} MB)")
         return input_path
+    print(f"🔎 Compressing {input_path} ({size/1024/1024:.2f} MB) -> {output_path}")
     try:
         gs_cmd = [
             "gs", "-sDEVICE=pdfwrite",
         ]
         subprocess.run(gs_cmd, check=True)
         new_size = os.path.getsize(output_path)
+        print(f"✅ Compression done. New size: {new_size/1024/1024:.2f} MB")
         if new_size <= max_size:
             return output_path
         else:
+            print("⚠️ Compressed file still larger than threshold; returning original")
             return input_path
+    except Exception as e:
+        print("❌ Compression error:", e)
         return input_path
 def create_model():
+    """
+    Create the Gemini model and print which model is selected.
+    """
     try:
+        print("⚡ Attempting to use gemini-2.5-pro model")
+        model = genai.GenerativeModel("gemini-2.5-pro", generation_config={"temperature": 0})
+        print("✅ Selected model: gemini-2.5-pro")
+        return model
+    except Exception as e:
+        print("⚠️ Could not use gemini-2.5-pro:", e)
+    try:
+        print("⚡ Falling back to gemini-2.5-flash model")
+        model = genai.GenerativeModel("gemini-2.5-flash", generation_config={"temperature": 0})
+        print("✅ Selected model: gemini-2.5-flash")
+        return model
+    except Exception as e:
+        print("❌ Failed to create any Gemini model:", e)
+        raise
 def merge_pdfs(paths, output_path):
     writer = PdfWriter()
 def gemini_generate_content(model, prompt_text, file_upload_obj=None, image_obj=None):
     """
     Send prompt_text and optionally an uploaded file (or an image object) to the model.
+    Returns textual response and prints progress.
     """
     inputs = [prompt_text]
     if file_upload_obj:
         inputs.append(file_upload_obj)
     if image_obj:
         inputs.append(image_obj)
+    print("📡 Sending request to Gemini (prompt length:", len(prompt_text), "chars )")
     response = model.generate_content(inputs)
     raw_text = getattr(response, "text", None)
     if not raw_text and getattr(response, "candidates", None):
         raw_text = response.candidates[0].content.parts[0].text
+    if raw_text is None:
         raw_text = str(response)
+    print("📥 Received response (chars):", len(raw_text))
     return raw_text
 # ---------------- PARSERS ----------------
 def extract_question_ids_from_qpms(text):
     """
     Extract question IDs from QP+MS transcript output.
+    We expect the QP+MS prompt to produce lines like 'Question: <id>'
     Return a list of unique IDs in order of appearance.
     """
+    print("🔎 Extracting question IDs from QP+MS transcript using regex...")
     ids = []
+    for m in re.finditer(r"(?im)^\s*Question\s*:\s*([0-9]+(?:(?:\.[a-zA-Z0-9]+)+|(?:\([a-zA-Z0-9]+\))+|[a-zA-Z])*)\b", text):
         qid = m.group(1).strip()
         if qid not in ids:
             ids.append(qid)
+    if ids:
+        print(f"✅ Extracted {len(ids)} question IDs.")
+        print("IDs:", ids)
+        return ids
+    # fallback scans
+    for m in re.finditer(r"(?m)^\s*([0-9]+(?:(?:\.[a-zA-Z0-9]+)+|(?:\([a-zA-Z0-9]+\))+|[a-zA-Z])*)\s*[\.\):\-]\s", text):
+        qid = m.group(1).strip()
+        if qid not in ids:
+            ids.append(qid)
+    if ids:
+        print(f"✅ Extracted {len(ids)} question IDs (fallback heuristic).")
+        print("IDs:", ids)
+    else:
+        print("⚠️ No question IDs extracted; will send NA placeholder.")
     return ids
 def build_as_prompt_with_expected_ids(expected_ids):
     """
+    Construct the AS transcription prompt injecting the expected IDs block.
     """
     if not expected_ids:
         ids_block = "{NA}"
     else:
         ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
     prompt = f"""You are a high-quality handwritten transcription assistant.
     Parse the grading markdown produced by the GRADING_PROMPT and extract marks per question.
     Returns dict: {"grading": [{"question": "1.a", "marks_awarded": ["M1","A1"]}, ...]}
     """
+    print("🔎 Extracting awarded marks from grading output...")
     grading_json = {"grading": []}
     question_blocks = re.split(r"##\s*Question\s+", grading_text)
     for block in question_blocks[1:]:
+        first_line = block.strip().splitlines()[0].strip() if block.strip().splitlines() else ""
         q_id_match = re.match(r"([0-9]+(?:[a-zA-Z]|\([^\)]+\)|(?:\.[a-zA-Z0-9]+))*)", first_line)
         if not q_id_match:
+            q_id = first_line.split()[0] if first_line else ""
         else:
             q_id = q_id_match.group(1).strip()
         awarded = re.findall(r"\b(M\d+|A\d+|R\d+|M0|A0|R0)\b", block)
             "question": q_id,
             "marks_awarded": awarded_unique
         })
+    print("✅ Extracted grading marks for", len(grading_json["grading"]), "question blocks.")
+    print(json.dumps(grading_json, indent=2))
     return grading_json
 # ---------------- MAPPING/IMPRINT HELPERS ----------------
 Grading JSON:
 {json.dumps(grading_json, indent=2)}
 """
+    print(f"📡 Sending mapping request for image {image_path} to Gemini...")
     img = Image.open(image_path)
     response = model.generate_content([prompt, img])
     raw_text = getattr(response, "text", None)
     if not raw_text and getattr(response, "candidates", None):
         raw_text = response.candidates[0].content.parts[0].text
+    if not raw_text:
+        raw_text = str(response)
+    print("📥 Mapping response (chars):", len(raw_text))
     try:
         start = raw_text.index('[')
         end = raw_text.rindex(']') + 1
         json_part = raw_text[start:end]
         mapping = json.loads(json_part)
+        print("✅ Parsed mapping JSON for", image_path, "| entries:", len(mapping))
         return mapping
     except Exception:
         match = re.search(r'(\[.*\])', raw_text, re.DOTALL)
         if match:
             try:
                 mapping = json.loads(match.group(1))
+                print("✅ Parsed mapping JSON (alt) for", image_path, "| entries:", len(mapping))
                 return mapping
             except Exception:
                 pass
+        print("⚠️ Failed to parse mapping JSON for", image_path)
         return []
 def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, rows=GRID_ROWS, cols=GRID_COLS):
     Convert PDF to images, create grid-numbered images for sending to Gemini,
     send all page images in parallel to Gemini for mapping, then annotate and produce imprinted PDF.
     """
+    print("📄 Converting answer PDF to images for imprinting...")
     pages = convert_from_path(pdf_path, dpi=200)
     annotated_page_paths = []
     temp_grid_images = []
     for p_index, page in enumerate(pages):
         img = page.convert("RGB")
         w, h = img.size
         temp_path = f"page_{p_index+1}_grid.png"
         img.save(temp_path, "PNG")
         temp_grid_images.append(temp_path)
+        print("🛰 Created grid image:", temp_path)
     # Send all grid images in parallel to Gemini to get mappings
+    print("📡 Sending all page images to Gemini in parallel for mapping...")
     mappings_per_page = {}
+    model_local = model
     with ThreadPoolExecutor(max_workers=min(8, len(temp_grid_images))) as ex:
         futures = {ex.submit(ask_gemini_for_mapping_for_page, model_local, img_path, grading_json, rows, cols): idx
                    for idx, img_path in enumerate(temp_grid_images)}
             idx = futures[fut]
             try:
                 mapping = fut.result()
+            except Exception as e:
+                print("⚠️ Mapping request failed for page", idx, e)
                 mapping = []
             mappings_per_page[idx] = mapping
     # Annotate original pages according to returned mappings
+    print("🖊 Annotating pages with marks...")
     for p_index, page in enumerate(pages):
         page_img = page.convert("RGB")
         img_cv = np.array(page_img)
         annotated_path = f"annotated_page_{p_index+1}.png"
         cv2.imwrite(annotated_path, img_cv)
         annotated_page_paths.append(annotated_path)
+        print("✅ Annotated page saved:", annotated_path)
     with open(output_pdf, "wb") as f:
         f.write(img2pdf.convert(annotated_page_paths))
     compressed = compress_pdf(output_pdf)
+    print("📑 Imprinted PDF saved to:", compressed)
     return compressed
 # ---------------- MAIN PIPELINE ----------------
 def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
     """
+    Final pipeline implementing requested flow and verbose console logging.
     """
     try:
+        print("🔁 Starting pipeline...")
+        # Step 0: compress as needed
         qp_path = compress_pdf(qp_path)
         ms_path = compress_pdf(ms_path)
         ans_path = compress_pdf(ans_path)
         # Merge QP + MS
         merged_qpms_path = os.path.splitext(qp_path)[0] + "_merged_qp_ms.pdf"
         merge_pdfs([qp_path, ms_path], merged_qpms_path)
+        print("📎 Merged QP + MS ->", merged_qpms_path)
+        # Upload files to Gemini
+        print("🔼 Uploading files to Gemini...")
         merged_uploaded = genai.upload_file(path=merged_qpms_path, display_name="QP+MS (merged)")
         ans_uploaded = genai.upload_file(path=ans_path, display_name="Answer Sheet")
+        print("✅ Upload complete.")
+        # Create model and print which selected
         model = create_model()
+        # Step 1.i: QP+MS transcription (first)
+        print("1.i) Transcribing QP+MS (questions first, then full markscheme)...")
         qpms_prompt = PROMPTS["QP_MS_TRANSCRIPTION"]["content"]
         qpms_text = gemini_generate_content(model, qpms_prompt, file_upload_obj=merged_uploaded)
+        print("📄 QP+MS transcription received. Saving debug file: debug_qpms_transcript.txt")
         with open("debug_qpms_transcript.txt", "w", encoding="utf-8") as f:
             f.write(qpms_text)
         # Step 2: extract serial numbers (question IDs) using regex from qpms_text
         extracted_ids = extract_question_ids_from_qpms(qpms_text)
         if not extracted_ids:
             extracted_ids = ["NA"]
+        # Step 1.ii: Build AS prompt injecting extracted IDs and transcribe AS
+        print("1.ii) Building AS transcription prompt with expected question IDs and sending to Gemini...")
         as_prompt = build_as_prompt_with_expected_ids(extracted_ids)
         as_text = gemini_generate_content(model, as_prompt, file_upload_obj=ans_uploaded)
+        print("📝 AS transcription received. Saving debug file: debug_as_transcript.txt")
         with open("debug_as_transcript.txt", "w", encoding="utf-8") as f:
             f.write(as_text)
+        # Step 3: Grading - send both transcripts to grading model
+        print("2) Preparing grading input and sending to Gemini for grading...")
         grading_input = (
             "=== QP+MS TRANSCRIPT BEGIN ===\n"
             + qpms_text
             + "\n=== ANSWER SHEET TRANSCRIPT END ===\n"
         )
         grading_prompt_system = PROMPTS["GRADING_PROMPT"]["content"]
+        grading_text = gemini_generate_content(model, grading_prompt_system + "\n\nPlease grade the following transcripts:\n" + grading_input)
+        print("🧾 Grading output received. Saving debug file: debug_grading.md")
+        with open("debug_grading.md", "w", encoding="utf-8") as f:
+            f.write(grading_text)
         # Save grading PDF
         base_name = os.path.splitext(os.path.basename(ans_path))[0]
         grading_pdf_path = save_as_pdf(grading_text, f"{base_name}_graded.pdf")
+        print("📄 Grading PDF saved:", grading_pdf_path)
+        # Step 4: Extract marks for imprinting
         grading_json = extract_marks_from_grading(grading_text)
         with open("debug_grading_json.json", "w", encoding="utf-8") as f:
             json.dump(grading_json, f, indent=2, ensure_ascii=False)
+        print("🔧 Grading marks extraction complete.")
         imprinted_pdf_path = None
         if imprint:
+            print("✍ Imprint option enabled. Starting imprinting process (parallel mapping requests)...")
             imprinted_pdf_path = f"{base_name}_imprinted.pdf"
             imprinted_pdf_path = imprint_marks_using_mapping(ans_path, grading_json, imprinted_pdf_path, model)
+            print("✅ Imprinting finished. Imprinted PDF at:", imprinted_pdf_path)
+        print("🏁 Pipeline finished successfully.")
         return qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path
     except Exception as e:
+        print("❌ Pipeline error:", e)
         return f"❌ Error: {e}", None, None, None, None
 # ---------------- GRADIO UI ----------------
+with gr.Blocks(title="LeadIB AI Grading (Final Flow — Verbose)") as demo:
+    gr.Markdown("## 📘 LeadIB AI Grading — Final Flow\nUpload **Question Paper**, **Markscheme**, and **Student Answer Sheet**.\nFlow: merge QP+MS -> transcribe QP+MS (questions first, full markscheme) -> extract IDs -> transcribe AS with expected IDs -> grade -> (optional) imprint. Console prints show progress.")
     with gr.Row():
         qp_file = gr.File(label="📄 Upload Question Paper (PDF)")
         qpms_box = gr.Textbox(label="📑 QP+MS Transcript", lines=12)
         as_box = gr.Textbox(label="📝 AS Transcript", lines=12)
+    grading_output_box = gr.Textbox(label="🧾 Grading (Ma