TRIAL

Sleeping

App Files Files Community

atz21 commited on Oct 24, 2025

Commit

4e28843

verified ·

1 Parent(s): 45f3df3

Update app.py

Browse files

Files changed (1) hide show

app.py +224 -145

app.py CHANGED Viewed

@@ -2,21 +2,20 @@ import os
 import re
 import json
 import subprocess
-import tempfile
 import time
 import img2pdf
 import gradio as gr
-import google.generativeai as genai
-from markdown_pdf import MarkdownPdf, Section
 from pdf2image import convert_from_path
 from PIL import Image, ImageDraw, ImageFont
 import cv2
 import numpy as np
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from PyPDF2 import PdfReader, PdfWriter
 # ---------------- CONFIG ----------------
-genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
 GRID_ROWS, GRID_COLS = 20, 14
 # ---------------- PROMPTS ----------------
@@ -28,36 +27,57 @@ INPUT: This file is a PDF that first contains the Question Paper and immediately
 TASK:
 1. Transcribe EXACTLY all the questions FIRST (with their total marks).
 2. After ALL questions, transcribe the Markscheme exactly, preserving M/A/R notation in brackets.
-3. Always number the questions sequentially (Question 1, Question 2, Question 3, …) **in the order they appear in the PDF**, even if the PDF shows a different number or leaves it blank. Do NOT skip or leave Question: blank. Never start a question other than question 1 ( even if it is labelled in pdf as 8 name it 1)
-4. After the markscheme, DETECT and FLAG all questions in the markscheme where a graph/diagram is expected. For each, output the question number and the page number in the format below.
 FORMAT:
 ==== PAPER TOTAL MARKS ====
 <total marks>
 ==== QUESTIONS BEGIN ====
-Question 1.i
 Total Marks: <number>
 QP: <question text>
 --QUESTION-END--
-Question 1.ii
 Total Marks: <number>
 QP: <question text>
 --QUESTION-END--
 (repeat for all questions in order of appearance)
 ==== QUESTIONS END ====
 ==== MARKSCHEME BEGIN ====
-Answer 1.i:
-<exact MS for Q1.i with notations M1, A1, R1 etc>
-Answer 1.ii:
-<exact MS for Q1.ii with notations>
 Answer 2 :
 <exact MS for Q2 with notations>
 (repeat for all answers)
 ==== MARKSCHEME END ====
-==== GRAPH EXPECTED QUESTIONS ====\nGraph expected in:\n- Question <number> → Page <number>\n(one per line)\n==== END GRAPH EXPECTED ====\n"""
-}
 ,
-    # GRADING_PROMPT unchanged except we will print steps around calling it
     "GRADING_PROMPT": {
         "role": "system",
         "content": """Developer: You are an official examiner. Apply the following grading rules precisely.
@@ -76,39 +96,101 @@ Answer 2 :
 4. Accept valid equivalent forms unless otherwise specified.
 5. Apply FT where appropriate.
 6. Use proper notation: M1A0, A1, etc.
-7. Any lost mark: use red `<span style=\"color:red\">M0</span>` and make Reason red.
 ---
 ## Output Format
 Produce two sections per question/sub-question, following this structure:
 ## Question <id>
 ### Markscheme vs Student Answer
-| Mark ID | Markscheme Expectation | Student’s Response | Awarded |
 |---------|------------------------|--------------------|---------|
 | M1_1    | Recognise GP           | "r=0.9"            | M1 |
-➡️ **Total: X/Y**
 ---
-### Examiner’s Report
 At the very end, provide a summary table:
 | Question Number | Marks | Remark |
 |-----------------|-------|--------|
-| 1               | X/Y   | <remark> |
 Then show total clearly as a final line:
 `Total: <obtained_marks>/<max_marks>`
 NOTES:
 - The assistant will receive two transcripts: (1) QP+MS transcript (questions then markscheme) and (2) AS transcript (student answers). Use the QP+MS transcript as the authoritative source of question wording, total marks, and verbatim markscheme entries (M/A/R mark IDs).
 - Match student answers to question IDs and grade according to the provided verbatim markscheme.
 - For questions where a graph is expected and the student attempted a graph, you will be provided with the relevant markscheme and answer sheet graph images/pages. Use these for grading those questions with visual context. For all other questions, proceed as usual.
-- Produce full markdown as above. Ensure mark IDs used in the grading are present and consistent with the markscheme.
 """
     }
 }
 # ---------------- HELPERS ----------------
-def save_as_pdf(text, filename="output.pdf"):
-    pdf = MarkdownPdf()
-    pdf.add_section(Section(text, toc=False))
-    pdf.save(filename)
-    return filename
 def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
     if output_path is None:
@@ -145,25 +227,28 @@ def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
         print("❌ Compression error:", e)
         return input_path
-def create_model():
-    """
-    Create the Gemini model and print which model is selected.
-    """
-    try:
-        print("⚡ Attempting to use gemini-2.5-pro model")
-        model = genai.GenerativeModel("gemini-2.5-pro", generation_config={"temperature": 0})
-        print("✅ Selected model: gemini-2.5-pro")
-        return model
-    except Exception as e:
-        print("⚠️ Could not use gemini-2.5-pro:", e)
-    try:
-        print("⚡ Falling back to gemini-2.5-flash model")
-        model = genai.GenerativeModel("gemini-2.5-flash", generation_config={"temperature": 0})
-        print("✅ Selected model: gemini-2.5-flash")
-        return model
-    except Exception as e:
-        print("❌ Failed to create any Gemini model:", e)
-        raise
 def merge_pdfs(paths, output_path):
     writer = PdfWriter()
@@ -175,61 +260,70 @@ def merge_pdfs(paths, output_path):
         writer.write(f)
     return output_path
-def gemini_generate_content(model, prompt_text, file_upload_obj=None, image_obj=None):
     """
-    Send prompt_text and optionally an uploaded file (or an image object/list) to the model.
     Returns textual response and prints progress.
     """
-    inputs = [prompt_text]
     if file_upload_obj:
-        inputs.append(file_upload_obj)
     if image_obj:
-        # Handle both single images and lists of images
         if isinstance(image_obj, list):
-            # Convert image paths to PIL Image objects
             for img_path in image_obj:
                 if isinstance(img_path, str):
-                    # It's a file path, load as PIL Image
                     pil_img = Image.open(img_path)
-                    inputs.append(pil_img)
                 else:
-                    # It's already an image object
-                    inputs.append(img_path)
         else:
-            # Single image
             if isinstance(image_obj, str):
                 pil_img = Image.open(image_obj)
-                inputs.append(pil_img)
             else:
-                inputs.append(image_obj)
     print("📡 Sending request to Gemini (prompt length:", len(prompt_text), "chars )")
-    response = model.generate_content(inputs)
-    raw_text = getattr(response, "text", None)
-    if not raw_text and getattr(response, "candidates", None):
-        raw_text = response.candidates[0].content.parts[0].text
-    if raw_text is None:
-        raw_text = str(response)
-    print("📥 Received response (chars):", len(raw_text))
-    return raw_text
 # ---------------- PARSERS ----------------
 def extract_question_ids_from_qpms(text: str):
-    """Extract question IDs from QP+MS transcript.
-    Two-step approach: explicit 'Question X' lines, then fallback numbered lists.
-    Robust to hidden whitespace and simple unicode spaces."""
     print("🔎 Extracting question IDs from QP+MS transcript using regex...")
-    # Normalize spaces/tabs/non-breaking spaces
     clean_text = text.replace("\u00A0", " ").replace("\t", " ")
-    # Step 1: Look for explicit "Question X" lines
     primary_matches = re.findall(r"^\s*Question\s*[:\s]\s*([\dA-Za-z.()]+)", clean_text, re.MULTILINE)
     if primary_matches:
         print(f"✅ Extracted {len(primary_matches)} question IDs from explicit 'Question X' lines.")
         print("IDs:", primary_matches)
         return primary_matches
-    # Step 2: Fallback — numbered/sub-question lists
     fallback_matches = re.findall(r"^\s*(\d+(?:[.)]|\([a-zA-Z0-9]+\))?[a-zA-Z0-9]*)", clean_text, re.MULTILINE)
     if fallback_matches:
         print(f"✅ Extracted {len(fallback_matches)} question IDs (fallback numbered lists).")
@@ -238,12 +332,9 @@ def extract_question_ids_from_qpms(text: str):
         print("⚠️ No question IDs extracted; will send NA placeholder.")
     return fallback_matches
-# Update AS prompt builder to include graph detection
 def build_as_prompt_with_expected_ids(expected_ids, qpms_text=None):
     """
     Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions.
-    If qpms_text is provided, instruct the LLM to refer to it for ambiguous handwriting.
     """
     if not expected_ids:
         ids_block = "{NA}"
@@ -276,8 +367,6 @@ AS:
 ==== GRAPH FOUND ANSWERS ====\nGraph found in:\n- Answer <number> → Page <number>\n(one per line)\n==== END GRAPH FOUND ===="""
     return prompt
-# Robust parsing functions for graph detection
 def extract_graph_questions_from_ms(text: str):
     """Extract graph questions and page numbers from MS transcript."""
     clean_text = text.replace("\u00A0", " ").replace("\t", " ")
@@ -313,9 +402,7 @@ def extract_graph_answers_from_as(text: str):
 def extract_marks_from_grading(grading_text):
     """
-    Parse the grading markdown produced by the GRADING_PROMPT and extract marks per question.
-    Returns dict: {"grading": [{"question": "1.a", "marks_awarded": ["M1","A1"]}, ...]}
-    Preserves all marks in order, including duplicates.
     """
     print("🔎 Extracting awarded marks from grading output...")
     grading_json = {"grading": []}
@@ -338,10 +425,9 @@ def extract_marks_from_grading(grading_text):
     return grading_json
 # ---------------- MAPPING/IMPRINT HELPERS ----------------
-def ask_gemini_for_mapping_batch(model, image_paths, grading_json, expected_ids=None, rows=GRID_ROWS, cols=GRID_COLS):
     """
     Send multiple page images together to Gemini for batch mapping processing.
-    More efficient than sending one by one.
     """
     ids_block = "{NA}"
     if expected_ids:
@@ -363,23 +449,30 @@ Return JSON only, like:
 Grading JSON:
 {json.dumps(grading_json, indent=2)}"""
-    # Load all images
     images = [Image.open(p) for p in image_paths]
     print(f"📡 Sending batch mapping request for {len(image_paths)} pages to Gemini...")
-    response = model.generate_content([prompt, *images])
-    raw_text = getattr(response, "text", None)
-    if not raw_text and getattr(response, "candidates", None):
-        raw_text = response.candidates[0].content.parts[0].text
-    if not raw_text:
-        raw_text = str(response)
     print("📥 Batch mapping response (chars):", len(raw_text))
     print("🔎 Gemini raw batch output:")
     print(raw_text)
-    # Try to extract JSON from response
     try:
         match = re.search(r'(\[.*\])', raw_text, re.DOTALL)
         if match:
@@ -393,17 +486,16 @@ Grading JSON:
         print(f"❌ Failed to parse Gemini JSON mapping: {e}")
         return []
-def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, expected_ids=None, rows=GRID_ROWS, cols=GRID_COLS):
     """
     Convert PDF to images, create grid-numbered images for batch sending to Gemini,
-    then annotate and produce imprinted PDF using batch processing for better efficiency.
     """
     print("📄 Converting answer PDF to images for imprinting...")
     pages = convert_from_path(pdf_path, dpi=200)
     annotated_page_paths = []
     temp_grid_images = []
-    # Create grid images for Gemini
     for p_index, page in enumerate(pages):
         img = page.convert("RGB")
         w, h = img.size
@@ -432,18 +524,16 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, expec
         temp_grid_images.append(temp_path)
         print("🛰 Created grid image:", temp_path)
-    # Send pages in batches to Gemini for mapping
     print("📡 Sending page images to Gemini in batches for mapping...")
-    batch_size = 10  # Process 10 pages at a time
     all_mappings = []
     for start in range(0, len(temp_grid_images), batch_size):
         batch_paths = temp_grid_images[start:start+batch_size]
-        batch_mapping = ask_gemini_for_mapping_batch(model, batch_paths, grading_json, expected_ids, rows, cols)
         all_mappings.extend(batch_mapping)
         print(f"✅ Processed batch {start//batch_size + 1}: pages {start+1}-{start+len(batch_paths)}")
-    # Annotate original pages according to returned mappings
     print("🖊 Annotating pages with marks...")
     for p_index, page in enumerate(pages):
         page_num = p_index + 1
@@ -453,7 +543,6 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, expec
         h, w, _ = img_cv.shape
         cell_w_px, cell_h_px = w / cols, h / rows
-        # Filter mappings for this page
         page_mappings = [m for m in all_mappings if m.get("page") == page_num]
         for item in page_mappings:
@@ -472,11 +561,9 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, expec
             row = (cell_number - 1) // cols
             col = (cell_number - 1) % cols
-            # Position marks to the right of the answer, with fallback to left
             x_c = int((col + 1) * cell_w_px - cell_w_px / 4)
             y_c = int((row + 0.5) * cell_h_px)
-            # Use larger, more visible font
             font_scale = max(1.0, min(2.0, cell_h_px / 40.0))
             thickness = max(2, int(font_scale * 2))
             cv2.putText(img_cv, marks_text, (x_c, y_c), cv2.FONT_HERSHEY_SIMPLEX,
@@ -488,7 +575,6 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, expec
         annotated_page_paths.append(annotated_path)
         print("✅ Annotated page saved:", annotated_path)
-    # Merge annotated pages into final PDF
     print("📑 Merging annotated pages into final PDF...")
     with open(output_pdf, "wb") as f:
         f.write(img2pdf.convert(annotated_page_paths))
@@ -497,21 +583,14 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, model, expec
     print("📑 Imprinted PDF saved to:", compressed)
     return compressed
-# ---------------- GRAPH DETECTION HELPERS ----------------
-# These functions are now robustly handled by the new_code, so they are no longer needed here.
-# ---------------- GRAPH PAGE EXTRACTION HELPER ----------------
 def extract_pdf_pages_as_images(pdf_path, page_numbers, prefix):
     """
     Extracts unique pages (1-based) from a PDF as images, saves as PNG, returns list of file paths.
-    Prints to console when extracting each page.
     """
     unique_pages = sorted(set(page_numbers))
     images = convert_from_path(pdf_path, dpi=200, first_page=min(unique_pages), last_page=max(unique_pages))
     out_paths = []
     for idx, page_num in enumerate(unique_pages):
-        # pdf2image returns images in order, but if not contiguous, we need to map
-        # So, get the image for this page (1-based)
         img_idx = page_num - min(unique_pages)
         img = images[img_idx]
         out_path = f"{prefix}_page_{page_num}.png"
@@ -520,42 +599,33 @@ def extract_pdf_pages_as_images(pdf_path, page_numbers, prefix):
         out_paths.append(out_path)
     return out_paths
-# ---------------- PIPELINE UPDATE FOR GRAPH-AWARE GRADING ----------------
 def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
     """
-    Final pipeline implementing requested flow and verbose console logging.
-    Now includes Graph-Aware Grading logic.
     """
     try:
         print("🔁 Starting pipeline...")
-        # Step 0: compress as needed
         qp_path = compress_pdf(qp_path)
         ms_path = compress_pdf(ms_path)
         ans_path = compress_pdf(ans_path)
-        # Merge QP + MS
         merged_qpms_path = os.path.splitext(qp_path)[0] + "_merged_qp_ms.pdf"
         merge_pdfs([qp_path, ms_path], merged_qpms_path)
         print("📎 Merged QP + MS ->", merged_qpms_path)
-        # Upload files to Gemini
         print("🔼 Uploading files to Gemini...")
-        merged_uploaded = genai.upload_file(path=merged_qpms_path, display_name="QP+MS (merged)")
-        ans_uploaded = genai.upload_file(path=ans_path, display_name="Answer Sheet")
         print("✅ Upload complete.")
-        # Create model and print which selected
-        model = create_model()
-        # Step 1.i: QP+MS transcription (first)
         print("1.i) Transcribing QP+MS (questions first, then full markscheme, with graph detection)...")
         qpms_prompt = PROMPTS["QP_MS_TRANSCRIPTION"]["content"] + "\nAt the end, also list all questions in the markscheme where a graph is expected, in the format:\nGraph expected in:\n- Question <number> → Page <number>\n(One per line, after ==== MARKSCHEME END ====)"
-        qpms_text = gemini_generate_content(model, qpms_prompt, file_upload_obj=merged_uploaded)
         print("📄 QP+MS transcription received. Saving debug file: debug_qpms_transcript.txt")
         with open("debug_qpms_transcript.txt", "w", encoding="utf-8") as f:
             f.write(qpms_text)
-        # Step 1.i.a: Extract graph-expected questions from MS
         ms_graph_mapping = extract_graph_questions_from_ms(qpms_text)
         print("🖼️ Graph-expected questions in MS:", ms_graph_mapping)
         ms_graph_pages = list(ms_graph_mapping.values())
@@ -563,20 +633,17 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
         if ms_graph_pages:
             ms_graph_images = extract_pdf_pages_as_images(merged_qpms_path, ms_graph_pages, prefix="qpms_graph")
-        # Step 2: extract serial numbers (question IDs) using regex from qpms_text
         extracted_ids = extract_question_ids_from_qpms(qpms_text)
         if not extracted_ids:
             extracted_ids = ["NA"]
-        # Step 1.ii: Build AS prompt injecting extracted IDs and transcribe AS
         print("1.ii) Building AS transcription prompt with expected question IDs and graph detection, sending to Gemini...")
         as_prompt = build_as_prompt_with_expected_ids(extracted_ids, qpms_text) + "\nAt the end, also list all answers where a graph is found, in the format:\nGraph found in:\n- Answer <number> → Page <number>\n(One per line, after all answers)"
-        as_text = gemini_generate_content(model, as_prompt, file_upload_obj=ans_uploaded)
         print("📝 AS transcription received. Saving debug file: debug_as_transcript.txt")
         with open("debug_as_transcript.txt", "w", encoding="utf-8") as f:
             f.write(as_text)
-        # Step 2.a: Extract graph-attempted answers from AS
         as_graph_mapping = extract_graph_answers_from_as(as_text)
         print("🖼️ Graph-attempted answers in AS:", as_graph_mapping)
         as_graph_pages = list(as_graph_mapping.values())
@@ -584,9 +651,6 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
         if as_graph_pages:
             as_graph_images = extract_pdf_pages_as_images(ans_path, as_graph_pages, prefix="as_graph")
-        # Step 3: (No graph bundle matching, just collect images)
-        # Step 4: Grading - send both transcripts to grading model, inject graph image info
         print("2) Preparing grading input and sending to Gemini for grading...")
         grading_input = (
             "=== QP+MS TRANSCRIPT BEGIN ===\n"
@@ -596,24 +660,20 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
             + as_text
             + "\n=== ANSWER SHEET TRANSCRIPT END ===\n"
         )
-        # Inject graph image note
         if ms_graph_images or as_graph_images:
-            graph_note = "\n\n---\nSome questions require graphs. I’ve attached the relevant graph pages from QP+MS and from the Answer Sheet. Use them as visual context when grading.\n---\n"
             grading_input += graph_note
         grading_prompt_system = PROMPTS["GRADING_PROMPT"]["content"]
-        # Pass images as additional input to gemini_generate_content
         grading_images = ms_graph_images + as_graph_images
-        grading_text = gemini_generate_content(model, grading_prompt_system + "\n\nPlease grade the following transcripts:\n" + grading_input, image_obj=grading_images if grading_images else None)
         print("🧾 Grading output received. Saving debug file: debug_grading.md")
         with open("debug_grading.md", "w", encoding="utf-8") as f:
             f.write(grading_text)
-        # Save grading PDF
         base_name = os.path.splitext(os.path.basename(ans_path))[0]
         grading_pdf_path = save_as_pdf(grading_text, f"{base_name}_graded.pdf")
         print("📄 Grading PDF saved:", grading_pdf_path)
-        # Step 4: Extract marks for imprinting
         grading_json = extract_marks_from_grading(grading_text)
         with open("debug_grading_json.json", "w", encoding="utf-8") as f:
             json.dump(grading_json, f, indent=2, ensure_ascii=False)
@@ -621,9 +681,9 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
         imprinted_pdf_path = None
         if imprint:
-            print("✍ Imprint option enabled. Starting imprinting process (parallel mapping requests)...")
             imprinted_pdf_path = f"{base_name}_imprinted.pdf"
-            imprinted_pdf_path = imprint_marks_using_mapping(ans_path, grading_json, imprinted_pdf_path, model, extracted_ids)
             print("✅ Imprinting finished. Imprinted PDF at:", imprinted_pdf_path)
         print("🏁 Pipeline finished successfully.")
@@ -631,11 +691,23 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
     except Exception as e:
         print("❌ Pipeline error:", e)
         return f"❌ Error: {e}", None, None, None, None
 # ---------------- GRADIO UI ----------------
-with gr.Blocks(title=" AI Grading (Final Flow )") as demo:
-    gr.Markdown("## 📘  AI Grading — Final Flow")
     with gr.Row():
         qp_file = gr.File(label="📄 Upload Question Paper (PDF)")
@@ -646,10 +718,17 @@ with gr.Blocks(title=" AI Grading (Final Flow )") as demo:
     run_button = gr.Button("🚀 Run Pipeline")
     with gr.Row():
-        grading_pdf_file = gr.File(label="📥 Download Grading PDF")
-        imprint_pdf_file = gr.File(label="📥 Download Imprinted PDF (Optional)")
     def run_pipeline(qp_file_obj, ms_file_obj, ans_file_obj, imprint_flag):
         qp_path = qp_file_obj.name
         ms_path = ms_file_obj.name
         ans_path = ans_file_obj.name
@@ -658,13 +737,13 @@ with gr.Blocks(title=" AI Grading (Final Flow )") as demo:
             qp_path, ms_path, ans_path, imprint=imprint_flag
         )
-        return grading_pdf_path, imprinted_pdf_path
     run_button.click(
         fn=run_pipeline,
         inputs=[qp_file, ms_file, ans_file, imprint_toggle],
-        outputs=[grading_pdf_file, imprint_pdf_file]
     )
 if __name__ == "__main__":
-    demo.launch()

 import re
 import json
 import subprocess
 import time
 import img2pdf
 import gradio as gr
+from google import genai  # NEW SDK
+import pypandoc
 from pdf2image import convert_from_path
 from PIL import Image, ImageDraw, ImageFont
 import cv2
 import numpy as np
 from PyPDF2 import PdfReader, PdfWriter
 # ---------------- CONFIG ----------------
+# Create client with new SDK
+client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
 GRID_ROWS, GRID_COLS = 20, 14
 # ---------------- PROMPTS ----------------
 TASK:
 1. Transcribe EXACTLY all the questions FIRST (with their total marks).
 2. After ALL questions, transcribe the Markscheme exactly, preserving M/A/R notation in brackets.
+3. Always number the questions sequentially (Question 1, Question 2, Question 3, …) **in the order they appear in the PDF**, even if the PDF shows a different number or leaves it blank. Do NOT skip or leave Question: blank. Never start a question other than question 1 (even if it is labelled in pdf as 8 name it 1).
+4. If a question or sub-question is labelled with a letter (e.g., "Q1.a", "Q2(b)", "1 (c)(i)"), transcribe it as "Question 1.a", "Question 2.b", "Question 1.c.i" etc., exactly preserving the hierarchy of sub-question identifiers.
+5. After the markscheme, DETECT and FLAG all questions in the markscheme where a graph/diagram is expected. For each, output the question number and the page number in the format below.
 FORMAT:
 ==== PAPER TOTAL MARKS ====
 <total marks>
 ==== QUESTIONS BEGIN ====
+Question 1.a
 Total Marks: <number>
 QP: <question text>
 --QUESTION-END--
+Question 1.b
+Total Marks: <number>
+QP: <question text>
+--QUESTION-END--
+Question 2
 Total Marks: <number>
 QP: <question text>
 --QUESTION-END--
 (repeat for all questions in order of appearance)
 ==== QUESTIONS END ====
 ==== MARKSCHEME BEGIN ====
+Answer 1.a:
+<exact MS for Q1.a with notations M1, A1, R1 etc>
+Answer 1.b:
+<exact MS for Q1.b with notations>
 Answer 2 :
 <exact MS for Q2 with notations>
 (repeat for all answers)
 ==== MARKSCHEME END ====
+==== GRAPH EXPECTED QUESTIONS ====
+Graph expected in:
+- Question <number> → Page <number>
+(one per line)
+==== END GRAPH EXPECTED ====
+"""
+}
 ,
     "GRADING_PROMPT": {
         "role": "system",
         "content": """Developer: You are an official examiner. Apply the following grading rules precisely.
 4. Accept valid equivalent forms unless otherwise specified.
 5. Apply FT where appropriate.
 6. Use proper notation: M1A0, A1, etc.
+7. Any lost mark: use red `<span style=\"color:red\">M0</span>` , similarly make markscheme expected , student response  and awarded marks in red include it in <span> tage
 ---
 ## Output Format
 Produce two sections per question/sub-question, following this structure:
 ## Question <id>
 ### Markscheme vs Student Answer
+| Mark ID | Markscheme Expectation | Student's Response | Awarded |
 |---------|------------------------|--------------------|---------|
 | M1_1    | Recognise GP           | "r=0.9"            | M1 |
+ **Total: X/Y**
 ---
+### Examiner's Report
 At the very end, provide a summary table:
 | Question Number | Marks | Remark |
 |-----------------|-------|--------|
+| 1               | X/Y   | A      |
+| 2               | X/Y   | B      |
 Then show total clearly as a final line:
 `Total: <obtained_marks>/<max_marks>`
 NOTES:
 - The assistant will receive two transcripts: (1) QP+MS transcript (questions then markscheme) and (2) AS transcript (student answers). Use the QP+MS transcript as the authoritative source of question wording, total marks, and verbatim markscheme entries (M/A/R mark IDs).
 - Match student answers to question IDs and grade according to the provided verbatim markscheme.
 - For questions where a graph is expected and the student attempted a graph, you will be provided with the relevant markscheme and answer sheet graph images/pages. Use these for grading those questions with visual context. For all other questions, proceed as usual.
+- Produce full markdown as above. Ensure mark IDs used in the grading are present and consistent with the markscheme.
+- give grade in remark one of the following  A : All Good   B : Silly Mistake   C : Conceptual Error   D : Hard question       E : Not Applicable
 """
     }
 }
 # ---------------- HELPERS ----------------
+def save_as_pdf(text, filename="output.pdf"):
+    """Convert markdown to PDF using pandoc - handles long content without truncation"""
+    try:
+        # Clean HTML for better compatibility
+        import re
+        clean_text = re.sub(r'<span style="color:red">(.*?)</span>', r'**[\1]**', text)
+        # Save temporary markdown
+        temp_md = f"{filename}_temp.md"
+        with open(temp_md, 'w', encoding='utf-8') as f:
+            f.write(clean_text)
+        print(f"📝 Converting markdown to PDF using pandoc...")
+        # Convert to PDF with pandoc
+        try:
+            pypandoc.convert_file(
+                temp_md, 'pdf',
+                outputfile=filename,
+                extra_args=[
+                    '--pdf-engine=xelatex',
+                    '-V', 'geometry:margin=0.75in',
+                    '-V', 'fontsize=10pt',
+                    '-V', 'linestretch=1.2',
+                    '--standalone'
+                ]
+            )
+        except RuntimeError:
+            # Try with pdflatex if xelatex fails
+            print("⚠️ xelatex failed, trying pdflatex...")
+            pypandoc.convert_file(
+                temp_md, 'pdf',
+                outputfile=filename,
+                extra_args=[
+                    '--pdf-engine=pdflatex',
+                    '-V', 'geometry:margin=0.75in',
+                    '-V', 'fontsize=10pt'
+                ]
+            )
+        # Cleanup
+        if os.path.exists(temp_md):
+            os.remove(temp_md)
+        # Verify the file was created
+        if os.path.exists(filename):
+            size = os.path.getsize(filename)
+            print(f"✅ PDF saved successfully: {filename} ({size/1024:.1f} KB)")
+            return filename
+        else:
+            raise Exception("PDF file was not created")
+    except Exception as e:
+        print(f"❌ PDF conversion error: {e}")
+        print("💡 Make sure pandoc is installed: https://pandoc.org/installing.html")
+        print("   Ubuntu/Debian: sudo apt-get install pandoc texlive-xetex")
+        print("   macOS: brew install pandoc basictex")
+        print("   Windows: Download from https://pandoc.org/installing.html")
+        # Fallback to text file
+        txt_file = filename.replace('.pdf', '.txt')
+        with open(txt_file, 'w', encoding='utf-8') as f:
+            f.write(text)
+        print(f"⚠️ Saved as text file instead: {txt_file}")
+        return txt_file
 def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
     if output_path is None:
         print("❌ Compression error:", e)
         return input_path
+def upload_to_gemini(path, display_name=None):
+    """
+    Upload a file to Gemini using the NEW google-genai SDK.
+    """
+    print(f"📤 Uploading {path} to Gemini...")
+    try:
+        uploaded_file = client.files.upload(file=path)
+        # Wait for processing to complete
+        print(f"⏳ Waiting for file processing: {uploaded_file.name}")
+        while uploaded_file.state.name == "PROCESSING":
+            time.sleep(2)
+            uploaded_file = client.files.get(name=uploaded_file.name)
+        if uploaded_file.state.name == "FAILED":
+            raise Exception(f"File processing failed: {uploaded_file.name}")
+        print(f"✅ Uploaded and processed: {uploaded_file.name}")
+        return uploaded_file
+    except Exception as e:
+        print(f"❌ Upload failed for {path}: {e}")
+        raise
 def merge_pdfs(paths, output_path):
     writer = PdfWriter()
         writer.write(f)
     return output_path
+def gemini_generate_content(prompt_text, file_upload_obj=None, image_obj=None, model_name="gemini-2.0-flash-exp"):
     """
+    Send prompt_text and optionally an uploaded file (or an image object/list) to the model using NEW SDK.
     Returns textual response and prints progress.
     """
+    contents = [prompt_text]
     if file_upload_obj:
+        contents.append(file_upload_obj)
     if image_obj:
         if isinstance(image_obj, list):
             for img_path in image_obj:
                 if isinstance(img_path, str):
                     pil_img = Image.open(img_path)
+                    contents.append(pil_img)
                 else:
+                    contents.append(img_path)
         else:
             if isinstance(image_obj, str):
                 pil_img = Image.open(image_obj)
+                contents.append(pil_img)
             else:
+                contents.append(image_obj)
     print("📡 Sending request to Gemini (prompt length:", len(prompt_text), "chars )")
+    try:
+        response = client.models.generate_content(
+            model=model_name,
+            contents=contents
+        )
+        raw_text = response.text
+        print("📥 Received response (chars):", len(raw_text))
+        return raw_text
+    except Exception as e:
+        print(f"❌ Generation failed: {e}")
+        # Try fallback model
+        print("⚡ Trying fallback model: gemini-1.5-flash")
+        try:
+            response = client.models.generate_content(
+                model="gemini-1.5-flash",
+                contents=contents
+            )
+            raw_text = response.text
+            print("📥 Received response (chars):", len(raw_text))
+            return raw_text
+        except Exception as e2:
+            print(f"❌ Fallback also failed: {e2}")
+            raise
 # ---------------- PARSERS ----------------
 def extract_question_ids_from_qpms(text: str):
+    """Extract question IDs from QP+MS transcript."""
     print("🔎 Extracting question IDs from QP+MS transcript using regex...")
     clean_text = text.replace("\u00A0", " ").replace("\t", " ")
     primary_matches = re.findall(r"^\s*Question\s*[:\s]\s*([\dA-Za-z.()]+)", clean_text, re.MULTILINE)
     if primary_matches:
         print(f"✅ Extracted {len(primary_matches)} question IDs from explicit 'Question X' lines.")
         print("IDs:", primary_matches)
         return primary_matches
     fallback_matches = re.findall(r"^\s*(\d+(?:[.)]|\([a-zA-Z0-9]+\))?[a-zA-Z0-9]*)", clean_text, re.MULTILINE)
     if fallback_matches:
         print(f"✅ Extracted {len(fallback_matches)} question IDs (fallback numbered lists).")
         print("⚠️ No question IDs extracted; will send NA placeholder.")
     return fallback_matches
 def build_as_prompt_with_expected_ids(expected_ids, qpms_text=None):
     """
     Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions.
     """
     if not expected_ids:
         ids_block = "{NA}"
 ==== GRAPH FOUND ANSWERS ====\nGraph found in:\n- Answer <number> → Page <number>\n(one per line)\n==== END GRAPH FOUND ===="""
     return prompt
 def extract_graph_questions_from_ms(text: str):
     """Extract graph questions and page numbers from MS transcript."""
     clean_text = text.replace("\u00A0", " ").replace("\t", " ")
 def extract_marks_from_grading(grading_text):
     """
+    Parse the grading markdown and extract marks per question.
     """
     print("🔎 Extracting awarded marks from grading output...")
     grading_json = {"grading": []}
     return grading_json
 # ---------------- MAPPING/IMPRINT HELPERS ----------------
+def ask_gemini_for_mapping_batch(image_paths, grading_json, expected_ids=None, rows=GRID_ROWS, cols=GRID_COLS):
     """
     Send multiple page images together to Gemini for batch mapping processing.
     """
     ids_block = "{NA}"
     if expected_ids:
 Grading JSON:
 {json.dumps(grading_json, indent=2)}"""
     images = [Image.open(p) for p in image_paths]
     print(f"📡 Sending batch mapping request for {len(image_paths)} pages to Gemini...")
+    try:
+        contents = [prompt] + images
+        response = client.models.generate_content(
+            model="gemini-2.0-flash-exp",
+            contents=contents
+        )
+        raw_text = response.text
+    except:
+        print("⚠️ Trying fallback model for mapping...")
+        contents = [prompt] + images
+        response = client.models.generate_content(
+            model="gemini-1.5-flash",
+            contents=contents
+        )
+        raw_text = response.text
     print("📥 Batch mapping response (chars):", len(raw_text))
     print("🔎 Gemini raw batch output:")
     print(raw_text)
     try:
         match = re.search(r'(\[.*\])', raw_text, re.DOTALL)
         if match:
         print(f"❌ Failed to parse Gemini JSON mapping: {e}")
         return []
+def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, expected_ids=None, rows=GRID_ROWS, cols=GRID_COLS):
     """
     Convert PDF to images, create grid-numbered images for batch sending to Gemini,
+    then annotate and produce imprinted PDF.
     """
     print("📄 Converting answer PDF to images for imprinting...")
     pages = convert_from_path(pdf_path, dpi=200)
     annotated_page_paths = []
     temp_grid_images = []
     for p_index, page in enumerate(pages):
         img = page.convert("RGB")
         w, h = img.size
         temp_grid_images.append(temp_path)
         print("🛰 Created grid image:", temp_path)
     print("📡 Sending page images to Gemini in batches for mapping...")
+    batch_size = 10
     all_mappings = []
     for start in range(0, len(temp_grid_images), batch_size):
         batch_paths = temp_grid_images[start:start+batch_size]
+        batch_mapping = ask_gemini_for_mapping_batch(batch_paths, grading_json, expected_ids, rows, cols)
         all_mappings.extend(batch_mapping)
         print(f"✅ Processed batch {start//batch_size + 1}: pages {start+1}-{start+len(batch_paths)}")
     print("🖊 Annotating pages with marks...")
     for p_index, page in enumerate(pages):
         page_num = p_index + 1
         h, w, _ = img_cv.shape
         cell_w_px, cell_h_px = w / cols, h / rows
         page_mappings = [m for m in all_mappings if m.get("page") == page_num]
         for item in page_mappings:
             row = (cell_number - 1) // cols
             col = (cell_number - 1) % cols
             x_c = int((col + 1) * cell_w_px - cell_w_px / 4)
             y_c = int((row + 0.5) * cell_h_px)
             font_scale = max(1.0, min(2.0, cell_h_px / 40.0))
             thickness = max(2, int(font_scale * 2))
             cv2.putText(img_cv, marks_text, (x_c, y_c), cv2.FONT_HERSHEY_SIMPLEX,
         annotated_page_paths.append(annotated_path)
         print("✅ Annotated page saved:", annotated_path)
     print("📑 Merging annotated pages into final PDF...")
     with open(output_pdf, "wb") as f:
         f.write(img2pdf.convert(annotated_page_paths))
     print("📑 Imprinted PDF saved to:", compressed)
     return compressed
 def extract_pdf_pages_as_images(pdf_path, page_numbers, prefix):
     """
     Extracts unique pages (1-based) from a PDF as images, saves as PNG, returns list of file paths.
     """
     unique_pages = sorted(set(page_numbers))
     images = convert_from_path(pdf_path, dpi=200, first_page=min(unique_pages), last_page=max(unique_pages))
     out_paths = []
     for idx, page_num in enumerate(unique_pages):
         img_idx = page_num - min(unique_pages)
         img = images[img_idx]
         out_path = f"{prefix}_page_{page_num}.png"
         out_paths.append(out_path)
     return out_paths
+# ---------------- PIPELINE ----------------
 def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
     """
+    Final pipeline with graph-aware grading logic using NEW SDK.
     """
     try:
         print("🔁 Starting pipeline...")
         qp_path = compress_pdf(qp_path)
         ms_path = compress_pdf(ms_path)
         ans_path = compress_pdf(ans_path)
         merged_qpms_path = os.path.splitext(qp_path)[0] + "_merged_qp_ms.pdf"
         merge_pdfs([qp_path, ms_path], merged_qpms_path)
         print("📎 Merged QP + MS ->", merged_qpms_path)
         print("🔼 Uploading files to Gemini...")
+        merged_uploaded = upload_to_gemini(merged_qpms_path)
+        ans_uploaded = upload_to_gemini(ans_path)
         print("✅ Upload complete.")
         print("1.i) Transcribing QP+MS (questions first, then full markscheme, with graph detection)...")
         qpms_prompt = PROMPTS["QP_MS_TRANSCRIPTION"]["content"] + "\nAt the end, also list all questions in the markscheme where a graph is expected, in the format:\nGraph expected in:\n- Question <number> → Page <number>\n(One per line, after ==== MARKSCHEME END ====)"
+        qpms_text = gemini_generate_content(qpms_prompt, file_upload_obj=merged_uploaded)
         print("📄 QP+MS transcription received. Saving debug file: debug_qpms_transcript.txt")
         with open("debug_qpms_transcript.txt", "w", encoding="utf-8") as f:
             f.write(qpms_text)
         ms_graph_mapping = extract_graph_questions_from_ms(qpms_text)
         print("🖼️ Graph-expected questions in MS:", ms_graph_mapping)
         ms_graph_pages = list(ms_graph_mapping.values())
         if ms_graph_pages:
             ms_graph_images = extract_pdf_pages_as_images(merged_qpms_path, ms_graph_pages, prefix="qpms_graph")
         extracted_ids = extract_question_ids_from_qpms(qpms_text)
         if not extracted_ids:
             extracted_ids = ["NA"]
         print("1.ii) Building AS transcription prompt with expected question IDs and graph detection, sending to Gemini...")
         as_prompt = build_as_prompt_with_expected_ids(extracted_ids, qpms_text) + "\nAt the end, also list all answers where a graph is found, in the format:\nGraph found in:\n- Answer <number> → Page <number>\n(One per line, after all answers)"
+        as_text = gemini_generate_content(as_prompt, file_upload_obj=ans_uploaded)
         print("📝 AS transcription received. Saving debug file: debug_as_transcript.txt")
         with open("debug_as_transcript.txt", "w", encoding="utf-8") as f:
             f.write(as_text)
         as_graph_mapping = extract_graph_answers_from_as(as_text)
         print("🖼️ Graph-attempted answers in AS:", as_graph_mapping)
         as_graph_pages = list(as_graph_mapping.values())
         if as_graph_pages:
             as_graph_images = extract_pdf_pages_as_images(ans_path, as_graph_pages, prefix="as_graph")
         print("2) Preparing grading input and sending to Gemini for grading...")
         grading_input = (
             "=== QP+MS TRANSCRIPT BEGIN ===\n"
             + as_text
             + "\n=== ANSWER SHEET TRANSCRIPT END ===\n"
         )
         if ms_graph_images or as_graph_images:
+            graph_note = "\n\n---\nSome questions require graphs. I've attached the relevant graph pages from QP+MS and from the Answer Sheet. Use them as visual context when grading.\n---\n"
             grading_input += graph_note
         grading_prompt_system = PROMPTS["GRADING_PROMPT"]["content"]
         grading_images = ms_graph_images + as_graph_images
+        grading_text = gemini_generate_content(grading_prompt_system + "\n\nPlease grade the following transcripts:\n" + grading_input, image_obj=grading_images if grading_images else None)
         print("🧾 Grading output received. Saving debug file: debug_grading.md")
         with open("debug_grading.md", "w", encoding="utf-8") as f:
             f.write(grading_text)
         base_name = os.path.splitext(os.path.basename(ans_path))[0]
         grading_pdf_path = save_as_pdf(grading_text, f"{base_name}_graded.pdf")
         print("📄 Grading PDF saved:", grading_pdf_path)
         grading_json = extract_marks_from_grading(grading_text)
         with open("debug_grading_json.json", "w", encoding="utf-8") as f:
             json.dump(grading_json, f, indent=2, ensure_ascii=False)
         imprinted_pdf_path = None
         if imprint:
+            print("✍ Imprint option enabled. Starting imprinting process...")
             imprinted_pdf_path = f"{base_name}_imprinted.pdf"
+            imprinted_pdf_path = imprint_marks_using_mapping(ans_path, grading_json, imprinted_pdf_path, extracted_ids)
             print("✅ Imprinting finished. Imprinted PDF at:", imprinted_pdf_path)
         print("🏁 Pipeline finished successfully.")
     except Exception as e:
         print("❌ Pipeline error:", e)
+        import traceback
+        traceback.print_exc()
         return f"❌ Error: {e}", None, None, None, None
 # ---------------- GRADIO UI ----------------
+with gr.Blocks(title="AI Grading (Fixed - Pandoc PDF)") as demo:
+    gr.Markdown("## 📘 AI Grading — Fixed with Pandoc PDF Conversion")
+    gr.Markdown("""
+    **✅ Now using pypandoc for PDF conversion (no truncation issues!)**
+    ### Requirements:
+    - Install: `pip install pypandoc`
+    - Install pandoc system-wide:
+      - **Ubuntu/Debian**: `sudo apt-get install pandoc texlive-xetex`
+      - **macOS**: `brew install pandoc basictex`
+      - **Windows**: Download from https://pandoc.org/installing.html
+    """)
     with gr.Row():
         qp_file = gr.File(label="📄 Upload Question Paper (PDF)")
     run_button = gr.Button("🚀 Run Pipeline")
     with gr.Row():
+        qpms_box = gr.Textbox(label="📑 QP+MS Transcript", lines=12)
+        as_box = gr.Textbox(label="📝 AS Transcript", lines=12)
+    grading_output_box = gr.Textbox(label="🧾 Grading (Markdown)", lines=20)
+    grading_pdf_file = gr.File(label="📥 Download Grading PDF")
+    imprint_pdf_file = gr.File(label="📥 Download Imprinted PDF (Optional)")
     def run_pipeline(qp_file_obj, ms_file_obj, ans_file_obj, imprint_flag):
+        if not qp_file_obj or not ms_file_obj or not ans_file_obj:
+            return "❌ Please upload all three files", "", "", None, None
         qp_path = qp_file_obj.name
         ms_path = ms_file_obj.name
         ans_path = ans_file_obj.name
             qp_path, ms_path, ans_path, imprint=imprint_flag
         )
+        return qpms_text or "", as_text or "", grading_text or "", grading_pdf_path, imprinted_pdf_path
     run_button.click(
         fn=run_pipeline,
         inputs=[qp_file, ms_file, ans_file, imprint_toggle],
+        outputs=[qpms_box, as_box, grading_output_box, grading_pdf_file, imprint_pdf_file]
     )
 if __name__ == "__main__":
+    demo.launch()