TRIAL

Sleeping

App Files Files Community

atz21 commited on Dec 3, 2025

Commit

d7fa86e

verified ·

1 Parent(s): 8463bc7

Update app.py

Browse files

Files changed (1) hide show

app.py +390 -403

app.py CHANGED Viewed

@@ -6,12 +6,12 @@ import time
 import img2pdf
 import gradio as gr
 from google import genai  # NEW SDK
-from markdown_pdf import MarkdownPdf, Section
 from pdf2image import convert_from_path
 from PIL import Image, ImageDraw, ImageFont
 import cv2
 import numpy as np
 from PyPDF2 import PdfReader, PdfWriter
 # ---------------- CONFIG ----------------
 # Create client with new SDK
@@ -19,279 +19,311 @@ client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
 GRID_ROWS, GRID_COLS = 20, 14
 # ---------------- PROMPTS ----------------
-PROMPTS = {
-    "QP_MS_TRANSCRIPTION" : {
-    "role": "system",
-    "content": """You are a high-quality OCR/Transcription assistant.
-INPUT: This file is a PDF that first contains the Question Paper and immediately after it the Markscheme.
-TASK:
-1. Transcribe EXACTLY all the questions FIRST (with their total marks).
-2. After ALL questions, transcribe the Markscheme exactly, preserving M/A/R notation in brackets.
-3. Always number the questions sequentially (Question 1, Question 2, Question 3, …) **in the order they appear in the PDF**, even if the PDF shows a different number or leaves it blank. Do NOT skip or leave Question: blank. Never start a question other than question 1 (even if it is labelled in pdf as 8 name it 1).
-4. If a question or sub-question is labelled with a letter (e.g., "Q1.a", "Q2(b)", "1 (c)(i)"), transcribe it as "Question 1.a", "Question 2.b", "Question 1.c.i" etc., exactly preserving the hierarchy of sub-question identifiers.
-5. After the markscheme, DETECT and FLAG all questions in the markscheme where a graph/diagram is expected. For each, output the question number and the page number in the format below.
-FORMAT:
-==== PAPER TOTAL MARKS ====
-<total marks>
-==== QUESTIONS BEGIN ====
-Question 1.a
-Total Marks: <number>
-QP: <question text>
---QUESTION-END--
-Question 1.b
-Total Marks: <number>
-QP: <question text>
---QUESTION-END--
-Question 2
-Total Marks: <number>
-QP: <question text>
---QUESTION-END--
-(repeat for all questions in order of appearance)
-==== QUESTIONS END ====
-==== MARKSCHEME BEGIN ====
-Answer 1.a:
-<exact MS for Q1.a with notations M1, A1, R1 etc>
-Answer 1.b:
-<exact MS for Q1.b with notations>
-Answer 2 :
-<exact MS for Q2 with notations>
-(repeat for all answers)
-==== MARKSCHEME END ====
-==== GRAPH EXPECTED QUESTIONS ====
-Graph expected in:
-- Question <number> → Page <number>
-(one per line)
-==== END GRAPH EXPECTED ====
-"""
-}
-,
-    "GRADING_PROMPT": {
-        "role": "system",
-        "content": """Developer: You are an official examiner. Apply the following grading rules precisely.
-### Abbreviations:
-- **M**: Marks for Method
-- **A**: Marks for Accuracy/Answer
-- **R**: Marks for Reasoning
-- **AG**: Answer given in question—no marks
-- **FT**: Follow Through marks (if error carried forward correctly)
-- **MR**: Deduct for misread (once only)
----
-## Grading Instructions
-1. Award marks using official annotations (e.g., M1, A2).
-2. Do not award full marks for answers alone; check for method marks.
-3. A marks usually require a valid M mark first.
-4. Accept valid equivalent forms unless otherwise specified.
-5. Apply FT where appropriate.
-6. Use proper notation: M1A0, A1, etc.
-7. Any lost mark: use red `<span style=\"color:red\">M0</span>` , similarly make markscheme expected , student response  and awarded marks in red include it in <span> tage
----
-## Output Format
-Produce two sections per question/sub-question, following this structure:
-## Question <id>
-### Markscheme vs Student Answer
-| Mark ID | Markscheme Expectation | Student's Response | Awarded |
-|---------|------------------------|--------------------|---------|
-| M1_1    | Recognise GP           | "r=0.9"            | M1 |
- **Total: X/Y**
----
-### Examiner's Report
-At the very end, provide a summary table:
-| Question Number | Marks | Remark |
-|-----------------|-------|--------|
-| 1               | X/Y   | A      |
-| 2               | X/Y   | B      |
-Then show total clearly as a final line:
-`Total: <obtained_marks>/<max_marks>`
-NOTES:
-- The assistant will receive two transcripts: (1) QP+MS transcript (questions then markscheme) and (2) AS transcript (student answers). Use the QP+MS transcript as the authoritative source of question wording, total marks, and verbatim markscheme entries (M/A/R mark IDs).
-- Match student answers to question IDs and grade according to the provided verbatim markscheme.
-- For questions where a graph is expected and the student attempted a graph, you will be provided with the relevant markscheme and answer sheet graph images/pages. Use these for grading those questions with visual context. For all other questions, proceed as usual.
-- Produce full markdown as above. Ensure mark IDs used in the grading are present and consistent with the markscheme.
-- give grade in remark one of the following  A : All Good   B : Silly Mistake   C : Conceptual Error   D : Hard question       E : Not Applicable
-"""
-    }
-}
 # ---------------- HELPERS ----------------
-def save_as_pdf(text, filename="output.pdf"):
     """
-    Convert markdown to PDF using markdown_pdf library.
-    Enhanced with error handling, content cleaning, and fallback mechanisms.
     """
     try:
-        print(f"📄 Starting PDF conversion for {filename}...")
-        # ============ STEP 1: Clean and prepare the text ============
-        print("🧹 Cleaning markdown content...")
-        # Remove or replace HTML tags that markdown_pdf can't handle
-        clean_text = text
-        # Replace red color spans with bold markdown (** for bold)
-        clean_text = re.sub(r'<span\s+style\s*=\s*["\']color\s*:\s*red["\']>(.*?)</span>',
-                           r'**\1**', clean_text, flags=re.IGNORECASE)
-        # Remove any other HTML tags
-        clean_text = re.sub(r'<[^>]+>', '', clean_text)
-        # Fix unicode issues
-        clean_text = clean_text.replace('\u00A0', ' ')  # Non-breaking space
-        clean_text = clean_text.replace('\u2013', '-')  # En dash
-        clean_text = clean_text.replace('\u2014', '--') # Em dash
-        clean_text = clean_text.replace('\u2019', "'")  # Right single quote
-        clean_text = clean_text.replace('\u201C', '"')  # Left double quote
-        clean_text = clean_text.replace('\u201D', '"')  # Right double quote
-        # Ensure proper line spacing for tables
-        clean_text = re.sub(r'\n\|', r'\n\n|', clean_text)
-        clean_text = re.sub(r'\|\n', r'|\n\n', clean_text)
-        # Remove excessive blank lines (more than 2)
-        clean_text = re.sub(r'\n{3,}', '\n\n', clean_text)
-        print(f"✅ Text cleaned. Length: {len(clean_text)} characters")
-        # ============ STEP 2: Save cleaned text to debug file ============
-        debug_file = filename.replace('.pdf', '_cleaned.md')
-        try:
-            with open(debug_file, 'w', encoding='utf-8') as f:
-                f.write(clean_text)
-            print(f"📝 Saved cleaned markdown to: {debug_file}")
-        except Exception as e:
-            print(f"⚠️ Warning: Could not save debug file: {e}")
-        # ============ STEP 3: Create PDF with optimal settings ============
-        print("🔧 Configuring PDF generator...")
-        # Initialize MarkdownPdf with minimal TOC
-        pdf = MarkdownPdf(toc_level=0)  # Disable table of contents
-        # Set metadata
-        pdf.meta = {
-            "title": "Grading Report",
-            "author": "AI Grading System",
-            "subject": "Student Assessment"
-        }
-        # Add the content as a section
-        print("📑 Adding content to PDF...")
-        pdf.add_section(Section(clean_text, toc=False))
-        # ============ STEP 4: Save the PDF ============
-        print(f"💾 Saving PDF to {filename}...")
-        pdf.save(filename)
-        # ============ STEP 5: Verify the PDF was created ============
-        if os.path.exists(filename):
-            file_size = os.path.getsize(filename)
-            print(f"✅ PDF created successfully!")
-            print(f"📊 File size: {file_size / 1024:.2f} KB")
-            # Check if file is suspiciously small (might indicate truncation)
-            if file_size < 10000:  # Less than 10KB
-                print(f"⚠️ Warning: PDF file is very small ({file_size} bytes)")
-                print("   This might indicate content was truncated.")
-                print("   Check the PDF file manually.")
-            return filename
         else:
-            raise FileNotFoundError(f"PDF file was not created: {filename}")
-    except Exception as e:
-        print(f"❌ PDF generation failed: {e}")
-        print(f"   Error type: {type(e).__name__}")
-        import traceback
-        traceback.print_exc()
-        # ============ FALLBACK: Save as Markdown ============
-        print("🔄 Attempting fallback: Saving as Markdown file...")
-        try:
-            md_filename = filename.replace('.pdf', '.md')
-            with open(md_filename, 'w', encoding='utf-8') as f:
-                f.write(clean_text if 'clean_text' in locals() else text)
-            print(f"✅ Saved as Markdown file: {md_filename}")
-            print("   You can manually convert this to PDF using an online tool.")
-            return md_filename
-        except Exception as fallback_error:
-            print(f"❌ Fallback also failed: {fallback_error}")
-            # ============ LAST RESORT: Save as plain text ============
             try:
-                txt_filename = filename.replace('.pdf', '.txt')
-                with open(txt_filename, 'w', encoding='utf-8') as f:
-                    f.write(text)  # Use original text
-                print(f"✅ Saved as text file: {txt_filename}")
-                return txt_filename
-            except Exception as final_error:
-                print(f"❌ All save attempts failed: {final_error}")
-                raise Exception("Could not save output in any format") from e
-def save_as_pdf_with_split(text, filename="output.pdf", max_questions=20):
-    """
-    Save as PDF, splitting into multiple files if content is too large.
-    """
-    try:
-        # First, try to save normally
-        return save_as_pdf(text, filename)
-    except Exception as e:
-        print(f"⚠️ Normal save failed, attempting to split document...")
-        # Split by questions
-        question_blocks = re.split(r'(## Question \d+(?:\.[a-z]+)?)', text)
-        if len(question_blocks) <= 3:  # Not enough to split
-            raise e
-        # Reconstruct questions with headers
-        questions = []
-        for i in range(1, len(question_blocks), 2):
-            if i+1 < len(question_blocks):
-                questions.append(question_blocks[i] + question_blocks[i+1])
-        print(f"📊 Found {len(questions)} questions to split")
-        # Split into chunks
-        chunk_size = max_questions
-        pdf_files = []
-        for chunk_idx in range(0, len(questions), chunk_size):
-            chunk = questions[chunk_idx:chunk_idx + chunk_size]
-            chunk_text = "\n\n".join(chunk)
-            # Add header and footer
-            chunk_header = f"# Grading Report - Part {chunk_idx//chunk_size + 1}\n\n"
-            chunk_text = chunk_header + chunk_text
-            # Save chunk
-            base_name = filename.replace('.pdf', '')
-            chunk_filename = f"{base_name}_part{chunk_idx//chunk_size + 1}.pdf"
-            print(f"💾 Saving part {chunk_idx//chunk_size + 1}...")
-            save_as_pdf(chunk_text, chunk_filename)
-            pdf_files.append(chunk_filename)
-        print(f"✅ Document split into {len(pdf_files)} parts:")
-        for pdf_file in pdf_files:
-            print(f"   📄 {pdf_file}")
-        return pdf_files[0]  # Return first part
 def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
     if output_path is None:
@@ -433,41 +465,81 @@ def extract_question_ids_from_qpms(text: str):
         print("⚠️ No question IDs extracted; will send NA placeholder.")
     return fallback_matches
-def build_as_prompt_with_expected_ids(expected_ids, qpms_text=None):
     """
-    Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions.
     """
     if not expected_ids:
         ids_block = "{NA}"
     else:
         ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
-    refer_text = ""
-    if qpms_text:
-        refer_text = (
-            "\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS). "
-            "If you encounter ambiguous handwriting (for example, if a number could be '-1.6' or '1.6'), refer to the QP+MS transcript to infer the student's intended answer. "
-            "However, if you are confident in your transcription, you may use your own judgment. "
-            "Always prioritize accuracy and context from the QP+MS transcript when in doubt.\n"
         )
-    prompt = f"""You are a high-quality handwritten transcription assistant.
-INPUT: This PDF contains a student's handwritten answer sheet.{refer_text}
-TASK: Transcribe the student's answers exactly (as text). Preserve step order and line breaks. Attempt to assign each answer to a question ID if the student has labelled it (e.g., "1", "1a", "2(b)", "3"). If the student hasn't labelled answers, segment contiguous answer blocks and attempt to infer question IDs from context — but mark inferred IDs clearly as "INFERRED: <id>"
-Enclose all mathematical expressions in Markdown fenced code blocks (``` triple backticks).
-If a diagram/graph is omitted, write [Graph omitted].
-Unreadable parts: [illegible].
-Unanswered: [No response].
-Do NOT recreate diagrams.
-Ensure consistency and determinism in formatting so subsequent models can grade directly from this aligned format.
 Expected questions (if missing, write NA):
 {ids_block}
 -----------------------
 OUTPUT FORMAT:
 Question <id>
-AS:
-<transcribed answer or placeholder>
-==== GRAPH FOUND ANSWERS ====\nGraph found in:\n- Answer <number> → Page <number>\n(one per line)\n==== END GRAPH FOUND ===="""
     return prompt
 def extract_graph_questions_from_ms(text: str):
     """Extract graph questions and page numbers from MS transcript."""
     clean_text = text.replace("\u00A0", " ").replace("\t", " ")
@@ -641,7 +713,7 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, expected_ids
         page_img = page.convert("RGB")
         img_cv = np.array(page_img)
         img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
-        h, w, _h, w, _ = img_cv.shape
         cell_w_px, cell_h_px = w / cols, h / rows
         page_mappings = [m for m in all_mappings if m.get("page") == page_num]
@@ -701,10 +773,9 @@ def extract_pdf_pages_as_images(pdf_path, page_numbers, prefix):
     return out_paths
 # ---------------- PIPELINE ----------------
-def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
     """
     Final pipeline with graph-aware grading logic using NEW SDK.
-    Enhanced with improved PDF saving.
     """
     try:
         print("🔁 Starting pipeline...")
@@ -722,7 +793,7 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
         print("✅ Upload complete.")
         print("1.i) Transcribing QP+MS (questions first, then full markscheme, with graph detection)...")
-        qpms_prompt = PROMPTS["QP_MS_TRANSCRIPTION"]["content"] + "\nAt the end, also list all questions in the markscheme where a graph is expected, in the format:\nGraph expected in:\n- Question <number> → Page <number>\n(One per line, after ==== MARKSCHEME END ====)"
         qpms_text = gemini_generate_content(qpms_prompt, file_upload_obj=merged_uploaded)
         print("📄 QP+MS transcription received. Saving debug file: debug_qpms_transcript.txt")
         with open("debug_qpms_transcript.txt", "w", encoding="utf-8") as f:
@@ -740,7 +811,7 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
             extracted_ids = ["NA"]
         print("1.ii) Building AS transcription prompt with expected question IDs and graph detection, sending to Gemini...")
-        as_prompt = build_as_prompt_with_expected_ids(extracted_ids, qpms_text) + "\nAt the end, also list all answers where a graph is found, in the format:\nGraph found in:\n- Answer <number> → Page <number>\n(One per line, after all answers)"
         as_text = gemini_generate_content(as_prompt, file_upload_obj=ans_uploaded)
         print("📝 AS transcription received. Saving debug file: debug_as_transcript.txt")
         with open("debug_as_transcript.txt", "w", encoding="utf-8") as f:
@@ -765,7 +836,8 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
         if ms_graph_images or as_graph_images:
             graph_note = "\n\n---\nSome questions require graphs. I've attached the relevant graph pages from QP+MS and from the Answer Sheet. Use them as visual context when grading.\n---\n"
             grading_input += graph_note
-        grading_prompt_system = PROMPTS["GRADING_PROMPT"]["content"]
         grading_images = ms_graph_images + as_graph_images
         grading_text = gemini_generate_content(grading_prompt_system + "\n\nPlease grade the following transcripts:\n" + grading_input, image_obj=grading_images if grading_images else None)
         print("🧾 Grading output received. Saving debug file: debug_grading.md")
@@ -773,35 +845,8 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
             f.write(grading_text)
         base_name = os.path.splitext(os.path.basename(ans_path))[0]
-        # ============ ENHANCED PDF SAVING WITH ERROR HANDLING ============
-        grading_pdf_path = f"{base_name}_graded.pdf"
-        print("📄 Attempting to save grading report as PDF...")
-        try:
-            # Try normal save first
-            grading_pdf_path = save_as_pdf(grading_text, grading_pdf_path)
-            print("✅ Grading PDF saved successfully:", grading_pdf_path)
-        except Exception as pdf_error:
-            print(f"⚠️ Standard PDF save failed: {pdf_error}")
-            print("🔄 Trying split document method...")
-            try:
-                # Try split method
-                grading_pdf_path = save_as_pdf_with_split(grading_text, grading_pdf_path)
-                print("✅ Grading PDF saved (split method):", grading_pdf_path)
-            except Exception as split_error:
-                print(f"⚠️ Split method also failed: {split_error}")
-                print("💾 Saving as Markdown fallback...")
-                # Fallback to markdown
-                grading_pdf_path = grading_pdf_path.replace('.pdf', '.md')
-                with open(grading_pdf_path, 'w', encoding='utf-8') as f:
-                    f.write(grading_text)
-                print(f"✅ Saved as Markdown file: {grading_pdf_path}")
-                print("ℹ️ You can convert this .md file to PDF using online tools or pandoc")
         grading_json = extract_marks_from_grading(grading_text)
         with open("debug_grading_json.json", "w", encoding="utf-8") as f:
@@ -812,14 +857,8 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
         if imprint:
             print("✍ Imprint option enabled. Starting imprinting process...")
             imprinted_pdf_path = f"{base_name}_imprinted.pdf"
-            try:
-                imprinted_pdf_path = imprint_marks_using_mapping(ans_path, grading_json, imprinted_pdf_path, extracted_ids)
-                print("✅ Imprinting finished. Imprinted PDF at:", imprinted_pdf_path)
-            except Exception as imprint_error:
-                print(f"❌ Imprinting failed: {imprint_error}")
-                import traceback
-                traceback.print_exc()
-                imprinted_pdf_path = None
         print("🏁 Pipeline finished successfully.")
         return qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path
@@ -831,105 +870,53 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
         return f"❌ Error: {e}", None, None, None, None
 # ---------------- GRADIO UI ----------------
-with gr.Blocks(title="AI Grading System - Enhanced", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 📘 AI Grading System - Enhanced Version")
-    gr.Markdown("**✅ Using official `google-genai` SDK with improved PDF generation**")
-    gr.Markdown("---")
     with gr.Row():
-        with gr.Column():
-            qp_file = gr.File(label="📄 Upload Question Paper (PDF)", file_types=[".pdf"])
-        with gr.Column():
-            ms_file = gr.File(label="📄 Upload Markscheme (PDF)", file_types=[".pdf"])
-        with gr.Column():
-            ans_file = gr.File(label="📝 Upload Student Answer Sheet (PDF)", file_types=[".pdf"])
     with gr.Row():
-        imprint_toggle = gr.Checkbox(label="✍ Imprint Marks on Student Answer Sheet", value=False)
-        run_button = gr.Button("🚀 Run Grading Pipeline", variant="primary", size="lg")
-    gr.Markdown("---")
-    gr.Markdown("### 📊 Transcription Outputs")
     with gr.Row():
-        qpms_box = gr.Textbox(label="📑 QP+MS Transcript", lines=12, max_lines=20)
-        as_box = gr.Textbox(label="📝 AS Transcript", lines=12, max_lines=20)
-    gr.Markdown("---")
-    gr.Markdown("### 🎯 Grading Results")
-    grading_output_box = gr.Textbox(label="🧾 Grading Report (Markdown)", lines=20, max_lines=30)
-    with gr.Row():
-        grading_pdf_file = gr.File(label="📥 Download Grading Report (PDF/MD)")
-        imprint_pdf_file = gr.File(label="📥 Download Imprinted Answer Sheet (Optional)")
-    gr.Markdown("---")
-    gr.Markdown("""
-    ### 📝 Instructions:
-    1. Upload all three PDF files (Question Paper, Markscheme, Answer Sheet)
-    2. Optionally enable mark imprinting on the answer sheet
-    3. Click "Run Grading Pipeline" and wait for processing
-    4. Review transcripts and download the grading report
-    ### ⚠️ Notes:
-    - Large documents may take several minutes to process
-    - If PDF generation fails, a Markdown (.md) file will be provided instead
-    - Check the console/logs for detailed progress information
-    - Debug files are saved automatically for troubleshooting
-    """)
-    def run_pipeline(qp_file_obj, ms_file_obj, ans_file_obj, imprint_flag):
-        """
-        Wrapper function for Gradio interface
-        """
         if not qp_file_obj or not ms_file_obj or not ans_file_obj:
-            error_msg = "❌ Please upload all three files (QP, MS, and Answer Sheet)"
-            return error_msg, "", "", None, None
         qp_path = qp_file_obj.name
         ms_path = ms_file_obj.name
         ans_path = ans_file_obj.name
-        print("\n" + "="*80)
-        print("🎬 STARTING NEW GRADING SESSION")
-        print("="*80 + "\n")
         qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path = align_and_grade_pipeline(
-            qp_path, ms_path, ans_path, imprint=imprint_flag
         )
-        print("\n" + "="*80)
-        print("🎬 GRADING SESSION COMPLETE")
-        print("="*80 + "\n")
         return qpms_text or "", as_text or "", grading_text or "", grading_pdf_path, imprinted_pdf_path
     run_button.click(
         fn=run_pipeline,
-        inputs=[qp_file, ms_file, ans_file, imprint_toggle],
         outputs=[qpms_box, as_box, grading_output_box, grading_pdf_file, imprint_pdf_file]
     )
 if __name__ == "__main__":
-    print("="*80)
-    print("🚀 AI GRADING SYSTEM - STARTING")
-    print("="*80)
-    print("📌 Make sure GEMINI_API_KEY environment variable is set")
-    print("📌 Required dependencies: google-genai, markdown_pdf, gradio, pdf2image, etc.")
-    print("="*80 + "\n")
-    # Check if API key is set
-    if not os.getenv("GEMINI_API_KEY"):
-        print("⚠️  WARNING: GEMINI_API_KEY not found in environment variables!")
-        print("   Set it with: export GEMINI_API_KEY='your-api-key-here'")
-    else:
-        print("✅ GEMINI_API_KEY found")
-    print("\n🌐 Launching Gradio interface...\n")
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        show_error=True
-    )

 import img2pdf
 import gradio as gr
 from google import genai  # NEW SDK
 from pdf2image import convert_from_path
 from PIL import Image, ImageDraw, ImageFont
 import cv2
 import numpy as np
 from PyPDF2 import PdfReader, PdfWriter
+from prompts import QP_MS_TRANSCRIPTION_PROMPT, get_grading_prompt
 # ---------------- CONFIG ----------------
 # Create client with new SDK
 GRID_ROWS, GRID_COLS = 20, 14
 # ---------------- PROMPTS ----------------
+# Prompts are now imported from prompts.py
 # ---------------- HELPERS ----------------
+def parse_md_table(md):
+    """Parse a Markdown table into a list of rows."""
+    lines = [l for l in md.split("\n") if l.strip()]
+    if len(lines) < 3:
+        return []
+    lines = lines[2:]  # skip header + separator
+    rows = []
+    for line in lines:
+        parts = [c.strip() for c in line.strip("|").split("|")]
+        # Filter out empty strings from leading/trailing pipes
+        clean_parts = [p for p in parts if p]
+        if clean_parts:
+            rows.append(clean_parts)
+    return rows
+def convert_html_color_spans(md_text):
+    """Convert HTML color spans to LaTeX textcolor commands."""
+    pattern = r'<span\s+style="color:\s*([^"]+)">\s*(.*?)\s*</span>'
+    def repl(m):
+        color = m.group(1).strip()
+        text = m.group(2)
+        return fr'\textcolor{{{color}}}{{{text}}}'
+    return re.sub(pattern, repl, md_text, flags=re.IGNORECASE)
+def cleanup_markdown_for_latex(md_text):
+    """Clean up markdown text for better LaTeX conversion."""
+    # Ensure spacing between bold headers and tables
+    md_text = re.sub(r'(\*\*Markscheme vs Student Answer\*\*)\s*(\|)', r'\1\n\n\2', md_text)
+    # Convert common unicode math symbols to LaTeX (safety net)
+    replacements = {
+        '∫': r'\int ',
+        '²': '^2',
+        '³': '^3',
+        '½': r'\frac{1}{2}',
+        '¼': r'\frac{1}{4}',
+        '∞': r'\infty',
+        '≤': r'\leq',
+        '≥': r'\geq',
+        '≠': r'\neq',
+        '±': r'\pm',
+        '×': r'\times',
+        '÷': r'\div',
+        '√': r'\sqrt',
+        '∑': r'\sum',
+        '∏': r'\prod',
+        '∂': r'\partial',
+        'π': r'\pi',
+        'θ': r'\theta',
+        'α': r'\alpha',
+        'β': r'\beta',
+        'γ': r'\gamma',
+        'δ': r'\delta',
+        'ε': r'\epsilon',
+        'λ': r'\lambda',
+        'μ': r'\mu',
+        'σ': r'\sigma',
+        'Δ': r'\Delta',
+        'Σ': r'\Sigma',
+        'Ω': r'\Omega'
+    }
+    for char, latex in replacements.items():
+        md_text = md_text.replace(char, f'${latex}$')
+    return md_text
+def escape_latex_special_chars(text):
+    """Escape special LaTeX characters in text."""
+    replacements = {
+        '%': r'\%',
+        '&': r'\&',
+        '#': r'\#',
+        '_': r'\_',
+        '{': r'\{',
+        '}': r'\}',
+        '~': r'\textasciitilde{}',
+        '^': r'\textasciicircum{}'
+    }
+    # Don't escape if already in math mode or LaTeX command
+    if '$' in text or '\\' in text:
+        return text
+    for char, escaped in replacements.items():
+        text = text.replace(char, escaped)
+    return text
+def save_as_pdf(text, filename="output.pdf"):
     """
+    Convert Markdown text to PDF using Pandoc with pdflatex.
+    Extracts the Examiner's Summary Report and places it at the top with enhanced formatting.
+    Converts HTML color spans to LaTeX textcolor commands.
+    Args:
+        text (str): Markdown content to convert
+        filename (str): Output PDF filename
+    Returns:
+        str: Path to the generated PDF file
+    Raises:
+        Exception: If Pandoc or pdflatex is not available, or conversion fails
     """
+    base_name = os.path.splitext(filename)[0]
+    temp_md_file = f"{base_name}_input.md"
+    temp_tex_file = f"{base_name}_temp.tex"
     try:
+        print(f"📝 Processing markdown for PDF generation...")
+        # Step 1: Extract Summary Report Table
+        summary_pattern = re.compile(
+            r"### Examiner's Summary Report\s*\n\n(\|.*?\|)\s*\n\n\*\*Total:\s*(.*?)\*\*",
+            re.DOTALL
+        )
+        summary_match = summary_pattern.search(text)
+        if summary_match:
+            summary_table_md = summary_match.group(1)
+            summary_total = summary_match.group(2)
+            # Remove summary section from markdown
+            text = summary_pattern.sub("", text)
+            print("✅ Extracted Examiner's Summary Report")
         else:
+            summary_table_md = ""
+            summary_total = ""
+            print("⚠️ No Examiner's Summary Report found")
+        # Step 2: Clean up markdown and convert HTML color spans to LaTeX
+        text = cleanup_markdown_for_latex(text)
+        text = convert_html_color_spans(text)
+        print("✅ Cleaned markdown and converted HTML color spans to LaTeX")
+        # Save cleaned markdown
+        with open(temp_md_file, 'w', encoding='utf-8') as f:
+            f.write(text)
+        # Step 3: Convert MD to LaTeX via Pandoc
+        print(f"📝 Converting markdown to LaTeX using Pandoc...")
+        pandoc_cmd = [
+            "pandoc",
+            "--from=markdown",
+            "--to=latex",
+            "--standalone",
+            temp_md_file,
+            "-o", temp_tex_file
+        ]
+        result = subprocess.run(pandoc_cmd, capture_output=True, check=False)
+        if result.returncode != 0 or not os.path.exists(temp_tex_file):
+            try:
+                stderr = result.stderr.decode('utf-8', errors='replace')
+            except:
+                stderr = str(result.stderr)
+            raise Exception(f"Pandoc conversion failed: {stderr}")
+        print("✅ Pandoc conversion complete")
+        # Step 4: Modify the generated LaTeX
+        with open(temp_tex_file, "r", encoding="utf-8") as f:
+            tex = f.read()
+        # Change document class to larger font
+        tex = tex.replace(
+            r"\documentclass{article}",
+            r"\documentclass[12pt]{extarticle}"
+        )
+        # Inject enhanced packages with better table formatting
+        insert_packages = r"""\usepackage[a4paper, margin=1in]{geometry}
+\usepackage{xcolor}
+\usepackage{colortbl}
+\usepackage{booktabs}
+\usepackage{array}
+\usepackage{longtable}
+\renewcommand{\arraystretch}{1.4}
+\newcolumntype{L}[1]{>{\raggedright\arraybackslash}p{#1}}"""
+        tex = tex.replace(r"\begin{document}", insert_packages + "\n\\begin{document}")
+        # Step 5: Build enhanced LaTeX table for summary with zebra striping (if exists)
+        if summary_table_md:
+            summary_rows = parse_md_table(summary_table_md)
+            summary_latex = r"""\section*{Examiner's Summary Report}
+\begin{center}
+\rowcolors{2}{gray!10}{white}
+\begin{tabular}{|c|c|c|L{8cm}|}
+\hline
+\rowcolor{gray!30}
+\textbf{Question} & \textbf{Marks} & \textbf{Remark} & \textbf{Feedback} \\ \hline
+"""
+            for row in summary_rows:
+                if len(row) >= 4:
+                    # Escape special LaTeX characters in feedback
+                    feedback = row[3]
+                    # Only escape if not already LaTeX code
+                    if not ('$' in feedback or '\\textcolor' in feedback):
+                        feedback = feedback.replace('%', r'\%').replace('&', r'\&').replace('#', r'\#')
+                    summary_latex += f"{row[0]} & {row[1]} & {row[2]} & {feedback} \\\\ \\hline\n"
+            summary_latex += r"\end{tabular}"
+            summary_latex += "\n\\end{center}\n\n"
+            summary_latex += f"\\vspace{{0.5cm}}\\noindent\\textbf{{\\Large Overall Score: {summary_total}}}\n\n"
+            summary_latex += "\\hrulefill\n\\vspace{1cm}\n\n"
+            summary_latex += "\\newpage\n\n"
+            # Insert summary right after \begin{document}
+            tex = tex.replace(
+                r"\begin{document}",
+                r"\begin{document}" + "\n\n" + summary_latex
+            )
+            print("✅ Injected enhanced summary table with zebra striping at top of document")
+        # Save modified LaTeX
+        with open(temp_tex_file, "w", encoding="utf-8") as f:
+            f.write(tex)
+        # Step 6: Compile PDF with pdflatex
+        print(f"📝 Compiling PDF with pdflatex...")
+        pdflatex_cmd = [
+            "pdflatex",
+            "-interaction=nonstopmode",
+            f"-output-directory={os.path.dirname(os.path.abspath(temp_tex_file)) or '.'}",
+            temp_tex_file
+        ]
+        # Run twice to resolve references
+        # Don't use text=True to avoid encoding issues with pdflatex output
+        result1 = subprocess.run(pdflatex_cmd, capture_output=True, check=False)
+        result2 = subprocess.run(pdflatex_cmd, capture_output=True, check=False)
+        # Check if PDF was actually created (better than checking return code)
+        temp_pdf = temp_tex_file.replace(".tex", ".pdf")
+        if not os.path.exists(temp_pdf):
+            # Try to decode error output for debugging
             try:
+                stderr = result2.stderr.decode('utf-8', errors='replace')
+            except:
+                stderr = str(result2.stderr)
+            # Also check log file for more details
+            log_file = temp_tex_file.replace(".tex", ".log")
+            if os.path.exists(log_file):
+                try:
+                    with open(log_file, 'r', encoding='utf-8', errors='replace') as f:
+                        log_content = f.read()
+                        # Extract error lines
+                        error_lines = [line for line in log_content.split('\n') if '!' in line]
+                        if error_lines:
+                            stderr += "\n\nLaTeX Errors:\n" + "\n".join(error_lines[:10])
+                except:
+                    pass
+            raise Exception(f"pdflatex failed to create PDF. Check LaTeX syntax. Error: {stderr[:1000]}")
+        # Move output PDF to final filename
+        if os.path.exists(temp_pdf):
+            if os.path.exists(filename):
+                os.remove(filename)
+            os.rename(temp_pdf, filename)
+        print(f"✅ PDF generated successfully: {filename}")
+        # Clean up temporary files
+        for ext in [".md", ".tex", ".aux", ".log", ".out"]:
+            temp_file = base_name + ext
+            if os.path.exists(temp_file):
+                os.remove(temp_file)
+            # Also clean input/temp variants
+            for prefix in ["_input", "_temp"]:
+                temp_file = base_name + prefix + ext
+                if os.path.exists(temp_file):
+                    os.remove(temp_file)
+        return filename
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Conversion failed: {e}")
+        print(f"   STDOUT: {e.stdout}")
+        print(f"   STDERR: {e.stderr}")
+        raise Exception(f"PDF conversion failed: {e.stderr}")
+    except FileNotFoundError as e:
+        print(f"❌ Required tool not found: {e}")
+        raise Exception(
+            "Pandoc or pdflatex not found. Please install:\n"
+            "  - pandoc\n"
+            "  - texlive (or MiKTeX on Windows)\n"
+            "  - texlive-latex-extra (for extarticle class)"
+        )
+    except Exception as e:
+        print(f"❌ Unexpected error during PDF conversion: {e}")
+        import traceback
+        traceback.print_exc()
+        raise
 def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
     if output_path is None:
         print("⚠️ No question IDs extracted; will send NA placeholder.")
     return fallback_matches
+def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
     """
+    Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
+    modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
+    requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
+    Includes explicit rules for interpreting NA-like answers and no-response situations.
     """
     if not expected_ids:
         ids_block = "{NA}"
     else:
         ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
+    qpms_section = ""
+    if qpms_text is not None:
+        qpms_section = (
+            "\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS) below."
+            "\nUse it primarily to resolve ambiguous handwriting and to confirm expected answers when needed."
+            "\n--- BEGIN QP+MS TRANSCRIPT ---\n"
+            f"{qpms_text.strip()}\n"
+            "--- END QP+MS TRANSCRIPT ---\n"
         )
+    prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.
+INPUT: This PDF contains a student's handwritten answer sheet.
+{qpms_section}
+TASK:
+1. **THINKING:** Before transcribing each answer, document your thought process inside a **<think>** tag.
+    - Identify the question ID. If inferred, note why.
+    - Detail any ambiguities (unclear numbers, symbols, or structures).
+    - Explain how ambiguities were resolved, including whether the QP+MS transcript was consulted.
+    - If QP+MS was consulted but you chose not to change the transcription, state this.
+    - If the initial question label was incorrect (e.g., 2.a vs 2.b), correct it and briefly explain the reasoning in <think>.
+    *Example Thinking:*
+    <think>
+    - Found Question 3(a).
+    - The term could be '$2x$' or '21x'.
+    - Markscheme uses '$21x$', but handwriting matches '$2x$'.
+    - Decision: transcribe '$2x$'.
+    </think>
+2. **TRANSCRIPTION:** Transcribe the student's answers directly and faithfully.
+    - Assign each answer to a labelled question ID when present.
+    - For unlabeled answers, segment logically and mark inferred IDs as "**INFERRED: <id>**".
+    - **Mathematical expressions and standalone variables must appear inside LaTeX dollar delimiters ($...$).**
+    - If a diagram/graph is omitted, write **[Graph omitted]**.
+    - If handwriting is unreadable: **[illegible]**.
+    **ANSWER-INTERPRETATION RULES:**
+    - If the student writes “NA”, “N/A”, “Not Applicable”, or clear equivalents → record exactly as **NA**.
+    - If the student leaves the space blank, crosses it out, makes no meaningful attempt, or provides no answer → record **[No response]**.
+Ensure deterministic formatting so subsequent models can grade directly from this aligned format.
 Expected questions (if missing, write NA):
 {ids_block}
 -----------------------
 OUTPUT FORMAT:
+<think>...</think>
+Question <id>
+AS:<transcribed answer or placeholder>
+<think>...</think>
 Question <id>
+AS:<transcribed answer or placeholder>
+...
+==== GRAPH FOUND ANSWERS ====
+Graph found in:
+- Answer <number> → Page <number>
+(one per line)
+==== END GRAPH FOUND ===="""
     return prompt
 def extract_graph_questions_from_ms(text: str):
     """Extract graph questions and page numbers from MS transcript."""
     clean_text = text.replace("\u00A0", " ").replace("\t", " ")
         page_img = page.convert("RGB")
         img_cv = np.array(page_img)
         img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
+        h, w, _ = img_cv.shape
         cell_w_px, cell_h_px = w / cols, h / rows
         page_mappings = [m for m in all_mappings if m.get("page") == page_num]
     return out_paths
 # ---------------- PIPELINE ----------------
+def align_and_grade_pipeline(qp_path, ms_path, ans_path, subject="Maths", imprint=False):
     """
     Final pipeline with graph-aware grading logic using NEW SDK.
     """
     try:
         print("🔁 Starting pipeline...")
         print("✅ Upload complete.")
         print("1.i) Transcribing QP+MS (questions first, then full markscheme, with graph detection)...")
+        qpms_prompt = QP_MS_TRANSCRIPTION_PROMPT["content"] + "\nAt the end, also list all questions in the markscheme where a graph is expected, in the format:\nGraph expected in:\n- Question <number> → Page <number>\n(One per line, after ==== MARKSCHEME END ====)"
         qpms_text = gemini_generate_content(qpms_prompt, file_upload_obj=merged_uploaded)
         print("📄 QP+MS transcription received. Saving debug file: debug_qpms_transcript.txt")
         with open("debug_qpms_transcript.txt", "w", encoding="utf-8") as f:
             extracted_ids = ["NA"]
         print("1.ii) Building AS transcription prompt with expected question IDs and graph detection, sending to Gemini...")
+        as_prompt = build_as_cot_prompt_with_expected_ids(extracted_ids, qpms_text) + "\nAt the end, also list all answers where a graph is found, in the format:\nGraph found in:\n- Answer <number> → Page <number>\n(One per line, after all answers)"
         as_text = gemini_generate_content(as_prompt, file_upload_obj=ans_uploaded)
         print("📝 AS transcription received. Saving debug file: debug_as_transcript.txt")
         with open("debug_as_transcript.txt", "w", encoding="utf-8") as f:
         if ms_graph_images or as_graph_images:
             graph_note = "\n\n---\nSome questions require graphs. I've attached the relevant graph pages from QP+MS and from the Answer Sheet. Use them as visual context when grading.\n---\n"
             grading_input += graph_note
+        grading_prompt_obj = get_grading_prompt(subject.lower())
+        grading_prompt_system = grading_prompt_obj["content"]
         grading_images = ms_graph_images + as_graph_images
         grading_text = gemini_generate_content(grading_prompt_system + "\n\nPlease grade the following transcripts:\n" + grading_input, image_obj=grading_images if grading_images else None)
         print("🧾 Grading output received. Saving debug file: debug_grading.md")
             f.write(grading_text)
         base_name = os.path.splitext(os.path.basename(ans_path))[0]
+        grading_pdf_path = save_as_pdf(grading_text, f"{base_name}_graded.pdf")
+        print("📄 Grading PDF saved:", grading_pdf_path)
         grading_json = extract_marks_from_grading(grading_text)
         with open("debug_grading_json.json", "w", encoding="utf-8") as f:
         if imprint:
             print("✍ Imprint option enabled. Starting imprinting process...")
             imprinted_pdf_path = f"{base_name}_imprinted.pdf"
+            imprinted_pdf_path = imprint_marks_using_mapping(ans_path, grading_json, imprinted_pdf_path, extracted_ids)
+            print("✅ Imprinting finished. Imprinted PDF at:", imprinted_pdf_path)
         print("🏁 Pipeline finished successfully.")
         return qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path
         return f"❌ Error: {e}", None, None, None, None
 # ---------------- GRADIO UI ----------------
+with gr.Blocks(title="AI Grading (Pandoc + pdflatex)") as demo:
+    gr.Markdown("## 📘 AI Grading — Using Pandoc + pdflatex for PDF Generation")
+    gr.Markdown("**✅ Now using Pandoc with pdflatex for professional-quality PDF outputs!**")
     with gr.Row():
+        qp_file = gr.File(label="📄 Upload Question Paper (PDF)")
+        ms_file = gr.File(label="📄 Upload Markscheme (PDF)")
+        ans_file = gr.File(label="📝 Upload Student Answer Sheet (PDF)")
     with gr.Row():
+        subject_dropdown = gr.Dropdown(
+            choices=["Maths", "Science"],
+            value="Maths",
+            label="📚 Subject",
+            info="Select the subject to apply appropriate grading guidelines"
+        )
+        imprint_toggle = gr.Checkbox(label="✍ Imprint Marks on Student Answer Sheet", value=False)
+    run_button = gr.Button("🚀 Run Pipeline")
     with gr.Row():
+        qpms_box = gr.Textbox(label="📑 QP+MS Transcript", lines=12)
+        as_box = gr.Textbox(label="📝 AS Transcript", lines=12)
+    grading_output_box = gr.Textbox(label="🧾 Grading (Markdown)", lines=20)
+    grading_pdf_file = gr.File(label="📥 Download Grading PDF")
+    imprint_pdf_file = gr.File(label="📥 Download Imprinted PDF (Optional)")
+    def run_pipeline(qp_file_obj, ms_file_obj, ans_file_obj, subject_choice, imprint_flag):
         if not qp_file_obj or not ms_file_obj or not ans_file_obj:
+            return "❌ Please upload all three files", "", "", None, None
         qp_path = qp_file_obj.name
         ms_path = ms_file_obj.name
         ans_path = ans_file_obj.name
         qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path = align_and_grade_pipeline(
+            qp_path, ms_path, ans_path, subject=subject_choice, imprint=imprint_flag
         )
         return qpms_text or "", as_text or "", grading_text or "", grading_pdf_path, imprinted_pdf_path
     run_button.click(
         fn=run_pipeline,
+        inputs=[qp_file, ms_file, ans_file, subject_dropdown, imprint_toggle],
         outputs=[qpms_box, as_box, grading_output_box, grading_pdf_file, imprint_pdf_file]
     )
 if __name__ == "__main__":
+    demo.launch()