Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,12 +6,12 @@ import time
|
|
| 6 |
import img2pdf
|
| 7 |
import gradio as gr
|
| 8 |
from google import genai # NEW SDK
|
| 9 |
-
from markdown_pdf import MarkdownPdf, Section
|
| 10 |
from pdf2image import convert_from_path
|
| 11 |
from PIL import Image, ImageDraw, ImageFont
|
| 12 |
import cv2
|
| 13 |
import numpy as np
|
| 14 |
from PyPDF2 import PdfReader, PdfWriter
|
|
|
|
| 15 |
|
| 16 |
# ---------------- CONFIG ----------------
|
| 17 |
# Create client with new SDK
|
|
@@ -19,279 +19,311 @@ client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
|
|
| 19 |
GRID_ROWS, GRID_COLS = 20, 14
|
| 20 |
|
| 21 |
# ---------------- PROMPTS ----------------
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
"content": """You are a high-quality OCR/Transcription assistant.
|
| 26 |
-
INPUT: This file is a PDF that first contains the Question Paper and immediately after it the Markscheme.
|
| 27 |
-
TASK:
|
| 28 |
-
1. Transcribe EXACTLY all the questions FIRST (with their total marks).
|
| 29 |
-
2. After ALL questions, transcribe the Markscheme exactly, preserving M/A/R notation in brackets.
|
| 30 |
-
3. Always number the questions sequentially (Question 1, Question 2, Question 3, β¦) **in the order they appear in the PDF**, even if the PDF shows a different number or leaves it blank. Do NOT skip or leave Question: blank. Never start a question other than question 1 (even if it is labelled in pdf as 8 name it 1).
|
| 31 |
-
4. If a question or sub-question is labelled with a letter (e.g., "Q1.a", "Q2(b)", "1 (c)(i)"), transcribe it as "Question 1.a", "Question 2.b", "Question 1.c.i" etc., exactly preserving the hierarchy of sub-question identifiers.
|
| 32 |
-
5. After the markscheme, DETECT and FLAG all questions in the markscheme where a graph/diagram is expected. For each, output the question number and the page number in the format below.
|
| 33 |
-
|
| 34 |
-
FORMAT:
|
| 35 |
-
==== PAPER TOTAL MARKS ====
|
| 36 |
-
<total marks>
|
| 37 |
-
|
| 38 |
-
==== QUESTIONS BEGIN ====
|
| 39 |
-
Question 1.a
|
| 40 |
-
Total Marks: <number>
|
| 41 |
-
QP: <question text>
|
| 42 |
-
--QUESTION-END--
|
| 43 |
-
|
| 44 |
-
Question 1.b
|
| 45 |
-
Total Marks: <number>
|
| 46 |
-
QP: <question text>
|
| 47 |
-
--QUESTION-END--
|
| 48 |
-
|
| 49 |
-
Question 2
|
| 50 |
-
Total Marks: <number>
|
| 51 |
-
QP: <question text>
|
| 52 |
-
--QUESTION-END--
|
| 53 |
-
|
| 54 |
-
(repeat for all questions in order of appearance)
|
| 55 |
-
|
| 56 |
-
==== QUESTIONS END ====
|
| 57 |
-
|
| 58 |
-
==== MARKSCHEME BEGIN ====
|
| 59 |
-
Answer 1.a:
|
| 60 |
-
<exact MS for Q1.a with notations M1, A1, R1 etc>
|
| 61 |
-
|
| 62 |
-
Answer 1.b:
|
| 63 |
-
<exact MS for Q1.b with notations>
|
| 64 |
-
|
| 65 |
-
Answer 2 :
|
| 66 |
-
<exact MS for Q2 with notations>
|
| 67 |
-
|
| 68 |
-
(repeat for all answers)
|
| 69 |
-
|
| 70 |
-
==== MARKSCHEME END ====
|
| 71 |
-
|
| 72 |
-
==== GRAPH EXPECTED QUESTIONS ====
|
| 73 |
-
Graph expected in:
|
| 74 |
-
- Question <number> β Page <number>
|
| 75 |
-
(one per line)
|
| 76 |
-
==== END GRAPH EXPECTED ====
|
| 77 |
-
"""
|
| 78 |
-
}
|
| 79 |
-
,
|
| 80 |
-
|
| 81 |
-
"GRADING_PROMPT": {
|
| 82 |
-
"role": "system",
|
| 83 |
-
"content": """Developer: You are an official examiner. Apply the following grading rules precisely.
|
| 84 |
-
### Abbreviations:
|
| 85 |
-
- **M**: Marks for Method
|
| 86 |
-
- **A**: Marks for Accuracy/Answer
|
| 87 |
-
- **R**: Marks for Reasoning
|
| 88 |
-
- **AG**: Answer given in questionβno marks
|
| 89 |
-
- **FT**: Follow Through marks (if error carried forward correctly)
|
| 90 |
-
- **MR**: Deduct for misread (once only)
|
| 91 |
-
---
|
| 92 |
-
## Grading Instructions
|
| 93 |
-
1. Award marks using official annotations (e.g., M1, A2).
|
| 94 |
-
2. Do not award full marks for answers alone; check for method marks.
|
| 95 |
-
3. A marks usually require a valid M mark first.
|
| 96 |
-
4. Accept valid equivalent forms unless otherwise specified.
|
| 97 |
-
5. Apply FT where appropriate.
|
| 98 |
-
6. Use proper notation: M1A0, A1, etc.
|
| 99 |
-
7. Any lost mark: use red `<span style=\"color:red\">M0</span>` , similarly make markscheme expected , student response and awarded marks in red include it in <span> tage
|
| 100 |
-
---
|
| 101 |
-
## Output Format
|
| 102 |
-
Produce two sections per question/sub-question, following this structure:
|
| 103 |
-
## Question <id>
|
| 104 |
-
### Markscheme vs Student Answer
|
| 105 |
-
| Mark ID | Markscheme Expectation | Student's Response | Awarded |
|
| 106 |
-
|---------|------------------------|--------------------|---------|
|
| 107 |
-
| M1_1 | Recognise GP | "r=0.9" | M1 |
|
| 108 |
-
**Total: X/Y**
|
| 109 |
-
---
|
| 110 |
-
### Examiner's Report
|
| 111 |
-
At the very end, provide a summary table:
|
| 112 |
-
| Question Number | Marks | Remark |
|
| 113 |
-
|-----------------|-------|--------|
|
| 114 |
-
| 1 | X/Y | A |
|
| 115 |
-
| 2 | X/Y | B |
|
| 116 |
-
Then show total clearly as a final line:
|
| 117 |
-
`Total: <obtained_marks>/<max_marks>`
|
| 118 |
-
NOTES:
|
| 119 |
-
- The assistant will receive two transcripts: (1) QP+MS transcript (questions then markscheme) and (2) AS transcript (student answers). Use the QP+MS transcript as the authoritative source of question wording, total marks, and verbatim markscheme entries (M/A/R mark IDs).
|
| 120 |
-
- Match student answers to question IDs and grade according to the provided verbatim markscheme.
|
| 121 |
-
- For questions where a graph is expected and the student attempted a graph, you will be provided with the relevant markscheme and answer sheet graph images/pages. Use these for grading those questions with visual context. For all other questions, proceed as usual.
|
| 122 |
-
- Produce full markdown as above. Ensure mark IDs used in the grading are present and consistent with the markscheme.
|
| 123 |
-
- give grade in remark one of the following A : All Good B : Silly Mistake C : Conceptual Error D : Hard question E : Not Applicable
|
| 124 |
-
"""
|
| 125 |
-
}
|
| 126 |
-
}
|
| 127 |
|
| 128 |
# ---------------- HELPERS ----------------
|
| 129 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
"""
|
| 131 |
-
Convert
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
try:
|
| 135 |
-
print(f"
|
| 136 |
-
|
| 137 |
-
# ============ STEP 1: Clean and prepare the text ============
|
| 138 |
-
print("π§Ή Cleaning markdown content...")
|
| 139 |
-
|
| 140 |
-
# Remove or replace HTML tags that markdown_pdf can't handle
|
| 141 |
-
clean_text = text
|
| 142 |
-
|
| 143 |
-
# Replace red color spans with bold markdown (** for bold)
|
| 144 |
-
clean_text = re.sub(r'<span\s+style\s*=\s*["\']color\s*:\s*red["\']>(.*?)</span>',
|
| 145 |
-
r'**\1**', clean_text, flags=re.IGNORECASE)
|
| 146 |
-
|
| 147 |
-
# Remove any other HTML tags
|
| 148 |
-
clean_text = re.sub(r'<[^>]+>', '', clean_text)
|
| 149 |
-
|
| 150 |
-
# Fix unicode issues
|
| 151 |
-
clean_text = clean_text.replace('\u00A0', ' ') # Non-breaking space
|
| 152 |
-
clean_text = clean_text.replace('\u2013', '-') # En dash
|
| 153 |
-
clean_text = clean_text.replace('\u2014', '--') # Em dash
|
| 154 |
-
clean_text = clean_text.replace('\u2019', "'") # Right single quote
|
| 155 |
-
clean_text = clean_text.replace('\u201C', '"') # Left double quote
|
| 156 |
-
clean_text = clean_text.replace('\u201D', '"') # Right double quote
|
| 157 |
-
|
| 158 |
-
# Ensure proper line spacing for tables
|
| 159 |
-
clean_text = re.sub(r'\n\|', r'\n\n|', clean_text)
|
| 160 |
-
clean_text = re.sub(r'\|\n', r'|\n\n', clean_text)
|
| 161 |
-
|
| 162 |
-
# Remove excessive blank lines (more than 2)
|
| 163 |
-
clean_text = re.sub(r'\n{3,}', '\n\n', clean_text)
|
| 164 |
-
|
| 165 |
-
print(f"β
Text cleaned. Length: {len(clean_text)} characters")
|
| 166 |
-
|
| 167 |
-
# ============ STEP 2: Save cleaned text to debug file ============
|
| 168 |
-
debug_file = filename.replace('.pdf', '_cleaned.md')
|
| 169 |
-
try:
|
| 170 |
-
with open(debug_file, 'w', encoding='utf-8') as f:
|
| 171 |
-
f.write(clean_text)
|
| 172 |
-
print(f"π Saved cleaned markdown to: {debug_file}")
|
| 173 |
-
except Exception as e:
|
| 174 |
-
print(f"β οΈ Warning: Could not save debug file: {e}")
|
| 175 |
|
| 176 |
-
#
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
# Add the content as a section
|
| 190 |
-
print("π Adding content to PDF...")
|
| 191 |
-
pdf.add_section(Section(clean_text, toc=False))
|
| 192 |
-
|
| 193 |
-
# ============ STEP 4: Save the PDF ============
|
| 194 |
-
print(f"πΎ Saving PDF to {filename}...")
|
| 195 |
-
pdf.save(filename)
|
| 196 |
-
|
| 197 |
-
# ============ STEP 5: Verify the PDF was created ============
|
| 198 |
-
if os.path.exists(filename):
|
| 199 |
-
file_size = os.path.getsize(filename)
|
| 200 |
-
print(f"β
PDF created successfully!")
|
| 201 |
-
print(f"π File size: {file_size / 1024:.2f} KB")
|
| 202 |
-
|
| 203 |
-
# Check if file is suspiciously small (might indicate truncation)
|
| 204 |
-
if file_size < 10000: # Less than 10KB
|
| 205 |
-
print(f"β οΈ Warning: PDF file is very small ({file_size} bytes)")
|
| 206 |
-
print(" This might indicate content was truncated.")
|
| 207 |
-
print(" Check the PDF file manually.")
|
| 208 |
-
|
| 209 |
-
return filename
|
| 210 |
else:
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
-
#
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
-
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
try:
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
print(f"β
Saved as text file: {txt_filename}")
|
| 239 |
-
return txt_filename
|
| 240 |
-
|
| 241 |
-
except Exception as final_error:
|
| 242 |
-
print(f"β All save attempts failed: {final_error}")
|
| 243 |
-
raise Exception("Could not save output in any format") from e
|
| 244 |
-
|
| 245 |
-
def save_as_pdf_with_split(text, filename="output.pdf", max_questions=20):
|
| 246 |
-
"""
|
| 247 |
-
Save as PDF, splitting into multiple files if content is too large.
|
| 248 |
-
"""
|
| 249 |
-
try:
|
| 250 |
-
# First, try to save normally
|
| 251 |
-
return save_as_pdf(text, filename)
|
| 252 |
-
|
| 253 |
-
except Exception as e:
|
| 254 |
-
print(f"β οΈ Normal save failed, attempting to split document...")
|
| 255 |
-
|
| 256 |
-
# Split by questions
|
| 257 |
-
question_blocks = re.split(r'(## Question \d+(?:\.[a-z]+)?)', text)
|
| 258 |
-
|
| 259 |
-
if len(question_blocks) <= 3: # Not enough to split
|
| 260 |
-
raise e
|
| 261 |
-
|
| 262 |
-
# Reconstruct questions with headers
|
| 263 |
-
questions = []
|
| 264 |
-
for i in range(1, len(question_blocks), 2):
|
| 265 |
-
if i+1 < len(question_blocks):
|
| 266 |
-
questions.append(question_blocks[i] + question_blocks[i+1])
|
| 267 |
-
|
| 268 |
-
print(f"π Found {len(questions)} questions to split")
|
| 269 |
-
|
| 270 |
-
# Split into chunks
|
| 271 |
-
chunk_size = max_questions
|
| 272 |
-
pdf_files = []
|
| 273 |
-
|
| 274 |
-
for chunk_idx in range(0, len(questions), chunk_size):
|
| 275 |
-
chunk = questions[chunk_idx:chunk_idx + chunk_size]
|
| 276 |
-
chunk_text = "\n\n".join(chunk)
|
| 277 |
-
|
| 278 |
-
# Add header and footer
|
| 279 |
-
chunk_header = f"# Grading Report - Part {chunk_idx//chunk_size + 1}\n\n"
|
| 280 |
-
chunk_text = chunk_header + chunk_text
|
| 281 |
|
| 282 |
-
#
|
| 283 |
-
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
|
| 297 |
if output_path is None:
|
|
@@ -433,41 +465,81 @@ def extract_question_ids_from_qpms(text: str):
|
|
| 433 |
print("β οΈ No question IDs extracted; will send NA placeholder.")
|
| 434 |
return fallback_matches
|
| 435 |
|
| 436 |
-
def
|
| 437 |
"""
|
| 438 |
-
Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions
|
|
|
|
|
|
|
|
|
|
| 439 |
"""
|
|
|
|
| 440 |
if not expected_ids:
|
| 441 |
ids_block = "{NA}"
|
| 442 |
else:
|
| 443 |
ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
"
|
| 449 |
-
"
|
| 450 |
-
"
|
|
|
|
|
|
|
| 451 |
)
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
Expected questions (if missing, write NA):
|
| 462 |
{ids_block}
|
| 463 |
-----------------------
|
| 464 |
OUTPUT FORMAT:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
Question <id>
|
| 466 |
-
AS:
|
| 467 |
-
|
| 468 |
-
==== GRAPH FOUND ANSWERS ====
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
return prompt
|
| 470 |
|
|
|
|
|
|
|
| 471 |
def extract_graph_questions_from_ms(text: str):
|
| 472 |
"""Extract graph questions and page numbers from MS transcript."""
|
| 473 |
clean_text = text.replace("\u00A0", " ").replace("\t", " ")
|
|
@@ -641,7 +713,7 @@ def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, expected_ids
|
|
| 641 |
page_img = page.convert("RGB")
|
| 642 |
img_cv = np.array(page_img)
|
| 643 |
img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
|
| 644 |
-
h, w,
|
| 645 |
cell_w_px, cell_h_px = w / cols, h / rows
|
| 646 |
|
| 647 |
page_mappings = [m for m in all_mappings if m.get("page") == page_num]
|
|
@@ -701,10 +773,9 @@ def extract_pdf_pages_as_images(pdf_path, page_numbers, prefix):
|
|
| 701 |
return out_paths
|
| 702 |
|
| 703 |
# ---------------- PIPELINE ----------------
|
| 704 |
-
def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
|
| 705 |
"""
|
| 706 |
Final pipeline with graph-aware grading logic using NEW SDK.
|
| 707 |
-
Enhanced with improved PDF saving.
|
| 708 |
"""
|
| 709 |
try:
|
| 710 |
print("π Starting pipeline...")
|
|
@@ -722,7 +793,7 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
|
|
| 722 |
print("β
Upload complete.")
|
| 723 |
|
| 724 |
print("1.i) Transcribing QP+MS (questions first, then full markscheme, with graph detection)...")
|
| 725 |
-
qpms_prompt =
|
| 726 |
qpms_text = gemini_generate_content(qpms_prompt, file_upload_obj=merged_uploaded)
|
| 727 |
print("π QP+MS transcription received. Saving debug file: debug_qpms_transcript.txt")
|
| 728 |
with open("debug_qpms_transcript.txt", "w", encoding="utf-8") as f:
|
|
@@ -740,7 +811,7 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
|
|
| 740 |
extracted_ids = ["NA"]
|
| 741 |
|
| 742 |
print("1.ii) Building AS transcription prompt with expected question IDs and graph detection, sending to Gemini...")
|
| 743 |
-
as_prompt =
|
| 744 |
as_text = gemini_generate_content(as_prompt, file_upload_obj=ans_uploaded)
|
| 745 |
print("π AS transcription received. Saving debug file: debug_as_transcript.txt")
|
| 746 |
with open("debug_as_transcript.txt", "w", encoding="utf-8") as f:
|
|
@@ -765,7 +836,8 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
|
|
| 765 |
if ms_graph_images or as_graph_images:
|
| 766 |
graph_note = "\n\n---\nSome questions require graphs. I've attached the relevant graph pages from QP+MS and from the Answer Sheet. Use them as visual context when grading.\n---\n"
|
| 767 |
grading_input += graph_note
|
| 768 |
-
|
|
|
|
| 769 |
grading_images = ms_graph_images + as_graph_images
|
| 770 |
grading_text = gemini_generate_content(grading_prompt_system + "\n\nPlease grade the following transcripts:\n" + grading_input, image_obj=grading_images if grading_images else None)
|
| 771 |
print("π§Ύ Grading output received. Saving debug file: debug_grading.md")
|
|
@@ -773,35 +845,8 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
|
|
| 773 |
f.write(grading_text)
|
| 774 |
|
| 775 |
base_name = os.path.splitext(os.path.basename(ans_path))[0]
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
grading_pdf_path = f"{base_name}_graded.pdf"
|
| 779 |
-
|
| 780 |
-
print("π Attempting to save grading report as PDF...")
|
| 781 |
-
try:
|
| 782 |
-
# Try normal save first
|
| 783 |
-
grading_pdf_path = save_as_pdf(grading_text, grading_pdf_path)
|
| 784 |
-
print("β
Grading PDF saved successfully:", grading_pdf_path)
|
| 785 |
-
|
| 786 |
-
except Exception as pdf_error:
|
| 787 |
-
print(f"β οΈ Standard PDF save failed: {pdf_error}")
|
| 788 |
-
print("π Trying split document method...")
|
| 789 |
-
|
| 790 |
-
try:
|
| 791 |
-
# Try split method
|
| 792 |
-
grading_pdf_path = save_as_pdf_with_split(grading_text, grading_pdf_path)
|
| 793 |
-
print("β
Grading PDF saved (split method):", grading_pdf_path)
|
| 794 |
-
|
| 795 |
-
except Exception as split_error:
|
| 796 |
-
print(f"β οΈ Split method also failed: {split_error}")
|
| 797 |
-
print("πΎ Saving as Markdown fallback...")
|
| 798 |
-
|
| 799 |
-
# Fallback to markdown
|
| 800 |
-
grading_pdf_path = grading_pdf_path.replace('.pdf', '.md')
|
| 801 |
-
with open(grading_pdf_path, 'w', encoding='utf-8') as f:
|
| 802 |
-
f.write(grading_text)
|
| 803 |
-
print(f"β
Saved as Markdown file: {grading_pdf_path}")
|
| 804 |
-
print("βΉοΈ You can convert this .md file to PDF using online tools or pandoc")
|
| 805 |
|
| 806 |
grading_json = extract_marks_from_grading(grading_text)
|
| 807 |
with open("debug_grading_json.json", "w", encoding="utf-8") as f:
|
|
@@ -812,14 +857,8 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
|
|
| 812 |
if imprint:
|
| 813 |
print("β Imprint option enabled. Starting imprinting process...")
|
| 814 |
imprinted_pdf_path = f"{base_name}_imprinted.pdf"
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
print("β
Imprinting finished. Imprinted PDF at:", imprinted_pdf_path)
|
| 818 |
-
except Exception as imprint_error:
|
| 819 |
-
print(f"β Imprinting failed: {imprint_error}")
|
| 820 |
-
import traceback
|
| 821 |
-
traceback.print_exc()
|
| 822 |
-
imprinted_pdf_path = None
|
| 823 |
|
| 824 |
print("π Pipeline finished successfully.")
|
| 825 |
return qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path
|
|
@@ -831,105 +870,53 @@ def align_and_grade_pipeline(qp_path, ms_path, ans_path, imprint=False):
|
|
| 831 |
return f"β Error: {e}", None, None, None, None
|
| 832 |
|
| 833 |
# ---------------- GRADIO UI ----------------
|
| 834 |
-
with gr.Blocks(title="AI Grading
|
| 835 |
-
gr.Markdown("# π AI Grading
|
| 836 |
-
gr.Markdown("**β
|
| 837 |
-
gr.Markdown("---")
|
| 838 |
|
| 839 |
with gr.Row():
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
ms_file = gr.File(label="π Upload Markscheme (PDF)", file_types=[".pdf"])
|
| 844 |
-
with gr.Column():
|
| 845 |
-
ans_file = gr.File(label="π Upload Student Answer Sheet (PDF)", file_types=[".pdf"])
|
| 846 |
|
| 847 |
with gr.Row():
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
|
|
|
|
|
|
| 853 |
|
|
|
|
|
|
|
| 854 |
with gr.Row():
|
| 855 |
-
qpms_box = gr.Textbox(label="π QP+MS Transcript", lines=12
|
| 856 |
-
as_box = gr.Textbox(label="π AS Transcript", lines=12
|
| 857 |
|
| 858 |
-
gr.
|
| 859 |
-
gr.
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
with gr.Row():
|
| 864 |
-
grading_pdf_file = gr.File(label="π₯ Download Grading Report (PDF/MD)")
|
| 865 |
-
imprint_pdf_file = gr.File(label="π₯ Download Imprinted Answer Sheet (Optional)")
|
| 866 |
-
|
| 867 |
-
gr.Markdown("---")
|
| 868 |
-
gr.Markdown("""
|
| 869 |
-
### π Instructions:
|
| 870 |
-
1. Upload all three PDF files (Question Paper, Markscheme, Answer Sheet)
|
| 871 |
-
2. Optionally enable mark imprinting on the answer sheet
|
| 872 |
-
3. Click "Run Grading Pipeline" and wait for processing
|
| 873 |
-
4. Review transcripts and download the grading report
|
| 874 |
-
|
| 875 |
-
### β οΈ Notes:
|
| 876 |
-
- Large documents may take several minutes to process
|
| 877 |
-
- If PDF generation fails, a Markdown (.md) file will be provided instead
|
| 878 |
-
- Check the console/logs for detailed progress information
|
| 879 |
-
- Debug files are saved automatically for troubleshooting
|
| 880 |
-
""")
|
| 881 |
-
|
| 882 |
-
def run_pipeline(qp_file_obj, ms_file_obj, ans_file_obj, imprint_flag):
|
| 883 |
-
"""
|
| 884 |
-
Wrapper function for Gradio interface
|
| 885 |
-
"""
|
| 886 |
if not qp_file_obj or not ms_file_obj or not ans_file_obj:
|
| 887 |
-
|
| 888 |
-
return error_msg, "", "", None, None
|
| 889 |
|
| 890 |
qp_path = qp_file_obj.name
|
| 891 |
ms_path = ms_file_obj.name
|
| 892 |
ans_path = ans_file_obj.name
|
| 893 |
|
| 894 |
-
print("\n" + "="*80)
|
| 895 |
-
print("π¬ STARTING NEW GRADING SESSION")
|
| 896 |
-
print("="*80 + "\n")
|
| 897 |
-
|
| 898 |
qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path = align_and_grade_pipeline(
|
| 899 |
-
qp_path, ms_path, ans_path, imprint=imprint_flag
|
| 900 |
)
|
| 901 |
|
| 902 |
-
print("\n" + "="*80)
|
| 903 |
-
print("π¬ GRADING SESSION COMPLETE")
|
| 904 |
-
print("="*80 + "\n")
|
| 905 |
-
|
| 906 |
return qpms_text or "", as_text or "", grading_text or "", grading_pdf_path, imprinted_pdf_path
|
| 907 |
|
| 908 |
run_button.click(
|
| 909 |
fn=run_pipeline,
|
| 910 |
-
inputs=[qp_file, ms_file, ans_file, imprint_toggle],
|
| 911 |
outputs=[qpms_box, as_box, grading_output_box, grading_pdf_file, imprint_pdf_file]
|
| 912 |
)
|
| 913 |
|
| 914 |
if __name__ == "__main__":
|
| 915 |
-
|
| 916 |
-
print("π AI GRADING SYSTEM - STARTING")
|
| 917 |
-
print("="*80)
|
| 918 |
-
print("π Make sure GEMINI_API_KEY environment variable is set")
|
| 919 |
-
print("π Required dependencies: google-genai, markdown_pdf, gradio, pdf2image, etc.")
|
| 920 |
-
print("="*80 + "\n")
|
| 921 |
-
|
| 922 |
-
# Check if API key is set
|
| 923 |
-
if not os.getenv("GEMINI_API_KEY"):
|
| 924 |
-
print("β οΈ WARNING: GEMINI_API_KEY not found in environment variables!")
|
| 925 |
-
print(" Set it with: export GEMINI_API_KEY='your-api-key-here'")
|
| 926 |
-
else:
|
| 927 |
-
print("β
GEMINI_API_KEY found")
|
| 928 |
-
|
| 929 |
-
print("\nπ Launching Gradio interface...\n")
|
| 930 |
-
demo.launch(
|
| 931 |
-
server_name="0.0.0.0",
|
| 932 |
-
server_port=7860,
|
| 933 |
-
share=False,
|
| 934 |
-
show_error=True
|
| 935 |
-
)
|
|
|
|
| 6 |
import img2pdf
|
| 7 |
import gradio as gr
|
| 8 |
from google import genai # NEW SDK
|
|
|
|
| 9 |
from pdf2image import convert_from_path
|
| 10 |
from PIL import Image, ImageDraw, ImageFont
|
| 11 |
import cv2
|
| 12 |
import numpy as np
|
| 13 |
from PyPDF2 import PdfReader, PdfWriter
|
| 14 |
+
from prompts import QP_MS_TRANSCRIPTION_PROMPT, get_grading_prompt
|
| 15 |
|
| 16 |
# ---------------- CONFIG ----------------
|
| 17 |
# Create client with new SDK
|
|
|
|
| 19 |
GRID_ROWS, GRID_COLS = 20, 14
|
| 20 |
|
| 21 |
# ---------------- PROMPTS ----------------
|
| 22 |
+
# Prompts are now imported from prompts.py
|
| 23 |
+
|
| 24 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# ---------------- HELPERS ----------------
|
| 27 |
+
def parse_md_table(md):
|
| 28 |
+
"""Parse a Markdown table into a list of rows."""
|
| 29 |
+
lines = [l for l in md.split("\n") if l.strip()]
|
| 30 |
+
if len(lines) < 3:
|
| 31 |
+
return []
|
| 32 |
+
lines = lines[2:] # skip header + separator
|
| 33 |
+
rows = []
|
| 34 |
+
for line in lines:
|
| 35 |
+
parts = [c.strip() for c in line.strip("|").split("|")]
|
| 36 |
+
# Filter out empty strings from leading/trailing pipes
|
| 37 |
+
clean_parts = [p for p in parts if p]
|
| 38 |
+
if clean_parts:
|
| 39 |
+
rows.append(clean_parts)
|
| 40 |
+
return rows
|
| 41 |
+
|
| 42 |
+
def convert_html_color_spans(md_text):
|
| 43 |
+
"""Convert HTML color spans to LaTeX textcolor commands."""
|
| 44 |
+
pattern = r'<span\s+style="color:\s*([^"]+)">\s*(.*?)\s*</span>'
|
| 45 |
+
def repl(m):
|
| 46 |
+
color = m.group(1).strip()
|
| 47 |
+
text = m.group(2)
|
| 48 |
+
return fr'\textcolor{{{color}}}{{{text}}}'
|
| 49 |
+
return re.sub(pattern, repl, md_text, flags=re.IGNORECASE)
|
| 50 |
+
|
| 51 |
+
def cleanup_markdown_for_latex(md_text):
|
| 52 |
+
"""Clean up markdown text for better LaTeX conversion."""
|
| 53 |
+
# Ensure spacing between bold headers and tables
|
| 54 |
+
md_text = re.sub(r'(\*\*Markscheme vs Student Answer\*\*)\s*(\|)', r'\1\n\n\2', md_text)
|
| 55 |
+
|
| 56 |
+
# Convert common unicode math symbols to LaTeX (safety net)
|
| 57 |
+
replacements = {
|
| 58 |
+
'β«': r'\int ',
|
| 59 |
+
'Β²': '^2',
|
| 60 |
+
'Β³': '^3',
|
| 61 |
+
'Β½': r'\frac{1}{2}',
|
| 62 |
+
'ΒΌ': r'\frac{1}{4}',
|
| 63 |
+
'β': r'\infty',
|
| 64 |
+
'β€': r'\leq',
|
| 65 |
+
'β₯': r'\geq',
|
| 66 |
+
'β ': r'\neq',
|
| 67 |
+
'Β±': r'\pm',
|
| 68 |
+
'Γ': r'\times',
|
| 69 |
+
'Γ·': r'\div',
|
| 70 |
+
'β': r'\sqrt',
|
| 71 |
+
'β': r'\sum',
|
| 72 |
+
'β': r'\prod',
|
| 73 |
+
'β': r'\partial',
|
| 74 |
+
'Ο': r'\pi',
|
| 75 |
+
'ΞΈ': r'\theta',
|
| 76 |
+
'Ξ±': r'\alpha',
|
| 77 |
+
'Ξ²': r'\beta',
|
| 78 |
+
'Ξ³': r'\gamma',
|
| 79 |
+
'Ξ΄': r'\delta',
|
| 80 |
+
'Ξ΅': r'\epsilon',
|
| 81 |
+
'Ξ»': r'\lambda',
|
| 82 |
+
'ΞΌ': r'\mu',
|
| 83 |
+
'Ο': r'\sigma',
|
| 84 |
+
'Ξ': r'\Delta',
|
| 85 |
+
'Ξ£': r'\Sigma',
|
| 86 |
+
'Ξ©': r'\Omega'
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
for char, latex in replacements.items():
|
| 90 |
+
md_text = md_text.replace(char, f'${latex}$')
|
| 91 |
+
|
| 92 |
+
return md_text
|
| 93 |
+
|
| 94 |
+
def escape_latex_special_chars(text):
|
| 95 |
+
"""Escape special LaTeX characters in text."""
|
| 96 |
+
replacements = {
|
| 97 |
+
'%': r'\%',
|
| 98 |
+
'&': r'\&',
|
| 99 |
+
'#': r'\#',
|
| 100 |
+
'_': r'\_',
|
| 101 |
+
'{': r'\{',
|
| 102 |
+
'}': r'\}',
|
| 103 |
+
'~': r'\textasciitilde{}',
|
| 104 |
+
'^': r'\textasciicircum{}'
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
# Don't escape if already in math mode or LaTeX command
|
| 108 |
+
if '$' in text or '\\' in text:
|
| 109 |
+
return text
|
| 110 |
+
|
| 111 |
+
for char, escaped in replacements.items():
|
| 112 |
+
text = text.replace(char, escaped)
|
| 113 |
+
|
| 114 |
+
return text
|
| 115 |
+
|
| 116 |
+
def save_as_pdf(text, filename="output.pdf"):
|
| 117 |
"""
|
| 118 |
+
Convert Markdown text to PDF using Pandoc with pdflatex.
|
| 119 |
+
Extracts the Examiner's Summary Report and places it at the top with enhanced formatting.
|
| 120 |
+
Converts HTML color spans to LaTeX textcolor commands.
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
text (str): Markdown content to convert
|
| 124 |
+
filename (str): Output PDF filename
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
str: Path to the generated PDF file
|
| 128 |
+
|
| 129 |
+
Raises:
|
| 130 |
+
Exception: If Pandoc or pdflatex is not available, or conversion fails
|
| 131 |
"""
|
| 132 |
+
base_name = os.path.splitext(filename)[0]
|
| 133 |
+
temp_md_file = f"{base_name}_input.md"
|
| 134 |
+
temp_tex_file = f"{base_name}_temp.tex"
|
| 135 |
+
|
| 136 |
try:
|
| 137 |
+
print(f"π Processing markdown for PDF generation...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
+
# Step 1: Extract Summary Report Table
|
| 140 |
+
summary_pattern = re.compile(
|
| 141 |
+
r"### Examiner's Summary Report\s*\n\n(\|.*?\|)\s*\n\n\*\*Total:\s*(.*?)\*\*",
|
| 142 |
+
re.DOTALL
|
| 143 |
+
)
|
| 144 |
+
summary_match = summary_pattern.search(text)
|
| 145 |
+
|
| 146 |
+
if summary_match:
|
| 147 |
+
summary_table_md = summary_match.group(1)
|
| 148 |
+
summary_total = summary_match.group(2)
|
| 149 |
+
# Remove summary section from markdown
|
| 150 |
+
text = summary_pattern.sub("", text)
|
| 151 |
+
print("β
Extracted Examiner's Summary Report")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
else:
|
| 153 |
+
summary_table_md = ""
|
| 154 |
+
summary_total = ""
|
| 155 |
+
print("β οΈ No Examiner's Summary Report found")
|
| 156 |
+
|
| 157 |
+
# Step 2: Clean up markdown and convert HTML color spans to LaTeX
|
| 158 |
+
text = cleanup_markdown_for_latex(text)
|
| 159 |
+
text = convert_html_color_spans(text)
|
| 160 |
+
print("β
Cleaned markdown and converted HTML color spans to LaTeX")
|
| 161 |
+
|
| 162 |
+
# Save cleaned markdown
|
| 163 |
+
with open(temp_md_file, 'w', encoding='utf-8') as f:
|
| 164 |
+
f.write(text)
|
| 165 |
+
|
| 166 |
+
# Step 3: Convert MD to LaTeX via Pandoc
|
| 167 |
+
print(f"π Converting markdown to LaTeX using Pandoc...")
|
| 168 |
+
pandoc_cmd = [
|
| 169 |
+
"pandoc",
|
| 170 |
+
"--from=markdown",
|
| 171 |
+
"--to=latex",
|
| 172 |
+
"--standalone",
|
| 173 |
+
temp_md_file,
|
| 174 |
+
"-o", temp_tex_file
|
| 175 |
+
]
|
| 176 |
+
|
| 177 |
+
result = subprocess.run(pandoc_cmd, capture_output=True, check=False)
|
| 178 |
+
if result.returncode != 0 or not os.path.exists(temp_tex_file):
|
| 179 |
+
try:
|
| 180 |
+
stderr = result.stderr.decode('utf-8', errors='replace')
|
| 181 |
+
except:
|
| 182 |
+
stderr = str(result.stderr)
|
| 183 |
+
raise Exception(f"Pandoc conversion failed: {stderr}")
|
| 184 |
+
print("β
Pandoc conversion complete")
|
| 185 |
+
|
| 186 |
+
# Step 4: Modify the generated LaTeX
|
| 187 |
+
with open(temp_tex_file, "r", encoding="utf-8") as f:
|
| 188 |
+
tex = f.read()
|
| 189 |
+
|
| 190 |
+
# Change document class to larger font
|
| 191 |
+
tex = tex.replace(
|
| 192 |
+
r"\documentclass{article}",
|
| 193 |
+
r"\documentclass[12pt]{extarticle}"
|
| 194 |
+
)
|
| 195 |
|
| 196 |
+
# Inject enhanced packages with better table formatting
|
| 197 |
+
insert_packages = r"""\usepackage[a4paper, margin=1in]{geometry}
|
| 198 |
+
\usepackage{xcolor}
|
| 199 |
+
\usepackage{colortbl}
|
| 200 |
+
\usepackage{booktabs}
|
| 201 |
+
\usepackage{array}
|
| 202 |
+
\usepackage{longtable}
|
| 203 |
+
\renewcommand{\arraystretch}{1.4}
|
| 204 |
+
\newcolumntype{L}[1]{>{\raggedright\arraybackslash}p{#1}}"""
|
| 205 |
+
|
| 206 |
+
tex = tex.replace(r"\begin{document}", insert_packages + "\n\\begin{document}")
|
| 207 |
+
|
| 208 |
+
# Step 5: Build enhanced LaTeX table for summary with zebra striping (if exists)
|
| 209 |
+
if summary_table_md:
|
| 210 |
+
summary_rows = parse_md_table(summary_table_md)
|
| 211 |
+
summary_latex = r"""\section*{Examiner's Summary Report}
|
| 212 |
+
\begin{center}
|
| 213 |
+
\rowcolors{2}{gray!10}{white}
|
| 214 |
+
\begin{tabular}{|c|c|c|L{8cm}|}
|
| 215 |
+
\hline
|
| 216 |
+
\rowcolor{gray!30}
|
| 217 |
+
\textbf{Question} & \textbf{Marks} & \textbf{Remark} & \textbf{Feedback} \\ \hline
|
| 218 |
+
"""
|
| 219 |
+
for row in summary_rows:
|
| 220 |
+
if len(row) >= 4:
|
| 221 |
+
# Escape special LaTeX characters in feedback
|
| 222 |
+
feedback = row[3]
|
| 223 |
+
# Only escape if not already LaTeX code
|
| 224 |
+
if not ('$' in feedback or '\\textcolor' in feedback):
|
| 225 |
+
feedback = feedback.replace('%', r'\%').replace('&', r'\&').replace('#', r'\#')
|
| 226 |
+
|
| 227 |
+
summary_latex += f"{row[0]} & {row[1]} & {row[2]} & {feedback} \\\\ \\hline\n"
|
| 228 |
|
| 229 |
+
summary_latex += r"\end{tabular}"
|
| 230 |
+
summary_latex += "\n\\end{center}\n\n"
|
| 231 |
+
summary_latex += f"\\vspace{{0.5cm}}\\noindent\\textbf{{\\Large Overall Score: {summary_total}}}\n\n"
|
| 232 |
+
summary_latex += "\\hrulefill\n\\vspace{1cm}\n\n"
|
| 233 |
+
summary_latex += "\\newpage\n\n"
|
| 234 |
|
| 235 |
+
# Insert summary right after \begin{document}
|
| 236 |
+
tex = tex.replace(
|
| 237 |
+
r"\begin{document}",
|
| 238 |
+
r"\begin{document}" + "\n\n" + summary_latex
|
| 239 |
+
)
|
| 240 |
+
print("β
Injected enhanced summary table with zebra striping at top of document")
|
| 241 |
+
|
| 242 |
+
# Save modified LaTeX
|
| 243 |
+
with open(temp_tex_file, "w", encoding="utf-8") as f:
|
| 244 |
+
f.write(tex)
|
| 245 |
+
|
| 246 |
+
# Step 6: Compile PDF with pdflatex
|
| 247 |
+
print(f"π Compiling PDF with pdflatex...")
|
| 248 |
+
pdflatex_cmd = [
|
| 249 |
+
"pdflatex",
|
| 250 |
+
"-interaction=nonstopmode",
|
| 251 |
+
f"-output-directory={os.path.dirname(os.path.abspath(temp_tex_file)) or '.'}",
|
| 252 |
+
temp_tex_file
|
| 253 |
+
]
|
| 254 |
+
|
| 255 |
+
# Run twice to resolve references
|
| 256 |
+
# Don't use text=True to avoid encoding issues with pdflatex output
|
| 257 |
+
result1 = subprocess.run(pdflatex_cmd, capture_output=True, check=False)
|
| 258 |
+
result2 = subprocess.run(pdflatex_cmd, capture_output=True, check=False)
|
| 259 |
+
|
| 260 |
+
# Check if PDF was actually created (better than checking return code)
|
| 261 |
+
temp_pdf = temp_tex_file.replace(".tex", ".pdf")
|
| 262 |
+
if not os.path.exists(temp_pdf):
|
| 263 |
+
# Try to decode error output for debugging
|
| 264 |
try:
|
| 265 |
+
stderr = result2.stderr.decode('utf-8', errors='replace')
|
| 266 |
+
except:
|
| 267 |
+
stderr = str(result2.stderr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
|
| 269 |
+
# Also check log file for more details
|
| 270 |
+
log_file = temp_tex_file.replace(".tex", ".log")
|
| 271 |
+
if os.path.exists(log_file):
|
| 272 |
+
try:
|
| 273 |
+
with open(log_file, 'r', encoding='utf-8', errors='replace') as f:
|
| 274 |
+
log_content = f.read()
|
| 275 |
+
# Extract error lines
|
| 276 |
+
error_lines = [line for line in log_content.split('\n') if '!' in line]
|
| 277 |
+
if error_lines:
|
| 278 |
+
stderr += "\n\nLaTeX Errors:\n" + "\n".join(error_lines[:10])
|
| 279 |
+
except:
|
| 280 |
+
pass
|
| 281 |
|
| 282 |
+
raise Exception(f"pdflatex failed to create PDF. Check LaTeX syntax. Error: {stderr[:1000]}")
|
| 283 |
+
|
| 284 |
+
# Move output PDF to final filename
|
| 285 |
+
if os.path.exists(temp_pdf):
|
| 286 |
+
if os.path.exists(filename):
|
| 287 |
+
os.remove(filename)
|
| 288 |
+
os.rename(temp_pdf, filename)
|
| 289 |
+
|
| 290 |
+
print(f"β
PDF generated successfully: {filename}")
|
| 291 |
+
|
| 292 |
+
# Clean up temporary files
|
| 293 |
+
for ext in [".md", ".tex", ".aux", ".log", ".out"]:
|
| 294 |
+
temp_file = base_name + ext
|
| 295 |
+
if os.path.exists(temp_file):
|
| 296 |
+
os.remove(temp_file)
|
| 297 |
+
# Also clean input/temp variants
|
| 298 |
+
for prefix in ["_input", "_temp"]:
|
| 299 |
+
temp_file = base_name + prefix + ext
|
| 300 |
+
if os.path.exists(temp_file):
|
| 301 |
+
os.remove(temp_file)
|
| 302 |
+
|
| 303 |
+
return filename
|
| 304 |
+
|
| 305 |
+
except subprocess.CalledProcessError as e:
|
| 306 |
+
print(f"β Conversion failed: {e}")
|
| 307 |
+
print(f" STDOUT: {e.stdout}")
|
| 308 |
+
print(f" STDERR: {e.stderr}")
|
| 309 |
+
|
| 310 |
+
raise Exception(f"PDF conversion failed: {e.stderr}")
|
| 311 |
+
|
| 312 |
+
except FileNotFoundError as e:
|
| 313 |
+
print(f"β Required tool not found: {e}")
|
| 314 |
+
|
| 315 |
+
raise Exception(
|
| 316 |
+
"Pandoc or pdflatex not found. Please install:\n"
|
| 317 |
+
" - pandoc\n"
|
| 318 |
+
" - texlive (or MiKTeX on Windows)\n"
|
| 319 |
+
" - texlive-latex-extra (for extarticle class)"
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
except Exception as e:
|
| 323 |
+
print(f"β Unexpected error during PDF conversion: {e}")
|
| 324 |
+
import traceback
|
| 325 |
+
traceback.print_exc()
|
| 326 |
+
raise
|
| 327 |
|
| 328 |
def compress_pdf(input_path, output_path=None, max_size=20*1024*1024):
|
| 329 |
if output_path is None:
|
|
|
|
| 465 |
print("β οΈ No question IDs extracted; will send NA placeholder.")
|
| 466 |
return fallback_matches
|
| 467 |
|
| 468 |
+
def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
|
| 469 |
"""
|
| 470 |
+
Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
|
| 471 |
+
modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
|
| 472 |
+
requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
|
| 473 |
+
Includes explicit rules for interpreting NA-like answers and no-response situations.
|
| 474 |
"""
|
| 475 |
+
|
| 476 |
if not expected_ids:
|
| 477 |
ids_block = "{NA}"
|
| 478 |
else:
|
| 479 |
ids_block = "{\n" + "\n".join(expected_ids) + "\n}"
|
| 480 |
+
|
| 481 |
+
qpms_section = ""
|
| 482 |
+
if qpms_text is not None:
|
| 483 |
+
qpms_section = (
|
| 484 |
+
"\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS) below."
|
| 485 |
+
"\nUse it primarily to resolve ambiguous handwriting and to confirm expected answers when needed."
|
| 486 |
+
"\n--- BEGIN QP+MS TRANSCRIPT ---\n"
|
| 487 |
+
f"{qpms_text.strip()}\n"
|
| 488 |
+
"--- END QP+MS TRANSCRIPT ---\n"
|
| 489 |
)
|
| 490 |
+
|
| 491 |
+
prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.
|
| 492 |
+
INPUT: This PDF contains a student's handwritten answer sheet.
|
| 493 |
+
{qpms_section}
|
| 494 |
+
TASK:
|
| 495 |
+
1. **THINKING:** Before transcribing each answer, document your thought process inside a **<think>** tag.
|
| 496 |
+
- Identify the question ID. If inferred, note why.
|
| 497 |
+
- Detail any ambiguities (unclear numbers, symbols, or structures).
|
| 498 |
+
- Explain how ambiguities were resolved, including whether the QP+MS transcript was consulted.
|
| 499 |
+
- If QP+MS was consulted but you chose not to change the transcription, state this.
|
| 500 |
+
- If the initial question label was incorrect (e.g., 2.a vs 2.b), correct it and briefly explain the reasoning in <think>.
|
| 501 |
+
*Example Thinking:*
|
| 502 |
+
<think>
|
| 503 |
+
- Found Question 3(a).
|
| 504 |
+
- The term could be '$2x$' or '21x'.
|
| 505 |
+
- Markscheme uses '$21x$', but handwriting matches '$2x$'.
|
| 506 |
+
- Decision: transcribe '$2x$'.
|
| 507 |
+
</think>
|
| 508 |
+
|
| 509 |
+
2. **TRANSCRIPTION:** Transcribe the student's answers directly and faithfully.
|
| 510 |
+
- Assign each answer to a labelled question ID when present.
|
| 511 |
+
- For unlabeled answers, segment logically and mark inferred IDs as "**INFERRED: <id>**".
|
| 512 |
+
- **Mathematical expressions and standalone variables must appear inside LaTeX dollar delimiters ($...$).**
|
| 513 |
+
- If a diagram/graph is omitted, write **[Graph omitted]**.
|
| 514 |
+
- If handwriting is unreadable: **[illegible]**.
|
| 515 |
+
|
| 516 |
+
**ANSWER-INTERPRETATION RULES:**
|
| 517 |
+
- If the student writes βNAβ, βN/Aβ, βNot Applicableβ, or clear equivalents β record exactly as **NA**.
|
| 518 |
+
- If the student leaves the space blank, crosses it out, makes no meaningful attempt, or provides no answer β record **[No response]**.
|
| 519 |
+
|
| 520 |
+
Ensure deterministic formatting so subsequent models can grade directly from this aligned format.
|
| 521 |
+
|
| 522 |
Expected questions (if missing, write NA):
|
| 523 |
{ids_block}
|
| 524 |
-----------------------
|
| 525 |
OUTPUT FORMAT:
|
| 526 |
+
<think>...</think>
|
| 527 |
+
Question <id>
|
| 528 |
+
AS:<transcribed answer or placeholder>
|
| 529 |
+
<think>...</think>
|
| 530 |
Question <id>
|
| 531 |
+
AS:<transcribed answer or placeholder>
|
| 532 |
+
...
|
| 533 |
+
==== GRAPH FOUND ANSWERS ====
|
| 534 |
+
Graph found in:
|
| 535 |
+
- Answer <number> β Page <number>
|
| 536 |
+
(one per line)
|
| 537 |
+
==== END GRAPH FOUND ===="""
|
| 538 |
+
|
| 539 |
return prompt
|
| 540 |
|
| 541 |
+
|
| 542 |
+
|
| 543 |
def extract_graph_questions_from_ms(text: str):
|
| 544 |
"""Extract graph questions and page numbers from MS transcript."""
|
| 545 |
clean_text = text.replace("\u00A0", " ").replace("\t", " ")
|
|
|
|
| 713 |
page_img = page.convert("RGB")
|
| 714 |
img_cv = np.array(page_img)
|
| 715 |
img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
|
| 716 |
+
h, w, _ = img_cv.shape
|
| 717 |
cell_w_px, cell_h_px = w / cols, h / rows
|
| 718 |
|
| 719 |
page_mappings = [m for m in all_mappings if m.get("page") == page_num]
|
|
|
|
| 773 |
return out_paths
|
| 774 |
|
| 775 |
# ---------------- PIPELINE ----------------
|
| 776 |
+
def align_and_grade_pipeline(qp_path, ms_path, ans_path, subject="Maths", imprint=False):
|
| 777 |
"""
|
| 778 |
Final pipeline with graph-aware grading logic using NEW SDK.
|
|
|
|
| 779 |
"""
|
| 780 |
try:
|
| 781 |
print("π Starting pipeline...")
|
|
|
|
| 793 |
print("β
Upload complete.")
|
| 794 |
|
| 795 |
print("1.i) Transcribing QP+MS (questions first, then full markscheme, with graph detection)...")
|
| 796 |
+
qpms_prompt = QP_MS_TRANSCRIPTION_PROMPT["content"] + "\nAt the end, also list all questions in the markscheme where a graph is expected, in the format:\nGraph expected in:\n- Question <number> β Page <number>\n(One per line, after ==== MARKSCHEME END ====)"
|
| 797 |
qpms_text = gemini_generate_content(qpms_prompt, file_upload_obj=merged_uploaded)
|
| 798 |
print("π QP+MS transcription received. Saving debug file: debug_qpms_transcript.txt")
|
| 799 |
with open("debug_qpms_transcript.txt", "w", encoding="utf-8") as f:
|
|
|
|
| 811 |
extracted_ids = ["NA"]
|
| 812 |
|
| 813 |
print("1.ii) Building AS transcription prompt with expected question IDs and graph detection, sending to Gemini...")
|
| 814 |
+
as_prompt = build_as_cot_prompt_with_expected_ids(extracted_ids, qpms_text) + "\nAt the end, also list all answers where a graph is found, in the format:\nGraph found in:\n- Answer <number> β Page <number>\n(One per line, after all answers)"
|
| 815 |
as_text = gemini_generate_content(as_prompt, file_upload_obj=ans_uploaded)
|
| 816 |
print("π AS transcription received. Saving debug file: debug_as_transcript.txt")
|
| 817 |
with open("debug_as_transcript.txt", "w", encoding="utf-8") as f:
|
|
|
|
| 836 |
if ms_graph_images or as_graph_images:
|
| 837 |
graph_note = "\n\n---\nSome questions require graphs. I've attached the relevant graph pages from QP+MS and from the Answer Sheet. Use them as visual context when grading.\n---\n"
|
| 838 |
grading_input += graph_note
|
| 839 |
+
grading_prompt_obj = get_grading_prompt(subject.lower())
|
| 840 |
+
grading_prompt_system = grading_prompt_obj["content"]
|
| 841 |
grading_images = ms_graph_images + as_graph_images
|
| 842 |
grading_text = gemini_generate_content(grading_prompt_system + "\n\nPlease grade the following transcripts:\n" + grading_input, image_obj=grading_images if grading_images else None)
|
| 843 |
print("π§Ύ Grading output received. Saving debug file: debug_grading.md")
|
|
|
|
| 845 |
f.write(grading_text)
|
| 846 |
|
| 847 |
base_name = os.path.splitext(os.path.basename(ans_path))[0]
|
| 848 |
+
grading_pdf_path = save_as_pdf(grading_text, f"{base_name}_graded.pdf")
|
| 849 |
+
print("π Grading PDF saved:", grading_pdf_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 850 |
|
| 851 |
grading_json = extract_marks_from_grading(grading_text)
|
| 852 |
with open("debug_grading_json.json", "w", encoding="utf-8") as f:
|
|
|
|
| 857 |
if imprint:
|
| 858 |
print("β Imprint option enabled. Starting imprinting process...")
|
| 859 |
imprinted_pdf_path = f"{base_name}_imprinted.pdf"
|
| 860 |
+
imprinted_pdf_path = imprint_marks_using_mapping(ans_path, grading_json, imprinted_pdf_path, extracted_ids)
|
| 861 |
+
print("β
Imprinting finished. Imprinted PDF at:", imprinted_pdf_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 862 |
|
| 863 |
print("π Pipeline finished successfully.")
|
| 864 |
return qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path
|
|
|
|
| 870 |
return f"β Error: {e}", None, None, None, None
|
| 871 |
|
| 872 |
# ---------------- GRADIO UI ----------------
|
| 873 |
+
with gr.Blocks(title="AI Grading (Pandoc + pdflatex)") as demo:
|
| 874 |
+
gr.Markdown("## π AI Grading β Using Pandoc + pdflatex for PDF Generation")
|
| 875 |
+
gr.Markdown("**β
Now using Pandoc with pdflatex for professional-quality PDF outputs!**")
|
|
|
|
| 876 |
|
| 877 |
with gr.Row():
|
| 878 |
+
qp_file = gr.File(label="π Upload Question Paper (PDF)")
|
| 879 |
+
ms_file = gr.File(label="π Upload Markscheme (PDF)")
|
| 880 |
+
ans_file = gr.File(label="π Upload Student Answer Sheet (PDF)")
|
|
|
|
|
|
|
|
|
|
| 881 |
|
| 882 |
with gr.Row():
|
| 883 |
+
subject_dropdown = gr.Dropdown(
|
| 884 |
+
choices=["Maths", "Science"],
|
| 885 |
+
value="Maths",
|
| 886 |
+
label="π Subject",
|
| 887 |
+
info="Select the subject to apply appropriate grading guidelines"
|
| 888 |
+
)
|
| 889 |
+
imprint_toggle = gr.Checkbox(label="β Imprint Marks on Student Answer Sheet", value=False)
|
| 890 |
|
| 891 |
+
run_button = gr.Button("π Run Pipeline")
|
| 892 |
+
|
| 893 |
with gr.Row():
|
| 894 |
+
qpms_box = gr.Textbox(label="π QP+MS Transcript", lines=12)
|
| 895 |
+
as_box = gr.Textbox(label="π AS Transcript", lines=12)
|
| 896 |
|
| 897 |
+
grading_output_box = gr.Textbox(label="π§Ύ Grading (Markdown)", lines=20)
|
| 898 |
+
grading_pdf_file = gr.File(label="π₯ Download Grading PDF")
|
| 899 |
+
imprint_pdf_file = gr.File(label="π₯ Download Imprinted PDF (Optional)")
|
| 900 |
+
|
| 901 |
+
def run_pipeline(qp_file_obj, ms_file_obj, ans_file_obj, subject_choice, imprint_flag):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 902 |
if not qp_file_obj or not ms_file_obj or not ans_file_obj:
|
| 903 |
+
return "β Please upload all three files", "", "", None, None
|
|
|
|
| 904 |
|
| 905 |
qp_path = qp_file_obj.name
|
| 906 |
ms_path = ms_file_obj.name
|
| 907 |
ans_path = ans_file_obj.name
|
| 908 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 909 |
qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path = align_and_grade_pipeline(
|
| 910 |
+
qp_path, ms_path, ans_path, subject=subject_choice, imprint=imprint_flag
|
| 911 |
)
|
| 912 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 913 |
return qpms_text or "", as_text or "", grading_text or "", grading_pdf_path, imprinted_pdf_path
|
| 914 |
|
| 915 |
run_button.click(
|
| 916 |
fn=run_pipeline,
|
| 917 |
+
inputs=[qp_file, ms_file, ans_file, subject_dropdown, imprint_toggle],
|
| 918 |
outputs=[qpms_box, as_box, grading_output_box, grading_pdf_file, imprint_pdf_file]
|
| 919 |
)
|
| 920 |
|
| 921 |
if __name__ == "__main__":
|
| 922 |
+
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|