atz21 commited on
Commit
5268488
·
verified ·
1 Parent(s): a5a195e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -28
app.py CHANGED
@@ -28,7 +28,7 @@ INPUT: This file is a PDF that first contains the Question Paper and immediately
28
  TASK:
29
  1. Transcribe EXACTLY all the questions FIRST (with their total marks).
30
  2. After ALL questions, transcribe the Markscheme exactly, preserving M/A/R notation in brackets.
31
- 3. Always number the questions sequentially (Question 1, Question 2, Question 3, …) **in the order they appear in the PDF**, even if the PDF shows a different number or leaves it blank. Do NOT skip or leave Question: blank. Never start a question other than question 1 ( even if it is labelled in pdf as 8 name it 1)
32
  4. After the markscheme, DETECT and FLAG all questions in the markscheme where a graph/diagram is expected. For each, output the question number and the page number in the format below.
33
  FORMAT:
34
  ==== PAPER TOTAL MARKS ====
@@ -263,33 +263,38 @@ AS:
263
 
264
  # Robust parsing functions for graph detection
265
 
266
- def extract_graph_questions_from_ms(ms_text):
267
- """
268
- Parse LLM output for Markscheme to extract questions/pages where a graph is expected.
269
- Returns dict: {question_number: ms_page_number}
270
- """
271
- matches = re.findall(r"==== GRAPH EXPECTED QUESTIONS ====\\s*Graph expected in:(.*?)==== END GRAPH EXPECTED ====" , ms_text, re.DOTALL)
272
- mapping = {}
273
- if matches:
274
- for line in matches[0].splitlines():
275
- m = re.match(r"-\s*Question\s*(\d+)\s*[\u2192\-\:]\s*Page\s*(\d+)", line.strip())
276
- if m:
277
- mapping[int(m.group(1))] = int(m.group(2))
278
- return mapping
279
-
280
- def extract_graph_answers_from_as(as_text):
281
- """
282
- Parse LLM output for Answer Sheet to extract answers/pages where a graph was found.
283
- Returns dict: {answer_number: as_page_number}
284
- """
285
- matches = re.findall(r"==== GRAPH FOUND ANSWERS ====\\s*Graph found in:(.*?)==== END GRAPH FOUND ====" , as_text, re.DOTALL)
286
- mapping = {}
287
- if matches:
288
- for line in matches[0].splitlines():
289
- m = re.match(r"-\s*Answer\s*(\d+)\s*[\u2192\-\:]\s*Page\s*(\d+)", line.strip())
290
- if m:
291
- mapping[int(m.group(1))] = int(m.group(2))
292
- return mapping
 
 
 
 
 
293
 
294
  def extract_marks_from_grading(grading_text):
295
  """
 
28
  TASK:
29
  1. Transcribe EXACTLY all the questions FIRST (with their total marks).
30
  2. After ALL questions, transcribe the Markscheme exactly, preserving M/A/R notation in brackets.
31
+ 3. Always number the questions sequentially (Question 1, Question 2, Question 3a,Question 3b …) **in the order they appear in the PDF**, even if the PDF shows a different number or leaves it blank. Do NOT skip or leave Question: blank. Never start a question other than question 1 ( even if it is labelled in pdf as 8 name it 1)
32
  4. After the markscheme, DETECT and FLAG all questions in the markscheme where a graph/diagram is expected. For each, output the question number and the page number in the format below.
33
  FORMAT:
34
  ==== PAPER TOTAL MARKS ====
 
263
 
264
  # Robust parsing functions for graph detection
265
 
266
+ def extract_graph_questions_from_ms(text: str):
267
+ """Extract graph questions and page numbers from MS transcript."""
268
+ clean_text = text.replace("\u00A0", " ").replace("\t", " ")
269
+ match = re.search(r"==== GRAPH EXPECTED QUESTIONS ====\s*(.*?)\s*==== END GRAPH EXPECTED ====",
270
+ clean_text, re.S)
271
+ graph_dict = {}
272
+ if match:
273
+ block = match.group(1)
274
+ for line in block.splitlines():
275
+ line = line.strip()
276
+ if line.startswith("- Question"):
277
+ q_match = re.match(r"- Question\s+([\dA-Za-z.()]+)\s*→\s*Page\s*(\d+)", line)
278
+ if q_match:
279
+ q_id, page = q_match.groups()
280
+ graph_dict[q_id] = int(page)
281
+ return graph_dict
282
+
283
+ def extract_graph_answers_from_as(text: str):
284
+ """Extract graph answers and page numbers from AS transcript."""
285
+ clean_text = text.replace("\u00A0", " ").replace("\t", " ")
286
+ block = re.search(r"==== GRAPH FOUND ANSWERS ====\s*(.*?)\s*==== END GRAPH FOUND ====",
287
+ clean_text, re.S)
288
+ graph_dict = {}
289
+ if block:
290
+ for line in block.group(1).splitlines():
291
+ line = line.strip()
292
+ if line.startswith("- Answer"):
293
+ match = re.match(r"- Answer\s+([\dA-Za-z.()]+)\s*→\s*Page\s*(\d+)", line)
294
+ if match:
295
+ ans_id, page = match.groups()
296
+ graph_dict[ans_id] = int(page)
297
+ return graph_dict
298
 
299
  def extract_marks_from_grading(grading_text):
300
  """