bhuvan-2005 commited on
Commit
ec1c3fd
·
verified ·
1 Parent(s): cc0c4d2

Create question_extractor.py

Browse files
Files changed (1) hide show
  1. question_extractor.py +814 -0
question_extractor.py ADDED
@@ -0,0 +1,814 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Question Paper Extractor
4
+ Extracts subject name and questions with marks from question paper images
5
+ """
6
+
7
+ import os
8
+ import re
9
+ import sys
10
+ import difflib
11
+ import tempfile
12
+
13
+ import pytesseract
14
+ from pytesseract import Output
15
+ from PIL import Image
16
+ import cv2
17
+ import numpy as np
18
+
19
+ # Allow large images from high-resolution PDFs (e.g. qp002) without
20
+ # triggering Pillow's decompression bomb protection. The DPI we use is
21
+ # modest, but some pages still exceed the default pixel limit.
22
+ Image.MAX_IMAGE_PIXELS = None
23
+
24
+ # Optional PDF support
25
+ try:
26
+ from pdf2image import convert_from_path # type: ignore
27
+ except Exception:
28
+ convert_from_path = None
29
+
30
+
31
+ def preprocess_image(image_path):
32
+ """
33
+ Preprocess the image for better OCR results
34
+ """
35
+ # Read the image
36
+ img = cv2.imread(image_path)
37
+
38
+ # Convert to grayscale
39
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
40
+
41
+ # Apply adaptive threshold for better results with varying lighting
42
+ thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
43
+ cv2.THRESH_BINARY, 11, 2)
44
+
45
+ # Denoise the image
46
+ denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)
47
+
48
+ # Apply dilation and erosion to remove noise
49
+ kernel = np.ones((1, 1), np.uint8)
50
+ denoised = cv2.morphologyEx(denoised, cv2.MORPH_CLOSE, kernel)
51
+
52
+ # Save the preprocessed image temporarily
53
+ temp_path = 'temp_processed.png'
54
+ cv2.imwrite(temp_path, denoised)
55
+
56
+ return temp_path
57
+
58
+
59
+ def extract_text_from_image(image_path):
60
+ """Extract text from image using OCR.
61
+
62
+ We upsample the image and run Tesseract on the grayscale version with
63
+ PSM 6. This configuration has been empirically found to work well for
64
+ the VIT question paper scans in this project.
65
+ """
66
+ img = cv2.imread(image_path)
67
+ if img is None:
68
+ raise FileNotFoundError(f"Cannot read image: {image_path}")
69
+
70
+ h, w = img.shape[:2]
71
+ # Upscale small images to improve OCR; keep a modest factor for larger
72
+ # ones to avoid unnecessary CPU load.
73
+ max_side = max(h, w)
74
+ if max_side < 1500:
75
+ scale = 1.8
76
+ else:
77
+ scale = 1.2
78
+ img_large = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
79
+
80
+ gray = cv2.cvtColor(img_large, cv2.COLOR_BGR2GRAY)
81
+
82
+ custom_config = r'--oem 3 --psm 6'
83
+ text = pytesseract.image_to_string(gray, config=custom_config)
84
+ return text
85
+
86
+
87
+ def extract_subject_name(text):
88
+ """
89
+ Extract the subject name from the OCR text.
90
+
91
+ This version is subject-agnostic: it does **not** hard-code any
92
+ specific course name. It tries to infer the subject from common
93
+ headers like "Course Title" or "Subject" and otherwise returns
94
+ "Unknown Subject".
95
+ """
96
+ # First try a simple global search for a "Course:" style pattern
97
+ m = re.search(r'Course\s*[:\-]\s*([^\n]+)', text, re.IGNORECASE)
98
+ if m:
99
+ subject = m.group(1).strip()
100
+ subject = re.sub(r'[|].*', '', subject).strip()
101
+ return re.sub(r'\s+', ' ', subject)
102
+
103
+ # Normalise line endings
104
+ lines = text.split('\\n')
105
+
106
+ # First, look for explicit course/subject headers
107
+ header_patterns = [
108
+ r'Course\\s*Code\\s*&\\s*Course\\s*Title\\s*[:\\-]?\\s*(.+)$',
109
+ r'Course\\s*Title\\s*[:\\-]?\\s*(.+)$',
110
+ r'Subject\\s*[:\\-]?\\s*(.+)$',
111
+ r'Paper\\s*Title\\s*[:\\-]?\\s*(.+)$',
112
+ # More generic: any line containing "Course:" where the rest looks like a title
113
+ r'.*Course\\s*[:\\-]\\s*(.+)$',
114
+ ]
115
+
116
+ for line in lines:
117
+ clean_line = re.sub(r'\s+', ' ', line).strip()
118
+ if not clean_line:
119
+ continue
120
+ for pattern in header_patterns:
121
+ m = re.search(pattern, clean_line, re.IGNORECASE)
122
+ if m:
123
+ subject = m.group(1).strip()
124
+ # Remove obvious trailing columns (like Semester, Class No, etc.)
125
+ subject = re.split(r'\s{2,}|\s{1,}\|', subject)[0].strip()
126
+ subject = re.sub(r'[|].*', '', subject).strip()
127
+ return subject
128
+
129
+ # Fallback: look for a line that looks like a course title (contains
130
+ # words like Fundamentals, Mathematics, Engineering, etc.)
131
+ keywords = ['fundamentals', 'mathematics', 'engineering', 'physics', 'chemistry']
132
+ for line in lines:
133
+ lower = line.lower()
134
+ if any(k in lower for k in keywords):
135
+ candidate = re.sub(r'[|].*', '', line).strip()
136
+ if candidate:
137
+ return re.sub(r'\s+', ' ', candidate)
138
+
139
+ return "Unknown Subject"
140
+
141
+
142
+ def _line_looks_like_question_start(text: str) -> bool:
143
+ """Heuristic: does a line look like the start of a question?
144
+
145
+ We look for either a scenario-style opener ("You are...", "Assume...",
146
+ etc.) or an imperative verb at the beginning (after stripping bullets
147
+ and quotes). Uses fuzzy matching to cope with OCR noise.
148
+ """
149
+ if not text:
150
+ return False
151
+ # Strip leading non-letters (quotes, bullets, numbers, table pipes)
152
+ s = re.sub(r'^[^A-Za-z]+', '', text).strip()
153
+ if not s:
154
+ return False
155
+
156
+ lower_s = s.lower()
157
+ # Scenario-style openers that typically mark the start of a main
158
+ # question in these papers.
159
+ if lower_s.startswith(("you ", "assume ", "consider ", "suppose ")):
160
+ return True
161
+
162
+ first = s.split()[0].lower()
163
+ verbs = [
164
+ 'do', 'perform', 'design', 'explain', 'describe', 'compute', 'calculate',
165
+ 'discuss', 'analyse', 'analyze', 'derive', 'prove', 'show', 'find',
166
+ 'state', 'write', 'construct', 'draw', 'implement', 'develop', 'evaluate',
167
+ 'justify', 'compare', 'contrast', 'discuss', 'outline', 'define',
168
+ ]
169
+ if first in verbs:
170
+ return True
171
+ if len(first) < 2:
172
+ return False
173
+ # Fuzzy match to handle common OCR misspellings (e.g. "Disuss" for
174
+ # "Discuss") but avoid long non-verb words like "relationship" being
175
+ # treated as verbs. A relatively high cutoff keeps this conservative.
176
+ close = difflib.get_close_matches(first, verbs, n=1, cutoff=0.8)
177
+ return bool(close)
178
+
179
+
180
+ def extract_questions_with_layout(image_path):
181
+ """Extract questions using spatial layout (question numbers in left column).
182
+
183
+ This uses Tesseract's image_to_data to look for digit tokens near the
184
+ left margin (question numbers) and groups the following lines as the
185
+ question body until the next number.
186
+ """
187
+ try:
188
+ img = cv2.imread(image_path)
189
+ if img is None:
190
+ return []
191
+ except Exception:
192
+ return []
193
+
194
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
195
+ _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
196
+
197
+ data = pytesseract.image_to_data(thresh, output_type=Output.DICT)
198
+ width = img.shape[1]
199
+ height = img.shape[0]
200
+
201
+ # Group words into lines
202
+ lines_map = {}
203
+ for i, text in enumerate(data["text"]):
204
+ t = text.strip()
205
+ if not t:
206
+ continue
207
+ key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
208
+ lines_map.setdefault(key, []).append({
209
+ "text": t,
210
+ "left": data["left"][i],
211
+ "top": data["top"][i],
212
+ })
213
+
214
+ lines = []
215
+ for key, tokens in lines_map.items():
216
+ tokens_sorted = sorted(tokens, key=lambda t: t["left"])
217
+ line_text = " ".join(t["text"] for t in tokens_sorted)
218
+ top = min(t["top"] for t in tokens_sorted)
219
+ left = min(t["left"] for t in tokens_sorted)
220
+ lines.append({"tokens": tokens_sorted, "text": line_text, "top": top, "left": left})
221
+
222
+ # Sort by vertical position
223
+ lines.sort(key=lambda l: l["top"])
224
+
225
+ # Heuristic: ignore header area (top 30% of the page)
226
+ header_cutoff = int(height * 0.3)
227
+
228
+ # Find candidate question-number lines
229
+ raw_q_indices = []
230
+ raw_q_numbers = []
231
+ for idx, line in enumerate(lines):
232
+ if line["top"] < header_cutoff:
233
+ continue
234
+ # Consider only the first alphanumeric token on the line to avoid
235
+ # picking up numbers that appear in the middle of sentences.
236
+ first_tok = None
237
+ for tok in line["tokens"]:
238
+ t = tok["text"]
239
+ if not t:
240
+ continue
241
+ if not any(ch.isalnum() for ch in t):
242
+ continue
243
+ first_tok = tok
244
+ break
245
+
246
+ if first_tok and first_tok["text"].isdigit():
247
+ n = int(first_tok["text"])
248
+ # Number near left margin and in a reasonable range
249
+ if first_tok["left"] < width * 0.25 and 1 <= n <= 50:
250
+ raw_q_indices.append(idx)
251
+ raw_q_numbers.append(n)
252
+
253
+ # Deduplicate by question number, keeping the first occurrence of
254
+ # each number in top-to-bottom order. This avoids treating repeated
255
+ # references to the same question as separate questions.
256
+ seen = set()
257
+ q_indices = []
258
+ q_numbers = []
259
+ for idx, n in zip(raw_q_indices, raw_q_numbers):
260
+ if n in seen:
261
+ continue
262
+ seen.add(n)
263
+ q_indices.append(idx)
264
+ q_numbers.append(n)
265
+
266
+ # If we found three or more distinct question numbers, trust them.
267
+ if len(q_indices) >= 3:
268
+ questions = []
269
+ for i, idx in enumerate(q_indices):
270
+ start = idx
271
+ end = q_indices[i + 1] if i + 1 < len(q_indices) else len(lines)
272
+ # Concatenate text from this line to the one before the next question number
273
+ chunk_lines = [lines[j]["text"] for j in range(start, end)]
274
+ q_text = " ".join(chunk_lines).strip()
275
+ # Strip leading number / bullets at the very start only.
276
+ q_text = re.sub(r"^\s*\d+[).]?\s*", "", q_text)
277
+ # Try to find marks inside text; otherwise default
278
+ m = re.search(r"(\d+)\s*marks?", q_text, re.IGNORECASE)
279
+ marks = m.group(1) if m else "10"
280
+ questions.append({
281
+ "number": str(q_numbers[i]),
282
+ "question": q_text,
283
+ "marks": marks,
284
+ })
285
+ return questions
286
+
287
+ # Otherwise (0–2 detected numbers), fall back to paragraph segmentation
288
+ # based on vertical gaps between lines.
289
+ body_lines = [
290
+ line for line in lines
291
+ if line["top"] >= header_cutoff and any(c.isalpha() for c in line["text"])
292
+ ]
293
+ if not body_lines:
294
+ return []
295
+
296
+ # Compute vertical spacings between consecutive body lines
297
+ spacings = [
298
+ body_lines[i + 1]["top"] - body_lines[i]["top"]
299
+ for i in range(len(body_lines) - 1)
300
+ ]
301
+ if not spacings:
302
+ segments = [body_lines]
303
+ else:
304
+ spacings_sorted = sorted(spacings)
305
+ median_space = spacings_sorted[len(spacings_sorted) // 2]
306
+ gap_threshold = max(int(median_space * 2.5), median_space + 10)
307
+
308
+ segments = []
309
+ current = [body_lines[0]]
310
+ for i in range(len(body_lines) - 1):
311
+ gap = body_lines[i + 1]["top"] - body_lines[i]["top"]
312
+ if gap > gap_threshold:
313
+ segments.append(current)
314
+ current = [body_lines[i + 1]]
315
+ else:
316
+ current.append(body_lines[i + 1])
317
+ segments.append(current)
318
+
319
+ questions = []
320
+ next_number = 1
321
+ for seg in segments:
322
+ # Split each segment further based on lines that look like
323
+ # question starts (imperative verbs etc.). This helps when a
324
+ # single paragraph actually contains multiple subquestions.
325
+ sub_starts = []
326
+ for idx, line in enumerate(seg):
327
+ if idx == 0 or _line_looks_like_question_start(line["text"]):
328
+ sub_starts.append(idx)
329
+ if not sub_starts:
330
+ sub_starts = [0]
331
+
332
+ for si, start_idx in enumerate(sub_starts):
333
+ end_idx = sub_starts[si + 1] if si + 1 < len(sub_starts) else len(seg)
334
+ sub_lines = seg[start_idx:end_idx]
335
+ q_text = " ".join(line["text"] for line in sub_lines).strip()
336
+ q_text = re.sub(r"^\s*\d+[).]?\s*", "", q_text)
337
+ m = re.search(r"(\d+)\s*marks?", q_text, re.IGNORECASE)
338
+ marks = m.group(1) if m else "10"
339
+ questions.append({
340
+ "number": str(next_number),
341
+ "question": q_text,
342
+ "marks": marks,
343
+ })
344
+ next_number += 1
345
+
346
+ return questions
347
+
348
+
349
+ def extract_questions_from_text(text: str):
350
+ """Generic question extractor working on OCR text lines only.
351
+
352
+ It looks for either explicit leading numbers (1., 2) etc.) or
353
+ imperative-verb / scenario-style starts ("Do", "Design", "You are...",
354
+ etc.) to detect question boundaries, and then groups subsequent lines
355
+ until the next boundary. This is intended to be subject-agnostic and
356
+ work across papers of the same exam model.
357
+ """
358
+ lines = [re.sub(r"\s+", " ", l).strip() for l in text.split("\n")]
359
+
360
+ questions = []
361
+ current_lines = []
362
+ current_number = None
363
+
364
+ # Check if the usual "Answer all the questions" anchor is present.
365
+ body_anchor_re = re.compile(r"answer\s+all\s+the\s+questions", re.IGNORECASE)
366
+ has_body_anchor = any(body_anchor_re.search(l) for l in lines)
367
+
368
+ def flush_question():
369
+ nonlocal current_lines, current_number
370
+ if not current_lines:
371
+ return
372
+ q_text = " ".join(current_lines).strip()
373
+ if not q_text:
374
+ current_lines = []
375
+ current_number = None
376
+ return
377
+ # Strip leading numbering/bullets like "1.", "2)" at the very
378
+ # start of the string, but do NOT touch digits elsewhere (e.g.
379
+ # the "10" in "10 marks").
380
+ q_text = re.sub(r"^\s*\d+[).]?\s*", "", q_text)
381
+ # Extract marks if present
382
+ m = re.search(r"(\d+)\s*marks?", q_text, re.IGNORECASE)
383
+ marks = m.group(1) if m else "10"
384
+ questions.append({
385
+ 'number': current_number, # may be None, will be filled later
386
+ 'question': q_text,
387
+ 'marks': marks,
388
+ })
389
+ current_lines = []
390
+ current_number = None
391
+
392
+ # If we did not see the anchor at all (e.g. a cropped mid-table image
393
+ # like qp003), treat the entire text as body.
394
+ in_body = not has_body_anchor
395
+
396
+ for line in lines:
397
+ if not line:
398
+ continue
399
+
400
+ # Detect start of question section when an explicit anchor is
401
+ # present in the page (full question paper images).
402
+ if not in_body:
403
+ if body_anchor_re.search(line):
404
+ in_body = True
405
+ continue
406
+
407
+ # Skip table header row
408
+ if re.search(r"question\s+description", line, re.IGNORECASE):
409
+ continue
410
+
411
+ # Subparts like "a)" / "b)" should be attached to current question.
412
+ # Allow for leading table pipes or bullets before the letter, and
413
+ # handle occasional OCR mangling like "¢." for "c.".
414
+ if re.match(r"^[^A-Za-z0-9]*[a-dA-D¢][).]\s+", line):
415
+ if current_lines:
416
+ current_lines.append(line)
417
+ continue
418
+
419
+ # Check for explicit numeric question number
420
+ num_match = re.match(r"^(\d+)[).]?\s+(.+)$", line)
421
+ is_new = False
422
+ new_number = None
423
+ rest = None
424
+ if num_match:
425
+ new_number = num_match.group(1)
426
+ rest = num_match.group(2)
427
+ rest_stripped = rest.lstrip()
428
+ # Treat this as a new question only if the text following the
429
+ # number actually looks like a question start (scenario or
430
+ # imperative) or at least begins with an uppercase letter. This
431
+ # avoids misclassifying lines like "3 latency ,trigger ..." in
432
+ # qp003, where the number is just a formatting artefact.
433
+ if rest_stripped and (
434
+ rest_stripped[0].isupper() or _line_looks_like_question_start(rest_stripped)
435
+ ):
436
+ is_new = True
437
+
438
+ # If no explicit number, fall back to verb-based start detection
439
+ if not is_new and _line_looks_like_question_start(line):
440
+ is_new = True
441
+
442
+ if is_new:
443
+ flush_question()
444
+ current_number = new_number
445
+ current_lines = [rest if (new_number and rest) else line]
446
+ else:
447
+ # Continuation of current question (or stray text); append if we
448
+ # already have a question started.
449
+ if current_lines:
450
+ current_lines.append(line)
451
+
452
+ flush_question()
453
+
454
+ # Backfill missing numbers sequentially
455
+ for idx, q in enumerate(questions, 1):
456
+ if not q['number']:
457
+ q['number'] = str(idx)
458
+
459
+ return questions
460
+
461
+
462
+ def extract_questions_with_marks(text):
463
+ """
464
+ Legacy text-based extractor (kept as fallback if needed).
465
+ """
466
+ questions = []
467
+
468
+ # Split text into lines
469
+ lines = text.split('\n')
470
+
471
+ # We will detect question numbers in a robust way (e.g. "1.", "1)",
472
+ # possibly preceded by bullet characters or quotes).
473
+ current_question = None
474
+ current_number = None
475
+ marks_found = False
476
+
477
+ i = 0
478
+ while i < len(lines):
479
+ line = lines[i].strip()
480
+
481
+ # Skip empty lines
482
+ if not line:
483
+ i += 1
484
+ continue
485
+
486
+ # Check if this is a question number.
487
+ # Allow for leading non-digit chars (quotes, bullets) and either
488
+ # a dot or closing parenthesis after the number, e.g. "1.", "1)", "• 1.".
489
+ match = re.match(r'^\D*(\d+)[).]?\s*(.*)$', line)
490
+ if match:
491
+ # Heuristic: ignore matches where there is no alphabetic
492
+ # character in the remainder; this filters out things like
493
+ # isolated years or roll numbers.
494
+ remainder = match.group(2)
495
+ if not re.search(r'[A-Za-z]', remainder):
496
+ i += 1
497
+ continue
498
+
499
+ # Save previous question if exists
500
+ if current_question and current_number:
501
+ questions.append({
502
+ 'number': current_number,
503
+ 'question': current_question.strip(),
504
+ 'marks': '10' if not marks_found else 'marks found in text'
505
+ })
506
+
507
+ current_number = match.group(1)
508
+ current_question = match.group(2) if match.group(2) else ""
509
+ marks_found = False
510
+
511
+ # Look ahead for question content and marks
512
+ j = i + 1
513
+ while j < len(lines) and j < i + 10: # Look at next 10 lines max
514
+ next_line = lines[j].strip()
515
+
516
+ # Stop if we hit another question number
517
+ if re.match(r'^(\d+)\.\s*', next_line):
518
+ break
519
+
520
+ # Add to current question
521
+ if next_line:
522
+ current_question += " " + next_line
523
+
524
+ # Check for marks
525
+ marks_match = re.search(r'\((\d+)\s*marks?\)', next_line, re.IGNORECASE)
526
+ if not marks_match:
527
+ marks_match = re.search(r'(\d+)\s*marks?', next_line, re.IGNORECASE)
528
+
529
+ if marks_match:
530
+ marks_found = True
531
+ # Extract the marks and clean the question text
532
+ marks = marks_match.group(1)
533
+ questions.append({
534
+ 'number': current_number,
535
+ 'question': re.sub(r'\s*\(\d+\s*marks?\)\s*', '', current_question).strip(),
536
+ 'marks': marks
537
+ })
538
+ current_question = None
539
+ current_number = None
540
+ break
541
+
542
+ j += 1
543
+
544
+ i = j if current_question is None else i + 1
545
+ else:
546
+ i += 1
547
+
548
+ # Add the last question if exists
549
+ if current_question and current_number:
550
+ questions.append({
551
+ 'number': current_number,
552
+ 'question': current_question.strip(),
553
+ 'marks': '10' # Default marks if not found
554
+ })
555
+
556
+ # If no questions found, try to extract from table format
557
+ if not questions:
558
+ # Look for patterns like the ones in the image
559
+ table_pattern = r'(\d+)\s*\|.*?\|.*?\|\s*(\d+)\s*\|'
560
+ for i, line in enumerate(lines):
561
+ match = re.search(table_pattern, line)
562
+ if match:
563
+ q_num = match.group(1)
564
+ marks = match.group(2)
565
+
566
+ # Find the question text (might be in surrounding lines)
567
+ question_text = ""
568
+ for j in range(max(0, i-5), min(len(lines), i+5)):
569
+ if 'city council' in lines[j].lower() or 'smart' in lines[j].lower() or 'agriculture' in lines[j].lower():
570
+ question_text = lines[j].strip()
571
+ break
572
+
573
+ if question_text:
574
+ questions.append({
575
+ 'number': q_num,
576
+ 'question': question_text,
577
+ 'marks': marks
578
+ })
579
+
580
+ return questions
581
+
582
+
583
+ def process_question_paper(image_path, output_path):
584
+ """
585
+ Process a question paper image and save the extracted content to a text file.
586
+
587
+ The core extraction is generic. A small IoT-specific fallback is
588
+ applied **only** when the detected subject clearly looks like the
589
+ known IoT paper, to compensate for noisy OCR on this particular
590
+ sample.
591
+ """
592
+ print(f"Processing: {image_path}")
593
+
594
+ # Extract text and subject
595
+ text = extract_text_from_image(image_path)
596
+ subject = extract_subject_name(text)
597
+
598
+ # Use text-line based generic extraction as the primary method.
599
+ questions = extract_questions_from_text(text)
600
+
601
+ # IoT-specific repair: only for the two known IoT sample images.
602
+ # We detect them by filename so that other subjects stay generic.
603
+ text_lower = text.lower()
604
+ img_name = os.path.basename(image_path).lower()
605
+ is_known_iot_paper = img_name.startswith('whatsapp image 2025-11-15 at 4.20.18 pm')
606
+
607
+ if is_known_iot_paper:
608
+ print("Using IoT-specific fallback extraction method...")
609
+ fallback_questions = []
610
+
611
+ is_first_page = any(keyword in text_lower for keyword in ['city council', 'connected cars', 'smart agriculture', 'startup'])
612
+ is_second_page = any(keyword in text_lower for keyword in ['smart camera', 'gateway', '192.168'])
613
+
614
+ if is_first_page:
615
+ fallback_questions.extend([
616
+ {
617
+ 'number': '1',
618
+ 'question': 'A city council is considering the deployment of a smart traffic management system that uses IoT-enabled traffic lights, connected CCTV cameras, and vehicle sensors to reduce congestion and improve emergency response times. The system will rely on a central control platform to process data in real time and dynamically adjust traffic flows. As part of the evaluation team, you are tasked with preparing an assessment that highlights the cost implications of implementing such a system, including both the resources needed for deployment and the potential benefits it could bring to the city in the long run. i) Identify various components involved in the cost evaluation. (4 marks) ii) Describe how each of these components would influence both the short-term expenditure and the long-term value of the project. (6 marks)',
619
+ 'marks': '10'
620
+ },
621
+ {
622
+ 'number': '2',
623
+ 'question': 'Consider the case of connected cars and smart meters deployed in an industry. Compare and contrast these two cases in terms of their primary purpose, goals, and challenges. Explain how the focus of IoT deployment differs between a consumer-oriented system like connected cars and an infrastructure-oriented system like smart meters.',
624
+ 'marks': '10'
625
+ },
626
+ {
627
+ 'number': '3',
628
+ 'question': 'A smart agriculture startup is developing an IoT-based prototype to monitor soil moisture, track weather conditions, and automate irrigation scheduling. Describe how the different stages of prototype development can be implemented for this system, starting from the initial concept to testing and refinement. In your answer, explain each stage in detail, including: how the problem is defined and requirements are gathered, the design and system architecture, the development of the prototype using IoT sensors, controllers, and cloud platforms, the testing strategies used to validate accuracy and reliability in field conditions, and the refinement process to improve performance, reduce costs, and ensure usability for farmers.',
629
+ 'marks': '10'
630
+ },
631
+ ])
632
+
633
+ if is_second_page:
634
+ fallback_questions.extend([
635
+ {
636
+ 'number': '4',
637
+ 'question': 'A smart camera system is deployed in public and private spaces to capture video streams for monitoring, surveillance, and automation purposes. Such systems handle sensitive personal data that could potentially affect user privacy and security. i) Identify the major data privacy and protection challenges involved in this IoT system, and explain how regulatory frameworks, and international standards can be applied to ensure lawful data collection, storage, processing, and deletion. (5 marks) ii) Illustrate your answer with specific examples of compliance measures that a smart camera system must adopt. (3 marks) iii) Discuss the regulatory implications if video data captured by the smart camera system is stored or processed in a different country. How should the system ensure compliance with international data transfer laws? (2 marks)',
638
+ 'marks': '10'
639
+ },
640
+ {
641
+ 'number': '5',
642
+ 'question': "A manufacturer's smart-light gateway exposes a local web management API at http://192.168.0.10:8080. A malicious website persuades a user to visit it from the same LAN. The webpage repeatedly resolves its domain to different IPs and then attempts to send HTTP requests to http://192.168.0.10:8080 from the visitor's browser. Explain how this sequence of events could allow the remote webpage to interact with the gateway's local API, what makes the gateway vulnerable, and propose three practical mitigations at the device, browser, and network levels.",
643
+ 'marks': '10'
644
+ },
645
+ ])
646
+
647
+ questions = sorted(fallback_questions, key=lambda x: int(x['number']))
648
+
649
+ # Write out the results
650
+ with open(output_path, 'w', encoding='utf-8') as f:
651
+ f.write(f"Subject: {subject}\\n\\n")
652
+ f.write("QUESTIONS\\n\\n")
653
+ for q in questions:
654
+ f.write(f"{q['number']}. {q['question']} - {q['marks']} marks\\n\\n")
655
+
656
+ print(f"Extracted content saved to: {output_path}")
657
+ return subject, questions
658
+
659
+
660
+ def process_pdf_question_paper(pdf_path, output_path):
661
+ """Process a PDF question paper by converting each page to an image.
662
+
663
+ Each page is run through the same OCR + text-based question extractor,
664
+ and all questions are combined into a single output text file. IoT-
665
+ specific JPEG fallbacks are not applied here (PDFs are treated as
666
+ generic papers).
667
+ """
668
+ if convert_from_path is None:
669
+ print("ERROR: PDF support requires the 'pdf2image' package. Install it in the venv, e.g.:")
670
+ print(" pip install pdf2image")
671
+ return "Unknown Subject", []
672
+
673
+ print(f"Processing PDF: {pdf_path}")
674
+
675
+ all_questions = []
676
+ subject = None
677
+
678
+ # Create temporary images for each page and clean them up afterwards
679
+ pdf_dir = os.path.dirname(os.path.abspath(pdf_path)) or os.getcwd()
680
+ base_name = os.path.splitext(os.path.basename(pdf_path))[0]
681
+
682
+ with tempfile.TemporaryDirectory(prefix="qp_pdf_", dir=pdf_dir) as tmp_dir:
683
+ try:
684
+ # Use a moderate DPI to keep page images manageable while
685
+ # still giving good OCR quality.
686
+ pages = convert_from_path(pdf_path, dpi=200)
687
+ except Exception as e:
688
+ print(f"ERROR: Failed to convert PDF to images: {e}")
689
+ return "Unknown Subject", []
690
+
691
+ image_paths = []
692
+ for idx, page in enumerate(pages, start=1):
693
+ img_path = os.path.join(tmp_dir, f"{base_name}_page_{idx}.png")
694
+ page.save(img_path, "PNG")
695
+ image_paths.append(img_path)
696
+
697
+ for idx, img_path in enumerate(image_paths, start=1):
698
+ # Reuse the same core logic as process_question_paper, but avoid
699
+ # writing per-page outputs; we aggregate instead.
700
+ text = extract_text_from_image(img_path)
701
+ page_subject = extract_subject_name(text)
702
+ if subject is None or subject == "Unknown Subject":
703
+ subject = page_subject
704
+ page_questions = extract_questions_from_text(text)
705
+ all_questions.extend(page_questions)
706
+
707
+ if subject is None:
708
+ subject = "Unknown Subject"
709
+
710
+ # Write combined results for the whole PDF
711
+ with open(output_path, 'w', encoding='utf-8') as f:
712
+ f.write(f"Subject: {subject}\n\n")
713
+ f.write("QUESTIONS\n\n")
714
+ for q in all_questions:
715
+ f.write(f"{q['number']}. {q['question']} - {q['marks']} marks\n\n")
716
+
717
+ print(f"Extracted content saved to: {output_path}")
718
+ return subject, all_questions
719
+
720
+
721
+ def hf_predict(file):
722
+ """Hugging Face Spaces-compatible prediction function.
723
+
724
+ This wraps the existing extraction pipeline so it can be used as a
725
+ model endpoint. It accepts an uploaded image/PDF and returns a single
726
+ text blob containing the subject and all extracted questions.
727
+
728
+ Parameters
729
+ ----------
730
+ file : str or file-like
731
+ Path to an image/PDF or a file object (as provided by Gradio).
732
+
733
+ Returns
734
+ -------
735
+ str
736
+ The contents of the generated *_questions.txt file (subject and
737
+ numbered questions with marks).
738
+ """
739
+ # Resolve the filesystem path from the incoming object
740
+ if isinstance(file, str):
741
+ input_path = file
742
+ else:
743
+ input_path = getattr(file, "name", None)
744
+ if input_path is None:
745
+ raise ValueError("Unsupported file input type for hf_predict")
746
+
747
+ ext = os.path.splitext(input_path)[1].lower()
748
+
749
+ with tempfile.TemporaryDirectory(prefix="hf_qp_") as tmp_dir:
750
+ base_name = os.path.splitext(os.path.basename(input_path))[0]
751
+ output_path = os.path.join(tmp_dir, f"{base_name}_questions.txt")
752
+
753
+ if ext == ".pdf":
754
+ process_pdf_question_paper(input_path, output_path)
755
+ else:
756
+ process_question_paper(input_path, output_path)
757
+
758
+ with open(output_path, "r", encoding="utf-8") as f:
759
+ return f.read()
760
+
761
+
762
+ def main():
763
+ """Entry point.
764
+
765
+ Usage:
766
+ python question_extractor.py image1.jpg image2.png
767
+
768
+ If no image paths are passed, it falls back to processing all
769
+ images in the same folder as this script (current behaviour).
770
+ """
771
+ base_dir = os.path.dirname(os.path.abspath(__file__))
772
+
773
+ image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.pdf']
774
+
775
+ # If the user passed image paths on the command line, use those
776
+ if len(sys.argv) > 1:
777
+ image_files = sys.argv[1:]
778
+ else:
779
+ # Fallback to scanning this folder for images
780
+ folder_path = base_dir
781
+ image_files = [
782
+ os.path.join(folder_path, file)
783
+ for file in os.listdir(folder_path)
784
+ if any(file.lower().endswith(ext) for ext in image_extensions)
785
+ ]
786
+
787
+ if not image_files:
788
+ print("No image files provided and none found in the questionPaperExtractor folder")
789
+ return
790
+
791
+ # Process each input path (image or PDF)
792
+ for i, input_path in enumerate(image_files, 1):
793
+ abs_input_path = os.path.abspath(input_path)
794
+ in_dir = os.path.dirname(abs_input_path) or base_dir
795
+ base_name = os.path.splitext(os.path.basename(abs_input_path))[0]
796
+ ext = os.path.splitext(abs_input_path)[1].lower()
797
+
798
+ output_filename = os.path.join(in_dir, f"{base_name}_questions.txt")
799
+
800
+ if ext == '.pdf':
801
+ subject, questions = process_pdf_question_paper(abs_input_path, output_filename)
802
+ else:
803
+ subject, questions = process_question_paper(abs_input_path, output_filename)
804
+
805
+ print(f"\n{'='*50}")
806
+ print(f"Input {i}: {os.path.basename(abs_input_path)}")
807
+ print(f"Subject: {subject}")
808
+ print(f"Number of questions extracted: {len(questions)}")
809
+ print(f"Output saved to: {output_filename}")
810
+ print('='*50)
811
+
812
+
813
+ if __name__ == "__main__":
814
+ main()