bhuvan-2005 commited on
Commit
5190f7f
·
verified ·
1 Parent(s): 19d31b9

Update question_extractor.py

Browse files
Files changed (1) hide show
  1. question_extractor.py +9 -2
question_extractor.py CHANGED
@@ -300,6 +300,12 @@ def extract_questions_with_layout(image_path):
300
  for idx, line in enumerate(lines):
301
  if line["top"] < header_cutoff:
302
  continue
 
 
 
 
 
 
303
  # Consider only the first alphanumeric token on the line to avoid
304
  # picking up numbers that appear in the middle of sentences.
305
  first_tok = None
@@ -312,10 +318,11 @@ def extract_questions_with_layout(image_path):
312
  first_tok = tok
313
  break
314
 
 
 
315
  if first_tok and first_tok["text"].isdigit():
316
  n = int(first_tok["text"])
317
- # Number near left margin and in a reasonable range
318
- if first_tok["left"] < width * 0.25 and 1 <= n <= 50:
319
  raw_q_indices.append(idx)
320
  raw_q_numbers.append(n)
321
 
 
300
  for idx, line in enumerate(lines):
301
  if line["top"] < header_cutoff:
302
  continue
303
+ text_lower = line["text"].lower()
304
+ # Skip obvious table header rows for VIT-style papers
305
+ if "q. no" in text_lower or "q no" in text_lower:
306
+ continue
307
+ if "description" in text_lower and "marks" in text_lower:
308
+ continue
309
  # Consider only the first alphanumeric token on the line to avoid
310
  # picking up numbers that appear in the middle of sentences.
311
  first_tok = None
 
318
  first_tok = tok
319
  break
320
 
321
+ # Require a pure integer token in a reasonable range, in the
322
+ # left-most part of the page (Q.No column in VIT tables).
323
  if first_tok and first_tok["text"].isdigit():
324
  n = int(first_tok["text"])
325
+ if 1 <= n <= 50 and first_tok["left"] < width * 0.2:
 
326
  raw_q_indices.append(idx)
327
  raw_q_numbers.append(n)
328