Spaces:
Sleeping
Sleeping
Update question_extractor.py
Browse files- question_extractor.py +9 -2
question_extractor.py
CHANGED
|
@@ -300,6 +300,12 @@ def extract_questions_with_layout(image_path):
|
|
| 300 |
for idx, line in enumerate(lines):
|
| 301 |
if line["top"] < header_cutoff:
|
| 302 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
# Consider only the first alphanumeric token on the line to avoid
|
| 304 |
# picking up numbers that appear in the middle of sentences.
|
| 305 |
first_tok = None
|
|
@@ -312,10 +318,11 @@ def extract_questions_with_layout(image_path):
|
|
| 312 |
first_tok = tok
|
| 313 |
break
|
| 314 |
|
|
|
|
|
|
|
| 315 |
if first_tok and first_tok["text"].isdigit():
|
| 316 |
n = int(first_tok["text"])
|
| 317 |
-
|
| 318 |
-
if first_tok["left"] < width * 0.25 and 1 <= n <= 50:
|
| 319 |
raw_q_indices.append(idx)
|
| 320 |
raw_q_numbers.append(n)
|
| 321 |
|
|
|
|
| 300 |
for idx, line in enumerate(lines):
|
| 301 |
if line["top"] < header_cutoff:
|
| 302 |
continue
|
| 303 |
+
text_lower = line["text"].lower()
|
| 304 |
+
# Skip obvious table header rows for VIT-style papers
|
| 305 |
+
if "q. no" in text_lower or "q no" in text_lower:
|
| 306 |
+
continue
|
| 307 |
+
if "description" in text_lower and "marks" in text_lower:
|
| 308 |
+
continue
|
| 309 |
# Consider only the first alphanumeric token on the line to avoid
|
| 310 |
# picking up numbers that appear in the middle of sentences.
|
| 311 |
first_tok = None
|
|
|
|
| 318 |
first_tok = tok
|
| 319 |
break
|
| 320 |
|
| 321 |
+
# Require a pure integer token in a reasonable range, in the
|
| 322 |
+
# left-most part of the page (Q.No column in VIT tables).
|
| 323 |
if first_tok and first_tok["text"].isdigit():
|
| 324 |
n = int(first_tok["text"])
|
| 325 |
+
if 1 <= n <= 50 and first_tok["left"] < width * 0.2:
|
|
|
|
| 326 |
raw_q_indices.append(idx)
|
| 327 |
raw_q_numbers.append(n)
|
| 328 |
|