Spaces:
Sleeping
Sleeping
Update question_extractor.py
Browse files- question_extractor.py +9 -2
question_extractor.py
CHANGED
|
@@ -663,8 +663,15 @@ def process_question_paper(image_path, output_path):
|
|
| 663 |
text = extract_text_from_image(image_path)
|
| 664 |
subject = extract_subject_name(text)
|
| 665 |
|
| 666 |
-
#
|
| 667 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
|
| 669 |
# Write out the results in a structured layout
|
| 670 |
with open(output_path, 'w', encoding='utf-8') as f:
|
|
|
|
| 663 |
text = extract_text_from_image(image_path)
|
| 664 |
subject = extract_subject_name(text)
|
| 665 |
|
| 666 |
+
# 1) Try layout-based extraction first (uses Tesseract's positional
|
| 667 |
+
# data to find question numbers in the left column). This is
|
| 668 |
+
# particularly robust for table-style papers like VIT's CAT format.
|
| 669 |
+
questions = extract_questions_with_layout(image_path)
|
| 670 |
+
|
| 671 |
+
# 2) If that fails or finds too few questions, fall back to the
|
| 672 |
+
# generic text-line based extractor which uses only OCR'd text.
|
| 673 |
+
if not questions or len(questions) < 3:
|
| 674 |
+
questions = extract_questions_from_text(text)
|
| 675 |
|
| 676 |
# Write out the results in a structured layout
|
| 677 |
with open(output_path, 'w', encoding='utf-8') as f:
|