bhuvan-2005 commited on
Commit
19d31b9
·
verified ·
1 Parent(s): 1b0427e

Update question_extractor.py

Browse files
Files changed (1) hide show
  1. question_extractor.py +9 -2
question_extractor.py CHANGED
@@ -663,8 +663,15 @@ def process_question_paper(image_path, output_path):
663
  text = extract_text_from_image(image_path)
664
  subject = extract_subject_name(text)
665
 
666
- # Use text-line based generic extraction as the primary method.
667
- questions = extract_questions_from_text(text)
 
 
 
 
 
 
 
668
 
669
  # Write out the results in a structured layout
670
  with open(output_path, 'w', encoding='utf-8') as f:
 
663
  text = extract_text_from_image(image_path)
664
  subject = extract_subject_name(text)
665
 
666
+ # 1) Try layout-based extraction first (uses Tesseract's positional
667
+ # data to find question numbers in the left column). This is
668
+ # particularly robust for table-style papers like VIT's CAT format.
669
+ questions = extract_questions_with_layout(image_path)
670
+
671
+ # 2) If that fails or finds too few questions, fall back to the
672
+ # generic text-line based extractor which uses only OCR'd text.
673
+ if not questions or len(questions) < 3:
674
+ questions = extract_questions_from_text(text)
675
 
676
  # Write out the results in a structured layout
677
  with open(output_path, 'w', encoding='utf-8') as f: