bhuvan-2005 commited on
Commit
1b0427e
·
verified ·
1 Parent(s): d0aa538

Update question_extractor.py

Browse files
Files changed (1) hide show
  1. question_extractor.py +6 -8
question_extractor.py CHANGED
@@ -650,13 +650,12 @@ def extract_questions_with_marks(text):
650
 
651
 
652
  def process_question_paper(image_path, output_path):
653
- """
654
- Process a question paper image and save the extracted content to a text file.
655
 
656
- The core extraction is generic. A small IoT-specific fallback is
657
- applied **only** when the detected subject clearly looks like the
658
- known IoT paper, to compensate for noisy OCR on this particular
659
- sample.
660
  """
661
  print(f"Processing: {image_path}")
662
 
@@ -667,8 +666,7 @@ def process_question_paper(image_path, output_path):
667
  # Use text-line based generic extraction as the primary method.
668
  questions = extract_questions_from_text(text)
669
 
670
- # Write out the results
671
- with open(output_path, 'w', encoding='utf-8') as f:
672
  with open(output_path, 'w', encoding='utf-8') as f:
673
  f.write(f"Subject: {subject}\\n\\n")
674
  f.write(f"Total Questions: {len(questions)}\\n\\n")
 
650
 
651
 
652
  def process_question_paper(image_path, output_path):
653
+ """Process a question paper image and save the extracted content.
 
654
 
655
+ This function is fully subject-agnostic: it runs OCR, infers a
656
+ subject line from generic headers, extracts questions using generic
657
+ heuristics, and writes a structured text file (subject, total
658
+ questions, and numbered questions with marks).
659
  """
660
  print(f"Processing: {image_path}")
661
 
 
666
  # Use text-line based generic extraction as the primary method.
667
  questions = extract_questions_from_text(text)
668
 
669
+ # Write out the results in a structured layout
 
670
  with open(output_path, 'w', encoding='utf-8') as f:
671
  f.write(f"Subject: {subject}\\n\\n")
672
  f.write(f"Total Questions: {len(questions)}\\n\\n")