Spaces:

bhuvan-2005
/

question-extractor

Sleeping

bhuvan-2005 commited on Nov 17, 2025

Commit

1b0427e

verified ·

1 Parent(s): d0aa538

Update question_extractor.py

Files changed (1) hide show

question_extractor.py CHANGED Viewed

@@ -650,13 +650,12 @@ def extract_questions_with_marks(text):
 def process_question_paper(image_path, output_path):
-    """
-    Process a question paper image and save the extracted content to a text file.
-    The core extraction is generic. A small IoT-specific fallback is
-    applied **only** when the detected subject clearly looks like the
-    known IoT paper, to compensate for noisy OCR on this particular
-    sample.
     """
     print(f"Processing: {image_path}")
@@ -667,8 +666,7 @@ def process_question_paper(image_path, output_path):
     # Use text-line based generic extraction as the primary method.
     questions = extract_questions_from_text(text)
-    # Write out the results
-    with open(output_path, 'w', encoding='utf-8') as f:
     with open(output_path, 'w', encoding='utf-8') as f:
         f.write(f"Subject: {subject}\\n\\n")
         f.write(f"Total Questions: {len(questions)}\\n\\n")

 def process_question_paper(image_path, output_path):
+    """Process a question paper image and save the extracted content.
+    This function is fully subject-agnostic: it runs OCR, infers a
+    subject line from generic headers, extracts questions using generic
+    heuristics, and writes a structured text file (subject, total
+    questions, and numbered questions with marks).
     """
     print(f"Processing: {image_path}")
     # Use text-line based generic extraction as the primary method.
     questions = extract_questions_from_text(text)
+    # Write out the results in a structured layout
     with open(output_path, 'w', encoding='utf-8') as f:
         f.write(f"Subject: {subject}\\n\\n")
         f.write(f"Total Questions: {len(questions)}\\n\\n")