Spaces:

bhuvan-2005
/

question-extractor

Sleeping

App Files Files Community

bhuvan-2005 commited on Nov 17, 2025

Commit

ebcc2b6

verified ·

1 Parent(s): 185368c

Update question_extractor.py

Browse files

Files changed (1) hide show

question_extractor.py +42 -8

question_extractor.py CHANGED Viewed

@@ -709,13 +709,23 @@ def process_pdf_question_paper(pdf_path, output_path):
     """Process a PDF question paper by converting each page to an image.
     Each page is run through the same OCR + text-based question extractor,
-    and all questions are combined into a single output text file. IoT-
-    specific JPEG fallbacks are not applied here (PDFs are treated as
-    generic papers).
     """
     if convert_from_path is None:
-        print("ERROR: PDF support requires the 'pdf2image' package. Install it in the venv, e.g.:")
-        print("  pip install pdf2image")
         return "Unknown Subject", []
     print(f"Processing PDF: {pdf_path}")
@@ -733,7 +743,13 @@ def process_pdf_question_paper(pdf_path, output_path):
             # still giving good OCR quality.
             pages = convert_from_path(pdf_path, dpi=200)
         except Exception as e:
-            print(f"ERROR: Failed to convert PDF to images: {e}")
             return "Unknown Subject", []
         image_paths = []
@@ -801,9 +817,27 @@ def hf_predict(file):
         output_path = os.path.join(tmp_dir, f"{base_name}_questions.txt")
         if ext == ".pdf":
-            process_pdf_question_paper(input_path, output_path)
         else:
-            process_question_paper(input_path, output_path)
         with open(output_path, "r", encoding="utf-8") as f:
             return f.read()

     """Process a PDF question paper by converting each page to an image.
     Each page is run through the same OCR + text-based question extractor,
+    and all questions are combined into a single output text file.
+    This function is defensive: if PDF support or poppler is missing, it
+    writes a small diagnostic file instead of raising, so hf_predict can
+    always read *something* from ``output_path``.
     """
     if convert_from_path is None:
+        msg_lines = [
+            "ERROR: PDF support requires the 'pdf2image' package.",
+            "Install it in the environment, e.g.: pip install pdf2image",
+        ]
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write("Subject: Unknown Subject\n\n")
+            f.write("Total Questions: 0\n\n")
+            f.write("QUESTIONS\n\n")
+            f.write("\n".join(msg_lines))
+        print("\n".join(msg_lines))
         return "Unknown Subject", []
     print(f"Processing PDF: {pdf_path}")
             # still giving good OCR quality.
             pages = convert_from_path(pdf_path, dpi=200)
         except Exception as e:
+            err = f"ERROR: Failed to convert PDF to images: {e}"
+            print(err)
+            with open(output_path, "w", encoding="utf-8") as f:
+                f.write("Subject: Unknown Subject\n\n")
+                f.write("Total Questions: 0\n\n")
+                f.write("QUESTIONS\n\n")
+                f.write(err)
             return "Unknown Subject", []
         image_paths = []
         output_path = os.path.join(tmp_dir, f"{base_name}_questions.txt")
         if ext == ".pdf":
+            subject, questions = process_pdf_question_paper(input_path, output_path)
         else:
+            subject, questions = process_question_paper(input_path, output_path)
+        # In normal cases process_* will have written output_path. If it
+        # did not (for some unexpected error), fall back to an in-memory
+        # text construction instead of raising FileNotFoundError.
+        if not os.path.exists(output_path):
+            lines = [
+                f"Subject: {subject}",
+                "",
+                f"Total Questions: {len(questions)}",
+                "",
+                "QUESTIONS",
+                "",
+            ]
+            for q in questions:
+                lines.append(f"Q{q['number']} ({q['marks']} marks):")
+                lines.append(q['question'])
+                lines.append("")
+            return "\n".join(lines)
         with open(output_path, "r", encoding="utf-8") as f:
             return f.read()