bhuvan-2005 commited on
Commit
ebcc2b6
·
verified ·
1 Parent(s): 185368c

Update question_extractor.py

Browse files
Files changed (1) hide show
  1. question_extractor.py +42 -8
question_extractor.py CHANGED
@@ -709,13 +709,23 @@ def process_pdf_question_paper(pdf_path, output_path):
709
  """Process a PDF question paper by converting each page to an image.
710
 
711
  Each page is run through the same OCR + text-based question extractor,
712
- and all questions are combined into a single output text file. IoT-
713
- specific JPEG fallbacks are not applied here (PDFs are treated as
714
- generic papers).
 
 
715
  """
716
  if convert_from_path is None:
717
- print("ERROR: PDF support requires the 'pdf2image' package. Install it in the venv, e.g.:")
718
- print(" pip install pdf2image")
 
 
 
 
 
 
 
 
719
  return "Unknown Subject", []
720
 
721
  print(f"Processing PDF: {pdf_path}")
@@ -733,7 +743,13 @@ def process_pdf_question_paper(pdf_path, output_path):
733
  # still giving good OCR quality.
734
  pages = convert_from_path(pdf_path, dpi=200)
735
  except Exception as e:
736
- print(f"ERROR: Failed to convert PDF to images: {e}")
 
 
 
 
 
 
737
  return "Unknown Subject", []
738
 
739
  image_paths = []
@@ -801,9 +817,27 @@ def hf_predict(file):
801
  output_path = os.path.join(tmp_dir, f"{base_name}_questions.txt")
802
 
803
  if ext == ".pdf":
804
- process_pdf_question_paper(input_path, output_path)
805
  else:
806
- process_question_paper(input_path, output_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
807
 
808
  with open(output_path, "r", encoding="utf-8") as f:
809
  return f.read()
 
709
  """Process a PDF question paper by converting each page to an image.
710
 
711
  Each page is run through the same OCR + text-based question extractor,
712
+ and all questions are combined into a single output text file.
713
+
714
+ This function is defensive: if PDF support or poppler is missing, it
715
+ writes a small diagnostic file instead of raising, so hf_predict can
716
+ always read *something* from ``output_path``.
717
  """
718
  if convert_from_path is None:
719
+ msg_lines = [
720
+ "ERROR: PDF support requires the 'pdf2image' package.",
721
+ "Install it in the environment, e.g.: pip install pdf2image",
722
+ ]
723
+ with open(output_path, "w", encoding="utf-8") as f:
724
+ f.write("Subject: Unknown Subject\n\n")
725
+ f.write("Total Questions: 0\n\n")
726
+ f.write("QUESTIONS\n\n")
727
+ f.write("\n".join(msg_lines))
728
+ print("\n".join(msg_lines))
729
  return "Unknown Subject", []
730
 
731
  print(f"Processing PDF: {pdf_path}")
 
743
  # still giving good OCR quality.
744
  pages = convert_from_path(pdf_path, dpi=200)
745
  except Exception as e:
746
+ err = f"ERROR: Failed to convert PDF to images: {e}"
747
+ print(err)
748
+ with open(output_path, "w", encoding="utf-8") as f:
749
+ f.write("Subject: Unknown Subject\n\n")
750
+ f.write("Total Questions: 0\n\n")
751
+ f.write("QUESTIONS\n\n")
752
+ f.write(err)
753
  return "Unknown Subject", []
754
 
755
  image_paths = []
 
817
  output_path = os.path.join(tmp_dir, f"{base_name}_questions.txt")
818
 
819
  if ext == ".pdf":
820
+ subject, questions = process_pdf_question_paper(input_path, output_path)
821
  else:
822
+ subject, questions = process_question_paper(input_path, output_path)
823
+
824
+ # In normal cases process_* will have written output_path. If it
825
+ # did not (for some unexpected error), fall back to an in-memory
826
+ # text construction instead of raising FileNotFoundError.
827
+ if not os.path.exists(output_path):
828
+ lines = [
829
+ f"Subject: {subject}",
830
+ "",
831
+ f"Total Questions: {len(questions)}",
832
+ "",
833
+ "QUESTIONS",
834
+ "",
835
+ ]
836
+ for q in questions:
837
+ lines.append(f"Q{q['number']} ({q['marks']} marks):")
838
+ lines.append(q['question'])
839
+ lines.append("")
840
+ return "\n".join(lines)
841
 
842
  with open(output_path, "r", encoding="utf-8") as f:
843
  return f.read()