Spaces:
Sleeping
Sleeping
Update question_extractor.py
Browse files- question_extractor.py +42 -8
question_extractor.py
CHANGED
|
@@ -709,13 +709,23 @@ def process_pdf_question_paper(pdf_path, output_path):
|
|
| 709 |
"""Process a PDF question paper by converting each page to an image.
|
| 710 |
|
| 711 |
Each page is run through the same OCR + text-based question extractor,
|
| 712 |
-
and all questions are combined into a single output text file.
|
| 713 |
-
|
| 714 |
-
|
|
|
|
|
|
|
| 715 |
"""
|
| 716 |
if convert_from_path is None:
|
| 717 |
-
|
| 718 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 719 |
return "Unknown Subject", []
|
| 720 |
|
| 721 |
print(f"Processing PDF: {pdf_path}")
|
|
@@ -733,7 +743,13 @@ def process_pdf_question_paper(pdf_path, output_path):
|
|
| 733 |
# still giving good OCR quality.
|
| 734 |
pages = convert_from_path(pdf_path, dpi=200)
|
| 735 |
except Exception as e:
|
| 736 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 737 |
return "Unknown Subject", []
|
| 738 |
|
| 739 |
image_paths = []
|
|
@@ -801,9 +817,27 @@ def hf_predict(file):
|
|
| 801 |
output_path = os.path.join(tmp_dir, f"{base_name}_questions.txt")
|
| 802 |
|
| 803 |
if ext == ".pdf":
|
| 804 |
-
process_pdf_question_paper(input_path, output_path)
|
| 805 |
else:
|
| 806 |
-
process_question_paper(input_path, output_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 807 |
|
| 808 |
with open(output_path, "r", encoding="utf-8") as f:
|
| 809 |
return f.read()
|
|
|
|
| 709 |
"""Process a PDF question paper by converting each page to an image.
|
| 710 |
|
| 711 |
Each page is run through the same OCR + text-based question extractor,
|
| 712 |
+
and all questions are combined into a single output text file.
|
| 713 |
+
|
| 714 |
+
This function is defensive: if PDF support or poppler is missing, it
|
| 715 |
+
writes a small diagnostic file instead of raising, so hf_predict can
|
| 716 |
+
always read *something* from ``output_path``.
|
| 717 |
"""
|
| 718 |
if convert_from_path is None:
|
| 719 |
+
msg_lines = [
|
| 720 |
+
"ERROR: PDF support requires the 'pdf2image' package.",
|
| 721 |
+
"Install it in the environment, e.g.: pip install pdf2image",
|
| 722 |
+
]
|
| 723 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 724 |
+
f.write("Subject: Unknown Subject\n\n")
|
| 725 |
+
f.write("Total Questions: 0\n\n")
|
| 726 |
+
f.write("QUESTIONS\n\n")
|
| 727 |
+
f.write("\n".join(msg_lines))
|
| 728 |
+
print("\n".join(msg_lines))
|
| 729 |
return "Unknown Subject", []
|
| 730 |
|
| 731 |
print(f"Processing PDF: {pdf_path}")
|
|
|
|
| 743 |
# still giving good OCR quality.
|
| 744 |
pages = convert_from_path(pdf_path, dpi=200)
|
| 745 |
except Exception as e:
|
| 746 |
+
err = f"ERROR: Failed to convert PDF to images: {e}"
|
| 747 |
+
print(err)
|
| 748 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 749 |
+
f.write("Subject: Unknown Subject\n\n")
|
| 750 |
+
f.write("Total Questions: 0\n\n")
|
| 751 |
+
f.write("QUESTIONS\n\n")
|
| 752 |
+
f.write(err)
|
| 753 |
return "Unknown Subject", []
|
| 754 |
|
| 755 |
image_paths = []
|
|
|
|
| 817 |
output_path = os.path.join(tmp_dir, f"{base_name}_questions.txt")
|
| 818 |
|
| 819 |
if ext == ".pdf":
|
| 820 |
+
subject, questions = process_pdf_question_paper(input_path, output_path)
|
| 821 |
else:
|
| 822 |
+
subject, questions = process_question_paper(input_path, output_path)
|
| 823 |
+
|
| 824 |
+
# In normal cases process_* will have written output_path. If it
|
| 825 |
+
# did not (for some unexpected error), fall back to an in-memory
|
| 826 |
+
# text construction instead of raising FileNotFoundError.
|
| 827 |
+
if not os.path.exists(output_path):
|
| 828 |
+
lines = [
|
| 829 |
+
f"Subject: {subject}",
|
| 830 |
+
"",
|
| 831 |
+
f"Total Questions: {len(questions)}",
|
| 832 |
+
"",
|
| 833 |
+
"QUESTIONS",
|
| 834 |
+
"",
|
| 835 |
+
]
|
| 836 |
+
for q in questions:
|
| 837 |
+
lines.append(f"Q{q['number']} ({q['marks']} marks):")
|
| 838 |
+
lines.append(q['question'])
|
| 839 |
+
lines.append("")
|
| 840 |
+
return "\n".join(lines)
|
| 841 |
|
| 842 |
with open(output_path, "r", encoding="utf-8") as f:
|
| 843 |
return f.read()
|