Spaces:

prat1003
/

project2

Sleeping

App Files Files Community

prat1003 commited on Oct 13, 2025

Commit

9153df0

verified ·

1 Parent(s): bc99bb8

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -15

app.py CHANGED Viewed

@@ -4,12 +4,17 @@ import pytesseract
 from transformers import pipeline
 import json
 import tempfile
 import os
-# Load question generation model
-qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-small-qg-prepend", tokenizer="t5-small")
-# OCR function
 def extract_text_from_scanned_pdf(file_path):
     pages = convert_from_path(file_path)
     text = ""
@@ -17,24 +22,25 @@ def extract_text_from_scanned_pdf(file_path):
         text += pytesseract.image_to_string(page)
     return text.strip()
-# Main function
 def process_pdf(pdf_file):
-    # Step 1: Save uploaded PDF
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
-        temp_pdf.write(pdf_file.read())
         temp_pdf_path = temp_pdf.name
-    # Step 2: OCR extraction
     extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
     os.remove(temp_pdf_path)
     if not extracted_text.strip():
-        return "❌ Could not extract text. Make sure the PDF has readable content."
-    # Step 3: Generate questions
-    questions_output = qg_pipeline("generate questions: " + extracted_text[:1000], max_length=128, num_return_sequences=3)
-    # Step 4: Convert to <questiondata> XML
     question_list = []
     for q in questions_output:
         question_list.append({
@@ -47,6 +53,7 @@ def process_pdf(pdf_file):
             ]
         })
     data = {
         "title": "Certification Title",
         "totalmarks": "50",
@@ -59,16 +66,17 @@ def process_pdf(pdf_file):
         "maxattempts": 3
     }
     xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
     return xml_output
-# Gradio interface
 iface = gr.Interface(
     fn=process_pdf,
-    inputs=gr.File(label="Upload your scanned PDF"),
     outputs="text",
-    title="📄 PDF to Question Generator (with OCR)",
-    description="Uploads a scanned PDF, runs OCR, and generates <questiondata> XML output for your quiz system."
 )
 iface.launch()

 from transformers import pipeline
 import json
 import tempfile
+import shutil
 import os
+# 🧠 Load lightweight question generation model
+qg_pipeline = pipeline(
+    "text2text-generation",
+    model="valhalla/t5-small-qg-prepend",
+    tokenizer="t5-small"
+)
+# 🧩 OCR function: extract text from scanned PDFs
 def extract_text_from_scanned_pdf(file_path):
     pages = convert_from_path(file_path)
     text = ""
         text += pytesseract.image_to_string(page)
     return text.strip()
+# ⚙️ Main processing function
 def process_pdf(pdf_file):
+    # Step 1️⃣: Copy uploaded file to a temporary location
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
+        shutil.copy(pdf_file.name, temp_pdf.name)
         temp_pdf_path = temp_pdf.name
+    # Step 2️⃣: Extract text using OCR
     extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
     os.remove(temp_pdf_path)
     if not extracted_text.strip():
+        return "❌ Could not extract text. Make sure the PDF has readable text."
+    # Step 3️⃣: Generate questions from extracted text
+    prompt = "generate questions: " + extracted_text[:1000]  # limit to 1000 chars
+    questions_output = qg_pipeline(prompt, max_length=128, num_return_sequences=3)
+    # Step 4️⃣: Convert model output into question list
     question_list = []
     for q in questions_output:
         question_list.append({
             ]
         })
+    # Step 5️⃣: Build the <questiondata> structure
     data = {
         "title": "Certification Title",
         "totalmarks": "50",
         "maxattempts": 3
     }
+    # Step 6️⃣: Wrap JSON inside XML CDATA
     xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
     return xml_output
+# 🚀 Gradio Web UI
 iface = gr.Interface(
     fn=process_pdf,
+    inputs=gr.File(label="📄 Upload your scanned PDF"),
     outputs="text",
+    title="PDF to Question Generator (with OCR)",
+    description="Uploads a scanned PDF, extracts text via OCR, and generates <questiondata> XML for quiz integration."
 )
 iface.launch()