Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,12 +4,17 @@ import pytesseract
|
|
| 4 |
from transformers import pipeline
|
| 5 |
import json
|
| 6 |
import tempfile
|
|
|
|
| 7 |
import os
|
| 8 |
|
| 9 |
-
# Load question generation model
|
| 10 |
-
qg_pipeline = pipeline(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
# OCR function
|
| 13 |
def extract_text_from_scanned_pdf(file_path):
|
| 14 |
pages = convert_from_path(file_path)
|
| 15 |
text = ""
|
|
@@ -17,24 +22,25 @@ def extract_text_from_scanned_pdf(file_path):
|
|
| 17 |
text += pytesseract.image_to_string(page)
|
| 18 |
return text.strip()
|
| 19 |
|
| 20 |
-
# Main function
|
| 21 |
def process_pdf(pdf_file):
|
| 22 |
-
# Step 1
|
| 23 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
|
| 24 |
-
|
| 25 |
temp_pdf_path = temp_pdf.name
|
| 26 |
|
| 27 |
-
# Step 2
|
| 28 |
extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
|
| 29 |
os.remove(temp_pdf_path)
|
| 30 |
|
| 31 |
if not extracted_text.strip():
|
| 32 |
-
return "❌ Could not extract text. Make sure the PDF has readable
|
| 33 |
|
| 34 |
-
# Step 3
|
| 35 |
-
|
|
|
|
| 36 |
|
| 37 |
-
# Step 4
|
| 38 |
question_list = []
|
| 39 |
for q in questions_output:
|
| 40 |
question_list.append({
|
|
@@ -47,6 +53,7 @@ def process_pdf(pdf_file):
|
|
| 47 |
]
|
| 48 |
})
|
| 49 |
|
|
|
|
| 50 |
data = {
|
| 51 |
"title": "Certification Title",
|
| 52 |
"totalmarks": "50",
|
|
@@ -59,16 +66,17 @@ def process_pdf(pdf_file):
|
|
| 59 |
"maxattempts": 3
|
| 60 |
}
|
| 61 |
|
|
|
|
| 62 |
xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
|
| 63 |
return xml_output
|
| 64 |
|
| 65 |
-
# Gradio
|
| 66 |
iface = gr.Interface(
|
| 67 |
fn=process_pdf,
|
| 68 |
-
inputs=gr.File(label="Upload your scanned PDF"),
|
| 69 |
outputs="text",
|
| 70 |
-
title="
|
| 71 |
-
description="Uploads a scanned PDF,
|
| 72 |
)
|
| 73 |
|
| 74 |
iface.launch()
|
|
|
|
| 4 |
from transformers import pipeline
|
| 5 |
import json
|
| 6 |
import tempfile
|
| 7 |
+
import shutil
|
| 8 |
import os
|
| 9 |
|
| 10 |
+
# 🧠 Load lightweight question generation model
|
| 11 |
+
qg_pipeline = pipeline(
|
| 12 |
+
"text2text-generation",
|
| 13 |
+
model="valhalla/t5-small-qg-prepend",
|
| 14 |
+
tokenizer="t5-small"
|
| 15 |
+
)
|
| 16 |
|
| 17 |
+
# 🧩 OCR function: extract text from scanned PDFs
|
| 18 |
def extract_text_from_scanned_pdf(file_path):
|
| 19 |
pages = convert_from_path(file_path)
|
| 20 |
text = ""
|
|
|
|
| 22 |
text += pytesseract.image_to_string(page)
|
| 23 |
return text.strip()
|
| 24 |
|
| 25 |
+
# ⚙️ Main processing function
|
| 26 |
def process_pdf(pdf_file):
|
| 27 |
+
# Step 1️⃣: Copy uploaded file to a temporary location
|
| 28 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
|
| 29 |
+
shutil.copy(pdf_file.name, temp_pdf.name)
|
| 30 |
temp_pdf_path = temp_pdf.name
|
| 31 |
|
| 32 |
+
# Step 2️⃣: Extract text using OCR
|
| 33 |
extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
|
| 34 |
os.remove(temp_pdf_path)
|
| 35 |
|
| 36 |
if not extracted_text.strip():
|
| 37 |
+
return "❌ Could not extract text. Make sure the PDF has readable text."
|
| 38 |
|
| 39 |
+
# Step 3️⃣: Generate questions from extracted text
|
| 40 |
+
prompt = "generate questions: " + extracted_text[:1000] # limit to 1000 chars
|
| 41 |
+
questions_output = qg_pipeline(prompt, max_length=128, num_return_sequences=3)
|
| 42 |
|
| 43 |
+
# Step 4️⃣: Convert model output into question list
|
| 44 |
question_list = []
|
| 45 |
for q in questions_output:
|
| 46 |
question_list.append({
|
|
|
|
| 53 |
]
|
| 54 |
})
|
| 55 |
|
| 56 |
+
# Step 5️⃣: Build the <questiondata> structure
|
| 57 |
data = {
|
| 58 |
"title": "Certification Title",
|
| 59 |
"totalmarks": "50",
|
|
|
|
| 66 |
"maxattempts": 3
|
| 67 |
}
|
| 68 |
|
| 69 |
+
# Step 6️⃣: Wrap JSON inside XML CDATA
|
| 70 |
xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
|
| 71 |
return xml_output
|
| 72 |
|
| 73 |
+
# 🚀 Gradio Web UI
|
| 74 |
iface = gr.Interface(
|
| 75 |
fn=process_pdf,
|
| 76 |
+
inputs=gr.File(label="📄 Upload your scanned PDF"),
|
| 77 |
outputs="text",
|
| 78 |
+
title="PDF to Question Generator (with OCR)",
|
| 79 |
+
description="Uploads a scanned PDF, extracts text via OCR, and generates <questiondata> XML for quiz integration."
|
| 80 |
)
|
| 81 |
|
| 82 |
iface.launch()
|