Spaces:
Sleeping
Sleeping
File size: 2,632 Bytes
2179b89 719feee 2179b89 719feee 9153df0 719feee 2179b89 9153df0 719feee 9153df0 719feee 2179b89 719feee 9153df0 719feee 9153df0 719feee 9153df0 719feee 9153df0 719feee 9153df0 719feee 9153df0 719feee 9153df0 719feee 2179b89 719feee 2179b89 9153df0 719feee 2179b89 719feee 2179b89 719feee 2179b89 9153df0 719feee 2179b89 9153df0 2179b89 719feee 9153df0 719feee 9153df0 2179b89 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | import gradio as gr
from pdf2image import convert_from_path
import pytesseract
from transformers import pipeline
import json
import tempfile
import shutil
import os
# 🧠 Load lightweight question generation model
qg_pipeline = pipeline(
"text2text-generation",
model="valhalla/t5-small-qg-prepend",
tokenizer="t5-small"
)
# 🧩 OCR function: extract text from scanned PDFs
def extract_text_from_scanned_pdf(file_path):
pages = convert_from_path(file_path)
text = ""
for page in pages:
text += pytesseract.image_to_string(page)
return text.strip()
# ⚙️ Main processing function
def process_pdf(pdf_file):
# Step 1️⃣: Copy uploaded file to a temporary location
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
shutil.copy(pdf_file.name, temp_pdf.name)
temp_pdf_path = temp_pdf.name
# Step 2️⃣: Extract text using OCR
extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
os.remove(temp_pdf_path)
if not extracted_text.strip():
return "❌ Could not extract text. Make sure the PDF has readable text."
# Step 3️⃣: Generate questions from extracted text
prompt = "generate questions: " + extracted_text[:1000] # limit to 1000 chars
questions_output = qg_pipeline(prompt, max_length=128, num_return_sequences=3)
# Step 4️⃣: Convert model output into question list
question_list = []
for q in questions_output:
question_list.append({
"questiontext": q["generated_text"],
"questiontype": "single_select",
"marks": 10,
"options": [
{"optiontext": "Option 1", "score": "10"},
{"optiontext": "Option 2", "score": "0"}
]
})
# Step 5️⃣: Build the <questiondata> structure
data = {
"title": "Certification Title",
"totalmarks": "50",
"time": "20",
"cutoff": "35",
"failurl": "",
"passurl": "",
"sendpassemail": True,
"questions": json.dumps({"questions": question_list}),
"maxattempts": 3
}
# Step 6️⃣: Wrap JSON inside XML CDATA
xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
return xml_output
# 🚀 Gradio Web UI
iface = gr.Interface(
fn=process_pdf,
inputs=gr.File(label="📄 Upload your scanned PDF"),
outputs="text",
title="PDF to Question Generator (with OCR)",
description="Uploads a scanned PDF, extracts text via OCR, and generates <questiondata> XML for quiz integration."
)
iface.launch()
|