project2 / app.py
prat1003's picture
Update app.py
9153df0 verified
raw
history blame
2.63 kB
import gradio as gr
from pdf2image import convert_from_path
import pytesseract
from transformers import pipeline
import json
import tempfile
import shutil
import os
# 🧠 Load lightweight question generation model
qg_pipeline = pipeline(
"text2text-generation",
model="valhalla/t5-small-qg-prepend",
tokenizer="t5-small"
)
# 🧩 OCR function: extract text from scanned PDFs
def extract_text_from_scanned_pdf(file_path):
pages = convert_from_path(file_path)
text = ""
for page in pages:
text += pytesseract.image_to_string(page)
return text.strip()
# ⚙️ Main processing function
def process_pdf(pdf_file):
# Step 1️⃣: Copy uploaded file to a temporary location
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
shutil.copy(pdf_file.name, temp_pdf.name)
temp_pdf_path = temp_pdf.name
# Step 2️⃣: Extract text using OCR
extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
os.remove(temp_pdf_path)
if not extracted_text.strip():
return "❌ Could not extract text. Make sure the PDF has readable text."
# Step 3️⃣: Generate questions from extracted text
prompt = "generate questions: " + extracted_text[:1000] # limit to 1000 chars
questions_output = qg_pipeline(prompt, max_length=128, num_return_sequences=3)
# Step 4️⃣: Convert model output into question list
question_list = []
for q in questions_output:
question_list.append({
"questiontext": q["generated_text"],
"questiontype": "single_select",
"marks": 10,
"options": [
{"optiontext": "Option 1", "score": "10"},
{"optiontext": "Option 2", "score": "0"}
]
})
# Step 5️⃣: Build the <questiondata> structure
data = {
"title": "Certification Title",
"totalmarks": "50",
"time": "20",
"cutoff": "35",
"failurl": "",
"passurl": "",
"sendpassemail": True,
"questions": json.dumps({"questions": question_list}),
"maxattempts": 3
}
# Step 6️⃣: Wrap JSON inside XML CDATA
xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
return xml_output
# 🚀 Gradio Web UI
iface = gr.Interface(
fn=process_pdf,
inputs=gr.File(label="📄 Upload your scanned PDF"),
outputs="text",
title="PDF to Question Generator (with OCR)",
description="Uploads a scanned PDF, extracts text via OCR, and generates <questiondata> XML for quiz integration."
)
iface.launch()