import gradio as gr from pdf2image import convert_from_path import pytesseract from transformers import pipeline import json import tempfile import shutil import os # 🧠 Load lightweight question generation model qg_pipeline = pipeline( "text2text-generation", model="valhalla/t5-small-qg-prepend", tokenizer="t5-small" ) # 🧩 OCR function: extract text from scanned PDFs def extract_text_from_scanned_pdf(file_path): pages = convert_from_path(file_path) text = "" for page in pages: text += pytesseract.image_to_string(page) return text.strip() # ⚙️ Main processing function def process_pdf(pdf_file): # Step 1️⃣: Copy uploaded file to a temporary location with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: shutil.copy(pdf_file.name, temp_pdf.name) temp_pdf_path = temp_pdf.name # Step 2️⃣: Extract text using OCR extracted_text = extract_text_from_scanned_pdf(temp_pdf_path) os.remove(temp_pdf_path) if not extracted_text.strip(): return "❌ Could not extract text. Make sure the PDF has readable text." # Step 3️⃣: Generate questions from extracted text prompt = "generate questions: " + extracted_text[:1000] # limit to 1000 chars questions_output = qg_pipeline(prompt, max_length=128, num_return_sequences=3) # Step 4️⃣: Convert model output into question list question_list = [] for q in questions_output: question_list.append({ "questiontext": q["generated_text"], "questiontype": "single_select", "marks": 10, "options": [ {"optiontext": "Option 1", "score": "10"}, {"optiontext": "Option 2", "score": "0"} ] }) # Step 5️⃣: Build the structure data = { "title": "Certification Title", "totalmarks": "50", "time": "20", "cutoff": "35", "failurl": "", "passurl": "", "sendpassemail": True, "questions": json.dumps({"questions": question_list}), "maxattempts": 3 } # Step 6️⃣: Wrap JSON inside XML CDATA xml_output = "" return xml_output # 🚀 Gradio Web UI iface = gr.Interface( fn=process_pdf, inputs=gr.File(label="📄 Upload your scanned PDF"), outputs="text", title="PDF to Question Generator (with OCR)", description="Uploads a scanned PDF, extracts text via OCR, and generates XML for quiz integration." ) iface.launch()