Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from pdf2image import convert_from_path | |
| import pytesseract | |
| from transformers import pipeline | |
| import json | |
| import tempfile | |
| import shutil | |
| import os | |
| # 🧠 Load lightweight question generation model | |
| qg_pipeline = pipeline( | |
| "text2text-generation", | |
| model="valhalla/t5-small-qg-prepend", | |
| tokenizer="t5-small" | |
| ) | |
| # 🧩 OCR function: extract text from scanned PDFs | |
| def extract_text_from_scanned_pdf(file_path): | |
| pages = convert_from_path(file_path) | |
| text = "" | |
| for page in pages: | |
| text += pytesseract.image_to_string(page) | |
| return text.strip() | |
| # ⚙️ Main processing function | |
| def process_pdf(pdf_file): | |
| # Step 1️⃣: Copy uploaded file to a temporary location | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: | |
| shutil.copy(pdf_file.name, temp_pdf.name) | |
| temp_pdf_path = temp_pdf.name | |
| # Step 2️⃣: Extract text using OCR | |
| extracted_text = extract_text_from_scanned_pdf(temp_pdf_path) | |
| os.remove(temp_pdf_path) | |
| if not extracted_text.strip(): | |
| return "❌ Could not extract text. Make sure the PDF has readable text." | |
| # Step 3️⃣: Generate questions from extracted text | |
| prompt = "generate questions: " + extracted_text[:1000] # limit to 1000 chars | |
| questions_output = qg_pipeline(prompt, max_length=128, num_return_sequences=3) | |
| # Step 4️⃣: Convert model output into question list | |
| question_list = [] | |
| for q in questions_output: | |
| question_list.append({ | |
| "questiontext": q["generated_text"], | |
| "questiontype": "single_select", | |
| "marks": 10, | |
| "options": [ | |
| {"optiontext": "Option 1", "score": "10"}, | |
| {"optiontext": "Option 2", "score": "0"} | |
| ] | |
| }) | |
| # Step 5️⃣: Build the <questiondata> structure | |
| data = { | |
| "title": "Certification Title", | |
| "totalmarks": "50", | |
| "time": "20", | |
| "cutoff": "35", | |
| "failurl": "", | |
| "passurl": "", | |
| "sendpassemail": True, | |
| "questions": json.dumps({"questions": question_list}), | |
| "maxattempts": 3 | |
| } | |
| # Step 6️⃣: Wrap JSON inside XML CDATA | |
| xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>" | |
| return xml_output | |
| # 🚀 Gradio Web UI | |
| iface = gr.Interface( | |
| fn=process_pdf, | |
| inputs=gr.File(label="📄 Upload your scanned PDF"), | |
| outputs="text", | |
| title="PDF to Question Generator (with OCR)", | |
| description="Uploads a scanned PDF, extracts text via OCR, and generates <questiondata> XML for quiz integration." | |
| ) | |
| iface.launch() | |