Spaces:

prat1003
/

project2

Sleeping

App Files Files Community

project2 / app.py

prat1003

Update app.py

9153df0 verified 5 months ago

raw

history blame

2.63 kB

	import gradio as gr
	from pdf2image import convert_from_path
	import pytesseract
	from transformers import pipeline
	import json
	import tempfile
	import shutil
	import os

	# 🧠 Load lightweight question generation model
	qg_pipeline = pipeline(
	"text2text-generation",
	model="valhalla/t5-small-qg-prepend",
	tokenizer="t5-small"
	)

	# 🧩 OCR function: extract text from scanned PDFs
	def extract_text_from_scanned_pdf(file_path):
	pages = convert_from_path(file_path)
	text = ""
	for page in pages:
	text += pytesseract.image_to_string(page)
	return text.strip()

	# ⚙️ Main processing function
	def process_pdf(pdf_file):
	# Step 1️⃣: Copy uploaded file to a temporary location
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
	shutil.copy(pdf_file.name, temp_pdf.name)
	temp_pdf_path = temp_pdf.name

	# Step 2️⃣: Extract text using OCR
	extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
	os.remove(temp_pdf_path)

	if not extracted_text.strip():
	return "❌ Could not extract text. Make sure the PDF has readable text."

	# Step 3️⃣: Generate questions from extracted text
	prompt = "generate questions: " + extracted_text[:1000] # limit to 1000 chars
	questions_output = qg_pipeline(prompt, max_length=128, num_return_sequences=3)

	# Step 4️⃣: Convert model output into question list
	question_list = []
	for q in questions_output:
	question_list.append({
	"questiontext": q["generated_text"],
	"questiontype": "single_select",
	"marks": 10,
	"options": [
	{"optiontext": "Option 1", "score": "10"},
	{"optiontext": "Option 2", "score": "0"}
	]
	})

	# Step 5️⃣: Build the <questiondata> structure
	data = {
	"title": "Certification Title",
	"totalmarks": "50",
	"time": "20",
	"cutoff": "35",
	"failurl": "",
	"passurl": "",
	"sendpassemail": True,
	"questions": json.dumps({"questions": question_list}),
	"maxattempts": 3
	}

	# Step 6️⃣: Wrap JSON inside XML CDATA
	xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
	return xml_output

	# 🚀 Gradio Web UI
	iface = gr.Interface(
	fn=process_pdf,
	inputs=gr.File(label="📄 Upload your scanned PDF"),
	outputs="text",
	title="PDF to Question Generator (with OCR)",
	description="Uploads a scanned PDF, extracts text via OCR, and generates <questiondata> XML for quiz integration."
	)

	iface.launch()