File size: 2,632 Bytes
2179b89
719feee
 
2179b89
 
719feee
9153df0
719feee
2179b89
9153df0
 
 
 
 
 
719feee
9153df0
719feee
 
2179b89
719feee
 
 
 
9153df0
719feee
9153df0
719feee
9153df0
719feee
 
9153df0
719feee
 
 
 
9153df0
719feee
9153df0
 
 
719feee
9153df0
719feee
 
 
 
2179b89
 
 
719feee
 
2179b89
 
 
9153df0
719feee
2179b89
 
 
 
 
 
 
719feee
2179b89
719feee
2179b89
9153df0
719feee
2179b89
 
9153df0
2179b89
719feee
9153df0
719feee
9153df0
 
2179b89
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr
from pdf2image import convert_from_path
import pytesseract
from transformers import pipeline
import json
import tempfile
import shutil
import os

# 🧠 Load lightweight question generation model
qg_pipeline = pipeline(
    "text2text-generation",
    model="valhalla/t5-small-qg-prepend",
    tokenizer="t5-small"
)

# 🧩 OCR function: extract text from scanned PDFs
def extract_text_from_scanned_pdf(file_path):
    pages = convert_from_path(file_path)
    text = ""
    for page in pages:
        text += pytesseract.image_to_string(page)
    return text.strip()

# ⚙️ Main processing function
def process_pdf(pdf_file):
    # Step 1️⃣: Copy uploaded file to a temporary location
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
        shutil.copy(pdf_file.name, temp_pdf.name)
        temp_pdf_path = temp_pdf.name

    # Step 2️⃣: Extract text using OCR
    extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
    os.remove(temp_pdf_path)

    if not extracted_text.strip():
        return "❌ Could not extract text. Make sure the PDF has readable text."

    # Step 3️⃣: Generate questions from extracted text
    prompt = "generate questions: " + extracted_text[:1000]  # limit to 1000 chars
    questions_output = qg_pipeline(prompt, max_length=128, num_return_sequences=3)

    # Step 4️⃣: Convert model output into question list
    question_list = []
    for q in questions_output:
        question_list.append({
            "questiontext": q["generated_text"],
            "questiontype": "single_select",
            "marks": 10,
            "options": [
                {"optiontext": "Option 1", "score": "10"},
                {"optiontext": "Option 2", "score": "0"}
            ]
        })

    # Step 5️⃣: Build the <questiondata> structure
    data = {
        "title": "Certification Title",
        "totalmarks": "50",
        "time": "20",
        "cutoff": "35",
        "failurl": "",
        "passurl": "",
        "sendpassemail": True,
        "questions": json.dumps({"questions": question_list}),
        "maxattempts": 3
    }

    # Step 6️⃣: Wrap JSON inside XML CDATA
    xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
    return xml_output

# 🚀 Gradio Web UI
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="📄 Upload your scanned PDF"),
    outputs="text",
    title="PDF to Question Generator (with OCR)",
    description="Uploads a scanned PDF, extracts text via OCR, and generates <questiondata> XML for quiz integration."
)

iface.launch()