File size: 4,612 Bytes
2179b89
719feee
9153df0
719feee
2085bbf
e7b5f58
2085bbf
 
 
 
ca5f6c8
e7b5f58
2085bbf
 
 
e7b5f58
 
ca5f6c8
2085bbf
 
 
 
 
 
ca5f6c8
 
 
 
 
 
 
2085bbf
 
 
 
 
 
 
4b1b7b9
2085bbf
 
 
 
 
 
 
e7b5f58
2085bbf
e7b5f58
 
4b1b7b9
 
 
 
 
 
e7b5f58
 
ca5f6c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2085bbf
 
 
719feee
2085bbf
719feee
9153df0
719feee
 
2085bbf
 
719feee
2085bbf
719feee
2085bbf
719feee
2085bbf
719feee
2085bbf
 
 
ca5f6c8
 
 
2085bbf
ca5f6c8
 
 
 
 
719feee
ca5f6c8
 
 
 
 
719feee
ca5f6c8
2179b89
 
 
ca5f6c8
 
2179b89
 
 
ca5f6c8
719feee
2179b89
 
 
 
 
 
 
719feee
2179b89
719feee
2179b89
ca5f6c8
 
2179b89
 
2085bbf
 
 
2179b89
719feee
2085bbf
719feee
ca5f6c8
 
2179b89
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import gradio as gr
import tempfile
import shutil
import os
import json
import numpy as np
from pdf2image import convert_from_path
import easyocr
from PyPDF2 import PdfReader
from transformers import pipeline
import random

# -----------------------------
# Initialize OCR and Transformers
# -----------------------------
reader = easyocr.Reader(['en'])

# Question generation model
qg_pipeline = pipeline(
    "text2text-generation",
    model="valhalla/t5-small-qg-prepend",
    tokenizer="t5-small"
)

# Question-answer generation model
qa_pipeline = pipeline(
    "text2text-generation",
    model="valhalla/t5-small-qa-qg-hl",
    tokenizer="t5-small"
)

# -----------------------------
# Extract text from selectable PDFs
# -----------------------------
def extract_text_from_pdf(file_path):
    reader_pdf = PdfReader(file_path)
    text = ""
    for page in reader_pdf.pages:
        t = getattr(page, 'extract_text', lambda: None)()
        if t:
            text += t + "\n"
    return text.strip()

# -----------------------------
# Extract text from scanned PDFs using EasyOCR
# -----------------------------
def extract_text_from_scanned_pdf(file_path):
    pages = convert_from_path(file_path, dpi=150)
    text = ""
    for page in pages:
        try:
            img_array = np.array(page)
            result = reader.readtext(img_array, detail=0)
            text += " ".join(result) + "\n"
        except Exception as e:
            print("OCR error on page:", e)
    return text.strip()

# -----------------------------
# Generate dummy options
# -----------------------------
def generate_options(correct_answer):
    options = [correct_answer]
    dummy_opts = [
        "None of the above",
        "All of the above",
        "Not mentioned",
        "Cannot be determined",
        "Irrelevant information"
    ]
    while len(options) < 4:
        opt = random.choice(dummy_opts)
        if opt not in options:
            options.append(opt)
    random.shuffle(options)
    return options

# -----------------------------
# Main processing function
# -----------------------------
def process_pdf(pdf_file):
    # Save uploaded PDF to temp file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
        shutil.copy(pdf_file.name, temp_pdf.name)
        temp_pdf_path = temp_pdf.name

    # Step 1: Try extracting text from PDF directly
    extracted_text = extract_text_from_pdf(temp_pdf_path)

    # Step 2: If empty, use OCR
    if not extracted_text.strip():
        extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)

    os.remove(temp_pdf_path)

    if not extracted_text.strip():
        return "โŒ Could not extract text. Make sure the PDF has readable content."

    # Step 3: Generate questions
    prompt_q = "generate questions: " + extracted_text[:1000]
    questions_output = qg_pipeline(prompt_q, max_length=128, num_beams=3, num_return_sequences=3)

    # Step 4: Generate answers
    prompt_a = "answer questions: " + extracted_text[:1000]
    answers_output = qa_pipeline(prompt_a, max_length=64, num_beams=3, num_return_sequences=3)

    # Step 5: Build question list
    question_list = []
    for i, q in enumerate(questions_output):
        question = q["generated_text"]
        correct_answer = answers_output[i]["generated_text"] if i < len(answers_output) else "N/A"

        options = generate_options(correct_answer)
        question_list.append({
            "questiontext": question,
            "questiontype": "single_select",
            "marks": 10,
            "options": [
                {"optiontext": opt, "score": "10" if opt == correct_answer else "0"}
                for opt in options
            ]
        })

    # Step 6: Build <questiondata> structure
    data = {
        "title": "Certification Title",
        "totalmarks": "50",
        "time": "20",
        "cutoff": "35",
        "failurl": "",
        "passurl": "",
        "sendpassemail": True,
        "questions": json.dumps({"questions": question_list}),
        "maxattempts": 3
    }

    # Step 7: Wrap JSON in XML CDATA
    xml_output = "<questiondata><![CDATA[" + json.dumps(data, indent=2) + "]]></questiondata>"
    return xml_output

# -----------------------------
# Gradio Interface
# -----------------------------
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="๐Ÿ“„ Upload your PDF"),
    outputs="text",
    title="PDF โ†’ Question & Answer Generator (with OCR)",
    description="Uploads a PDF, extracts text (or OCR for scanned PDFs), and generates XML with questions + answers."
)

iface.launch()