Spaces:
Sleeping
Sleeping
File size: 4,612 Bytes
2179b89 719feee 9153df0 719feee 2085bbf e7b5f58 2085bbf ca5f6c8 e7b5f58 2085bbf e7b5f58 ca5f6c8 2085bbf ca5f6c8 2085bbf 4b1b7b9 2085bbf e7b5f58 2085bbf e7b5f58 4b1b7b9 e7b5f58 ca5f6c8 2085bbf 719feee 2085bbf 719feee 9153df0 719feee 2085bbf 719feee 2085bbf 719feee 2085bbf 719feee 2085bbf 719feee 2085bbf ca5f6c8 2085bbf ca5f6c8 719feee ca5f6c8 719feee ca5f6c8 2179b89 ca5f6c8 2179b89 ca5f6c8 719feee 2179b89 719feee 2179b89 719feee 2179b89 ca5f6c8 2179b89 2085bbf 2179b89 719feee 2085bbf 719feee ca5f6c8 2179b89 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | import gradio as gr
import tempfile
import shutil
import os
import json
import numpy as np
from pdf2image import convert_from_path
import easyocr
from PyPDF2 import PdfReader
from transformers import pipeline
import random
# -----------------------------
# Initialize OCR and Transformers
# -----------------------------
reader = easyocr.Reader(['en'])
# Question generation model
qg_pipeline = pipeline(
"text2text-generation",
model="valhalla/t5-small-qg-prepend",
tokenizer="t5-small"
)
# Question-answer generation model
qa_pipeline = pipeline(
"text2text-generation",
model="valhalla/t5-small-qa-qg-hl",
tokenizer="t5-small"
)
# -----------------------------
# Extract text from selectable PDFs
# -----------------------------
def extract_text_from_pdf(file_path):
reader_pdf = PdfReader(file_path)
text = ""
for page in reader_pdf.pages:
t = getattr(page, 'extract_text', lambda: None)()
if t:
text += t + "\n"
return text.strip()
# -----------------------------
# Extract text from scanned PDFs using EasyOCR
# -----------------------------
def extract_text_from_scanned_pdf(file_path):
pages = convert_from_path(file_path, dpi=150)
text = ""
for page in pages:
try:
img_array = np.array(page)
result = reader.readtext(img_array, detail=0)
text += " ".join(result) + "\n"
except Exception as e:
print("OCR error on page:", e)
return text.strip()
# -----------------------------
# Generate dummy options
# -----------------------------
def generate_options(correct_answer):
options = [correct_answer]
dummy_opts = [
"None of the above",
"All of the above",
"Not mentioned",
"Cannot be determined",
"Irrelevant information"
]
while len(options) < 4:
opt = random.choice(dummy_opts)
if opt not in options:
options.append(opt)
random.shuffle(options)
return options
# -----------------------------
# Main processing function
# -----------------------------
def process_pdf(pdf_file):
# Save uploaded PDF to temp file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
shutil.copy(pdf_file.name, temp_pdf.name)
temp_pdf_path = temp_pdf.name
# Step 1: Try extracting text from PDF directly
extracted_text = extract_text_from_pdf(temp_pdf_path)
# Step 2: If empty, use OCR
if not extracted_text.strip():
extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
os.remove(temp_pdf_path)
if not extracted_text.strip():
return "โ Could not extract text. Make sure the PDF has readable content."
# Step 3: Generate questions
prompt_q = "generate questions: " + extracted_text[:1000]
questions_output = qg_pipeline(prompt_q, max_length=128, num_beams=3, num_return_sequences=3)
# Step 4: Generate answers
prompt_a = "answer questions: " + extracted_text[:1000]
answers_output = qa_pipeline(prompt_a, max_length=64, num_beams=3, num_return_sequences=3)
# Step 5: Build question list
question_list = []
for i, q in enumerate(questions_output):
question = q["generated_text"]
correct_answer = answers_output[i]["generated_text"] if i < len(answers_output) else "N/A"
options = generate_options(correct_answer)
question_list.append({
"questiontext": question,
"questiontype": "single_select",
"marks": 10,
"options": [
{"optiontext": opt, "score": "10" if opt == correct_answer else "0"}
for opt in options
]
})
# Step 6: Build <questiondata> structure
data = {
"title": "Certification Title",
"totalmarks": "50",
"time": "20",
"cutoff": "35",
"failurl": "",
"passurl": "",
"sendpassemail": True,
"questions": json.dumps({"questions": question_list}),
"maxattempts": 3
}
# Step 7: Wrap JSON in XML CDATA
xml_output = "<questiondata><![CDATA[" + json.dumps(data, indent=2) + "]]></questiondata>"
return xml_output
# -----------------------------
# Gradio Interface
# -----------------------------
iface = gr.Interface(
fn=process_pdf,
inputs=gr.File(label="๐ Upload your PDF"),
outputs="text",
title="PDF โ Question & Answer Generator (with OCR)",
description="Uploads a PDF, extracts text (or OCR for scanned PDFs), and generates XML with questions + answers."
)
iface.launch()
|