Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,33 +1,53 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from
|
|
|
|
| 3 |
from transformers import pipeline
|
| 4 |
import json
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
| 9 |
text = ""
|
| 10 |
-
for page in
|
| 11 |
-
text +=
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
"questiontype": "single_select",
|
| 23 |
"marks": 10,
|
| 24 |
"options": [
|
| 25 |
-
{"optiontext": "
|
| 26 |
-
{"optiontext": "
|
| 27 |
]
|
| 28 |
})
|
| 29 |
|
| 30 |
-
|
| 31 |
"title": "Certification Title",
|
| 32 |
"totalmarks": "50",
|
| 33 |
"time": "20",
|
|
@@ -35,18 +55,20 @@ def generate_questions(pdf_file):
|
|
| 35 |
"failurl": "",
|
| 36 |
"passurl": "",
|
| 37 |
"sendpassemail": True,
|
| 38 |
-
"questions": json.dumps({"questions":
|
| 39 |
"maxattempts": 3
|
| 40 |
-
}
|
| 41 |
|
| 42 |
-
xml_output =
|
| 43 |
return xml_output
|
| 44 |
|
| 45 |
-
# Gradio
|
| 46 |
iface = gr.Interface(
|
| 47 |
-
fn=
|
| 48 |
-
inputs=gr.File(
|
| 49 |
-
outputs=
|
|
|
|
|
|
|
| 50 |
)
|
| 51 |
|
| 52 |
iface.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from pdf2image import convert_from_path
|
| 3 |
+
import pytesseract
|
| 4 |
from transformers import pipeline
|
| 5 |
import json
|
| 6 |
+
import tempfile
|
| 7 |
+
import os
|
| 8 |
|
| 9 |
+
# Load question generation model
|
| 10 |
+
qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-small-qg-prepend")
|
| 11 |
+
|
| 12 |
+
# OCR function
|
| 13 |
+
def extract_text_from_scanned_pdf(file_path):
|
| 14 |
+
pages = convert_from_path(file_path)
|
| 15 |
text = ""
|
| 16 |
+
for page in pages:
|
| 17 |
+
text += pytesseract.image_to_string(page)
|
| 18 |
+
return text.strip()
|
| 19 |
+
|
| 20 |
+
# Main function
|
| 21 |
+
def process_pdf(pdf_file):
|
| 22 |
+
# Step 1: Save uploaded PDF
|
| 23 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
|
| 24 |
+
temp_pdf.write(pdf_file.read())
|
| 25 |
+
temp_pdf_path = temp_pdf.name
|
| 26 |
+
|
| 27 |
+
# Step 2: OCR extraction
|
| 28 |
+
extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
|
| 29 |
+
os.remove(temp_pdf_path)
|
| 30 |
+
|
| 31 |
+
if not extracted_text.strip():
|
| 32 |
+
return "❌ Could not extract text. Make sure the PDF has readable content."
|
| 33 |
+
|
| 34 |
+
# Step 3: Generate questions
|
| 35 |
+
questions_output = qg_pipeline("generate questions: " + extracted_text[:1000], max_length=128, num_return_sequences=3)
|
| 36 |
+
|
| 37 |
+
# Step 4: Convert to <questiondata> XML
|
| 38 |
+
question_list = []
|
| 39 |
+
for q in questions_output:
|
| 40 |
+
question_list.append({
|
| 41 |
+
"questiontext": q["generated_text"],
|
| 42 |
"questiontype": "single_select",
|
| 43 |
"marks": 10,
|
| 44 |
"options": [
|
| 45 |
+
{"optiontext": "Option 1", "score": "10"},
|
| 46 |
+
{"optiontext": "Option 2", "score": "0"}
|
| 47 |
]
|
| 48 |
})
|
| 49 |
|
| 50 |
+
data = {
|
| 51 |
"title": "Certification Title",
|
| 52 |
"totalmarks": "50",
|
| 53 |
"time": "20",
|
|
|
|
| 55 |
"failurl": "",
|
| 56 |
"passurl": "",
|
| 57 |
"sendpassemail": True,
|
| 58 |
+
"questions": json.dumps({"questions": question_list}),
|
| 59 |
"maxattempts": 3
|
| 60 |
+
}
|
| 61 |
|
| 62 |
+
xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
|
| 63 |
return xml_output
|
| 64 |
|
| 65 |
+
# Gradio interface
|
| 66 |
iface = gr.Interface(
|
| 67 |
+
fn=process_pdf,
|
| 68 |
+
inputs=gr.File(label="Upload your scanned PDF"),
|
| 69 |
+
outputs="text",
|
| 70 |
+
title="📄 PDF to Question Generator (with OCR)",
|
| 71 |
+
description="Uploads a scanned PDF, runs OCR, and generates <questiondata> XML output for your quiz system."
|
| 72 |
)
|
| 73 |
|
| 74 |
iface.launch()
|