import gradio as gr
from pdf2image import convert_from_path
import pytesseract
from transformers import pipeline
import json
import tempfile
import shutil
import os

# 🧠 Load lightweight question generation model
qg_pipeline = pipeline(
    "text2text-generation",
    model="valhalla/t5-small-qg-prepend",
    tokenizer="t5-small"
)

# 🧩 OCR function: extract text from scanned PDFs
def extract_text_from_scanned_pdf(file_path):
    pages = convert_from_path(file_path)
    text = ""
    for page in pages:
        text += pytesseract.image_to_string(page)
    return text.strip()

# ⚙️ Main processing function
def process_pdf(pdf_file):
    # Step 1️⃣: Copy uploaded file to a temporary location
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
        shutil.copy(pdf_file.name, temp_pdf.name)
        temp_pdf_path = temp_pdf.name

    # Step 2️⃣: Extract text using OCR
    extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
    os.remove(temp_pdf_path)

    if not extracted_text.strip():
        return "❌ Could not extract text. Make sure the PDF has readable text."

    # Step 3️⃣: Generate questions from extracted text
    prompt = "generate questions: " + extracted_text[:1000]  # limit to 1000 chars
    questions_output = qg_pipeline(prompt, max_length=128, num_return_sequences=3)

    # Step 4️⃣: Convert model output into question list
    question_list = []
    for q in questions_output:
        question_list.append({
            "questiontext": q["generated_text"],
            "questiontype": "single_select",
            "marks": 10,
            "options": [
                {"optiontext": "Option 1", "score": "10"},
                {"optiontext": "Option 2", "score": "0"}
            ]
        })

    # Step 5️⃣: Build the <questiondata> structure
    data = {
        "title": "Certification Title",
        "totalmarks": "50",
        "time": "20",
        "cutoff": "35",
        "failurl": "",
        "passurl": "",
        "sendpassemail": True,
        "questions": json.dumps({"questions": question_list}),
        "maxattempts": 3
    }

    # Step 6️⃣: Wrap JSON inside XML CDATA
    xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
    return xml_output

# 🚀 Gradio Web UI
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="📄 Upload your scanned PDF"),
    outputs="text",
    title="PDF to Question Generator (with OCR)",
    description="Uploads a scanned PDF, extracts text via OCR, and generates <questiondata> XML for quiz integration."
)

iface.launch()