"""Gradio demo for StructFix: JSON/tool-call repair against a schema.

The user provides:
- A DSL schema (typed fields with enums and required flags), OR
- A free-form text instruction broken/loose (the model will try to repair it anyway)

The model returns a valid JSON output that matches the schema.
"""
import os
os.environ.setdefault("DISABLE_ONNXRUNTIME_GPU", "1")

import json
import gradio as gr
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


MODEL_ID = "ottema/structfix-codet5p-220m"
_model = None
_tokenizer = None


def get_model():
    global _model, _tokenizer
    if _model is None:
        _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        _model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
    return _model, _tokenizer


PRESET_SCHEMAS = {
    "Customer Support Ticket": """FIELD priority TYPE string VALUES low|medium|high|urgent REQUIRED yes
FIELD category TYPE string VALUES billing|technical|account|other REQUIRED yes
FIELD description TYPE string REQUIRED yes
FIELD needs_callback TYPE boolean REQUIRED no""",
    "Task Management": """FIELD task TYPE string REQUIRED yes
FIELD assignee TYPE string REQUIRED no
FIELD deadline TYPE string REQUIRED no
FIELD priority TYPE string VALUES low|medium|high REQUIRED no
FIELD status TYPE string VALUES todo|in_progress|done REQUIRED no""",
    "Code Review Comment": """FIELD file_path TYPE string REQUIRED yes
FIELD line_number TYPE integer REQUIRED no
FIELD severity TYPE string VALUES nit|warning|error REQUIRED yes
FIELD message TYPE string REQUIRED yes""",
    "Restaurant Review": """FIELD rating TYPE integer REQUIRED yes
FIELD cuisine TYPE string REQUIRED no
FIELD price_range TYPE string VALUES $|$$|$$$|$$$$ REQUIRED no
FIELD comments TYPE string REQUIRED no""",
    "Custom (write your own)": "",
}

PRESET_BROKEN = {
    "Missing required field": '{"priority":"high"}',
    "Invalid enum value": '{"priority":"super-mega-urgent","description":"Server crashed"}',
    "Wrong type": '{"priority":123,"description":"Database is down"}',
    "Markdown-wrapped JSON": '```json\n{"status":"done","result":"Successfully deployed"}\n```',
    "Partial / truncated": '{"status":"succe',
    "Extra surrounding text": 'The system returned this response: {"task":"Review PR","assignee":"alice"} - end of response.',
    "Code review (análise real)": 'Aqui está minha análise:\n\n```json\n{"file": "auth.py", "line": 42, "severity": "critical", "msg": "SQL injection vulnerability on user input"}\n```\n\nPor favor revisar.',
    "API de suporte (vazio)": "A chamada retornou: {}",
    "Resposta multilíngue (PT-BR)": 'O sistema retornou isto: {"tarefa":"Revisar PR","responsavel":"alice","status":"em_andamento"}',
    "Resposta em PT-BR (schema inglês)": '{"status":"concluido","descricao":"Deploy foi realizado","prioridade":5}',
    "Custom (write your own)": "",
}

# Examples that pair a schema with a broken output, for the Examples block
EXAMPLES = [
    # Customer Support
    ["Customer Support Ticket", "Missing required field"],
    ["Customer Support Ticket", "Invalid enum value"],
    ["Customer Support Ticket", "Wrong type"],
    ["Customer Support Ticket", "Markdown-wrapped JSON"],
    ["Customer Support Ticket", "API de suporte (vazio)"],
    # Task Management
    ["Task Management", "Partial / truncated"],
    ["Task Management", "Extra surrounding text"],
    # Code Review
    ["Code Review Comment", "Code review (análise real)"],
    ["Code Review Comment", "Markdown-wrapped JSON"],
    # Restaurant
    ["Restaurant Review", "Wrong type"],
    ["Restaurant Review", "Extra surrounding text"],
    # PT-BR test
    ["Task Management", "Resposta multilíngue (PT-BR)"],
    ["Customer Support Ticket", "Resposta em PT-BR (schema inglês)"],
]


def repair(schema, broken_output):
    if not schema or not schema.strip():
        return "Empty schema.", "", ""
    if not broken_output or not broken_output.strip():
        return "Empty broken output.", "", ""

    model, tokenizer = get_model()
    prompt = f"""TASK repair_structured_output

SPEC
{schema}

BROKEN_OUTPUT
{broken_output}"""

    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(
        **inputs,
        max_length=256,
        num_beams=1,
        do_sample=False,
    )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    pretty = text
    is_valid = False
    try:
        parsed = json.loads(text)
        pretty = json.dumps(parsed, indent=2, ensure_ascii=False)
        is_valid = True
    except Exception:
        pass

    status = "✅ Valid JSON" if is_valid else "⚠️ Could not parse output as JSON"
    return status, text, pretty


with gr.Blocks(title="Ottema StructFix Demo", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """# Ottema StructFix Demo

Repair broken JSON / tool-call output against a typed schema. Model: [`ottema/structfix-codet5p-220m`](https://huggingface.co/ottema/structfix-codet5p-220m) (220M params, Apache-2.0).

**Use cases:**
- LLM emits invalid JSON → fix it
- Missing required fields → fill or flag
- Wrong enum values → coerce to valid
- Markdown-wrapped output → strip the wrapper
- Partial / truncated JSON → complete it
- Schema-constrained generation recovery

**Pipeline:** user provides a DSL schema + a broken output. Model returns a valid JSON object that matches the schema.
"""
    )

    with gr.Row():
        with gr.Column():
            schema_preset = gr.Dropdown(
                choices=list(PRESET_SCHEMAS.keys()),
                value="Customer Support Ticket",
                label="Schema preset",
            )
            schema_input = gr.Textbox(
                label="Schema (DSL)",
                value=PRESET_SCHEMAS["Customer Support Ticket"],
                lines=6,
            )
        with gr.Column():
            broken_preset = gr.Dropdown(
                choices=list(PRESET_BROKEN.keys()),
                value="Invalid enum value",
                label="Broken-output preset",
            )
            broken_input = gr.Textbox(
                label="Broken output (anything goes)",
                value=PRESET_BROKEN["Invalid enum value"],
                lines=4,
            )

    run_btn = gr.Button("Repair", variant="primary")

    with gr.Row():
        status = gr.Textbox(label="Status", interactive=False)
        raw_output = gr.Textbox(label="Raw model output", interactive=False)

    pretty_output = gr.Code(label="Repaired JSON", language="json")

    schema_preset.change(lambda p: PRESET_SCHEMAS[p], inputs=[schema_preset], outputs=[schema_input])
    broken_preset.change(lambda p: PRESET_BROKEN[p], inputs=[broken_preset], outputs=[broken_input])
    run_btn.click(repair, inputs=[schema_input, broken_input], outputs=[status, raw_output, pretty_output])

    gr.Examples(
        examples=EXAMPLES,
        inputs=[schema_preset, broken_preset],
        label="Click any example to load it. Then click Repair.",
    )

    gr.Markdown(
        """---
**How it works:** the DSL schema is a compact typed-declaration language. Each line declares a field with name, type, optional enum, and required flag. The model was trained on 250k synthetic examples of broken outputs paired with their valid counterparts (see [`ottema/structfix-bench`](https://huggingface.co/datasets/ottema/structfix-bench)).

**Credits:** [CodeT5+](https://github.com/salesforce/CodeT5) backbone (Apache-2.0), fine-tuning and dataset by [Ottema](https://huggingface.co/ottema).

**Try the on-demand API:** see [ottema/structfix-codet5p-220m](https://huggingface.co/ottema/structfix-codet5p-220m) for the inference snippet."""
    )


if __name__ == "__main__":
    demo.launch()