import gradio as gr

from coherence_under_zero_stimulus_v01.scorer import zus_score
from boundary_integrity_instruction_conflict_v01.scorer import boundary_score
from temporal_coherence_interruption_v01.scorer import temporal_score


def run_selected(benchmark, prompt, completion):
    if benchmark == "Coherence Under Zero Stimulus v01":
        r = zus_score(prompt, completion)
        return {"score_0_100": r.score_0_100, "details": r.details}

    if benchmark == "Boundary Integrity Under Instruction Conflict v01":
        r = boundary_score(prompt, completion)
        return {"score_0_100": r.score_0_100, "details": r.details}

    if benchmark == "Temporal Coherence Under Interrupted Context v01":
        r = temporal_score(prompt, completion)
        return {"score_0_100": r.score_0_100, "details": r.details}

    return {"score_0_100": 0, "details": {"error": "unknown benchmark"}}


demo = gr.Interface(
    fn=run_selected,
    inputs=[
        gr.Dropdown(
            choices=[
                "Coherence Under Zero Stimulus v01",
                "Boundary Integrity Under Instruction Conflict v01",
                "Temporal Coherence Under Interrupted Context v01",
            ],
            value="Coherence Under Zero Stimulus v01",
            label="Benchmark",
        ),
        gr.Textbox(label="Prompt", lines=8, placeholder="Paste the prompt used for the eval"),
        gr.Textbox(label="Model Output", lines=8, placeholder="Paste the model response to score"),
    ],
    outputs=gr.JSON(label="Score"),
    title="Clarus Benchmarks",
    description="Three public-facing behavioral evals: zero-stimulus restraint, boundary integrity under instruction conflict, and temporal coherence under interrupted context.",
)

demo.launch()