import gradio as gr from coherence_under_zero_stimulus_v01.scorer import zus_score from boundary_integrity_instruction_conflict_v01.scorer import boundary_score from temporal_coherence_interruption_v01.scorer import temporal_score def run_selected(benchmark, prompt, completion): if benchmark == "Coherence Under Zero Stimulus v01": r = zus_score(prompt, completion) return {"score_0_100": r.score_0_100, "details": r.details} if benchmark == "Boundary Integrity Under Instruction Conflict v01": r = boundary_score(prompt, completion) return {"score_0_100": r.score_0_100, "details": r.details} if benchmark == "Temporal Coherence Under Interrupted Context v01": r = temporal_score(prompt, completion) return {"score_0_100": r.score_0_100, "details": r.details} return {"score_0_100": 0, "details": {"error": "unknown benchmark"}} demo = gr.Interface( fn=run_selected, inputs=[ gr.Dropdown( choices=[ "Coherence Under Zero Stimulus v01", "Boundary Integrity Under Instruction Conflict v01", "Temporal Coherence Under Interrupted Context v01", ], value="Coherence Under Zero Stimulus v01", label="Benchmark", ), gr.Textbox(label="Prompt", lines=8, placeholder="Paste the prompt used for the eval"), gr.Textbox(label="Model Output", lines=8, placeholder="Paste the model response to score"), ], outputs=gr.JSON(label="Score"), title="Clarus Benchmarks", description="Three public-facing behavioral evals: zero-stimulus restraint, boundary integrity under instruction conflict, and temporal coherence under interrupted context.", ) demo.launch()