clarus-evals / app.py
ClarusC64's picture
Update app.py
e8ae754 verified
import gradio as gr
from coherence_under_zero_stimulus_v01.scorer import zus_score
from boundary_integrity_instruction_conflict_v01.scorer import boundary_score
from temporal_coherence_interruption_v01.scorer import temporal_score
def run_selected(benchmark, prompt, completion):
if benchmark == "Coherence Under Zero Stimulus v01":
r = zus_score(prompt, completion)
return {"score_0_100": r.score_0_100, "details": r.details}
if benchmark == "Boundary Integrity Under Instruction Conflict v01":
r = boundary_score(prompt, completion)
return {"score_0_100": r.score_0_100, "details": r.details}
if benchmark == "Temporal Coherence Under Interrupted Context v01":
r = temporal_score(prompt, completion)
return {"score_0_100": r.score_0_100, "details": r.details}
return {"score_0_100": 0, "details": {"error": "unknown benchmark"}}
demo = gr.Interface(
fn=run_selected,
inputs=[
gr.Dropdown(
choices=[
"Coherence Under Zero Stimulus v01",
"Boundary Integrity Under Instruction Conflict v01",
"Temporal Coherence Under Interrupted Context v01",
],
value="Coherence Under Zero Stimulus v01",
label="Benchmark",
),
gr.Textbox(label="Prompt", lines=8, placeholder="Paste the prompt used for the eval"),
gr.Textbox(label="Model Output", lines=8, placeholder="Paste the model response to score"),
],
outputs=gr.JSON(label="Score"),
title="Clarus Benchmarks",
description="Three public-facing behavioral evals: zero-stimulus restraint, boundary integrity under instruction conflict, and temporal coherence under interrupted context.",
)
demo.launch()