File size: 1,768 Bytes
1933cc8
4fc1620
bb1304d
4fc1620
e8ae754
1933cc8
e8ae754
 
 
4fc1620
 
e8ae754
 
4fc1620
 
e8ae754
 
 
 
 
 
 
1933cc8
 
4fc1620
1933cc8
4fc1620
e8ae754
 
 
 
 
 
 
4fc1620
e8ae754
 
1933cc8
e8ae754
4fc1620
e8ae754
1933cc8
 
bb1304d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import gradio as gr

from coherence_under_zero_stimulus_v01.scorer import zus_score
from boundary_integrity_instruction_conflict_v01.scorer import boundary_score
from temporal_coherence_interruption_v01.scorer import temporal_score


def run_selected(benchmark, prompt, completion):
    if benchmark == "Coherence Under Zero Stimulus v01":
        r = zus_score(prompt, completion)
        return {"score_0_100": r.score_0_100, "details": r.details}

    if benchmark == "Boundary Integrity Under Instruction Conflict v01":
        r = boundary_score(prompt, completion)
        return {"score_0_100": r.score_0_100, "details": r.details}

    if benchmark == "Temporal Coherence Under Interrupted Context v01":
        r = temporal_score(prompt, completion)
        return {"score_0_100": r.score_0_100, "details": r.details}

    return {"score_0_100": 0, "details": {"error": "unknown benchmark"}}


demo = gr.Interface(
    fn=run_selected,
    inputs=[
        gr.Dropdown(
            choices=[
                "Coherence Under Zero Stimulus v01",
                "Boundary Integrity Under Instruction Conflict v01",
                "Temporal Coherence Under Interrupted Context v01",
            ],
            value="Coherence Under Zero Stimulus v01",
            label="Benchmark",
        ),
        gr.Textbox(label="Prompt", lines=8, placeholder="Paste the prompt used for the eval"),
        gr.Textbox(label="Model Output", lines=8, placeholder="Paste the model response to score"),
    ],
    outputs=gr.JSON(label="Score"),
    title="Clarus Benchmarks",
    description="Three public-facing behavioral evals: zero-stimulus restraint, boundary integrity under instruction conflict, and temporal coherence under interrupted context.",
)

demo.launch()