ClarusC64 commited on
Commit
e8ae754
·
verified ·
1 Parent(s): 454d34b

Update app.py

Browse files

Add temporal coherence benchmark to app

Files changed (1) hide show
  1. app.py +24 -11
app.py CHANGED
@@ -2,30 +2,43 @@ import gradio as gr
2
 
3
  from coherence_under_zero_stimulus_v01.scorer import zus_score
4
  from boundary_integrity_instruction_conflict_v01.scorer import boundary_score
 
5
 
6
- def run_selected(eval_name, prompt, completion):
7
- if eval_name == "Zero Stimulus v01":
 
8
  r = zus_score(prompt, completion)
9
  return {"score_0_100": r.score_0_100, "details": r.details}
10
- if eval_name == "Boundary Integrity v01":
 
11
  r = boundary_score(prompt, completion)
12
  return {"score_0_100": r.score_0_100, "details": r.details}
13
- return {"score_0_100": 0, "details": {"error": "unknown eval"}}
 
 
 
 
 
 
14
 
15
  demo = gr.Interface(
16
  fn=run_selected,
17
  inputs=[
18
  gr.Dropdown(
19
- choices=["Zero Stimulus v01", "Boundary Integrity v01"],
20
- value="Boundary Integrity v01",
21
- label="Benchmark"
 
 
 
 
22
  ),
23
- gr.Textbox(label="Prompt", lines=8),
24
- gr.Textbox(label="Model Output", lines=8),
25
  ],
26
- outputs=gr.JSON(label="Clarus Score"),
27
  title="Clarus Benchmarks",
28
- description="Score model behavior for restraint and boundary integrity.",
29
  )
30
 
31
  demo.launch()
 
2
 
3
  from coherence_under_zero_stimulus_v01.scorer import zus_score
4
  from boundary_integrity_instruction_conflict_v01.scorer import boundary_score
5
+ from temporal_coherence_interruption_v01.scorer import temporal_score
6
 
7
+
8
+ def run_selected(benchmark, prompt, completion):
9
+ if benchmark == "Coherence Under Zero Stimulus v01":
10
  r = zus_score(prompt, completion)
11
  return {"score_0_100": r.score_0_100, "details": r.details}
12
+
13
+ if benchmark == "Boundary Integrity Under Instruction Conflict v01":
14
  r = boundary_score(prompt, completion)
15
  return {"score_0_100": r.score_0_100, "details": r.details}
16
+
17
+ if benchmark == "Temporal Coherence Under Interrupted Context v01":
18
+ r = temporal_score(prompt, completion)
19
+ return {"score_0_100": r.score_0_100, "details": r.details}
20
+
21
+ return {"score_0_100": 0, "details": {"error": "unknown benchmark"}}
22
+
23
 
24
  demo = gr.Interface(
25
  fn=run_selected,
26
  inputs=[
27
  gr.Dropdown(
28
+ choices=[
29
+ "Coherence Under Zero Stimulus v01",
30
+ "Boundary Integrity Under Instruction Conflict v01",
31
+ "Temporal Coherence Under Interrupted Context v01",
32
+ ],
33
+ value="Coherence Under Zero Stimulus v01",
34
+ label="Benchmark",
35
  ),
36
+ gr.Textbox(label="Prompt", lines=8, placeholder="Paste the prompt used for the eval"),
37
+ gr.Textbox(label="Model Output", lines=8, placeholder="Paste the model response to score"),
38
  ],
39
+ outputs=gr.JSON(label="Score"),
40
  title="Clarus Benchmarks",
41
+ description="Three public-facing behavioral evals: zero-stimulus restraint, boundary integrity under instruction conflict, and temporal coherence under interrupted context.",
42
  )
43
 
44
  demo.launch()