Spaces:

ClarusC64
/

clarus-evals

Sleeping

App Files Files Community

ClarusC64 commited on Jan 5

Commit

e8ae754

verified ·

1 Parent(s): 454d34b

Update app.py

Browse files

Add temporal coherence benchmark to app

Files changed (1) hide show

app.py +24 -11

app.py CHANGED Viewed

@@ -2,30 +2,43 @@ import gradio as gr
 from coherence_under_zero_stimulus_v01.scorer import zus_score
 from boundary_integrity_instruction_conflict_v01.scorer import boundary_score
-def run_selected(eval_name, prompt, completion):
-    if eval_name == "Zero Stimulus v01":
         r = zus_score(prompt, completion)
         return {"score_0_100": r.score_0_100, "details": r.details}
-    if eval_name == "Boundary Integrity v01":
         r = boundary_score(prompt, completion)
         return {"score_0_100": r.score_0_100, "details": r.details}
-    return {"score_0_100": 0, "details": {"error": "unknown eval"}}
 demo = gr.Interface(
     fn=run_selected,
     inputs=[
         gr.Dropdown(
-            choices=["Zero Stimulus v01", "Boundary Integrity v01"],
-            value="Boundary Integrity v01",
-            label="Benchmark"
         ),
-        gr.Textbox(label="Prompt", lines=8),
-        gr.Textbox(label="Model Output", lines=8),
     ],
-    outputs=gr.JSON(label="Clarus Score"),
     title="Clarus Benchmarks",
-    description="Score model behavior for restraint and boundary integrity.",
 )
 demo.launch()

 from coherence_under_zero_stimulus_v01.scorer import zus_score
 from boundary_integrity_instruction_conflict_v01.scorer import boundary_score
+from temporal_coherence_interruption_v01.scorer import temporal_score
+def run_selected(benchmark, prompt, completion):
+    if benchmark == "Coherence Under Zero Stimulus v01":
         r = zus_score(prompt, completion)
         return {"score_0_100": r.score_0_100, "details": r.details}
+    if benchmark == "Boundary Integrity Under Instruction Conflict v01":
         r = boundary_score(prompt, completion)
         return {"score_0_100": r.score_0_100, "details": r.details}
+    if benchmark == "Temporal Coherence Under Interrupted Context v01":
+        r = temporal_score(prompt, completion)
+        return {"score_0_100": r.score_0_100, "details": r.details}
+    return {"score_0_100": 0, "details": {"error": "unknown benchmark"}}
 demo = gr.Interface(
     fn=run_selected,
     inputs=[
         gr.Dropdown(
+            choices=[
+                "Coherence Under Zero Stimulus v01",
+                "Boundary Integrity Under Instruction Conflict v01",
+                "Temporal Coherence Under Interrupted Context v01",
+            ],
+            value="Coherence Under Zero Stimulus v01",
+            label="Benchmark",
         ),
+        gr.Textbox(label="Prompt", lines=8, placeholder="Paste the prompt used for the eval"),
+        gr.Textbox(label="Model Output", lines=8, placeholder="Paste the model response to score"),
     ],
+    outputs=gr.JSON(label="Score"),
     title="Clarus Benchmarks",
+    description="Three public-facing behavioral evals: zero-stimulus restraint, boundary integrity under instruction conflict, and temporal coherence under interrupted context.",
 )
 demo.launch()