Spaces:

ClarusC64
/

clarus-evals

Sleeping

App Files Files Community

clarus-evals / app.py

ClarusC64

Update app.py

e8ae754 verified about 1 month ago

raw

history blame contribute delete

1.77 kB

	import gradio as gr

	from coherence_under_zero_stimulus_v01.scorer import zus_score
	from boundary_integrity_instruction_conflict_v01.scorer import boundary_score
	from temporal_coherence_interruption_v01.scorer import temporal_score


	def run_selected(benchmark, prompt, completion):
	if benchmark == "Coherence Under Zero Stimulus v01":
	r = zus_score(prompt, completion)
	return {"score_0_100": r.score_0_100, "details": r.details}

	if benchmark == "Boundary Integrity Under Instruction Conflict v01":
	r = boundary_score(prompt, completion)
	return {"score_0_100": r.score_0_100, "details": r.details}

	if benchmark == "Temporal Coherence Under Interrupted Context v01":
	r = temporal_score(prompt, completion)
	return {"score_0_100": r.score_0_100, "details": r.details}

	return {"score_0_100": 0, "details": {"error": "unknown benchmark"}}


	demo = gr.Interface(
	fn=run_selected,
	inputs=[
	gr.Dropdown(
	choices=[
	"Coherence Under Zero Stimulus v01",
	"Boundary Integrity Under Instruction Conflict v01",
	"Temporal Coherence Under Interrupted Context v01",
	],
	value="Coherence Under Zero Stimulus v01",
	label="Benchmark",
	),
	gr.Textbox(label="Prompt", lines=8, placeholder="Paste the prompt used for the eval"),
	gr.Textbox(label="Model Output", lines=8, placeholder="Paste the model response to score"),
	],
	outputs=gr.JSON(label="Score"),
	title="Clarus Benchmarks",
	description="Three public-facing behavioral evals: zero-stimulus restraint, boundary integrity under instruction conflict, and temporal coherence under interrupted context.",
	)

	demo.launch()