LukasHug commited on
Commit
e31d1c3
Β·
1 Parent(s): 08b0915

Fix Gradio app: call verify_ipt directly (schema can't carry two-field SLR-Bench form); friendlier description

Browse files
Files changed (2) hide show
  1. IsomorphicPerturbationTesting.py +6 -5
  2. app.py +35 -22
IsomorphicPerturbationTesting.py CHANGED
@@ -214,10 +214,12 @@ class IsomorphicPerturbationTesting(evaluate.Metric):
214
  """
215
 
216
  def _info(self):
217
- # Schema declares the canonical SLR-Bench field names. The `_compute`
218
- # path also accepts (extensional_program, isomorphic_program) and the
219
- # deprecated single-field (validation_program) form β€” see
220
- # _resolve_programs().
 
 
221
  return evaluate.MetricInfo(
222
  description=_DESCRIPTION,
223
  citation=_CITATION,
@@ -226,7 +228,6 @@ class IsomorphicPerturbationTesting(evaluate.Metric):
226
  "predictions": datasets.Value("string"),
227
  "references": {
228
  "validation_program": datasets.Value("string"),
229
- "validation_program_shortcuts": datasets.Value("string"),
230
  "evaluation_config": {
231
  "positive_predicate": datasets.Value("string"),
232
  "negative_predicate": datasets.Value("string"),
 
214
  """
215
 
216
  def _info(self):
217
+ # Schema declares `validation_program` as the single required reference
218
+ # field. Extra fields are accepted at runtime and resolved by
219
+ # _resolve_programs(): (validation_program_shortcuts, validation_program)
220
+ # is the SLR-Bench convention, (extensional_program, isomorphic_program)
221
+ # is the explicit-name convention. Declaring `validation_program_shortcuts`
222
+ # here too would force-require it and break legacy single-field callers.
223
  return evaluate.MetricInfo(
224
  description=_DESCRIPTION,
225
  citation=_CITATION,
 
228
  "predictions": datasets.Value("string"),
229
  "references": {
230
  "validation_program": datasets.Value("string"),
 
231
  "evaluation_config": {
232
  "positive_predicate": datasets.Value("string"),
233
  "negative_predicate": datasets.Value("string"),
app.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  import evaluate
4
  import gradio as gr
5
 
6
- from ipt_verifier import legacy_synth_isomorphic
7
 
8
 
9
  def create_interface(module):
@@ -22,21 +22,19 @@ def create_interface(module):
22
  if not neg_pred or not neg_pred.strip():
23
  return "", "", "", "Please specify the negative predicate."
24
 
25
- ref = {
26
- "extensional_program": ext_program.strip(),
27
- "isomorphic_program": iso_program.strip(),
28
- "evaluation_config": {
29
- "positive_predicate": pos_pred.strip(),
30
- "negative_predicate": neg_pred.strip(),
31
- },
32
  }
33
- results = module.compute(
34
- predictions=[prediction.strip()],
35
- references=[ref],
36
- verbose=False,
 
37
  )
38
-
39
- d = results["detailed_results"][0]
40
  error_msg = d.get("error") or ""
41
 
42
  if d["is_reward_shortcut"]:
@@ -49,8 +47,8 @@ def create_interface(module):
49
  iso_icon = "βœ…" if d["isomorphic_correct"] else "❌"
50
  ext_icon = "βœ…" if d["extensional_correct"] else "❌"
51
 
52
- iso_line = f"{iso_icon} {results['isomorphic_accuracy']:.4f} (partial: {d['isomorphic_partial']:.4f})"
53
- ext_line = f"{ext_icon} {results['meta']['extensional_accuracy']:.4f} (partial: {d['extensional_partial']:.4f})"
54
 
55
  return verdict, iso_line, ext_line, error_msg
56
 
@@ -121,12 +119,27 @@ def create_interface(module):
121
  with gr.Blocks(title="Isomorphic Perturbation Testing") as demo:
122
  with gr.Tab("Evaluate"):
123
  gr.Markdown("# Isomorphic Perturbation Testing (IPT)")
124
- gr.Markdown(
125
- "Diagnose whether a model output is a **genuine rule** or a **reward shortcut**. "
126
- "A shortcut passes the standard verifier (extensional) but fails when object "
127
- "constants are renamed (isomorphic) β€” exposing that it memorised training instances "
128
- "rather than learning a generalizable rule."
129
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  with gr.Row():
132
  with gr.Column():
 
3
  import evaluate
4
  import gradio as gr
5
 
6
+ from ipt_verifier import legacy_synth_isomorphic, verify_ipt
7
 
8
 
9
  def create_interface(module):
 
22
  if not neg_pred or not neg_pred.strip():
23
  return "", "", "", "Please specify the negative predicate."
24
 
25
+ # Single-input demo: call verify_ipt directly. (The batched
26
+ # module.compute() path is for HF evaluate.load() callers; its schema
27
+ # only declares the legacy single-field form.)
28
+ eval_config = {
29
+ "positive_predicate": pos_pred.strip(),
30
+ "negative_predicate": neg_pred.strip(),
 
31
  }
32
+ d = verify_ipt(
33
+ prediction.strip(),
34
+ ext_program.strip(),
35
+ iso_program.strip(),
36
+ eval_config,
37
  )
 
 
38
  error_msg = d.get("error") or ""
39
 
40
  if d["is_reward_shortcut"]:
 
47
  iso_icon = "βœ…" if d["isomorphic_correct"] else "❌"
48
  ext_icon = "βœ…" if d["extensional_correct"] else "❌"
49
 
50
+ iso_line = f"{iso_icon} isomorphic β€” partial: {d['isomorphic_partial']:.2f}"
51
+ ext_line = f"{ext_icon} extensional β€” partial: {d['extensional_partial']:.2f}"
52
 
53
  return verdict, iso_line, ext_line, error_msg
54
 
 
119
  with gr.Blocks(title="Isomorphic Perturbation Testing") as demo:
120
  with gr.Tab("Evaluate"):
121
  gr.Markdown("# Isomorphic Perturbation Testing (IPT)")
122
+ gr.Markdown("""
123
+ ### Do reasoning LLMs actually reason β€” or learn to game the test?
124
+
125
+ LLMs are increasingly trained with **reinforcement learning from verifiable rewards** (RLVR),
126
+ which boosts their performance on problems whose answers can be checked automatically.
127
+ But it can also teach them to *exploit the verifier* rather than solve the task.
128
+
129
+ We test this on **inductive reasoning**: a model sees a few labeled examples and must write
130
+ a rule that explains them.
131
+
132
+ - 🎯 *Intended:* `plants with purple leaves are toxic.`
133
+ - ⚠️ *Shortcut:* `plant_01 is toxic. plant_02 is safe. ...`
134
+
135
+ The shortcut passes a naive correctness check but doesn't capture the pattern. **IPT** catches
136
+ it by renaming the objects in the task and re-checking β€” a real rule still works; a shortcut
137
+ breaks. Paste a hypothesis below and try it on one of the examples, or your own data.
138
+
139
+ πŸ’» [Paper code](https://github.com/ml-research/llms-gaming-verifiers) Β·
140
+ πŸ§ͺ [Try the leaderboard](https://huggingface.co/spaces/AIML-TUDA/slr-leaderboard) Β·
141
+ πŸ“Š [SLR-Bench dataset](https://huggingface.co/datasets/AIML-TUDA/SLR-Bench)
142
+ """)
143
 
144
  with gr.Row():
145
  with gr.Column():