Spaces:

AIML-TUDA
/

IsomorphicPerturbationTesting

Running

App Files Files Community

LukasHug commited on 25 days ago

Commit

e31d1c3

1 Parent(s): 08b0915

Fix Gradio app: call verify_ipt directly (schema can't carry two-field SLR-Bench form); friendlier description

Browse files

Files changed (2) hide show

IsomorphicPerturbationTesting.py +6 -5
app.py +35 -22

IsomorphicPerturbationTesting.py CHANGED Viewed

@@ -214,10 +214,12 @@ class IsomorphicPerturbationTesting(evaluate.Metric):
     """
     def _info(self):
-        # Schema declares the canonical SLR-Bench field names. The `_compute`
-        # path also accepts (extensional_program, isomorphic_program) and the
-        # deprecated single-field (validation_program) form — see
-        # _resolve_programs().
         return evaluate.MetricInfo(
             description=_DESCRIPTION,
             citation=_CITATION,
@@ -226,7 +228,6 @@ class IsomorphicPerturbationTesting(evaluate.Metric):
                 "predictions": datasets.Value("string"),
                 "references": {
                     "validation_program": datasets.Value("string"),
-                    "validation_program_shortcuts": datasets.Value("string"),
                     "evaluation_config": {
                         "positive_predicate": datasets.Value("string"),
                         "negative_predicate": datasets.Value("string"),

     """
     def _info(self):
+        # Schema declares `validation_program` as the single required reference
+        # field. Extra fields are accepted at runtime and resolved by
+        # _resolve_programs(): (validation_program_shortcuts, validation_program)
+        # is the SLR-Bench convention, (extensional_program, isomorphic_program)
+        # is the explicit-name convention. Declaring `validation_program_shortcuts`
+        # here too would force-require it and break legacy single-field callers.
         return evaluate.MetricInfo(
             description=_DESCRIPTION,
             citation=_CITATION,
                 "predictions": datasets.Value("string"),
                 "references": {
                     "validation_program": datasets.Value("string"),
                     "evaluation_config": {
                         "positive_predicate": datasets.Value("string"),
                         "negative_predicate": datasets.Value("string"),

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 import evaluate
 import gradio as gr
-from ipt_verifier import legacy_synth_isomorphic
 def create_interface(module):
@@ -22,21 +22,19 @@ def create_interface(module):
         if not neg_pred or not neg_pred.strip():
             return "", "", "", "Please specify the negative predicate."
-        ref = {
-            "extensional_program": ext_program.strip(),
-            "isomorphic_program":  iso_program.strip(),
-            "evaluation_config": {
-                "positive_predicate": pos_pred.strip(),
-                "negative_predicate": neg_pred.strip(),
-            },
         }
-        results = module.compute(
-            predictions=[prediction.strip()],
-            references=[ref],
-            verbose=False,
         )
-        d = results["detailed_results"][0]
         error_msg = d.get("error") or ""
         if d["is_reward_shortcut"]:
@@ -49,8 +47,8 @@ def create_interface(module):
         iso_icon = "✅" if d["isomorphic_correct"] else "❌"
         ext_icon = "✅" if d["extensional_correct"] else "❌"
-        iso_line = f"{iso_icon}  {results['isomorphic_accuracy']:.4f}  (partial: {d['isomorphic_partial']:.4f})"
-        ext_line = f"{ext_icon}  {results['meta']['extensional_accuracy']:.4f}  (partial: {d['extensional_partial']:.4f})"
         return verdict, iso_line, ext_line, error_msg
@@ -121,12 +119,27 @@ def create_interface(module):
     with gr.Blocks(title="Isomorphic Perturbation Testing") as demo:
         with gr.Tab("Evaluate"):
             gr.Markdown("# Isomorphic Perturbation Testing (IPT)")
-            gr.Markdown(
-                "Diagnose whether a model output is a **genuine rule** or a **reward shortcut**. "
-                "A shortcut passes the standard verifier (extensional) but fails when object "
-                "constants are renamed (isomorphic) — exposing that it memorised training instances "
-                "rather than learning a generalizable rule."
-            )
             with gr.Row():
                 with gr.Column():

 import evaluate
 import gradio as gr
+from ipt_verifier import legacy_synth_isomorphic, verify_ipt
 def create_interface(module):
         if not neg_pred or not neg_pred.strip():
             return "", "", "", "Please specify the negative predicate."
+        # Single-input demo: call verify_ipt directly. (The batched
+        # module.compute() path is for HF evaluate.load() callers; its schema
+        # only declares the legacy single-field form.)
+        eval_config = {
+            "positive_predicate": pos_pred.strip(),
+            "negative_predicate": neg_pred.strip(),
         }
+        d = verify_ipt(
+            prediction.strip(),
+            ext_program.strip(),
+            iso_program.strip(),
+            eval_config,
         )
         error_msg = d.get("error") or ""
         if d["is_reward_shortcut"]:
         iso_icon = "✅" if d["isomorphic_correct"] else "❌"
         ext_icon = "✅" if d["extensional_correct"] else "❌"
+        iso_line = f"{iso_icon}  isomorphic — partial: {d['isomorphic_partial']:.2f}"
+        ext_line = f"{ext_icon}  extensional — partial: {d['extensional_partial']:.2f}"
         return verdict, iso_line, ext_line, error_msg
     with gr.Blocks(title="Isomorphic Perturbation Testing") as demo:
         with gr.Tab("Evaluate"):
             gr.Markdown("# Isomorphic Perturbation Testing (IPT)")
+            gr.Markdown("""
+### Do reasoning LLMs actually reason — or learn to game the test?
+LLMs are increasingly trained with **reinforcement learning from verifiable rewards** (RLVR),
+which boosts their performance on problems whose answers can be checked automatically.
+But it can also teach them to *exploit the verifier* rather than solve the task.
+We test this on **inductive reasoning**: a model sees a few labeled examples and must write
+a rule that explains them.
+- 🎯 *Intended:* `plants with purple leaves are toxic.`
+- ⚠️ *Shortcut:* `plant_01 is toxic. plant_02 is safe. ...`
+The shortcut passes a naive correctness check but doesn't capture the pattern. **IPT** catches
+it by renaming the objects in the task and re-checking — a real rule still works; a shortcut
+breaks. Paste a hypothesis below and try it on one of the examples, or your own data.
+💻 [Paper code](https://github.com/ml-research/llms-gaming-verifiers) ·
+🧪 [Try the leaderboard](https://huggingface.co/spaces/AIML-TUDA/slr-leaderboard) ·
+📊 [SLR-Bench dataset](https://huggingface.co/datasets/AIML-TUDA/SLR-Bench)
+""")
             with gr.Row():
                 with gr.Column():