Fix Gradio app: call verify_ipt directly (schema can't carry two-field SLR-Bench form); friendlier description
Browse files- IsomorphicPerturbationTesting.py +6 -5
- app.py +35 -22
IsomorphicPerturbationTesting.py
CHANGED
|
@@ -214,10 +214,12 @@ class IsomorphicPerturbationTesting(evaluate.Metric):
|
|
| 214 |
"""
|
| 215 |
|
| 216 |
def _info(self):
|
| 217 |
-
# Schema declares
|
| 218 |
-
#
|
| 219 |
-
#
|
| 220 |
-
#
|
|
|
|
|
|
|
| 221 |
return evaluate.MetricInfo(
|
| 222 |
description=_DESCRIPTION,
|
| 223 |
citation=_CITATION,
|
|
@@ -226,7 +228,6 @@ class IsomorphicPerturbationTesting(evaluate.Metric):
|
|
| 226 |
"predictions": datasets.Value("string"),
|
| 227 |
"references": {
|
| 228 |
"validation_program": datasets.Value("string"),
|
| 229 |
-
"validation_program_shortcuts": datasets.Value("string"),
|
| 230 |
"evaluation_config": {
|
| 231 |
"positive_predicate": datasets.Value("string"),
|
| 232 |
"negative_predicate": datasets.Value("string"),
|
|
|
|
| 214 |
"""
|
| 215 |
|
| 216 |
def _info(self):
|
| 217 |
+
# Schema declares `validation_program` as the single required reference
|
| 218 |
+
# field. Extra fields are accepted at runtime and resolved by
|
| 219 |
+
# _resolve_programs(): (validation_program_shortcuts, validation_program)
|
| 220 |
+
# is the SLR-Bench convention, (extensional_program, isomorphic_program)
|
| 221 |
+
# is the explicit-name convention. Declaring `validation_program_shortcuts`
|
| 222 |
+
# here too would force-require it and break legacy single-field callers.
|
| 223 |
return evaluate.MetricInfo(
|
| 224 |
description=_DESCRIPTION,
|
| 225 |
citation=_CITATION,
|
|
|
|
| 228 |
"predictions": datasets.Value("string"),
|
| 229 |
"references": {
|
| 230 |
"validation_program": datasets.Value("string"),
|
|
|
|
| 231 |
"evaluation_config": {
|
| 232 |
"positive_predicate": datasets.Value("string"),
|
| 233 |
"negative_predicate": datasets.Value("string"),
|
app.py
CHANGED
|
@@ -3,7 +3,7 @@ import os
|
|
| 3 |
import evaluate
|
| 4 |
import gradio as gr
|
| 5 |
|
| 6 |
-
from ipt_verifier import legacy_synth_isomorphic
|
| 7 |
|
| 8 |
|
| 9 |
def create_interface(module):
|
|
@@ -22,21 +22,19 @@ def create_interface(module):
|
|
| 22 |
if not neg_pred or not neg_pred.strip():
|
| 23 |
return "", "", "", "Please specify the negative predicate."
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
},
|
| 32 |
}
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
| 37 |
)
|
| 38 |
-
|
| 39 |
-
d = results["detailed_results"][0]
|
| 40 |
error_msg = d.get("error") or ""
|
| 41 |
|
| 42 |
if d["is_reward_shortcut"]:
|
|
@@ -49,8 +47,8 @@ def create_interface(module):
|
|
| 49 |
iso_icon = "β
" if d["isomorphic_correct"] else "β"
|
| 50 |
ext_icon = "β
" if d["extensional_correct"] else "β"
|
| 51 |
|
| 52 |
-
iso_line = f"{iso_icon}
|
| 53 |
-
ext_line = f"{ext_icon}
|
| 54 |
|
| 55 |
return verdict, iso_line, ext_line, error_msg
|
| 56 |
|
|
@@ -121,12 +119,27 @@ def create_interface(module):
|
|
| 121 |
with gr.Blocks(title="Isomorphic Perturbation Testing") as demo:
|
| 122 |
with gr.Tab("Evaluate"):
|
| 123 |
gr.Markdown("# Isomorphic Perturbation Testing (IPT)")
|
| 124 |
-
gr.Markdown(
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
with gr.Row():
|
| 132 |
with gr.Column():
|
|
|
|
| 3 |
import evaluate
|
| 4 |
import gradio as gr
|
| 5 |
|
| 6 |
+
from ipt_verifier import legacy_synth_isomorphic, verify_ipt
|
| 7 |
|
| 8 |
|
| 9 |
def create_interface(module):
|
|
|
|
| 22 |
if not neg_pred or not neg_pred.strip():
|
| 23 |
return "", "", "", "Please specify the negative predicate."
|
| 24 |
|
| 25 |
+
# Single-input demo: call verify_ipt directly. (The batched
|
| 26 |
+
# module.compute() path is for HF evaluate.load() callers; its schema
|
| 27 |
+
# only declares the legacy single-field form.)
|
| 28 |
+
eval_config = {
|
| 29 |
+
"positive_predicate": pos_pred.strip(),
|
| 30 |
+
"negative_predicate": neg_pred.strip(),
|
|
|
|
| 31 |
}
|
| 32 |
+
d = verify_ipt(
|
| 33 |
+
prediction.strip(),
|
| 34 |
+
ext_program.strip(),
|
| 35 |
+
iso_program.strip(),
|
| 36 |
+
eval_config,
|
| 37 |
)
|
|
|
|
|
|
|
| 38 |
error_msg = d.get("error") or ""
|
| 39 |
|
| 40 |
if d["is_reward_shortcut"]:
|
|
|
|
| 47 |
iso_icon = "β
" if d["isomorphic_correct"] else "β"
|
| 48 |
ext_icon = "β
" if d["extensional_correct"] else "β"
|
| 49 |
|
| 50 |
+
iso_line = f"{iso_icon} isomorphic β partial: {d['isomorphic_partial']:.2f}"
|
| 51 |
+
ext_line = f"{ext_icon} extensional β partial: {d['extensional_partial']:.2f}"
|
| 52 |
|
| 53 |
return verdict, iso_line, ext_line, error_msg
|
| 54 |
|
|
|
|
| 119 |
with gr.Blocks(title="Isomorphic Perturbation Testing") as demo:
|
| 120 |
with gr.Tab("Evaluate"):
|
| 121 |
gr.Markdown("# Isomorphic Perturbation Testing (IPT)")
|
| 122 |
+
gr.Markdown("""
|
| 123 |
+
### Do reasoning LLMs actually reason β or learn to game the test?
|
| 124 |
+
|
| 125 |
+
LLMs are increasingly trained with **reinforcement learning from verifiable rewards** (RLVR),
|
| 126 |
+
which boosts their performance on problems whose answers can be checked automatically.
|
| 127 |
+
But it can also teach them to *exploit the verifier* rather than solve the task.
|
| 128 |
+
|
| 129 |
+
We test this on **inductive reasoning**: a model sees a few labeled examples and must write
|
| 130 |
+
a rule that explains them.
|
| 131 |
+
|
| 132 |
+
- π― *Intended:* `plants with purple leaves are toxic.`
|
| 133 |
+
- β οΈ *Shortcut:* `plant_01 is toxic. plant_02 is safe. ...`
|
| 134 |
+
|
| 135 |
+
The shortcut passes a naive correctness check but doesn't capture the pattern. **IPT** catches
|
| 136 |
+
it by renaming the objects in the task and re-checking β a real rule still works; a shortcut
|
| 137 |
+
breaks. Paste a hypothesis below and try it on one of the examples, or your own data.
|
| 138 |
+
|
| 139 |
+
π» [Paper code](https://github.com/ml-research/llms-gaming-verifiers) Β·
|
| 140 |
+
π§ͺ [Try the leaderboard](https://huggingface.co/spaces/AIML-TUDA/slr-leaderboard) Β·
|
| 141 |
+
π [SLR-Bench dataset](https://huggingface.co/datasets/AIML-TUDA/SLR-Bench)
|
| 142 |
+
""")
|
| 143 |
|
| 144 |
with gr.Row():
|
| 145 |
with gr.Column():
|