avanigupta Claude Opus 4.6 (1M context) commited on
Commit
5de8f8e
Β·
1 Parent(s): 5e1f8bb

demo only proposes logically inferrable fixes

Browse files

Fixes are only shown where the correct value can be deduced:
- Name from email pattern (david.kim@company.com β†’ David Kim)
- Type conversion (seventy-five thousand β†’ 75000)
- Computed fields (total = qty * price)
- Format conversion (26/01/2024 β†’ 2024-01-26)
- Nearest valid value (batch_size 250 β†’ 256 power of 2)
- Strip artifacts ([SYSTEM] prompt prefix removed)
- Well-known facts (Salvator Mundi at Christie's for $450.3M)

Ambiguous fixes NOT proposed: any-valid-salary, any-past-date,
any-model-name, any-valid-LR, etc.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. dataqa_env/server/gradio_ui.py +49 -24
dataqa_env/server/gradio_ui.py CHANGED
@@ -20,6 +20,9 @@ from ..models import DataQAAction
20
  # ── Pre-built agent trajectories (simulates baseline agent) ──
21
 
22
  AGENT_TRAJECTORIES = {
 
 
 
23
  "easy": [
24
  {
25
  "issues": [
@@ -41,11 +44,15 @@ AGENT_TRAJECTORIES = {
41
  "row:18,col:start_date,issue:out_of_range",
42
  ],
43
  "fixes": [
 
44
  "row:4,col:name,fix:David Kim",
 
45
  "row:7,col:salary,fix:75000",
46
- "row:9,col:salary,fix:73000",
47
  "row:15,col:email,fix:oscar.rivera@company.com",
48
- "row:18,col:start_date,fix:2022-01-19",
 
 
49
  ],
50
  },
51
  ],
@@ -74,12 +81,17 @@ AGENT_TRAJECTORIES = {
74
  "row:29,col:order_date,issue:inconsistent_value",
75
  ],
76
  "fixes": [
 
77
  "row:5,col:total,fix:42.00",
 
78
  "row:10,col:category,fix:Sports",
 
79
  "row:12,col:order_date,fix:2024-01-26",
80
- "row:14,col:product_name,fix:LED Strip Lights",
81
- "row:24,col:shipping_country,fix:US",
82
- "row:29,col:order_date,fix:2024-02-12",
 
 
83
  ],
84
  },
85
  ],
@@ -108,11 +120,18 @@ AGENT_TRAJECTORIES = {
108
  "row:12,col:test_accuracy,issue:statistical_outlier",
109
  ],
110
  "fixes": [
111
- "row:14,col:training_time_hours,fix:72.0",
112
- "row:13,col:learning_rate,fix:0.00001",
113
- "row:15,col:model_name,fix:whisper-small",
114
  "row:9,col:batch_size,fix:256",
115
- "row:9,col:training_time_hours,fix:36.0",
 
 
 
 
 
 
 
 
 
116
  ],
117
  },
118
  ],
@@ -120,34 +139,40 @@ AGENT_TRAJECTORIES = {
120
  {
121
  "issues": [
122
  "row:6,col:response,issue:inconsistent_value",
123
- "row:15,col:language,issue:inconsistent_value",
124
- "row:17,col:instruction,issue:missing_value",
125
- "row:19,col:response,issue:inconsistent_value",
126
- "row:21,col:instruction,issue:duplicate_row",
127
- "row:23,col:response,issue:missing_value",
128
  "row:3,col:response,issue:inconsistent_value",
129
  ],
130
  "fixes": [],
131
  },
132
  {
133
  "issues": [
 
134
  "row:4,col:response,issue:inconsistent_value",
135
  "row:6,col:response,issue:inconsistent_value",
 
136
  "row:8,col:response,issue:inconsistent_value",
137
- "row:10,col:response,issue:inconsistent_value",
138
  "row:11,col:response,issue:inconsistent_value",
139
- "row:15,col:language,issue:inconsistent_value",
140
- "row:17,col:instruction,issue:missing_value",
141
- "row:19,col:response,issue:inconsistent_value",
142
- "row:21,col:instruction,issue:duplicate_row",
143
- "row:23,col:response,issue:missing_value",
144
- "row:24,col:response,issue:inconsistent_value",
145
- "row:3,col:response,issue:inconsistent_value",
146
  ],
147
  "fixes": [
148
- "row:6,col:response,fix:The scientific name for the flower commonly called a cherry blossom is Prunus serrulata. It can be found in white pink and red colors.",
149
  "row:4,col:response,fix:The most expensive painting ever sold at auction is Salvator Mundi by Leonardo da Vinci. It was sold for $450.3 million at Christie's in New York City in 2017.",
150
- "row:20,col:response,fix:Five animals that live in grasslands are lions zebras cheetahs gazelles and hyenas. These animals live in grasslands to access the food water and shade that grasslands provide.",
 
 
 
 
 
 
151
  ],
152
  },
153
  ],
 
20
  # ── Pre-built agent trajectories (simulates baseline agent) ──
21
 
22
  AGENT_TRAJECTORIES = {
23
+ # Demo trajectories: fixes are ONLY proposed where the correct value
24
+ # is logically inferrable (computable, format conversion, or deducible from context).
25
+ # Ambiguous fixes (any valid salary, any past date) are NOT proposed.
26
  "easy": [
27
  {
28
  "issues": [
 
44
  "row:18,col:start_date,issue:out_of_range",
45
  ],
46
  "fixes": [
47
+ # Inferrable: name "David Kim" deduced from email david.kim@company.com
48
  "row:4,col:name,fix:David Kim",
49
+ # Inferrable: "seventy-five thousand" is clearly 75000
50
  "row:7,col:salary,fix:75000",
51
+ # Inferrable: email must match name pattern oscar.rivera@company.com
52
  "row:15,col:email,fix:oscar.rivera@company.com",
53
+ # NOT proposed: row:9 salary (any valid salary 50000-150000 works)
54
+ # NOT proposed: row:18 start_date (any past date works)
55
+ # NOT proposed: row:21 duplicate (remove or reassign β€” ambiguous)
56
  ],
57
  },
58
  ],
 
81
  "row:29,col:order_date,issue:inconsistent_value",
82
  ],
83
  "fixes": [
84
+ # Inferrable: total = qty(1) * price(42.00) = 42.00
85
  "row:5,col:total,fix:42.00",
86
+ # Inferrable: "Fitness" is closest to "Sports" in allowed categories
87
  "row:10,col:category,fix:Sports",
88
+ # Inferrable: 26/01/2024 reformatted to YYYY-MM-DD
89
  "row:12,col:order_date,fix:2024-01-26",
90
+ # NOT proposed: row:14 product_name (any product name works)
91
+ # NOT proposed: row:17 quantity (any positive int)
92
+ # NOT proposed: row:19 duplicate order_id (reassign β€” ambiguous)
93
+ # NOT proposed: row:24 country (could be any valid ISO code)
94
+ # NOT proposed: row:29 future date (any past date works)
95
  ],
96
  },
97
  ],
 
120
  "row:12,col:test_accuracy,issue:statistical_outlier",
121
  ],
122
  "fixes": [
123
+ # Inferrable: batch_size 250 β†’ nearest power of 2 = 256
 
 
124
  "row:9,col:batch_size,fix:256",
125
+ # Inferrable: negative time -72.0 β†’ absolute value 72.0
126
+ "row:14,col:training_time_hours,fix:72.0",
127
+ # NOT proposed: row:13 LR (any valid LR 1e-7 to 1.0)
128
+ # NOT proposed: row:15 model_name (could be any model)
129
+ # NOT proposed: row:5 val_loss (any val >= train_loss)
130
+ # NOT proposed: row:7 GPU memory (any reasonable value)
131
+ # NOT proposed: row:10 train_size (any value > test_size)
132
+ # NOT proposed: row:11 timestamp (any date after prev)
133
+ # NOT proposed: row:9 training_time (any reasonable hours)
134
+ # NOT proposed: row:12 test_accuracy (any < SOTA)
135
  ],
136
  },
137
  ],
 
139
  {
140
  "issues": [
141
  "row:6,col:response,issue:inconsistent_value",
142
+ "row:15,col:response,issue:inconsistent_value",
143
+ "row:28,col:prompt,issue:missing_value",
144
+ "row:20,col:response,issue:inconsistent_value",
145
+ "row:7,col:prompt,issue:duplicate_row",
146
+ "row:25,col:response,issue:missing_value",
147
  "row:3,col:response,issue:inconsistent_value",
148
  ],
149
  "fixes": [],
150
  },
151
  {
152
  "issues": [
153
+ "row:3,col:response,issue:inconsistent_value",
154
  "row:4,col:response,issue:inconsistent_value",
155
  "row:6,col:response,issue:inconsistent_value",
156
+ "row:7,col:prompt,issue:duplicate_row",
157
  "row:8,col:response,issue:inconsistent_value",
 
158
  "row:11,col:response,issue:inconsistent_value",
159
+ "row:15,col:response,issue:inconsistent_value",
160
+ "row:17,col:helpfulness,issue:inconsistent_value",
161
+ "row:20,col:response,issue:inconsistent_value",
162
+ "row:25,col:response,issue:missing_value",
163
+ "row:28,col:prompt,issue:missing_value",
164
+ "row:29,col:response,issue:inconsistent_value",
 
165
  ],
166
  "fixes": [
167
+ # Inferrable: Salvator Mundi facts are well-known ($450.3M at Christie's)
168
  "row:4,col:response,fix:The most expensive painting ever sold at auction is Salvator Mundi by Leonardo da Vinci. It was sold for $450.3 million at Christie's in New York City in 2017.",
169
+ # Inferrable: strip leaked [SYSTEM] prompt prefix
170
+ "row:3,col:response,fix:Kitsch is art or design that is overly sentimental or ornate while camp is a style that is over-the-top and exaggerated often used in satire or irony.",
171
+ # NOT proposed: row:6 wrong scientific name (need taxonomy knowledge)
172
+ # NOT proposed: row:8 harmful advice (need to write safe version)
173
+ # NOT proposed: row:11 self-contradiction (need to rewrite coherently)
174
+ # NOT proposed: row:15 French response (need English translation)
175
+ # NOT proposed: row:29 hallucinated citation (need factual replacement)
176
  ],
177
  },
178
  ],