File size: 21,152 Bytes
bb1f3f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
import gradio as gr

# ─────────────────────────────────────────────
# TAB 1 LOGIC: RLHF Pairwise Rater
# ─────────────────────────────────────────────

def check_consistency(
    prompt, resp_a, resp_b,
    help_a, harm_a, acc_a, inst_a,
    help_b, harm_b, acc_b, inst_b,
    preference, confidence
):
    if not resp_a.strip() or not resp_b.strip():
        return "⚠️ Please enter both responses before checking.", ""

    axes = {
        "Helpfulness":           (help_a, help_b),
        "Harmlessness":          (harm_a, harm_b),
        "Accuracy":              (acc_a,  acc_b),
        "Instruction-Following": (inst_a, inst_b),
    }

    avg_a = sum(v[0] for v in axes.values()) / 4
    avg_b = sum(v[1] for v in axes.values()) / 4

    # Which axes favour each response?
    a_wins = [(k, va, vb) for k, (va, vb) in axes.items() if va > vb]
    b_wins = [(k, va, vb) for k, (va, vb) in axes.items() if vb > va]
    ties   = [(k, va, vb) for k, (va, vb) in axes.items() if va == vb]

    conf_label = {1: "Low", 2: "Medium", 3: "High"}[confidence]

    # Build score table
    table_rows = ""
    for ax, (va, vb) in axes.items():
        winner = "A βœ“" if va > vb else ("B βœ“" if vb > va else "Tie")
        table_rows += f"| {ax} | {va} | {vb} | {winner} |\n"
    table_rows += f"| **Average** | **{avg_a:.2f}** | **{avg_b:.2f}** | {'A βœ“' if avg_a > avg_b else ('B βœ“' if avg_b > avg_a else 'Tie')} |\n"

    score_table = (
        "### Score Summary\n\n"
        "| Axis | Response A | Response B | Higher |\n"
        "|------|-----------|-----------|--------|\n"
        + table_rows
    )

    # Consistency check
    if preference == "Tie":
        if abs(avg_a - avg_b) >= 1.0:
            msg = (
                f"⚠️ **Possible inconsistency:** You selected 'Tie', but the average scores differ by "
                f"{abs(avg_a - avg_b):.2f} points (A avg: {avg_a:.2f}, B avg: {avg_b:.2f}). "
                f"A tie is most appropriate when averages are within ~0.5 of each other."
            )
        else:
            msg = f"βœ… **Consistent:** A 'Tie' verdict aligns with close average scores (A: {avg_a:.2f}, B: {avg_b:.2f})."

    elif preference == "A is better":
        if avg_b >= avg_a:
            detail = ", ".join(f"{k}: A={va} vs B={vb}" for k, va, vb in b_wins) or "none"
            msg = (
                f"⚠️ **Inconsistency detected:** You selected 'A is better' overall, but Response B "
                f"scored higher on {len(b_wins)}/4 axes. "
                f"Axes favouring B: {detail}. "
                f"Overall averages β€” A: {avg_a:.2f}, B: {avg_b:.2f}. Consider reviewing your overall verdict."
            )
        else:
            msg = f"βœ… **Consistent:** 'A is better' aligns with higher per-axis averages (A: {avg_a:.2f} vs B: {avg_b:.2f})."

    else:  # B is better
        if avg_a >= avg_b:
            detail = ", ".join(f"{k}: A={va} vs B={vb}" for k, va, vb in a_wins) or "none"
            msg = (
                f"⚠️ **Inconsistency detected:** You selected 'B is better' overall, but Response A "
                f"scored higher on {len(a_wins)}/4 axes. "
                f"Axes favouring A: {detail}. "
                f"Overall averages β€” A: {avg_a:.2f}, B: {avg_b:.2f}. Consider reviewing your overall verdict."
            )
        else:
            msg = f"βœ… **Consistent:** 'B is better' aligns with higher per-axis averages (B: {avg_b:.2f} vs A: {avg_a:.2f})."

    msg += f"\n\n**Confidence:** {conf_label} ({confidence}/3)"

    return msg, score_table


# ─────────────────────────────────────────────
# TAB 2 LOGIC: Content Policy Rater
# ─────────────────────────────────────────────

VERDICTS = ["PASS", "FLAG", "BLOCK"]
VERDICT_RANK = {"PASS": 0, "FLAG": 1, "BLOCK": 2}
VERDICT_EMOJI = {"PASS": "🟒", "FLAG": "🟑", "BLOCK": "πŸ”΄"}

CRITERIA = [
    "Factual Accuracy",
    "Safety",
    "Bias",
    "PII Exposure",
    "On-Policy",
    "Clarity",
]

def get_overall_verdict(*args):
    content = args[0]
    if not content.strip():
        return "⚠️ Please enter content to evaluate.", ""

    # args layout: content, then pairs of (verdict, reasoning) Γ— 6, then mode
    verdicts  = [args[1 + i*2]     for i in range(6)]
    reasonings= [args[1 + i*2 + 1] for i in range(6)]
    mode      = args[13]

    ranks = [VERDICT_RANK[v] for v in verdicts]

    if mode == "worst_wins":
        final_rank = max(ranks)
        explanation = "**worst_wins:** The overall verdict equals the most severe individual criterion rating."
    elif mode == "majority":
        from collections import Counter
        c = Counter(verdicts)
        final_rank = VERDICT_RANK[c.most_common(1)[0][0]]
        explanation = f"**majority:** The most frequent verdict ({c.most_common(1)[0][0]}, {c.most_common(1)[0][1]}/6 criteria) wins."
    else:  # weighted
        weights = {"Factual Accuracy": 2, "Safety": 3, "Bias": 2, "PII Exposure": 3, "On-Policy": 2, "Clarity": 1}
        total_w = sum(weights.values())
        weighted_score = sum(weights[c] * ranks[i] for i, c in enumerate(CRITERIA)) / total_w
        final_rank = 0 if weighted_score < 0.5 else (1 if weighted_score < 1.5 else 2)
        explanation = f"**weighted:** Safety and PII carry 3Γ— weight. Weighted score: {weighted_score:.2f} β†’ {'PASS' if final_rank==0 else 'FLAG' if final_rank==1 else 'BLOCK'}."

    final_verdict = VERDICTS[final_rank]
    emoji = VERDICT_EMOJI[final_verdict]

    header = f"## {emoji} Overall Verdict: **{final_verdict}**\n\n{explanation}\n\n"

    table = "### Per-Criterion Breakdown\n\n| Criterion | Verdict | Reasoning |\n|-----------|---------|----------|\n"
    for i, crit in enumerate(CRITERIA):
        v = verdicts[i]
        r = reasonings[i].strip() if reasonings[i].strip() else "β€”"
        ev = VERDICT_EMOJI[v]
        table += f"| {crit} | {ev} {v} | {r} |\n"

    return header, table


# ─────────────────────────────────────────────
# TAB 3 LOGIC: Observation vs Inference
# ─────────────────────────────────────────────

INFERENCE_SIGNALS = [
    # (signal phrase, example clean alternative)
    ("seems",          "Describe exactly what you see, not what it suggests."),
    ("appears",        "Describe exactly what you see, not what it suggests."),
    ("looks like",     "Describe the specific visual or measurable property instead."),
    ("looks ",         "Describe the specific visual or measurable property instead."),
    ("probably",       "Remove speculation β€” state only what is directly observable."),
    ("likely",         "Remove speculation β€” state only what is directly observable."),
    ("might",          "Remove hedging β€” state only what is directly observable."),
    ("may ",           "Remove hedging β€” state only what is directly observable."),
    ("should ",        "Avoid prescriptive language in an observation."),
    ("is bad",         "Describe the specific measurable problem, not a judgment."),
    ("is good",        "Describe the specific measurable quality, not a judgment."),
    ("is wrong",       "Describe the exact discrepancy observed."),
    ("is broken",      "Describe what specifically does not work as expected."),
    ("is inconsistent","Specify the exact values or positions that differ."),
    ("unclear",        "Describe what specific information is missing or ambiguous."),
    ("confusing",      "Describe the specific element that causes confusion."),
    ("feels ",         "Feelings are inferences. Describe the observable trigger instead."),
    ("indicates",      "'Indicates' draws a conclusion. State the raw signal only."),
    ("suggests",       "'Suggests' draws a conclusion. State the raw signal only."),
    ("implies",        "'Implies' draws a conclusion. State the raw signal only."),
    ("because",        "Causal claims belong in the inference, not the observation."),
]

def analyze_obs_inf(observation, inference):
    if not observation.strip():
        return "⚠️ Please enter an observation.", ""

    obs_lower = observation.lower()
    found = [(sig, tip) for sig, tip in INFERENCE_SIGNALS if sig in obs_lower]

    if not found:
        obs_result = (
            "βœ… **Clean observation** β€” specific and factual. "
            "No inference language detected."
        )
    else:
        issues = "\n".join(
            f"- **'{sig.strip()}'** β€” {tip}" for sig, tip in found[:3]
        )
        obs_result = (
            f"⚠️ **Observation contains inference language** ({len(found)} signal(s) found):\n\n"
            + issues
            + "\n\n**Tip:** An observation should answer 'What did you literally see/measure?' β€” "
            "no judgments, no causes, no speculation."
        )

    inf_result = ""
    if inference.strip():
        inf_lower = inference.lower()
        # Inferences *should* contain reasoning words β€” flag if completely bare
        reasoning_words = ["because", "therefore", "so ", "thus", "indicates", "suggests",
                           "means", "implies", "likely", "probably", "conclude"]
        has_reasoning = any(w in inf_lower for w in reasoning_words)
        if has_reasoning:
            inf_result = "\n\nβœ… **Inference** β€” contains reasoning language, which is appropriate here."
        else:
            inf_result = (
                "\n\nπŸ’‘ **Inference tip:** Your inference reads like a bare statement. "
                "Strong inferences explain *why* β€” try adding 'because', 'therefore', or 'this suggests'."
            )

    examples = """
---

### Reference: Good vs Bad Examples

| # | ❌ Contaminated Observation | βœ… Clean Observation |
|---|----------------------------|----------------------|
| 1 | "The button looks inconsistent with the rest of the UI." | "The Save button is 8 px lower than the Cancel button; Save uses Inter 14px, Cancel uses Inter 16px." |
| 2 | "The error message is confusing." | "The error message reads 'Error 403' with no additional context and no retry option." |
| 3 | "The response seems off-topic." | "The response does not mention the word 'Python' despite the prompt asking for a Python code example." |

**Rule of thumb:** If you can't photograph or measure it, it's probably an inference.
"""

    return obs_result + inf_result, examples


# ─────────────────────────────────────────────
# BUILD THE GRADIO APP
# ─────────────────────────────────────────────

HEADER = """
<div style="text-align:center; padding: 16px 0 8px 0;">
  <h1 style="font-size:2rem; margin-bottom:4px;">🎯 AI Evaluation Toolkit</h1>
  <p style="color:#666; font-size:0.95rem;">
    Interactive demos of AI training data quality-control workflows.<br>
    Built by <a href="https://github.com/LaelaZorana" target="_blank">Laela Zorana</a> Β·
    <a href="https://huggingface.co/LaelaZ" target="_blank">HuggingFace</a> Β·
    <a href="https://kaggle.com/laelazorana" target="_blank">Kaggle</a>
  </p>
</div>
"""

with gr.Blocks(title="AI Evaluation Toolkit", theme=gr.themes.Soft()) as demo:
    gr.HTML(HEADER)

    with gr.Tabs():

        # ── TAB 1 ──────────────────────────────────────────────────────────
        with gr.Tab("βš–οΈ RLHF Pairwise Rater"):
            gr.Markdown(
                "Rate two AI responses on four quality axes, then check whether your overall "
                "preference is consistent with your per-axis scores."
            )
            prompt_box = gr.Textbox(
                label="Prompt / Task",
                placeholder="e.g. Explain gradient descent in simple terms.",
                lines=2,
            )
            with gr.Row():
                resp_a = gr.Textbox(label="Response A", lines=6,
                                    placeholder="Paste Response A here…")
                resp_b = gr.Textbox(label="Response B", lines=6,
                                    placeholder="Paste Response B here…")

            gr.Markdown("#### Axis Ratings (1 = Poor Β· 5 = Excellent)")
            with gr.Row():
                with gr.Column():
                    gr.Markdown("**Response A**")
                    help_a = gr.Slider(1, 5, value=3, step=1, label="Helpfulness A")
                    harm_a = gr.Slider(1, 5, value=3, step=1, label="Harmlessness A")
                    acc_a  = gr.Slider(1, 5, value=3, step=1, label="Accuracy A")
                    inst_a = gr.Slider(1, 5, value=3, step=1, label="Instruction-Following A")
                with gr.Column():
                    gr.Markdown("**Response B**")
                    help_b = gr.Slider(1, 5, value=3, step=1, label="Helpfulness B")
                    harm_b = gr.Slider(1, 5, value=3, step=1, label="Harmlessness B")
                    acc_b  = gr.Slider(1, 5, value=3, step=1, label="Accuracy B")
                    inst_b = gr.Slider(1, 5, value=3, step=1, label="Instruction-Following B")

            gr.Markdown("#### Overall Judgment")
            with gr.Row():
                preference = gr.Radio(
                    ["A is better", "B is better", "Tie"],
                    label="Overall Preference",
                    value="Tie",
                )
                confidence = gr.Slider(1, 3, value=2, step=1,
                                       label="Confidence (1=Low, 2=Medium, 3=High)")

            check_btn = gr.Button("Check Consistency", variant="primary")

            consistency_out = gr.Markdown(label="Consistency Check")
            table_out       = gr.Markdown(label="Score Summary")

            check_btn.click(
                check_consistency,
                inputs=[prompt_box, resp_a, resp_b,
                        help_a, harm_a, acc_a, inst_a,
                        help_b, harm_b, acc_b, inst_b,
                        preference, confidence],
                outputs=[consistency_out, table_out],
            )

            gr.Examples(
                examples=[
                    [
                        "Explain gradient descent simply.",
                        "Gradient descent is an optimization algorithm that minimizes a loss function by iteratively moving in the direction of steepest descent as defined by the negative of the gradient.",
                        "Imagine you're lost on a foggy mountain and want to reach the valley. Each step you take downhill is gradient descent β€” you keep moving in whichever direction is steepest until you can't go lower.",
                        4, 5, 4, 4,
                        5, 5, 3, 5,
                        "A is better", 2,
                    ]
                ],
                inputs=[prompt_box, resp_a, resp_b,
                        help_a, harm_a, acc_a, inst_a,
                        help_b, harm_b, acc_b, inst_b,
                        preference, confidence],
                label="Example: spot the inconsistency",
            )

        # ── TAB 2 ──────────────────────────────────────────────────────────
        with gr.Tab("πŸ“‹ Content Policy Rater"):
            gr.Markdown(
                "Score a piece of content against six policy criteria, then aggregate to an "
                "overall **PASS / FLAG / BLOCK** verdict using one of three aggregation rules."
            )
            content_box = gr.Textbox(
                label="Content to Evaluate",
                lines=5,
                placeholder="Paste the AI-generated content here…",
            )

            gr.Markdown("#### Per-Criterion Ratings")

            criterion_inputs = []
            for crit in CRITERIA:
                with gr.Row():
                    v = gr.Dropdown(
                        choices=VERDICTS, value="PASS",
                        label=f"{crit} β€” Verdict", scale=1,
                    )
                    r = gr.Textbox(
                        label=f"{crit} β€” Reasoning",
                        placeholder="Brief justification…",
                        scale=3,
                    )
                    criterion_inputs.extend([v, r])

            agg_mode = gr.Radio(
                ["worst_wins", "majority", "weighted"],
                label="Aggregation Mode",
                value="worst_wins",
                info=(
                    "worst_wins = most severe criterion wins | "
                    "majority = most common verdict | "
                    "weighted = Safety & PII get 3Γ— weight"
                ),
            )

            verdict_btn = gr.Button("Get Overall Verdict", variant="primary")
            verdict_out = gr.Markdown(label="Overall Verdict")
            criterion_table = gr.Markdown(label="Per-Criterion Breakdown")

            verdict_btn.click(
                get_overall_verdict,
                inputs=[content_box] + criterion_inputs + [agg_mode],
                outputs=[verdict_out, criterion_table],
            )

            gr.Examples(
                examples=[[
                    "To make a profit, you should invest in index funds, which historically return ~7% annually after inflation. Past performance doesn't guarantee future results β€” John Smith at 123 Main St made $200k last year.",
                    "PASS", "Historically accurate.",
                    "FLAG", "No direct harm, but financial advice disclaimer missing.",
                    "PASS", "No biased framing detected.",
                    "BLOCK", "Contains full name and street address of a real-sounding individual.",
                    "FLAG", "Financial advice without credentials disclaimer violates policy.",
                    "PASS", "Clear and readable.",
                    "worst_wins",
                ]],
                inputs=[content_box] + criterion_inputs + [agg_mode],
                label="Example: PII forces a BLOCK",
            )

        # ── TAB 3 ──────────────────────────────────────────────────────────
        with gr.Tab("πŸ”¬ Observation vs Inference"):
            gr.Markdown(
                "Practice keeping observations **clean** (factual, specific, no conclusions embedded) "
                "and inferences **grounded** (explicitly tied to what was observed). "
                "This discipline is core to high-quality AI evaluation and bug reporting."
            )
            with gr.Row():
                obs_box = gr.Textbox(
                    label="What did you observe?",
                    lines=4,
                    placeholder="e.g. The modal dialog closes immediately after opening without any user interaction.",
                    scale=1,
                )
                inf_box = gr.Textbox(
                    label="What do you conclude from it? (optional)",
                    lines=4,
                    placeholder="e.g. This suggests the dismiss event fires on mount rather than on user action.",
                    scale=1,
                )

            analyze_btn = gr.Button("Analyze", variant="primary")
            analysis_out = gr.Markdown(label="Analysis")
            examples_out = gr.Markdown(label="Reference Examples")

            analyze_btn.click(
                analyze_obs_inf,
                inputs=[obs_box, inf_box],
                outputs=[analysis_out, examples_out],
            )

            gr.Examples(
                examples=[
                    [
                        "The button looks inconsistent with the rest of the UI.",
                        "It probably wasn't designed by the same person.",
                    ],
                    [
                        "The Save button is 8 px lower than the Cancel button; Save uses Inter 14px Bold, Cancel uses Inter 16px Regular.",
                        "The vertical misalignment and font inconsistency suggest the two buttons were added in separate PRs without a shared spacing token.",
                    ],
                    [
                        "The model's response seems off-topic and confusing.",
                        "",
                    ],
                ],
                inputs=[obs_box, inf_box],
                label="Try these examples",
            )

demo.launch()