Spaces:
Runtime error
Runtime error
File size: 21,152 Bytes
bb1f3f9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 | import gradio as gr
# βββββββββββββββββββββββββββββββββββββββββββββ
# TAB 1 LOGIC: RLHF Pairwise Rater
# βββββββββββββββββββββββββββββββββββββββββββββ
def check_consistency(
prompt, resp_a, resp_b,
help_a, harm_a, acc_a, inst_a,
help_b, harm_b, acc_b, inst_b,
preference, confidence
):
if not resp_a.strip() or not resp_b.strip():
return "β οΈ Please enter both responses before checking.", ""
axes = {
"Helpfulness": (help_a, help_b),
"Harmlessness": (harm_a, harm_b),
"Accuracy": (acc_a, acc_b),
"Instruction-Following": (inst_a, inst_b),
}
avg_a = sum(v[0] for v in axes.values()) / 4
avg_b = sum(v[1] for v in axes.values()) / 4
# Which axes favour each response?
a_wins = [(k, va, vb) for k, (va, vb) in axes.items() if va > vb]
b_wins = [(k, va, vb) for k, (va, vb) in axes.items() if vb > va]
ties = [(k, va, vb) for k, (va, vb) in axes.items() if va == vb]
conf_label = {1: "Low", 2: "Medium", 3: "High"}[confidence]
# Build score table
table_rows = ""
for ax, (va, vb) in axes.items():
winner = "A β" if va > vb else ("B β" if vb > va else "Tie")
table_rows += f"| {ax} | {va} | {vb} | {winner} |\n"
table_rows += f"| **Average** | **{avg_a:.2f}** | **{avg_b:.2f}** | {'A β' if avg_a > avg_b else ('B β' if avg_b > avg_a else 'Tie')} |\n"
score_table = (
"### Score Summary\n\n"
"| Axis | Response A | Response B | Higher |\n"
"|------|-----------|-----------|--------|\n"
+ table_rows
)
# Consistency check
if preference == "Tie":
if abs(avg_a - avg_b) >= 1.0:
msg = (
f"β οΈ **Possible inconsistency:** You selected 'Tie', but the average scores differ by "
f"{abs(avg_a - avg_b):.2f} points (A avg: {avg_a:.2f}, B avg: {avg_b:.2f}). "
f"A tie is most appropriate when averages are within ~0.5 of each other."
)
else:
msg = f"β
**Consistent:** A 'Tie' verdict aligns with close average scores (A: {avg_a:.2f}, B: {avg_b:.2f})."
elif preference == "A is better":
if avg_b >= avg_a:
detail = ", ".join(f"{k}: A={va} vs B={vb}" for k, va, vb in b_wins) or "none"
msg = (
f"β οΈ **Inconsistency detected:** You selected 'A is better' overall, but Response B "
f"scored higher on {len(b_wins)}/4 axes. "
f"Axes favouring B: {detail}. "
f"Overall averages β A: {avg_a:.2f}, B: {avg_b:.2f}. Consider reviewing your overall verdict."
)
else:
msg = f"β
**Consistent:** 'A is better' aligns with higher per-axis averages (A: {avg_a:.2f} vs B: {avg_b:.2f})."
else: # B is better
if avg_a >= avg_b:
detail = ", ".join(f"{k}: A={va} vs B={vb}" for k, va, vb in a_wins) or "none"
msg = (
f"β οΈ **Inconsistency detected:** You selected 'B is better' overall, but Response A "
f"scored higher on {len(a_wins)}/4 axes. "
f"Axes favouring A: {detail}. "
f"Overall averages β A: {avg_a:.2f}, B: {avg_b:.2f}. Consider reviewing your overall verdict."
)
else:
msg = f"β
**Consistent:** 'B is better' aligns with higher per-axis averages (B: {avg_b:.2f} vs A: {avg_a:.2f})."
msg += f"\n\n**Confidence:** {conf_label} ({confidence}/3)"
return msg, score_table
# βββββββββββββββββββββββββββββββββββββββββββββ
# TAB 2 LOGIC: Content Policy Rater
# βββββββββββββββββββββββββββββββββββββββββββββ
VERDICTS = ["PASS", "FLAG", "BLOCK"]
VERDICT_RANK = {"PASS": 0, "FLAG": 1, "BLOCK": 2}
VERDICT_EMOJI = {"PASS": "π’", "FLAG": "π‘", "BLOCK": "π΄"}
CRITERIA = [
"Factual Accuracy",
"Safety",
"Bias",
"PII Exposure",
"On-Policy",
"Clarity",
]
def get_overall_verdict(*args):
content = args[0]
if not content.strip():
return "β οΈ Please enter content to evaluate.", ""
# args layout: content, then pairs of (verdict, reasoning) Γ 6, then mode
verdicts = [args[1 + i*2] for i in range(6)]
reasonings= [args[1 + i*2 + 1] for i in range(6)]
mode = args[13]
ranks = [VERDICT_RANK[v] for v in verdicts]
if mode == "worst_wins":
final_rank = max(ranks)
explanation = "**worst_wins:** The overall verdict equals the most severe individual criterion rating."
elif mode == "majority":
from collections import Counter
c = Counter(verdicts)
final_rank = VERDICT_RANK[c.most_common(1)[0][0]]
explanation = f"**majority:** The most frequent verdict ({c.most_common(1)[0][0]}, {c.most_common(1)[0][1]}/6 criteria) wins."
else: # weighted
weights = {"Factual Accuracy": 2, "Safety": 3, "Bias": 2, "PII Exposure": 3, "On-Policy": 2, "Clarity": 1}
total_w = sum(weights.values())
weighted_score = sum(weights[c] * ranks[i] for i, c in enumerate(CRITERIA)) / total_w
final_rank = 0 if weighted_score < 0.5 else (1 if weighted_score < 1.5 else 2)
explanation = f"**weighted:** Safety and PII carry 3Γ weight. Weighted score: {weighted_score:.2f} β {'PASS' if final_rank==0 else 'FLAG' if final_rank==1 else 'BLOCK'}."
final_verdict = VERDICTS[final_rank]
emoji = VERDICT_EMOJI[final_verdict]
header = f"## {emoji} Overall Verdict: **{final_verdict}**\n\n{explanation}\n\n"
table = "### Per-Criterion Breakdown\n\n| Criterion | Verdict | Reasoning |\n|-----------|---------|----------|\n"
for i, crit in enumerate(CRITERIA):
v = verdicts[i]
r = reasonings[i].strip() if reasonings[i].strip() else "β"
ev = VERDICT_EMOJI[v]
table += f"| {crit} | {ev} {v} | {r} |\n"
return header, table
# βββββββββββββββββββββββββββββββββββββββββββββ
# TAB 3 LOGIC: Observation vs Inference
# βββββββββββββββββββββββββββββββββββββββββββββ
INFERENCE_SIGNALS = [
# (signal phrase, example clean alternative)
("seems", "Describe exactly what you see, not what it suggests."),
("appears", "Describe exactly what you see, not what it suggests."),
("looks like", "Describe the specific visual or measurable property instead."),
("looks ", "Describe the specific visual or measurable property instead."),
("probably", "Remove speculation β state only what is directly observable."),
("likely", "Remove speculation β state only what is directly observable."),
("might", "Remove hedging β state only what is directly observable."),
("may ", "Remove hedging β state only what is directly observable."),
("should ", "Avoid prescriptive language in an observation."),
("is bad", "Describe the specific measurable problem, not a judgment."),
("is good", "Describe the specific measurable quality, not a judgment."),
("is wrong", "Describe the exact discrepancy observed."),
("is broken", "Describe what specifically does not work as expected."),
("is inconsistent","Specify the exact values or positions that differ."),
("unclear", "Describe what specific information is missing or ambiguous."),
("confusing", "Describe the specific element that causes confusion."),
("feels ", "Feelings are inferences. Describe the observable trigger instead."),
("indicates", "'Indicates' draws a conclusion. State the raw signal only."),
("suggests", "'Suggests' draws a conclusion. State the raw signal only."),
("implies", "'Implies' draws a conclusion. State the raw signal only."),
("because", "Causal claims belong in the inference, not the observation."),
]
def analyze_obs_inf(observation, inference):
if not observation.strip():
return "β οΈ Please enter an observation.", ""
obs_lower = observation.lower()
found = [(sig, tip) for sig, tip in INFERENCE_SIGNALS if sig in obs_lower]
if not found:
obs_result = (
"β
**Clean observation** β specific and factual. "
"No inference language detected."
)
else:
issues = "\n".join(
f"- **'{sig.strip()}'** β {tip}" for sig, tip in found[:3]
)
obs_result = (
f"β οΈ **Observation contains inference language** ({len(found)} signal(s) found):\n\n"
+ issues
+ "\n\n**Tip:** An observation should answer 'What did you literally see/measure?' β "
"no judgments, no causes, no speculation."
)
inf_result = ""
if inference.strip():
inf_lower = inference.lower()
# Inferences *should* contain reasoning words β flag if completely bare
reasoning_words = ["because", "therefore", "so ", "thus", "indicates", "suggests",
"means", "implies", "likely", "probably", "conclude"]
has_reasoning = any(w in inf_lower for w in reasoning_words)
if has_reasoning:
inf_result = "\n\nβ
**Inference** β contains reasoning language, which is appropriate here."
else:
inf_result = (
"\n\nπ‘ **Inference tip:** Your inference reads like a bare statement. "
"Strong inferences explain *why* β try adding 'because', 'therefore', or 'this suggests'."
)
examples = """
---
### Reference: Good vs Bad Examples
| # | β Contaminated Observation | β
Clean Observation |
|---|----------------------------|----------------------|
| 1 | "The button looks inconsistent with the rest of the UI." | "The Save button is 8 px lower than the Cancel button; Save uses Inter 14px, Cancel uses Inter 16px." |
| 2 | "The error message is confusing." | "The error message reads 'Error 403' with no additional context and no retry option." |
| 3 | "The response seems off-topic." | "The response does not mention the word 'Python' despite the prompt asking for a Python code example." |
**Rule of thumb:** If you can't photograph or measure it, it's probably an inference.
"""
return obs_result + inf_result, examples
# βββββββββββββββββββββββββββββββββββββββββββββ
# BUILD THE GRADIO APP
# βββββββββββββββββββββββββββββββββββββββββββββ
HEADER = """
<div style="text-align:center; padding: 16px 0 8px 0;">
<h1 style="font-size:2rem; margin-bottom:4px;">π― AI Evaluation Toolkit</h1>
<p style="color:#666; font-size:0.95rem;">
Interactive demos of AI training data quality-control workflows.<br>
Built by <a href="https://github.com/LaelaZorana" target="_blank">Laela Zorana</a> Β·
<a href="https://huggingface.co/LaelaZ" target="_blank">HuggingFace</a> Β·
<a href="https://kaggle.com/laelazorana" target="_blank">Kaggle</a>
</p>
</div>
"""
with gr.Blocks(title="AI Evaluation Toolkit", theme=gr.themes.Soft()) as demo:
gr.HTML(HEADER)
with gr.Tabs():
# ββ TAB 1 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.Tab("βοΈ RLHF Pairwise Rater"):
gr.Markdown(
"Rate two AI responses on four quality axes, then check whether your overall "
"preference is consistent with your per-axis scores."
)
prompt_box = gr.Textbox(
label="Prompt / Task",
placeholder="e.g. Explain gradient descent in simple terms.",
lines=2,
)
with gr.Row():
resp_a = gr.Textbox(label="Response A", lines=6,
placeholder="Paste Response A hereβ¦")
resp_b = gr.Textbox(label="Response B", lines=6,
placeholder="Paste Response B hereβ¦")
gr.Markdown("#### Axis Ratings (1 = Poor Β· 5 = Excellent)")
with gr.Row():
with gr.Column():
gr.Markdown("**Response A**")
help_a = gr.Slider(1, 5, value=3, step=1, label="Helpfulness A")
harm_a = gr.Slider(1, 5, value=3, step=1, label="Harmlessness A")
acc_a = gr.Slider(1, 5, value=3, step=1, label="Accuracy A")
inst_a = gr.Slider(1, 5, value=3, step=1, label="Instruction-Following A")
with gr.Column():
gr.Markdown("**Response B**")
help_b = gr.Slider(1, 5, value=3, step=1, label="Helpfulness B")
harm_b = gr.Slider(1, 5, value=3, step=1, label="Harmlessness B")
acc_b = gr.Slider(1, 5, value=3, step=1, label="Accuracy B")
inst_b = gr.Slider(1, 5, value=3, step=1, label="Instruction-Following B")
gr.Markdown("#### Overall Judgment")
with gr.Row():
preference = gr.Radio(
["A is better", "B is better", "Tie"],
label="Overall Preference",
value="Tie",
)
confidence = gr.Slider(1, 3, value=2, step=1,
label="Confidence (1=Low, 2=Medium, 3=High)")
check_btn = gr.Button("Check Consistency", variant="primary")
consistency_out = gr.Markdown(label="Consistency Check")
table_out = gr.Markdown(label="Score Summary")
check_btn.click(
check_consistency,
inputs=[prompt_box, resp_a, resp_b,
help_a, harm_a, acc_a, inst_a,
help_b, harm_b, acc_b, inst_b,
preference, confidence],
outputs=[consistency_out, table_out],
)
gr.Examples(
examples=[
[
"Explain gradient descent simply.",
"Gradient descent is an optimization algorithm that minimizes a loss function by iteratively moving in the direction of steepest descent as defined by the negative of the gradient.",
"Imagine you're lost on a foggy mountain and want to reach the valley. Each step you take downhill is gradient descent β you keep moving in whichever direction is steepest until you can't go lower.",
4, 5, 4, 4,
5, 5, 3, 5,
"A is better", 2,
]
],
inputs=[prompt_box, resp_a, resp_b,
help_a, harm_a, acc_a, inst_a,
help_b, harm_b, acc_b, inst_b,
preference, confidence],
label="Example: spot the inconsistency",
)
# ββ TAB 2 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.Tab("π Content Policy Rater"):
gr.Markdown(
"Score a piece of content against six policy criteria, then aggregate to an "
"overall **PASS / FLAG / BLOCK** verdict using one of three aggregation rules."
)
content_box = gr.Textbox(
label="Content to Evaluate",
lines=5,
placeholder="Paste the AI-generated content hereβ¦",
)
gr.Markdown("#### Per-Criterion Ratings")
criterion_inputs = []
for crit in CRITERIA:
with gr.Row():
v = gr.Dropdown(
choices=VERDICTS, value="PASS",
label=f"{crit} β Verdict", scale=1,
)
r = gr.Textbox(
label=f"{crit} β Reasoning",
placeholder="Brief justificationβ¦",
scale=3,
)
criterion_inputs.extend([v, r])
agg_mode = gr.Radio(
["worst_wins", "majority", "weighted"],
label="Aggregation Mode",
value="worst_wins",
info=(
"worst_wins = most severe criterion wins | "
"majority = most common verdict | "
"weighted = Safety & PII get 3Γ weight"
),
)
verdict_btn = gr.Button("Get Overall Verdict", variant="primary")
verdict_out = gr.Markdown(label="Overall Verdict")
criterion_table = gr.Markdown(label="Per-Criterion Breakdown")
verdict_btn.click(
get_overall_verdict,
inputs=[content_box] + criterion_inputs + [agg_mode],
outputs=[verdict_out, criterion_table],
)
gr.Examples(
examples=[[
"To make a profit, you should invest in index funds, which historically return ~7% annually after inflation. Past performance doesn't guarantee future results β John Smith at 123 Main St made $200k last year.",
"PASS", "Historically accurate.",
"FLAG", "No direct harm, but financial advice disclaimer missing.",
"PASS", "No biased framing detected.",
"BLOCK", "Contains full name and street address of a real-sounding individual.",
"FLAG", "Financial advice without credentials disclaimer violates policy.",
"PASS", "Clear and readable.",
"worst_wins",
]],
inputs=[content_box] + criterion_inputs + [agg_mode],
label="Example: PII forces a BLOCK",
)
# ββ TAB 3 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.Tab("π¬ Observation vs Inference"):
gr.Markdown(
"Practice keeping observations **clean** (factual, specific, no conclusions embedded) "
"and inferences **grounded** (explicitly tied to what was observed). "
"This discipline is core to high-quality AI evaluation and bug reporting."
)
with gr.Row():
obs_box = gr.Textbox(
label="What did you observe?",
lines=4,
placeholder="e.g. The modal dialog closes immediately after opening without any user interaction.",
scale=1,
)
inf_box = gr.Textbox(
label="What do you conclude from it? (optional)",
lines=4,
placeholder="e.g. This suggests the dismiss event fires on mount rather than on user action.",
scale=1,
)
analyze_btn = gr.Button("Analyze", variant="primary")
analysis_out = gr.Markdown(label="Analysis")
examples_out = gr.Markdown(label="Reference Examples")
analyze_btn.click(
analyze_obs_inf,
inputs=[obs_box, inf_box],
outputs=[analysis_out, examples_out],
)
gr.Examples(
examples=[
[
"The button looks inconsistent with the rest of the UI.",
"It probably wasn't designed by the same person.",
],
[
"The Save button is 8 px lower than the Cancel button; Save uses Inter 14px Bold, Cancel uses Inter 16px Regular.",
"The vertical misalignment and font inconsistency suggest the two buttons were added in separate PRs without a shared spacing token.",
],
[
"The model's response seems off-topic and confusing.",
"",
],
],
inputs=[obs_box, inf_box],
label="Try these examples",
)
demo.launch()
|