Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 1 LOGIC: RLHF Pairwise Rater | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def check_consistency( | |
| prompt, resp_a, resp_b, | |
| help_a, harm_a, acc_a, inst_a, | |
| help_b, harm_b, acc_b, inst_b, | |
| preference, confidence | |
| ): | |
| if not resp_a.strip() or not resp_b.strip(): | |
| return "β οΈ Please enter both responses before checking.", "" | |
| axes = { | |
| "Helpfulness": (help_a, help_b), | |
| "Harmlessness": (harm_a, harm_b), | |
| "Accuracy": (acc_a, acc_b), | |
| "Instruction-Following": (inst_a, inst_b), | |
| } | |
| avg_a = sum(v[0] for v in axes.values()) / 4 | |
| avg_b = sum(v[1] for v in axes.values()) / 4 | |
| # Which axes favour each response? | |
| a_wins = [(k, va, vb) for k, (va, vb) in axes.items() if va > vb] | |
| b_wins = [(k, va, vb) for k, (va, vb) in axes.items() if vb > va] | |
| ties = [(k, va, vb) for k, (va, vb) in axes.items() if va == vb] | |
| conf_label = {1: "Low", 2: "Medium", 3: "High"}[confidence] | |
| # Build score table | |
| table_rows = "" | |
| for ax, (va, vb) in axes.items(): | |
| winner = "A β" if va > vb else ("B β" if vb > va else "Tie") | |
| table_rows += f"| {ax} | {va} | {vb} | {winner} |\n" | |
| table_rows += f"| **Average** | **{avg_a:.2f}** | **{avg_b:.2f}** | {'A β' if avg_a > avg_b else ('B β' if avg_b > avg_a else 'Tie')} |\n" | |
| score_table = ( | |
| "### Score Summary\n\n" | |
| "| Axis | Response A | Response B | Higher |\n" | |
| "|------|-----------|-----------|--------|\n" | |
| + table_rows | |
| ) | |
| # Consistency check | |
| if preference == "Tie": | |
| if abs(avg_a - avg_b) >= 1.0: | |
| msg = ( | |
| f"β οΈ **Possible inconsistency:** You selected 'Tie', but the average scores differ by " | |
| f"{abs(avg_a - avg_b):.2f} points (A avg: {avg_a:.2f}, B avg: {avg_b:.2f}). " | |
| f"A tie is most appropriate when averages are within ~0.5 of each other." | |
| ) | |
| else: | |
| msg = f"β **Consistent:** A 'Tie' verdict aligns with close average scores (A: {avg_a:.2f}, B: {avg_b:.2f})." | |
| elif preference == "A is better": | |
| if avg_b >= avg_a: | |
| detail = ", ".join(f"{k}: A={va} vs B={vb}" for k, va, vb in b_wins) or "none" | |
| msg = ( | |
| f"β οΈ **Inconsistency detected:** You selected 'A is better' overall, but Response B " | |
| f"scored higher on {len(b_wins)}/4 axes. " | |
| f"Axes favouring B: {detail}. " | |
| f"Overall averages β A: {avg_a:.2f}, B: {avg_b:.2f}. Consider reviewing your overall verdict." | |
| ) | |
| else: | |
| msg = f"β **Consistent:** 'A is better' aligns with higher per-axis averages (A: {avg_a:.2f} vs B: {avg_b:.2f})." | |
| else: # B is better | |
| if avg_a >= avg_b: | |
| detail = ", ".join(f"{k}: A={va} vs B={vb}" for k, va, vb in a_wins) or "none" | |
| msg = ( | |
| f"β οΈ **Inconsistency detected:** You selected 'B is better' overall, but Response A " | |
| f"scored higher on {len(a_wins)}/4 axes. " | |
| f"Axes favouring A: {detail}. " | |
| f"Overall averages β A: {avg_a:.2f}, B: {avg_b:.2f}. Consider reviewing your overall verdict." | |
| ) | |
| else: | |
| msg = f"β **Consistent:** 'B is better' aligns with higher per-axis averages (B: {avg_b:.2f} vs A: {avg_a:.2f})." | |
| msg += f"\n\n**Confidence:** {conf_label} ({confidence}/3)" | |
| return msg, score_table | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 2 LOGIC: Content Policy Rater | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| VERDICTS = ["PASS", "FLAG", "BLOCK"] | |
| VERDICT_RANK = {"PASS": 0, "FLAG": 1, "BLOCK": 2} | |
| VERDICT_EMOJI = {"PASS": "π’", "FLAG": "π‘", "BLOCK": "π΄"} | |
| CRITERIA = [ | |
| "Factual Accuracy", | |
| "Safety", | |
| "Bias", | |
| "PII Exposure", | |
| "On-Policy", | |
| "Clarity", | |
| ] | |
| def get_overall_verdict(*args): | |
| content = args[0] | |
| if not content.strip(): | |
| return "β οΈ Please enter content to evaluate.", "" | |
| # args layout: content, then pairs of (verdict, reasoning) Γ 6, then mode | |
| verdicts = [args[1 + i*2] for i in range(6)] | |
| reasonings= [args[1 + i*2 + 1] for i in range(6)] | |
| mode = args[13] | |
| ranks = [VERDICT_RANK[v] for v in verdicts] | |
| if mode == "worst_wins": | |
| final_rank = max(ranks) | |
| explanation = "**worst_wins:** The overall verdict equals the most severe individual criterion rating." | |
| elif mode == "majority": | |
| from collections import Counter | |
| c = Counter(verdicts) | |
| final_rank = VERDICT_RANK[c.most_common(1)[0][0]] | |
| explanation = f"**majority:** The most frequent verdict ({c.most_common(1)[0][0]}, {c.most_common(1)[0][1]}/6 criteria) wins." | |
| else: # weighted | |
| weights = {"Factual Accuracy": 2, "Safety": 3, "Bias": 2, "PII Exposure": 3, "On-Policy": 2, "Clarity": 1} | |
| total_w = sum(weights.values()) | |
| weighted_score = sum(weights[c] * ranks[i] for i, c in enumerate(CRITERIA)) / total_w | |
| final_rank = 0 if weighted_score < 0.5 else (1 if weighted_score < 1.5 else 2) | |
| explanation = f"**weighted:** Safety and PII carry 3Γ weight. Weighted score: {weighted_score:.2f} β {'PASS' if final_rank==0 else 'FLAG' if final_rank==1 else 'BLOCK'}." | |
| final_verdict = VERDICTS[final_rank] | |
| emoji = VERDICT_EMOJI[final_verdict] | |
| header = f"## {emoji} Overall Verdict: **{final_verdict}**\n\n{explanation}\n\n" | |
| table = "### Per-Criterion Breakdown\n\n| Criterion | Verdict | Reasoning |\n|-----------|---------|----------|\n" | |
| for i, crit in enumerate(CRITERIA): | |
| v = verdicts[i] | |
| r = reasonings[i].strip() if reasonings[i].strip() else "β" | |
| ev = VERDICT_EMOJI[v] | |
| table += f"| {crit} | {ev} {v} | {r} |\n" | |
| return header, table | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 3 LOGIC: Observation vs Inference | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| INFERENCE_SIGNALS = [ | |
| # (signal phrase, example clean alternative) | |
| ("seems", "Describe exactly what you see, not what it suggests."), | |
| ("appears", "Describe exactly what you see, not what it suggests."), | |
| ("looks like", "Describe the specific visual or measurable property instead."), | |
| ("looks ", "Describe the specific visual or measurable property instead."), | |
| ("probably", "Remove speculation β state only what is directly observable."), | |
| ("likely", "Remove speculation β state only what is directly observable."), | |
| ("might", "Remove hedging β state only what is directly observable."), | |
| ("may ", "Remove hedging β state only what is directly observable."), | |
| ("should ", "Avoid prescriptive language in an observation."), | |
| ("is bad", "Describe the specific measurable problem, not a judgment."), | |
| ("is good", "Describe the specific measurable quality, not a judgment."), | |
| ("is wrong", "Describe the exact discrepancy observed."), | |
| ("is broken", "Describe what specifically does not work as expected."), | |
| ("is inconsistent","Specify the exact values or positions that differ."), | |
| ("unclear", "Describe what specific information is missing or ambiguous."), | |
| ("confusing", "Describe the specific element that causes confusion."), | |
| ("feels ", "Feelings are inferences. Describe the observable trigger instead."), | |
| ("indicates", "'Indicates' draws a conclusion. State the raw signal only."), | |
| ("suggests", "'Suggests' draws a conclusion. State the raw signal only."), | |
| ("implies", "'Implies' draws a conclusion. State the raw signal only."), | |
| ("because", "Causal claims belong in the inference, not the observation."), | |
| ] | |
| def analyze_obs_inf(observation, inference): | |
| if not observation.strip(): | |
| return "β οΈ Please enter an observation.", "" | |
| obs_lower = observation.lower() | |
| found = [(sig, tip) for sig, tip in INFERENCE_SIGNALS if sig in obs_lower] | |
| if not found: | |
| obs_result = ( | |
| "β **Clean observation** β specific and factual. " | |
| "No inference language detected." | |
| ) | |
| else: | |
| issues = "\n".join( | |
| f"- **'{sig.strip()}'** β {tip}" for sig, tip in found[:3] | |
| ) | |
| obs_result = ( | |
| f"β οΈ **Observation contains inference language** ({len(found)} signal(s) found):\n\n" | |
| + issues | |
| + "\n\n**Tip:** An observation should answer 'What did you literally see/measure?' β " | |
| "no judgments, no causes, no speculation." | |
| ) | |
| inf_result = "" | |
| if inference.strip(): | |
| inf_lower = inference.lower() | |
| # Inferences *should* contain reasoning words β flag if completely bare | |
| reasoning_words = ["because", "therefore", "so ", "thus", "indicates", "suggests", | |
| "means", "implies", "likely", "probably", "conclude"] | |
| has_reasoning = any(w in inf_lower for w in reasoning_words) | |
| if has_reasoning: | |
| inf_result = "\n\nβ **Inference** β contains reasoning language, which is appropriate here." | |
| else: | |
| inf_result = ( | |
| "\n\nπ‘ **Inference tip:** Your inference reads like a bare statement. " | |
| "Strong inferences explain *why* β try adding 'because', 'therefore', or 'this suggests'." | |
| ) | |
| examples = """ | |
| --- | |
| ### Reference: Good vs Bad Examples | |
| | # | β Contaminated Observation | β Clean Observation | | |
| |---|----------------------------|----------------------| | |
| | 1 | "The button looks inconsistent with the rest of the UI." | "The Save button is 8 px lower than the Cancel button; Save uses Inter 14px, Cancel uses Inter 16px." | | |
| | 2 | "The error message is confusing." | "The error message reads 'Error 403' with no additional context and no retry option." | | |
| | 3 | "The response seems off-topic." | "The response does not mention the word 'Python' despite the prompt asking for a Python code example." | | |
| **Rule of thumb:** If you can't photograph or measure it, it's probably an inference. | |
| """ | |
| return obs_result + inf_result, examples | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # BUILD THE GRADIO APP | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| HEADER = """ | |
| <div style="text-align:center; padding: 16px 0 8px 0;"> | |
| <h1 style="font-size:2rem; margin-bottom:4px;">π― AI Evaluation Toolkit</h1> | |
| <p style="color:#666; font-size:0.95rem;"> | |
| Interactive demos of AI training data quality-control workflows.<br> | |
| Built by <a href="https://github.com/LaelaZorana" target="_blank">Laela Zorana</a> Β· | |
| <a href="https://huggingface.co/LaelaZ" target="_blank">HuggingFace</a> Β· | |
| <a href="https://kaggle.com/laelazorana" target="_blank">Kaggle</a> | |
| </p> | |
| </div> | |
| """ | |
| with gr.Blocks(title="AI Evaluation Toolkit", theme=gr.themes.Soft()) as demo: | |
| gr.HTML(HEADER) | |
| with gr.Tabs(): | |
| # ββ TAB 1 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("βοΈ RLHF Pairwise Rater"): | |
| gr.Markdown( | |
| "Rate two AI responses on four quality axes, then check whether your overall " | |
| "preference is consistent with your per-axis scores." | |
| ) | |
| prompt_box = gr.Textbox( | |
| label="Prompt / Task", | |
| placeholder="e.g. Explain gradient descent in simple terms.", | |
| lines=2, | |
| ) | |
| with gr.Row(): | |
| resp_a = gr.Textbox(label="Response A", lines=6, | |
| placeholder="Paste Response A hereβ¦") | |
| resp_b = gr.Textbox(label="Response B", lines=6, | |
| placeholder="Paste Response B hereβ¦") | |
| gr.Markdown("#### Axis Ratings (1 = Poor Β· 5 = Excellent)") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("**Response A**") | |
| help_a = gr.Slider(1, 5, value=3, step=1, label="Helpfulness A") | |
| harm_a = gr.Slider(1, 5, value=3, step=1, label="Harmlessness A") | |
| acc_a = gr.Slider(1, 5, value=3, step=1, label="Accuracy A") | |
| inst_a = gr.Slider(1, 5, value=3, step=1, label="Instruction-Following A") | |
| with gr.Column(): | |
| gr.Markdown("**Response B**") | |
| help_b = gr.Slider(1, 5, value=3, step=1, label="Helpfulness B") | |
| harm_b = gr.Slider(1, 5, value=3, step=1, label="Harmlessness B") | |
| acc_b = gr.Slider(1, 5, value=3, step=1, label="Accuracy B") | |
| inst_b = gr.Slider(1, 5, value=3, step=1, label="Instruction-Following B") | |
| gr.Markdown("#### Overall Judgment") | |
| with gr.Row(): | |
| preference = gr.Radio( | |
| ["A is better", "B is better", "Tie"], | |
| label="Overall Preference", | |
| value="Tie", | |
| ) | |
| confidence = gr.Slider(1, 3, value=2, step=1, | |
| label="Confidence (1=Low, 2=Medium, 3=High)") | |
| check_btn = gr.Button("Check Consistency", variant="primary") | |
| consistency_out = gr.Markdown(label="Consistency Check") | |
| table_out = gr.Markdown(label="Score Summary") | |
| check_btn.click( | |
| check_consistency, | |
| inputs=[prompt_box, resp_a, resp_b, | |
| help_a, harm_a, acc_a, inst_a, | |
| help_b, harm_b, acc_b, inst_b, | |
| preference, confidence], | |
| outputs=[consistency_out, table_out], | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "Explain gradient descent simply.", | |
| "Gradient descent is an optimization algorithm that minimizes a loss function by iteratively moving in the direction of steepest descent as defined by the negative of the gradient.", | |
| "Imagine you're lost on a foggy mountain and want to reach the valley. Each step you take downhill is gradient descent β you keep moving in whichever direction is steepest until you can't go lower.", | |
| 4, 5, 4, 4, | |
| 5, 5, 3, 5, | |
| "A is better", 2, | |
| ] | |
| ], | |
| inputs=[prompt_box, resp_a, resp_b, | |
| help_a, harm_a, acc_a, inst_a, | |
| help_b, harm_b, acc_b, inst_b, | |
| preference, confidence], | |
| label="Example: spot the inconsistency", | |
| ) | |
| # ββ TAB 2 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π Content Policy Rater"): | |
| gr.Markdown( | |
| "Score a piece of content against six policy criteria, then aggregate to an " | |
| "overall **PASS / FLAG / BLOCK** verdict using one of three aggregation rules." | |
| ) | |
| content_box = gr.Textbox( | |
| label="Content to Evaluate", | |
| lines=5, | |
| placeholder="Paste the AI-generated content hereβ¦", | |
| ) | |
| gr.Markdown("#### Per-Criterion Ratings") | |
| criterion_inputs = [] | |
| for crit in CRITERIA: | |
| with gr.Row(): | |
| v = gr.Dropdown( | |
| choices=VERDICTS, value="PASS", | |
| label=f"{crit} β Verdict", scale=1, | |
| ) | |
| r = gr.Textbox( | |
| label=f"{crit} β Reasoning", | |
| placeholder="Brief justificationβ¦", | |
| scale=3, | |
| ) | |
| criterion_inputs.extend([v, r]) | |
| agg_mode = gr.Radio( | |
| ["worst_wins", "majority", "weighted"], | |
| label="Aggregation Mode", | |
| value="worst_wins", | |
| info=( | |
| "worst_wins = most severe criterion wins | " | |
| "majority = most common verdict | " | |
| "weighted = Safety & PII get 3Γ weight" | |
| ), | |
| ) | |
| verdict_btn = gr.Button("Get Overall Verdict", variant="primary") | |
| verdict_out = gr.Markdown(label="Overall Verdict") | |
| criterion_table = gr.Markdown(label="Per-Criterion Breakdown") | |
| verdict_btn.click( | |
| get_overall_verdict, | |
| inputs=[content_box] + criterion_inputs + [agg_mode], | |
| outputs=[verdict_out, criterion_table], | |
| ) | |
| gr.Examples( | |
| examples=[[ | |
| "To make a profit, you should invest in index funds, which historically return ~7% annually after inflation. Past performance doesn't guarantee future results β John Smith at 123 Main St made $200k last year.", | |
| "PASS", "Historically accurate.", | |
| "FLAG", "No direct harm, but financial advice disclaimer missing.", | |
| "PASS", "No biased framing detected.", | |
| "BLOCK", "Contains full name and street address of a real-sounding individual.", | |
| "FLAG", "Financial advice without credentials disclaimer violates policy.", | |
| "PASS", "Clear and readable.", | |
| "worst_wins", | |
| ]], | |
| inputs=[content_box] + criterion_inputs + [agg_mode], | |
| label="Example: PII forces a BLOCK", | |
| ) | |
| # ββ TAB 3 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π¬ Observation vs Inference"): | |
| gr.Markdown( | |
| "Practice keeping observations **clean** (factual, specific, no conclusions embedded) " | |
| "and inferences **grounded** (explicitly tied to what was observed). " | |
| "This discipline is core to high-quality AI evaluation and bug reporting." | |
| ) | |
| with gr.Row(): | |
| obs_box = gr.Textbox( | |
| label="What did you observe?", | |
| lines=4, | |
| placeholder="e.g. The modal dialog closes immediately after opening without any user interaction.", | |
| scale=1, | |
| ) | |
| inf_box = gr.Textbox( | |
| label="What do you conclude from it? (optional)", | |
| lines=4, | |
| placeholder="e.g. This suggests the dismiss event fires on mount rather than on user action.", | |
| scale=1, | |
| ) | |
| analyze_btn = gr.Button("Analyze", variant="primary") | |
| analysis_out = gr.Markdown(label="Analysis") | |
| examples_out = gr.Markdown(label="Reference Examples") | |
| analyze_btn.click( | |
| analyze_obs_inf, | |
| inputs=[obs_box, inf_box], | |
| outputs=[analysis_out, examples_out], | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "The button looks inconsistent with the rest of the UI.", | |
| "It probably wasn't designed by the same person.", | |
| ], | |
| [ | |
| "The Save button is 8 px lower than the Cancel button; Save uses Inter 14px Bold, Cancel uses Inter 16px Regular.", | |
| "The vertical misalignment and font inconsistency suggest the two buttons were added in separate PRs without a shared spacing token.", | |
| ], | |
| [ | |
| "The model's response seems off-topic and confusing.", | |
| "", | |
| ], | |
| ], | |
| inputs=[obs_box, inf_box], | |
| label="Try these examples", | |
| ) | |
| demo.launch() | |