Adithya765's picture
final fix: clamp all scores strictly
8fd29dc
"""
task3_hard.py
=============
Task 3 β€” Bias & Quality Audit (Hard, 5-Turn)
OpenEnv Project | Meta Γ— Hugging Face Hackathon
What it does:
Runs a 5-turn deep audit on a HuggingFace dataset
covering the hardest quality and bias checks.
Checks (9 across 5 turns):
Turn 1: Column profiling
Turn 2: Near-duplicate detection + Annotation inconsistency
Turn 3: Label noise + Toxicity + Hate speech
Turn 4: Bias detection (gender/racial/cultural) + Linguistic diversity
Turn 5: Data leakage + Domain drift + Final audit report
Usage:
python task3_hard.py
β†’ Enter dataset name (e.g. dair-ai/emotion)
β†’ Press Enter to progress through each turn
β†’ Output saved to task3_output.json
β†’ Copy JSON output into grader3.py
Requirements:
pip install datasets pandas numpy scikit-learn langdetect
"""
"""
Task 3 β€” Bias & Quality Audit (HARD)
=====================================
A 5-turn multi-step agent task with 8 hard checks.
Turn 1 β†’ Dataset Overview + Column Profiling
Turn 2 β†’ Near-Duplicate Detection + Annotation Inconsistency
Turn 3 β†’ Label Noise + Toxicity / Hate Speech Detection
Turn 4 β†’ Bias Detection + Linguistic Diversity
Turn 5 β†’ Data Leakage + Domain Drift + Final Audit Report
Compatible with: Google Colab, Python 3.8+
Install: pip install datasets pandas numpy scikit-learn langdetect difflib
"""
import json
import re
import difflib
import warnings
import subprocess
import sys
from collections import Counter, defaultdict
warnings.filterwarnings("ignore")
def clamp_score(x):
return round(max(0.01, min(0.99, x)), 3)
# ─────────────────────────────────────────────
# INSTALL DEPS
# ─────────────────────────────────────────────
def install(pkg):
subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])
for pkg in ["datasets", "pandas", "numpy", "scikit-learn", "langdetect"]:
try:
__import__(pkg.replace("-", "_"))
except ImportError:
install(pkg)
import pandas as pd
import numpy as np
from datasets import load_dataset
from langdetect import detect, LangDetectException
# ─────────────────────────────────────────────
# WORD LISTS
# ─────────────────────────────────────────────
GENDER_BIAS_WORDS = [
"he", "she", "him", "her", "his", "hers", "man", "woman", "men", "women",
"male", "female", "boy", "girl", "husband", "wife", "father", "mother",
"son", "daughter", "brother", "sister", "mr", "mrs", "ms", "sir", "madam",
"king", "queen", "prince", "princess", "actor", "actress", "waiter", "waitress",
"policeman", "policewoman", "fireman", "stewardess", "mankind", "manpower"
]
RACIAL_BIAS_WORDS = [
"black", "white", "asian", "hispanic", "latino", "latina", "african",
"caucasian", "arab", "jewish", "muslim", "christian", "hindu", "immigrant",
"refugee", "foreigner", "native", "minority", "ethnic", "race", "racial",
"colored", "indigenous", "tribal"
]
CULTURAL_BIAS_WORDS = [
"western", "eastern", "american", "european", "third world", "developing",
"primitive", "civilized", "savage", "barbarian", "exotic", "oriental",
"traditional", "modern", "backward", "progressive", "uncivilized"
]
TOXIC_WORDS = [
"idiot", "stupid", "moron", "loser", "trash", "garbage", "hate", "kill",
"die", "retard", "freak", "ugly", "fat", "dumb", "worthless", "scum",
"disgusting", "pathetic", "monster", "pervert", "creep"
]
HATE_SPEECH_PATTERNS = [
r'\b(all|those|these)\s+(black|white|asian|muslim|jewish|gay|trans)\s+(people|guys|men|women)\s+(are|should|must|deserve)\b',
r'\b(go back to)\s+\w+',
r'\b(ban all)\s+\w+',
r'\b(they|them)\s+(don\'t belong|should leave|are inferior|are criminals)\b',
]
POSITIVE_WORDS = [
"great", "excellent", "amazing", "wonderful", "fantastic", "best", "good",
"beautiful", "smart", "brilliant", "awesome", "perfect", "love", "superb",
"outstanding", "exceptional", "magnificent", "enjoyed", "loved"
]
NEGATIVE_WORDS = [
"terrible", "awful", "horrible", "disgusting", "hate", "worst", "bad",
"poor", "ugly", "stupid", "dumb", "trash", "garbage", "useless",
"worthless", "pathetic", "loser", "failure", "boring", "waste", "dreadful"
]
# ─────────────────────────────────────────────
# HELPERS
# ─────────────────────────────────────────────
def clean_dataset_name(name: str) -> tuple:
"""
Accept either:
- plain name: imdb
- name + config: cardiffnlp/tweet_eval hate
- full HF URL: https://huggingface.co/datasets/cardiffnlp/tweet_eval
Returns (dataset_name, config_or_None)
"""
name = name.strip()
# Strip full HF URL down to just the dataset path
if name.startswith("http"):
# e.g. https://huggingface.co/datasets/cardiffnlp/tweet_eval
name = re.sub(r"https?://huggingface\.co/datasets/", "", name).strip("/")
# Check if user passed config after a space e.g. "cardiffnlp/tweet_eval hate"
parts = name.split()
if len(parts) == 2:
return parts[0], parts[1]
return name, None
def fetch_dataset_rows(dataset_name: str, config: str = None, num_rows: int = 100):
"""Fetch rows β€” tries train, then test, then validation split."""
label = f"{dataset_name}" + (f" ({config})" if config else "")
print(f"\nπŸ“¦ Loading dataset: {label} ...")
# Known datasets that require a config β€” default config map
CONFIG_DEFAULTS = {
"cardiffnlp/tweet_eval": "hate",
"nyu-mll/glue": "sst2",
"glue": "sst2",
}
if config is None:
config = CONFIG_DEFAULTS.get(dataset_name)
for split in ["train", "test", "validation"]:
try:
if config:
ds = load_dataset(dataset_name, config, split=split, streaming=True)
else:
ds = load_dataset(dataset_name, split=split, streaming=True)
rows = []
for i, row in enumerate(ds):
if i >= num_rows:
break
rows.append(row)
if rows:
df = pd.DataFrame(rows)
print(f"βœ… Loaded {len(df)} rows, {len(df.columns)} columns from '{split}' split")
print(f" Columns: {list(df.columns)}")
return df, split
except Exception as e:
print(f" ⚠️ Split '{split}' failed: {str(e)[:80]}")
continue
print("❌ Could not load any split.")
return pd.DataFrame(), None
def fetch_second_split(dataset_name: str, first_split: str, config: str = None, num_rows: int = 50):
"""Fetch a different split for data leakage check."""
for split in ["train", "test", "validation"]:
if split == first_split:
continue
try:
if config:
ds = load_dataset(dataset_name, config, split=split, streaming=True)
else:
ds = load_dataset(dataset_name, split=split, streaming=True)
rows = []
for i, row in enumerate(ds):
if i >= num_rows:
break
rows.append(row)
if rows:
return pd.DataFrame(rows), split
except Exception:
continue
return pd.DataFrame(), None
def get_text_columns(df: pd.DataFrame):
return [c for c in df.columns if df[c].dtype == object and
df[c].dropna().apply(lambda x: isinstance(x, str) and len(x) > 20).mean() > 0.3]
def get_label_columns(df: pd.DataFrame):
return [c for c in df.columns if 1 < df[c].nunique() <= 20]
def similarity_ratio(s1: str, s2: str) -> float:
return difflib.SequenceMatcher(None, s1.lower().strip(), s2.lower().strip()).ratio()
def detect_language(text: str) -> str:
try:
return detect(str(text))
except LangDetectException:
return "unknown"
# ─────────────────────────────────────────────
# TURN 1 β€” Dataset Overview + Column Profiling
# ─────────────────────────────────────────────
def turn1_overview(df: pd.DataFrame, dataset_name: str, split: str) -> dict:
print("\n" + "="*60)
print("TURN 1 β€” Dataset Overview & Column Profiling")
print("="*60)
columns_info = {}
for col in df.columns:
missing = int(df[col].isnull().sum())
nuniq = int(df[col].nunique())
dtype = str(df[col].dtype)
sample = df[col].dropna().iloc[:3].tolist() if not df[col].dropna().empty else []
avg_len = None
if df[col].dtype == object:
avg_len = round(df[col].dropna().astype(str).apply(len).mean(), 1)
columns_info[col] = {
"dtype": dtype,
"missing": missing,
"missing_rate": round(missing / max(len(df), 1), 3),
"unique_values": nuniq,
"avg_text_length": avg_len,
"sample": [str(s)[:60] for s in sample]
}
text_cols = get_text_columns(df)
label_cols = get_label_columns(df)
flags = []
if len(df) < 20:
flags.append("Very small dataset β€” statistical checks may be unreliable")
if not text_cols:
flags.append("No long-text columns β€” bias/toxicity checks will be limited")
if not label_cols:
flags.append("No label columns β€” label noise check will be skipped")
result = {
"turn": 1,
"turn_name": "dataset_overview",
"dataset_name": dataset_name,
"split_used": split,
"total_rows": len(df),
"total_columns": len(df.columns),
"column_names": list(df.columns),
"columns_info": columns_info,
"text_columns_detected": text_cols,
"label_columns_detected": label_cols,
"initial_flags": flags,
"status": "completed"
}
print(json.dumps(result, indent=2))
return result
# ─────────────────────────────────────────────
# TURN 2 β€” Near-Duplicates + Annotation Inconsistency
# ─────────────────────────────────────────────
def turn2_near_dupes_and_annotation(df: pd.DataFrame) -> dict:
print("\n" + "="*60)
print("TURN 2 β€” Near-Duplicate Detection + Annotation Inconsistency")
print("="*60)
text_cols = get_text_columns(df)
label_cols = get_label_columns(df)
# ── Near-Duplicates ──
near_dupes = []
if text_cols:
col = text_cols[0]
texts = df[col].dropna().astype(str).tolist()
limit = min(len(texts), 150)
for i in range(limit):
for j in range(i + 1, limit):
ratio = similarity_ratio(texts[i], texts[j])
if 0.85 <= ratio < 1.0:
near_dupes.append({
"row_i": i, "row_j": j,
"similarity": round(ratio, 3),
"text_i": texts[i][:80],
"text_j": texts[j][:80]
})
# ── Annotation Inconsistency ──
annotation_issues = []
if text_cols and label_cols:
text_col = text_cols[0]
label_col = label_cols[0]
texts = df[text_col].dropna().astype(str).tolist()
labels = df[label_col].astype(str).tolist()
limit = min(len(texts), 150)
for i in range(limit):
for j in range(i + 1, limit):
ratio = similarity_ratio(texts[i], texts[j])
if ratio >= 0.80 and labels[i] != labels[j]:
annotation_issues.append({
"row_i": i, "row_j": j,
"similarity": round(ratio, 3),
"text_i": texts[i][:80],
"text_j": texts[j][:80],
"label_i": labels[i],
"label_j": labels[j],
"issue": "Similar texts have different labels"
})
total = len(near_dupes) + len(annotation_issues)
severity = "HIGH" if total > 8 else "MEDIUM" if total > 3 else "LOW"
result = {
"turn": 2,
"turn_name": "near_duplicates_and_annotation_inconsistency",
"near_duplicates_found": len(near_dupes),
"near_duplicate_pairs": near_dupes[:10],
"annotation_inconsistencies_found": len(annotation_issues),
"annotation_inconsistency_samples": annotation_issues[:10],
"total_issues": total,
"severity": severity,
"status": "completed"
}
print(json.dumps(result, indent=2))
return result
# ─────────────────────────────────────────────
# TURN 3 β€” Label Noise + Toxicity / Hate Speech
# ─────────────────────────────────────────────
def turn3_label_noise_and_toxicity(df: pd.DataFrame) -> dict:
print("\n" + "="*60)
print("TURN 3 β€” Label Noise + Toxicity / Hate Speech Detection")
print("="*60)
text_cols = get_text_columns(df)
label_cols = get_label_columns(df)
# ── Label Noise ──
noisy_labels = []
label_distribution = {}
class_imbalance = False
if label_cols:
label_col = label_cols[0]
counts = df[label_col].value_counts().to_dict()
label_distribution = {str(k): int(v) for k, v in counts.items()}
if counts:
max_c, min_c = max(counts.values()), min(counts.values())
class_imbalance = max_c > 0 and (min_c / max_c) < 0.3
if text_cols and label_cols:
text_col = text_cols[0]
label_col = label_cols[0]
label_vals = [str(v).lower() for v in df[label_col].dropna().unique()]
is_sentiment = any(v in label_vals for v in ["positive","negative","0","1","pos","neg"])
if is_sentiment:
for idx, row in df.iterrows():
try:
text = str(row[text_col]).lower()
label = str(row[label_col]).lower()
pos = sum(1 for w in POSITIVE_WORDS if w in text)
neg = sum(1 for w in NEGATIVE_WORDS if w in text)
if pos >= 3 and neg == 0 and label in ["negative","neg","0"]:
noisy_labels.append({"row": int(idx), "text": str(row[text_col])[:100],
"label": str(row[label_col]),
"issue": "Strong positive text β†’ negative label"})
elif neg >= 3 and pos == 0 and label in ["positive","pos","1"]:
noisy_labels.append({"row": int(idx), "text": str(row[text_col])[:100],
"label": str(row[label_col]),
"issue": "Strong negative text β†’ positive label"})
except Exception:
continue
# ── Toxicity & Hate Speech ──
toxic_rows = []
hate_speech_rows = []
if text_cols:
text_col = text_cols[0]
for idx, row in df.iterrows():
try:
text = str(row[text_col]).lower()
words = set(re.findall(r'\b\w+\b', text))
hits = [w for w in TOXIC_WORDS if w in words]
if hits:
toxic_rows.append({"row": int(idx), "text": str(row[text_col])[:100],
"toxic_words_found": hits[:5]})
for pattern in HATE_SPEECH_PATTERNS:
if re.search(pattern, text):
hate_speech_rows.append({"row": int(idx),
"text": str(row[text_col])[:100],
"pattern_matched": pattern})
break
except Exception:
continue
noise_rate = round(len(noisy_labels) / max(len(df), 1), 3)
tox_rate = round(len(toxic_rows) / max(len(df), 1), 3)
severity = "HIGH" if (noise_rate > 0.1 or tox_rate > 0.1) else \
"MEDIUM" if (noise_rate > 0.03 or tox_rate > 0.03) else "LOW"
result = {
"turn": 3,
"turn_name": "label_noise_and_toxicity",
"label_distribution": label_distribution,
"class_imbalance_detected": class_imbalance,
"noisy_labels_found": len(noisy_labels),
"noisy_label_samples": noisy_labels[:10],
"label_noise_rate": noise_rate,
"toxic_rows_found": len(toxic_rows),
"toxic_samples": toxic_rows[:10],
"hate_speech_rows_found": len(hate_speech_rows),
"hate_speech_samples": hate_speech_rows[:5],
"toxicity_rate": tox_rate,
"severity": severity,
"status": "completed"
}
print(json.dumps(result, indent=2))
return result
# ─────────────────────────────────────────────
# TURN 4 β€” Bias Detection + Linguistic Diversity
# ─────────────────────────────────────────────
def turn4_bias_and_language(df: pd.DataFrame) -> dict:
print("\n" + "="*60)
print("TURN 4 β€” Bias Detection + Linguistic Diversity")
print("="*60)
text_cols = get_text_columns(df)
bias_report = {
"gender_bias": {"mentions": 0, "words_found": {}, "affected_rows": []},
"racial_bias": {"mentions": 0, "words_found": {}, "affected_rows": []},
"cultural_bias": {"mentions": 0, "words_found": {}, "affected_rows": []}
}
if text_cols:
text_col = text_cols[0]
g_ctr, r_ctr, c_ctr = Counter(), Counter(), Counter()
for idx, row in df.iterrows():
try:
text = str(row[text_col]).lower()
words = set(re.findall(r'\b\w+\b', text))
g = [w for w in GENDER_BIAS_WORDS if w in words]
r = [w for w in RACIAL_BIAS_WORDS if w in words]
c = [w for w in CULTURAL_BIAS_WORDS if w in words]
if g:
bias_report["gender_bias"]["mentions"] += 1
g_ctr.update(g)
if len(bias_report["gender_bias"]["affected_rows"]) < 5:
bias_report["gender_bias"]["affected_rows"].append(int(idx))
if r:
bias_report["racial_bias"]["mentions"] += 1
r_ctr.update(r)
if len(bias_report["racial_bias"]["affected_rows"]) < 5:
bias_report["racial_bias"]["affected_rows"].append(int(idx))
if c:
bias_report["cultural_bias"]["mentions"] += 1
c_ctr.update(c)
if len(bias_report["cultural_bias"]["affected_rows"]) < 5:
bias_report["cultural_bias"]["affected_rows"].append(int(idx))
except Exception:
continue
bias_report["gender_bias"]["words_found"] = dict(g_ctr.most_common(10))
bias_report["racial_bias"]["words_found"] = dict(r_ctr.most_common(10))
bias_report["cultural_bias"]["words_found"] = dict(c_ctr.most_common(10))
total_bias = sum(v["mentions"] for v in bias_report.values())
bias_rate = round(total_bias / max(len(df) * 3, 1), 3)
bias_score = round(max(0.0, 1.0 - bias_rate), 3)
# ── Linguistic Diversity ──
lang_counts = Counter()
non_english = []
if text_cols:
text_col = text_cols[0]
sample = df[text_col].dropna().astype(str).tolist()[:100]
for i, text in enumerate(sample):
lang = detect_language(text)
lang_counts[lang] += 1
if lang not in ("en", "unknown") and len(non_english) < 5:
non_english.append({"row": i, "text": text[:80], "detected_lang": lang})
total_detected = sum(lang_counts.values())
english_pct = round(lang_counts.get("en", 0) / max(total_detected, 1), 3)
is_multilingual = len([l for l in lang_counts if l not in ("en", "unknown")]) > 0
result = {
"turn": 4,
"turn_name": "bias_and_linguistic_diversity",
"bias_report": bias_report,
"total_bias_mentions": total_bias,
"bias_rate": bias_rate,
"bias_score": bias_score,
"bias_severity": "HIGH" if bias_rate > 0.3 else "MEDIUM" if bias_rate > 0.1 else "LOW",
"language_distribution": dict(lang_counts),
"english_percentage": english_pct,
"is_multilingual": is_multilingual,
"non_english_samples": non_english,
"language_diversity_flag": is_multilingual,
"status": "completed"
}
print(json.dumps(result, indent=2))
return result
# ─────────────────────────────────────────────
# TURN 5 β€” Data Leakage + Domain Drift + Final Report
# ─────────────────────────────────────────────
def turn5_leakage_drift_report(
df: pd.DataFrame, dataset_name: str, first_split: str,
t1: dict, t2: dict, t3: dict, t4: dict,
config: str = None
) -> dict:
_cfg = config
print("\n" + "="*60)
print("TURN 5 β€” Data Leakage + Domain Drift + Final Audit Report")
print("="*60)
text_cols = get_text_columns(df)
leakage_pairs = []
leakage_rate = 0.0
split2 = None
# ── Data Leakage ──
print(" πŸ” Fetching second split for leakage check...")
df2, split2 = fetch_second_split(dataset_name, first_split, config=_cfg, num_rows=50)
if not df2.empty and text_cols:
text_col = text_cols[0]
texts1 = df[text_col].dropna().astype(str).tolist()[:50]
texts2 = df2[text_col].dropna().astype(str).tolist()[:50] if text_col in df2.columns else []
for i, t1_text in enumerate(texts1):
for j, t2_text in enumerate(texts2):
ratio = similarity_ratio(t1_text, t2_text)
if ratio >= 0.90:
leakage_pairs.append({
f"{first_split}_row": i, f"{split2}_row": j,
"similarity": round(ratio, 3),
f"{first_split}_text": t1_text[:80],
f"{split2}_text": t2_text[:80]
})
leakage_rate = round(len(leakage_pairs) / max(len(texts1), 1), 3)
# ── Domain Drift ──
drift_detected = False
drift_details = {}
if text_cols:
text_col = text_cols[0]
texts = df[text_col].dropna().astype(str).tolist()
if len(texts) >= 20:
mid = len(texts) // 2
first_half = texts[:mid]
second_half = texts[mid:]
avg_len_first = round(np.mean([len(t) for t in first_half]), 1)
avg_len_second = round(np.mean([len(t) for t in second_half]), 1)
def vocab(txts):
w = set()
for t in txts:
w.update(re.findall(r'\b\w+\b', t.lower()))
return w
v1, v2 = vocab(first_half), vocab(second_half)
jaccard = round(len(v1 & v2) / max(len(v1 | v2), 1), 3)
drift_detected = jaccard < 0.40 or abs(avg_len_first - avg_len_second) > 50
drift_details = {
"avg_text_length_first_half": avg_len_first,
"avg_text_length_second_half": avg_len_second,
"length_drift": round(abs(avg_len_first - avg_len_second), 1),
"vocabulary_overlap_jaccard": jaccard,
"drift_detected": drift_detected,
"interpretation": (
"Significant topic/domain drift β€” dataset may not be uniform."
if drift_detected else
"No significant drift β€” dataset appears topically consistent."
)
}
# ── Scoring ──
dup_score = clamp_score(1.0 - (t2.get("near_duplicates_found", 0) / max(len(df), 1)) * 5)
ann_score = clamp_score(1.0 - (t2.get("annotation_inconsistencies_found", 0) / max(len(df), 1)) * 10)
label_score = clamp_score(1.0 - t3.get("label_noise_rate", 0) * 10)
tox_score = clamp_score(1.0 - t3.get("toxicity_rate", 0) * 5)
bias_score = clamp_score(t4.get("bias_score", 1.0))
lang_score = clamp_score(t4.get("english_percentage", 1.0))
leakage_score = clamp_score(1.0 - leakage_rate * 5)
drift_score = clamp_score(0.6 if drift_detected else 1.0)
audit_score = clamp_score(
dup_score * 0.12 +
ann_score * 0.13 +
label_score * 0.15 +
tox_score * 0.15 +
bias_score * 0.15 +
lang_score * 0.10 +
leakage_score * 0.10 +
drift_score * 0.10
)
# Verdict
if audit_score >= 0.85:
verdict = "EXCELLENT"
summary = "Dataset passes all hard quality checks. Ready for training."
elif audit_score >= 0.70:
verdict = "GOOD"
summary = "Dataset is usable but has some issues worth addressing."
elif audit_score >= 0.50:
verdict = "NEEDS_WORK"
summary = "Multiple quality/bias issues found. Significant cleanup needed."
else:
verdict = "POOR"
summary = "Serious quality, bias, or leakage issues. Not recommended without major fixes."
# Recommendations
recs = []
if dup_score < 0.8: recs.append("Remove near-duplicate rows to prevent overfitting.")
if ann_score < 0.8: recs.append("Re-annotate inconsistently labelled similar texts.")
if label_score < 0.8: recs.append("Review and correct noisy labels.")
if tox_score < 0.8: recs.append("Filter toxic/hate speech content before training.")
if bias_score < 0.7: recs.append("Audit and balance gender, racial, and cultural representation.")
if lang_score < 0.8: recs.append("Filter non-English rows if multilingual content is unintentional.")
if leakage_score < 0.8: recs.append("Data leakage detected between splits β€” re-split the dataset.")
if drift_detected: recs.append("Topic drift detected β€” verify dataset was collected from a consistent source.")
if not recs: recs.append("Dataset passed all checks. Consider expanding size for better generalization.")
result = {
"task_id": "task3_hard",
"turn": 5,
"turn_name": "leakage_drift_final_report",
"dataset_name": dataset_name,
"total_rows_audited": len(df),
"data_leakage": {
"splits_compared": [first_split, split2] if split2 else [first_split],
"leakage_pairs_found": len(leakage_pairs),
"leakage_pairs_sample": leakage_pairs[:5],
"leakage_rate": leakage_rate
},
"domain_drift": drift_details,
"scores": {
"near_duplicate_score": dup_score,
"annotation_consistency_score": ann_score,
"label_noise_score": label_score,
"toxicity_score": tox_score,
"bias_score": bias_score,
"language_consistency_score": lang_score,
"data_leakage_score": leakage_score,
"domain_drift_score": drift_score
},
"audit_score": audit_score,
"verdict": verdict,
"summary": summary,
"recommendations": recs,
"turn_results": {
"turn1_overview": t1,
"turn2_near_dupes_annotation": t2,
"turn3_label_noise_toxicity": t3,
"turn4_bias_language": t4
},
"status": "completed"
}
print(json.dumps(result, indent=2))
return result
# ─────────────────────────────────────────────
# MAIN
# ─────────────────────────────────────────────
if __name__ == "__main__":
print("=" * 60)
print(" TASK 3 β€” BIAS & QUALITY AUDIT (HARD, 5-TURN)")
print(" Checks: Near-Dupes | Annotation Inconsistency |")
print(" Label Noise | Toxicity | Bias | Language | Leakage | Drift")
print("=" * 60)
raw_input = input("\nEnter HuggingFace dataset name (e.g. imdb OR cardiffnlp/tweet_eval hate): ").strip()
if not raw_input:
raw_input = "imdb"
dataset_name, config = clean_dataset_name(raw_input)
print(f" πŸ“Œ Dataset : {dataset_name}")
if config:
print(f" πŸ“Œ Config : {config}")
df, split = fetch_dataset_rows(dataset_name, config=config, num_rows=100)
if df.empty:
print("❌ Could not load dataset.")
print(" Tips:")
print(" β€’ Use just the name, e.g. imdb or dair-ai/emotion")
print(" β€’ For tweet_eval use: cardiffnlp/tweet_eval hate")
print(" β€’ Do NOT paste the full URL")
exit(1)
t1 = turn1_overview(df, dataset_name, split)
input("\n⏎ Press Enter β†’ Turn 2 (Near-Dupes + Annotation)...")
t2 = turn2_near_dupes_and_annotation(df)
input("\n⏎ Press Enter β†’ Turn 3 (Label Noise + Toxicity)...")
t3 = turn3_label_noise_and_toxicity(df)
input("\n⏎ Press Enter β†’ Turn 4 (Bias + Language)...")
t4 = turn4_bias_and_language(df)
input("\n⏎ Press Enter β†’ Turn 5 (Leakage + Drift + Final Report)...")
final = turn5_leakage_drift_report(df, dataset_name, split, t1, t2, t3, t4, config=config)
print("\n" + "=" * 60)
print("βœ… TASK 3 COMPLETE β€” Copy the JSON below into grader3.py")
print("=" * 60)
print(json.dumps(final, indent=2))
with open("task3_output.json", "w") as f:
json.dump(final, f, indent=2)
print("\nπŸ“„ Output saved to: task3_output.json")