""" task3_hard.py ============= Task 3 — Bias & Quality Audit (Hard, 5-Turn) OpenEnv Project | Meta × Hugging Face Hackathon What it does: Runs a 5-turn deep audit on a HuggingFace dataset covering the hardest quality and bias checks. Checks (9 across 5 turns): Turn 1: Column profiling Turn 2: Near-duplicate detection + Annotation inconsistency Turn 3: Label noise + Toxicity + Hate speech Turn 4: Bias detection (gender/racial/cultural) + Linguistic diversity Turn 5: Data leakage + Domain drift + Final audit report Usage: python task3_hard.py → Enter dataset name (e.g. dair-ai/emotion) → Press Enter to progress through each turn → Output saved to task3_output.json → Copy JSON output into grader3.py Requirements: pip install datasets pandas numpy scikit-learn langdetect """ """ Task 3 — Bias & Quality Audit (HARD) ===================================== A 5-turn multi-step agent task with 8 hard checks. Turn 1 → Dataset Overview + Column Profiling Turn 2 → Near-Duplicate Detection + Annotation Inconsistency Turn 3 → Label Noise + Toxicity / Hate Speech Detection Turn 4 → Bias Detection + Linguistic Diversity Turn 5 → Data Leakage + Domain Drift + Final Audit Report Compatible with: Google Colab, Python 3.8+ Install: pip install datasets pandas numpy scikit-learn langdetect difflib """ import json import re import difflib import warnings import subprocess import sys from collections import Counter, defaultdict warnings.filterwarnings("ignore") def clamp_score(x): return round(max(0.01, min(0.99, x)), 3) # ───────────────────────────────────────────── # INSTALL DEPS # ───────────────────────────────────────────── def install(pkg): subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"]) for pkg in ["datasets", "pandas", "numpy", "scikit-learn", "langdetect"]: try: __import__(pkg.replace("-", "_")) except ImportError: install(pkg) import pandas as pd import numpy as np from datasets import load_dataset from langdetect import detect, LangDetectException # ───────────────────────────────────────────── # WORD LISTS # ───────────────────────────────────────────── GENDER_BIAS_WORDS = [ "he", "she", "him", "her", "his", "hers", "man", "woman", "men", "women", "male", "female", "boy", "girl", "husband", "wife", "father", "mother", "son", "daughter", "brother", "sister", "mr", "mrs", "ms", "sir", "madam", "king", "queen", "prince", "princess", "actor", "actress", "waiter", "waitress", "policeman", "policewoman", "fireman", "stewardess", "mankind", "manpower" ] RACIAL_BIAS_WORDS = [ "black", "white", "asian", "hispanic", "latino", "latina", "african", "caucasian", "arab", "jewish", "muslim", "christian", "hindu", "immigrant", "refugee", "foreigner", "native", "minority", "ethnic", "race", "racial", "colored", "indigenous", "tribal" ] CULTURAL_BIAS_WORDS = [ "western", "eastern", "american", "european", "third world", "developing", "primitive", "civilized", "savage", "barbarian", "exotic", "oriental", "traditional", "modern", "backward", "progressive", "uncivilized" ] TOXIC_WORDS = [ "idiot", "stupid", "moron", "loser", "trash", "garbage", "hate", "kill", "die", "retard", "freak", "ugly", "fat", "dumb", "worthless", "scum", "disgusting", "pathetic", "monster", "pervert", "creep" ] HATE_SPEECH_PATTERNS = [ r'\b(all|those|these)\s+(black|white|asian|muslim|jewish|gay|trans)\s+(people|guys|men|women)\s+(are|should|must|deserve)\b', r'\b(go back to)\s+\w+', r'\b(ban all)\s+\w+', r'\b(they|them)\s+(don\'t belong|should leave|are inferior|are criminals)\b', ] POSITIVE_WORDS = [ "great", "excellent", "amazing", "wonderful", "fantastic", "best", "good", "beautiful", "smart", "brilliant", "awesome", "perfect", "love", "superb", "outstanding", "exceptional", "magnificent", "enjoyed", "loved" ] NEGATIVE_WORDS = [ "terrible", "awful", "horrible", "disgusting", "hate", "worst", "bad", "poor", "ugly", "stupid", "dumb", "trash", "garbage", "useless", "worthless", "pathetic", "loser", "failure", "boring", "waste", "dreadful" ] # ───────────────────────────────────────────── # HELPERS # ───────────────────────────────────────────── def clean_dataset_name(name: str) -> tuple: """ Accept either: - plain name: imdb - name + config: cardiffnlp/tweet_eval hate - full HF URL: https://huggingface.co/datasets/cardiffnlp/tweet_eval Returns (dataset_name, config_or_None) """ name = name.strip() # Strip full HF URL down to just the dataset path if name.startswith("http"): # e.g. https://huggingface.co/datasets/cardiffnlp/tweet_eval name = re.sub(r"https?://huggingface\.co/datasets/", "", name).strip("/") # Check if user passed config after a space e.g. "cardiffnlp/tweet_eval hate" parts = name.split() if len(parts) == 2: return parts[0], parts[1] return name, None def fetch_dataset_rows(dataset_name: str, config: str = None, num_rows: int = 100): """Fetch rows — tries train, then test, then validation split.""" label = f"{dataset_name}" + (f" ({config})" if config else "") print(f"\n📦 Loading dataset: {label} ...") # Known datasets that require a config — default config map CONFIG_DEFAULTS = { "cardiffnlp/tweet_eval": "hate", "nyu-mll/glue": "sst2", "glue": "sst2", } if config is None: config = CONFIG_DEFAULTS.get(dataset_name) for split in ["train", "test", "validation"]: try: if config: ds = load_dataset(dataset_name, config, split=split, streaming=True) else: ds = load_dataset(dataset_name, split=split, streaming=True) rows = [] for i, row in enumerate(ds): if i >= num_rows: break rows.append(row) if rows: df = pd.DataFrame(rows) print(f"✅ Loaded {len(df)} rows, {len(df.columns)} columns from '{split}' split") print(f" Columns: {list(df.columns)}") return df, split except Exception as e: print(f" ⚠️ Split '{split}' failed: {str(e)[:80]}") continue print("❌ Could not load any split.") return pd.DataFrame(), None def fetch_second_split(dataset_name: str, first_split: str, config: str = None, num_rows: int = 50): """Fetch a different split for data leakage check.""" for split in ["train", "test", "validation"]: if split == first_split: continue try: if config: ds = load_dataset(dataset_name, config, split=split, streaming=True) else: ds = load_dataset(dataset_name, split=split, streaming=True) rows = [] for i, row in enumerate(ds): if i >= num_rows: break rows.append(row) if rows: return pd.DataFrame(rows), split except Exception: continue return pd.DataFrame(), None def get_text_columns(df: pd.DataFrame): return [c for c in df.columns if df[c].dtype == object and df[c].dropna().apply(lambda x: isinstance(x, str) and len(x) > 20).mean() > 0.3] def get_label_columns(df: pd.DataFrame): return [c for c in df.columns if 1 < df[c].nunique() <= 20] def similarity_ratio(s1: str, s2: str) -> float: return difflib.SequenceMatcher(None, s1.lower().strip(), s2.lower().strip()).ratio() def detect_language(text: str) -> str: try: return detect(str(text)) except LangDetectException: return "unknown" # ───────────────────────────────────────────── # TURN 1 — Dataset Overview + Column Profiling # ───────────────────────────────────────────── def turn1_overview(df: pd.DataFrame, dataset_name: str, split: str) -> dict: print("\n" + "="*60) print("TURN 1 — Dataset Overview & Column Profiling") print("="*60) columns_info = {} for col in df.columns: missing = int(df[col].isnull().sum()) nuniq = int(df[col].nunique()) dtype = str(df[col].dtype) sample = df[col].dropna().iloc[:3].tolist() if not df[col].dropna().empty else [] avg_len = None if df[col].dtype == object: avg_len = round(df[col].dropna().astype(str).apply(len).mean(), 1) columns_info[col] = { "dtype": dtype, "missing": missing, "missing_rate": round(missing / max(len(df), 1), 3), "unique_values": nuniq, "avg_text_length": avg_len, "sample": [str(s)[:60] for s in sample] } text_cols = get_text_columns(df) label_cols = get_label_columns(df) flags = [] if len(df) < 20: flags.append("Very small dataset — statistical checks may be unreliable") if not text_cols: flags.append("No long-text columns — bias/toxicity checks will be limited") if not label_cols: flags.append("No label columns — label noise check will be skipped") result = { "turn": 1, "turn_name": "dataset_overview", "dataset_name": dataset_name, "split_used": split, "total_rows": len(df), "total_columns": len(df.columns), "column_names": list(df.columns), "columns_info": columns_info, "text_columns_detected": text_cols, "label_columns_detected": label_cols, "initial_flags": flags, "status": "completed" } print(json.dumps(result, indent=2)) return result # ───────────────────────────────────────────── # TURN 2 — Near-Duplicates + Annotation Inconsistency # ───────────────────────────────────────────── def turn2_near_dupes_and_annotation(df: pd.DataFrame) -> dict: print("\n" + "="*60) print("TURN 2 — Near-Duplicate Detection + Annotation Inconsistency") print("="*60) text_cols = get_text_columns(df) label_cols = get_label_columns(df) # ── Near-Duplicates ── near_dupes = [] if text_cols: col = text_cols[0] texts = df[col].dropna().astype(str).tolist() limit = min(len(texts), 150) for i in range(limit): for j in range(i + 1, limit): ratio = similarity_ratio(texts[i], texts[j]) if 0.85 <= ratio < 1.0: near_dupes.append({ "row_i": i, "row_j": j, "similarity": round(ratio, 3), "text_i": texts[i][:80], "text_j": texts[j][:80] }) # ── Annotation Inconsistency ── annotation_issues = [] if text_cols and label_cols: text_col = text_cols[0] label_col = label_cols[0] texts = df[text_col].dropna().astype(str).tolist() labels = df[label_col].astype(str).tolist() limit = min(len(texts), 150) for i in range(limit): for j in range(i + 1, limit): ratio = similarity_ratio(texts[i], texts[j]) if ratio >= 0.80 and labels[i] != labels[j]: annotation_issues.append({ "row_i": i, "row_j": j, "similarity": round(ratio, 3), "text_i": texts[i][:80], "text_j": texts[j][:80], "label_i": labels[i], "label_j": labels[j], "issue": "Similar texts have different labels" }) total = len(near_dupes) + len(annotation_issues) severity = "HIGH" if total > 8 else "MEDIUM" if total > 3 else "LOW" result = { "turn": 2, "turn_name": "near_duplicates_and_annotation_inconsistency", "near_duplicates_found": len(near_dupes), "near_duplicate_pairs": near_dupes[:10], "annotation_inconsistencies_found": len(annotation_issues), "annotation_inconsistency_samples": annotation_issues[:10], "total_issues": total, "severity": severity, "status": "completed" } print(json.dumps(result, indent=2)) return result # ───────────────────────────────────────────── # TURN 3 — Label Noise + Toxicity / Hate Speech # ───────────────────────────────────────────── def turn3_label_noise_and_toxicity(df: pd.DataFrame) -> dict: print("\n" + "="*60) print("TURN 3 — Label Noise + Toxicity / Hate Speech Detection") print("="*60) text_cols = get_text_columns(df) label_cols = get_label_columns(df) # ── Label Noise ── noisy_labels = [] label_distribution = {} class_imbalance = False if label_cols: label_col = label_cols[0] counts = df[label_col].value_counts().to_dict() label_distribution = {str(k): int(v) for k, v in counts.items()} if counts: max_c, min_c = max(counts.values()), min(counts.values()) class_imbalance = max_c > 0 and (min_c / max_c) < 0.3 if text_cols and label_cols: text_col = text_cols[0] label_col = label_cols[0] label_vals = [str(v).lower() for v in df[label_col].dropna().unique()] is_sentiment = any(v in label_vals for v in ["positive","negative","0","1","pos","neg"]) if is_sentiment: for idx, row in df.iterrows(): try: text = str(row[text_col]).lower() label = str(row[label_col]).lower() pos = sum(1 for w in POSITIVE_WORDS if w in text) neg = sum(1 for w in NEGATIVE_WORDS if w in text) if pos >= 3 and neg == 0 and label in ["negative","neg","0"]: noisy_labels.append({"row": int(idx), "text": str(row[text_col])[:100], "label": str(row[label_col]), "issue": "Strong positive text → negative label"}) elif neg >= 3 and pos == 0 and label in ["positive","pos","1"]: noisy_labels.append({"row": int(idx), "text": str(row[text_col])[:100], "label": str(row[label_col]), "issue": "Strong negative text → positive label"}) except Exception: continue # ── Toxicity & Hate Speech ── toxic_rows = [] hate_speech_rows = [] if text_cols: text_col = text_cols[0] for idx, row in df.iterrows(): try: text = str(row[text_col]).lower() words = set(re.findall(r'\b\w+\b', text)) hits = [w for w in TOXIC_WORDS if w in words] if hits: toxic_rows.append({"row": int(idx), "text": str(row[text_col])[:100], "toxic_words_found": hits[:5]}) for pattern in HATE_SPEECH_PATTERNS: if re.search(pattern, text): hate_speech_rows.append({"row": int(idx), "text": str(row[text_col])[:100], "pattern_matched": pattern}) break except Exception: continue noise_rate = round(len(noisy_labels) / max(len(df), 1), 3) tox_rate = round(len(toxic_rows) / max(len(df), 1), 3) severity = "HIGH" if (noise_rate > 0.1 or tox_rate > 0.1) else \ "MEDIUM" if (noise_rate > 0.03 or tox_rate > 0.03) else "LOW" result = { "turn": 3, "turn_name": "label_noise_and_toxicity", "label_distribution": label_distribution, "class_imbalance_detected": class_imbalance, "noisy_labels_found": len(noisy_labels), "noisy_label_samples": noisy_labels[:10], "label_noise_rate": noise_rate, "toxic_rows_found": len(toxic_rows), "toxic_samples": toxic_rows[:10], "hate_speech_rows_found": len(hate_speech_rows), "hate_speech_samples": hate_speech_rows[:5], "toxicity_rate": tox_rate, "severity": severity, "status": "completed" } print(json.dumps(result, indent=2)) return result # ───────────────────────────────────────────── # TURN 4 — Bias Detection + Linguistic Diversity # ───────────────────────────────────────────── def turn4_bias_and_language(df: pd.DataFrame) -> dict: print("\n" + "="*60) print("TURN 4 — Bias Detection + Linguistic Diversity") print("="*60) text_cols = get_text_columns(df) bias_report = { "gender_bias": {"mentions": 0, "words_found": {}, "affected_rows": []}, "racial_bias": {"mentions": 0, "words_found": {}, "affected_rows": []}, "cultural_bias": {"mentions": 0, "words_found": {}, "affected_rows": []} } if text_cols: text_col = text_cols[0] g_ctr, r_ctr, c_ctr = Counter(), Counter(), Counter() for idx, row in df.iterrows(): try: text = str(row[text_col]).lower() words = set(re.findall(r'\b\w+\b', text)) g = [w for w in GENDER_BIAS_WORDS if w in words] r = [w for w in RACIAL_BIAS_WORDS if w in words] c = [w for w in CULTURAL_BIAS_WORDS if w in words] if g: bias_report["gender_bias"]["mentions"] += 1 g_ctr.update(g) if len(bias_report["gender_bias"]["affected_rows"]) < 5: bias_report["gender_bias"]["affected_rows"].append(int(idx)) if r: bias_report["racial_bias"]["mentions"] += 1 r_ctr.update(r) if len(bias_report["racial_bias"]["affected_rows"]) < 5: bias_report["racial_bias"]["affected_rows"].append(int(idx)) if c: bias_report["cultural_bias"]["mentions"] += 1 c_ctr.update(c) if len(bias_report["cultural_bias"]["affected_rows"]) < 5: bias_report["cultural_bias"]["affected_rows"].append(int(idx)) except Exception: continue bias_report["gender_bias"]["words_found"] = dict(g_ctr.most_common(10)) bias_report["racial_bias"]["words_found"] = dict(r_ctr.most_common(10)) bias_report["cultural_bias"]["words_found"] = dict(c_ctr.most_common(10)) total_bias = sum(v["mentions"] for v in bias_report.values()) bias_rate = round(total_bias / max(len(df) * 3, 1), 3) bias_score = round(max(0.0, 1.0 - bias_rate), 3) # ── Linguistic Diversity ── lang_counts = Counter() non_english = [] if text_cols: text_col = text_cols[0] sample = df[text_col].dropna().astype(str).tolist()[:100] for i, text in enumerate(sample): lang = detect_language(text) lang_counts[lang] += 1 if lang not in ("en", "unknown") and len(non_english) < 5: non_english.append({"row": i, "text": text[:80], "detected_lang": lang}) total_detected = sum(lang_counts.values()) english_pct = round(lang_counts.get("en", 0) / max(total_detected, 1), 3) is_multilingual = len([l for l in lang_counts if l not in ("en", "unknown")]) > 0 result = { "turn": 4, "turn_name": "bias_and_linguistic_diversity", "bias_report": bias_report, "total_bias_mentions": total_bias, "bias_rate": bias_rate, "bias_score": bias_score, "bias_severity": "HIGH" if bias_rate > 0.3 else "MEDIUM" if bias_rate > 0.1 else "LOW", "language_distribution": dict(lang_counts), "english_percentage": english_pct, "is_multilingual": is_multilingual, "non_english_samples": non_english, "language_diversity_flag": is_multilingual, "status": "completed" } print(json.dumps(result, indent=2)) return result # ───────────────────────────────────────────── # TURN 5 — Data Leakage + Domain Drift + Final Report # ───────────────────────────────────────────── def turn5_leakage_drift_report( df: pd.DataFrame, dataset_name: str, first_split: str, t1: dict, t2: dict, t3: dict, t4: dict, config: str = None ) -> dict: _cfg = config print("\n" + "="*60) print("TURN 5 — Data Leakage + Domain Drift + Final Audit Report") print("="*60) text_cols = get_text_columns(df) leakage_pairs = [] leakage_rate = 0.0 split2 = None # ── Data Leakage ── print(" 🔍 Fetching second split for leakage check...") df2, split2 = fetch_second_split(dataset_name, first_split, config=_cfg, num_rows=50) if not df2.empty and text_cols: text_col = text_cols[0] texts1 = df[text_col].dropna().astype(str).tolist()[:50] texts2 = df2[text_col].dropna().astype(str).tolist()[:50] if text_col in df2.columns else [] for i, t1_text in enumerate(texts1): for j, t2_text in enumerate(texts2): ratio = similarity_ratio(t1_text, t2_text) if ratio >= 0.90: leakage_pairs.append({ f"{first_split}_row": i, f"{split2}_row": j, "similarity": round(ratio, 3), f"{first_split}_text": t1_text[:80], f"{split2}_text": t2_text[:80] }) leakage_rate = round(len(leakage_pairs) / max(len(texts1), 1), 3) # ── Domain Drift ── drift_detected = False drift_details = {} if text_cols: text_col = text_cols[0] texts = df[text_col].dropna().astype(str).tolist() if len(texts) >= 20: mid = len(texts) // 2 first_half = texts[:mid] second_half = texts[mid:] avg_len_first = round(np.mean([len(t) for t in first_half]), 1) avg_len_second = round(np.mean([len(t) for t in second_half]), 1) def vocab(txts): w = set() for t in txts: w.update(re.findall(r'\b\w+\b', t.lower())) return w v1, v2 = vocab(first_half), vocab(second_half) jaccard = round(len(v1 & v2) / max(len(v1 | v2), 1), 3) drift_detected = jaccard < 0.40 or abs(avg_len_first - avg_len_second) > 50 drift_details = { "avg_text_length_first_half": avg_len_first, "avg_text_length_second_half": avg_len_second, "length_drift": round(abs(avg_len_first - avg_len_second), 1), "vocabulary_overlap_jaccard": jaccard, "drift_detected": drift_detected, "interpretation": ( "Significant topic/domain drift — dataset may not be uniform." if drift_detected else "No significant drift — dataset appears topically consistent." ) } # ── Scoring ── dup_score = clamp_score(1.0 - (t2.get("near_duplicates_found", 0) / max(len(df), 1)) * 5) ann_score = clamp_score(1.0 - (t2.get("annotation_inconsistencies_found", 0) / max(len(df), 1)) * 10) label_score = clamp_score(1.0 - t3.get("label_noise_rate", 0) * 10) tox_score = clamp_score(1.0 - t3.get("toxicity_rate", 0) * 5) bias_score = clamp_score(t4.get("bias_score", 1.0)) lang_score = clamp_score(t4.get("english_percentage", 1.0)) leakage_score = clamp_score(1.0 - leakage_rate * 5) drift_score = clamp_score(0.6 if drift_detected else 1.0) audit_score = clamp_score( dup_score * 0.12 + ann_score * 0.13 + label_score * 0.15 + tox_score * 0.15 + bias_score * 0.15 + lang_score * 0.10 + leakage_score * 0.10 + drift_score * 0.10 ) # Verdict if audit_score >= 0.85: verdict = "EXCELLENT" summary = "Dataset passes all hard quality checks. Ready for training." elif audit_score >= 0.70: verdict = "GOOD" summary = "Dataset is usable but has some issues worth addressing." elif audit_score >= 0.50: verdict = "NEEDS_WORK" summary = "Multiple quality/bias issues found. Significant cleanup needed." else: verdict = "POOR" summary = "Serious quality, bias, or leakage issues. Not recommended without major fixes." # Recommendations recs = [] if dup_score < 0.8: recs.append("Remove near-duplicate rows to prevent overfitting.") if ann_score < 0.8: recs.append("Re-annotate inconsistently labelled similar texts.") if label_score < 0.8: recs.append("Review and correct noisy labels.") if tox_score < 0.8: recs.append("Filter toxic/hate speech content before training.") if bias_score < 0.7: recs.append("Audit and balance gender, racial, and cultural representation.") if lang_score < 0.8: recs.append("Filter non-English rows if multilingual content is unintentional.") if leakage_score < 0.8: recs.append("Data leakage detected between splits — re-split the dataset.") if drift_detected: recs.append("Topic drift detected — verify dataset was collected from a consistent source.") if not recs: recs.append("Dataset passed all checks. Consider expanding size for better generalization.") result = { "task_id": "task3_hard", "turn": 5, "turn_name": "leakage_drift_final_report", "dataset_name": dataset_name, "total_rows_audited": len(df), "data_leakage": { "splits_compared": [first_split, split2] if split2 else [first_split], "leakage_pairs_found": len(leakage_pairs), "leakage_pairs_sample": leakage_pairs[:5], "leakage_rate": leakage_rate }, "domain_drift": drift_details, "scores": { "near_duplicate_score": dup_score, "annotation_consistency_score": ann_score, "label_noise_score": label_score, "toxicity_score": tox_score, "bias_score": bias_score, "language_consistency_score": lang_score, "data_leakage_score": leakage_score, "domain_drift_score": drift_score }, "audit_score": audit_score, "verdict": verdict, "summary": summary, "recommendations": recs, "turn_results": { "turn1_overview": t1, "turn2_near_dupes_annotation": t2, "turn3_label_noise_toxicity": t3, "turn4_bias_language": t4 }, "status": "completed" } print(json.dumps(result, indent=2)) return result # ───────────────────────────────────────────── # MAIN # ───────────────────────────────────────────── if __name__ == "__main__": print("=" * 60) print(" TASK 3 — BIAS & QUALITY AUDIT (HARD, 5-TURN)") print(" Checks: Near-Dupes | Annotation Inconsistency |") print(" Label Noise | Toxicity | Bias | Language | Leakage | Drift") print("=" * 60) raw_input = input("\nEnter HuggingFace dataset name (e.g. imdb OR cardiffnlp/tweet_eval hate): ").strip() if not raw_input: raw_input = "imdb" dataset_name, config = clean_dataset_name(raw_input) print(f" 📌 Dataset : {dataset_name}") if config: print(f" 📌 Config : {config}") df, split = fetch_dataset_rows(dataset_name, config=config, num_rows=100) if df.empty: print("❌ Could not load dataset.") print(" Tips:") print(" • Use just the name, e.g. imdb or dair-ai/emotion") print(" • For tweet_eval use: cardiffnlp/tweet_eval hate") print(" • Do NOT paste the full URL") exit(1) t1 = turn1_overview(df, dataset_name, split) input("\n⏎ Press Enter → Turn 2 (Near-Dupes + Annotation)...") t2 = turn2_near_dupes_and_annotation(df) input("\n⏎ Press Enter → Turn 3 (Label Noise + Toxicity)...") t3 = turn3_label_noise_and_toxicity(df) input("\n⏎ Press Enter → Turn 4 (Bias + Language)...") t4 = turn4_bias_and_language(df) input("\n⏎ Press Enter → Turn 5 (Leakage + Drift + Final Report)...") final = turn5_leakage_drift_report(df, dataset_name, split, t1, t2, t3, t4, config=config) print("\n" + "=" * 60) print("✅ TASK 3 COMPLETE — Copy the JSON below into grader3.py") print("=" * 60) print(json.dumps(final, indent=2)) with open("task3_output.json", "w") as f: json.dump(final, f, indent=2) print("\n📄 Output saved to: task3_output.json")