"""
task3_hard.py
=============
Task 3 — Bias & Quality Audit (Hard, 5-Turn)
OpenEnv Project | Meta × Hugging Face Hackathon

What it does:
  Runs a 5-turn deep audit on a HuggingFace dataset
  covering the hardest quality and bias checks.

Checks (9 across 5 turns):
  Turn 1: Column profiling
  Turn 2: Near-duplicate detection + Annotation inconsistency
  Turn 3: Label noise + Toxicity + Hate speech
  Turn 4: Bias detection (gender/racial/cultural) + Linguistic diversity
  Turn 5: Data leakage + Domain drift + Final audit report

Usage:
  python task3_hard.py
  → Enter dataset name (e.g. dair-ai/emotion)
  → Press Enter to progress through each turn
  → Output saved to task3_output.json
  → Copy JSON output into grader3.py

Requirements:
  pip install datasets pandas numpy scikit-learn langdetect
"""
"""
Task 3 — Bias & Quality Audit (HARD)
=====================================
A 5-turn multi-step agent task with 8 hard checks.

Turn 1 → Dataset Overview + Column Profiling
Turn 2 → Near-Duplicate Detection + Annotation Inconsistency
Turn 3 → Label Noise + Toxicity / Hate Speech Detection
Turn 4 → Bias Detection + Linguistic Diversity
Turn 5 → Data Leakage + Domain Drift + Final Audit Report

Compatible with: Google Colab, Python 3.8+
Install: pip install datasets pandas numpy scikit-learn langdetect difflib
"""

import json
import re
import difflib
import warnings
import subprocess
import sys
from collections import Counter, defaultdict

warnings.filterwarnings("ignore")

def clamp_score(x):
    return round(max(0.01, min(0.99, x)), 3)
# ─────────────────────────────────────────────
# INSTALL DEPS
# ─────────────────────────────────────────────

def install(pkg):
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])

for pkg in ["datasets", "pandas", "numpy", "scikit-learn", "langdetect"]:
    try:
        __import__(pkg.replace("-", "_"))
    except ImportError:
        install(pkg)

import pandas as pd
import numpy as np
from datasets import load_dataset
from langdetect import detect, LangDetectException

# ─────────────────────────────────────────────
# WORD LISTS
# ─────────────────────────────────────────────

GENDER_BIAS_WORDS = [
    "he", "she", "him", "her", "his", "hers", "man", "woman", "men", "women",
    "male", "female", "boy", "girl", "husband", "wife", "father", "mother",
    "son", "daughter", "brother", "sister", "mr", "mrs", "ms", "sir", "madam",
    "king", "queen", "prince", "princess", "actor", "actress", "waiter", "waitress",
    "policeman", "policewoman", "fireman", "stewardess", "mankind", "manpower"
]

RACIAL_BIAS_WORDS = [
    "black", "white", "asian", "hispanic", "latino", "latina", "african",
    "caucasian", "arab", "jewish", "muslim", "christian", "hindu", "immigrant",
    "refugee", "foreigner", "native", "minority", "ethnic", "race", "racial",
    "colored", "indigenous", "tribal"
]

CULTURAL_BIAS_WORDS = [
    "western", "eastern", "american", "european", "third world", "developing",
    "primitive", "civilized", "savage", "barbarian", "exotic", "oriental",
    "traditional", "modern", "backward", "progressive", "uncivilized"
]

TOXIC_WORDS = [
    "idiot", "stupid", "moron", "loser", "trash", "garbage", "hate", "kill",
    "die", "retard", "freak", "ugly", "fat", "dumb", "worthless", "scum",
    "disgusting", "pathetic", "monster", "pervert", "creep"
]

HATE_SPEECH_PATTERNS = [
    r'\b(all|those|these)\s+(black|white|asian|muslim|jewish|gay|trans)\s+(people|guys|men|women)\s+(are|should|must|deserve)\b',
    r'\b(go back to)\s+\w+',
    r'\b(ban all)\s+\w+',
    r'\b(they|them)\s+(don\'t belong|should leave|are inferior|are criminals)\b',
]

POSITIVE_WORDS = [
    "great", "excellent", "amazing", "wonderful", "fantastic", "best", "good",
    "beautiful", "smart", "brilliant", "awesome", "perfect", "love", "superb",
    "outstanding", "exceptional", "magnificent", "enjoyed", "loved"
]

NEGATIVE_WORDS = [
    "terrible", "awful", "horrible", "disgusting", "hate", "worst", "bad",
    "poor", "ugly", "stupid", "dumb", "trash", "garbage", "useless",
    "worthless", "pathetic", "loser", "failure", "boring", "waste", "dreadful"
]

# ─────────────────────────────────────────────
# HELPERS
# ─────────────────────────────────────────────

def clean_dataset_name(name: str) -> tuple:
    """
    Accept either:
      - plain name:        imdb
      - name + config:     cardiffnlp/tweet_eval hate
      - full HF URL:       https://huggingface.co/datasets/cardiffnlp/tweet_eval
    Returns (dataset_name, config_or_None)
    """
    name = name.strip()
    # Strip full HF URL down to just the dataset path
    if name.startswith("http"):
        # e.g. https://huggingface.co/datasets/cardiffnlp/tweet_eval
        name = re.sub(r"https?://huggingface\.co/datasets/", "", name).strip("/")

    # Check if user passed config after a space e.g. "cardiffnlp/tweet_eval hate"
    parts = name.split()
    if len(parts) == 2:
        return parts[0], parts[1]
    return name, None


def fetch_dataset_rows(dataset_name: str, config: str = None, num_rows: int = 100):
    """Fetch rows — tries train, then test, then validation split."""
    label = f"{dataset_name}" + (f" ({config})" if config else "")
    print(f"\n📦 Loading dataset: {label} ...")

    # Known datasets that require a config — default config map
    CONFIG_DEFAULTS = {
        "cardiffnlp/tweet_eval": "hate",
        "nyu-mll/glue": "sst2",
        "glue": "sst2",
    }

    if config is None:
        config = CONFIG_DEFAULTS.get(dataset_name)

    for split in ["train", "test", "validation"]:
        try:
            if config:
                ds = load_dataset(dataset_name, config, split=split, streaming=True)
            else:
                ds = load_dataset(dataset_name, split=split, streaming=True)
            rows = []
            for i, row in enumerate(ds):
                if i >= num_rows:
                    break
                rows.append(row)
            if rows:
                df = pd.DataFrame(rows)
                print(f"✅ Loaded {len(df)} rows, {len(df.columns)} columns from '{split}' split")
                print(f"   Columns: {list(df.columns)}")
                return df, split
        except Exception as e:
            print(f"   ⚠️  Split '{split}' failed: {str(e)[:80]}")
            continue
    print("❌ Could not load any split.")
    return pd.DataFrame(), None


def fetch_second_split(dataset_name: str, first_split: str, config: str = None, num_rows: int = 50):
    """Fetch a different split for data leakage check."""
    for split in ["train", "test", "validation"]:
        if split == first_split:
            continue
        try:
            if config:
                ds = load_dataset(dataset_name, config, split=split, streaming=True)
            else:
                ds = load_dataset(dataset_name, split=split, streaming=True)
            rows = []
            for i, row in enumerate(ds):
                if i >= num_rows:
                    break
                rows.append(row)
            if rows:
                return pd.DataFrame(rows), split
        except Exception:
            continue
    return pd.DataFrame(), None


def get_text_columns(df: pd.DataFrame):
    return [c for c in df.columns if df[c].dtype == object and
            df[c].dropna().apply(lambda x: isinstance(x, str) and len(x) > 20).mean() > 0.3]


def get_label_columns(df: pd.DataFrame):
    return [c for c in df.columns if 1 < df[c].nunique() <= 20]


def similarity_ratio(s1: str, s2: str) -> float:
    return difflib.SequenceMatcher(None, s1.lower().strip(), s2.lower().strip()).ratio()


def detect_language(text: str) -> str:
    try:
        return detect(str(text))
    except LangDetectException:
        return "unknown"


# ─────────────────────────────────────────────
# TURN 1 — Dataset Overview + Column Profiling
# ─────────────────────────────────────────────

def turn1_overview(df: pd.DataFrame, dataset_name: str, split: str) -> dict:
    print("\n" + "="*60)
    print("TURN 1 — Dataset Overview & Column Profiling")
    print("="*60)

    columns_info = {}
    for col in df.columns:
        missing = int(df[col].isnull().sum())
        nuniq   = int(df[col].nunique())
        dtype   = str(df[col].dtype)
        sample  = df[col].dropna().iloc[:3].tolist() if not df[col].dropna().empty else []
        avg_len = None
        if df[col].dtype == object:
            avg_len = round(df[col].dropna().astype(str).apply(len).mean(), 1)
        columns_info[col] = {
            "dtype": dtype,
            "missing": missing,
            "missing_rate": round(missing / max(len(df), 1), 3),
            "unique_values": nuniq,
            "avg_text_length": avg_len,
            "sample": [str(s)[:60] for s in sample]
        }

    text_cols  = get_text_columns(df)
    label_cols = get_label_columns(df)

    flags = []
    if len(df) < 20:
        flags.append("Very small dataset — statistical checks may be unreliable")
    if not text_cols:
        flags.append("No long-text columns — bias/toxicity checks will be limited")
    if not label_cols:
        flags.append("No label columns — label noise check will be skipped")

    result = {
        "turn": 1,
        "turn_name": "dataset_overview",
        "dataset_name": dataset_name,
        "split_used": split,
        "total_rows": len(df),
        "total_columns": len(df.columns),
        "column_names": list(df.columns),
        "columns_info": columns_info,
        "text_columns_detected": text_cols,
        "label_columns_detected": label_cols,
        "initial_flags": flags,
        "status": "completed"
    }
    print(json.dumps(result, indent=2))
    return result


# ─────────────────────────────────────────────
# TURN 2 — Near-Duplicates + Annotation Inconsistency
# ─────────────────────────────────────────────

def turn2_near_dupes_and_annotation(df: pd.DataFrame) -> dict:
    print("\n" + "="*60)
    print("TURN 2 — Near-Duplicate Detection + Annotation Inconsistency")
    print("="*60)

    text_cols  = get_text_columns(df)
    label_cols = get_label_columns(df)

    # ── Near-Duplicates ──
    near_dupes = []
    if text_cols:
        col    = text_cols[0]
        texts  = df[col].dropna().astype(str).tolist()
        limit  = min(len(texts), 150)
        for i in range(limit):
            for j in range(i + 1, limit):
                ratio = similarity_ratio(texts[i], texts[j])
                if 0.85 <= ratio < 1.0:
                    near_dupes.append({
                        "row_i": i, "row_j": j,
                        "similarity": round(ratio, 3),
                        "text_i": texts[i][:80],
                        "text_j": texts[j][:80]
                    })

    # ── Annotation Inconsistency ──
    annotation_issues = []
    if text_cols and label_cols:
        text_col  = text_cols[0]
        label_col = label_cols[0]
        texts  = df[text_col].dropna().astype(str).tolist()
        labels = df[label_col].astype(str).tolist()
        limit  = min(len(texts), 150)
        for i in range(limit):
            for j in range(i + 1, limit):
                ratio = similarity_ratio(texts[i], texts[j])
                if ratio >= 0.80 and labels[i] != labels[j]:
                    annotation_issues.append({
                        "row_i": i, "row_j": j,
                        "similarity": round(ratio, 3),
                        "text_i": texts[i][:80],
                        "text_j": texts[j][:80],
                        "label_i": labels[i],
                        "label_j": labels[j],
                        "issue": "Similar texts have different labels"
                    })

    total   = len(near_dupes) + len(annotation_issues)
    severity = "HIGH" if total > 8 else "MEDIUM" if total > 3 else "LOW"

    result = {
        "turn": 2,
        "turn_name": "near_duplicates_and_annotation_inconsistency",
        "near_duplicates_found": len(near_dupes),
        "near_duplicate_pairs": near_dupes[:10],
        "annotation_inconsistencies_found": len(annotation_issues),
        "annotation_inconsistency_samples": annotation_issues[:10],
        "total_issues": total,
        "severity": severity,
        "status": "completed"
    }
    print(json.dumps(result, indent=2))
    return result


# ─────────────────────────────────────────────
# TURN 3 — Label Noise + Toxicity / Hate Speech
# ─────────────────────────────────────────────

def turn3_label_noise_and_toxicity(df: pd.DataFrame) -> dict:
    print("\n" + "="*60)
    print("TURN 3 — Label Noise + Toxicity / Hate Speech Detection")
    print("="*60)

    text_cols  = get_text_columns(df)
    label_cols = get_label_columns(df)

    # ── Label Noise ──
    noisy_labels       = []
    label_distribution = {}
    class_imbalance    = False

    if label_cols:
        label_col = label_cols[0]
        counts    = df[label_col].value_counts().to_dict()
        label_distribution = {str(k): int(v) for k, v in counts.items()}
        if counts:
            max_c, min_c = max(counts.values()), min(counts.values())
            class_imbalance = max_c > 0 and (min_c / max_c) < 0.3

    if text_cols and label_cols:
        text_col   = text_cols[0]
        label_col  = label_cols[0]
        label_vals = [str(v).lower() for v in df[label_col].dropna().unique()]
        is_sentiment = any(v in label_vals for v in ["positive","negative","0","1","pos","neg"])

        if is_sentiment:
            for idx, row in df.iterrows():
                try:
                    text  = str(row[text_col]).lower()
                    label = str(row[label_col]).lower()
                    pos   = sum(1 for w in POSITIVE_WORDS if w in text)
                    neg   = sum(1 for w in NEGATIVE_WORDS if w in text)
                    if pos >= 3 and neg == 0 and label in ["negative","neg","0"]:
                        noisy_labels.append({"row": int(idx), "text": str(row[text_col])[:100],
                                             "label": str(row[label_col]),
                                             "issue": "Strong positive text → negative label"})
                    elif neg >= 3 and pos == 0 and label in ["positive","pos","1"]:
                        noisy_labels.append({"row": int(idx), "text": str(row[text_col])[:100],
                                             "label": str(row[label_col]),
                                             "issue": "Strong negative text → positive label"})
                except Exception:
                    continue

    # ── Toxicity & Hate Speech ──
    toxic_rows      = []
    hate_speech_rows = []

    if text_cols:
        text_col = text_cols[0]
        for idx, row in df.iterrows():
            try:
                text  = str(row[text_col]).lower()
                words = set(re.findall(r'\b\w+\b', text))
                hits  = [w for w in TOXIC_WORDS if w in words]
                if hits:
                    toxic_rows.append({"row": int(idx), "text": str(row[text_col])[:100],
                                       "toxic_words_found": hits[:5]})
                for pattern in HATE_SPEECH_PATTERNS:
                    if re.search(pattern, text):
                        hate_speech_rows.append({"row": int(idx),
                                                  "text": str(row[text_col])[:100],
                                                  "pattern_matched": pattern})
                        break
            except Exception:
                continue

    noise_rate   = round(len(noisy_labels) / max(len(df), 1), 3)
    tox_rate     = round(len(toxic_rows)   / max(len(df), 1), 3)
    severity     = "HIGH"   if (noise_rate > 0.1 or tox_rate > 0.1) else \
                   "MEDIUM" if (noise_rate > 0.03 or tox_rate > 0.03) else "LOW"

    result = {
        "turn": 3,
        "turn_name": "label_noise_and_toxicity",
        "label_distribution": label_distribution,
        "class_imbalance_detected": class_imbalance,
        "noisy_labels_found": len(noisy_labels),
        "noisy_label_samples": noisy_labels[:10],
        "label_noise_rate": noise_rate,
        "toxic_rows_found": len(toxic_rows),
        "toxic_samples": toxic_rows[:10],
        "hate_speech_rows_found": len(hate_speech_rows),
        "hate_speech_samples": hate_speech_rows[:5],
        "toxicity_rate": tox_rate,
        "severity": severity,
        "status": "completed"
    }
    print(json.dumps(result, indent=2))
    return result


# ─────────────────────────────────────────────
# TURN 4 — Bias Detection + Linguistic Diversity
# ─────────────────────────────────────────────

def turn4_bias_and_language(df: pd.DataFrame) -> dict:
    print("\n" + "="*60)
    print("TURN 4 — Bias Detection + Linguistic Diversity")
    print("="*60)

    text_cols   = get_text_columns(df)
    bias_report = {
        "gender_bias":   {"mentions": 0, "words_found": {}, "affected_rows": []},
        "racial_bias":   {"mentions": 0, "words_found": {}, "affected_rows": []},
        "cultural_bias": {"mentions": 0, "words_found": {}, "affected_rows": []}
    }

    if text_cols:
        text_col = text_cols[0]
        g_ctr, r_ctr, c_ctr = Counter(), Counter(), Counter()

        for idx, row in df.iterrows():
            try:
                text  = str(row[text_col]).lower()
                words = set(re.findall(r'\b\w+\b', text))
                g = [w for w in GENDER_BIAS_WORDS   if w in words]
                r = [w for w in RACIAL_BIAS_WORDS   if w in words]
                c = [w for w in CULTURAL_BIAS_WORDS if w in words]
                if g:
                    bias_report["gender_bias"]["mentions"] += 1
                    g_ctr.update(g)
                    if len(bias_report["gender_bias"]["affected_rows"]) < 5:
                        bias_report["gender_bias"]["affected_rows"].append(int(idx))
                if r:
                    bias_report["racial_bias"]["mentions"] += 1
                    r_ctr.update(r)
                    if len(bias_report["racial_bias"]["affected_rows"]) < 5:
                        bias_report["racial_bias"]["affected_rows"].append(int(idx))
                if c:
                    bias_report["cultural_bias"]["mentions"] += 1
                    c_ctr.update(c)
                    if len(bias_report["cultural_bias"]["affected_rows"]) < 5:
                        bias_report["cultural_bias"]["affected_rows"].append(int(idx))
            except Exception:
                continue

        bias_report["gender_bias"]["words_found"]   = dict(g_ctr.most_common(10))
        bias_report["racial_bias"]["words_found"]   = dict(r_ctr.most_common(10))
        bias_report["cultural_bias"]["words_found"] = dict(c_ctr.most_common(10))

    total_bias  = sum(v["mentions"] for v in bias_report.values())
    bias_rate   = round(total_bias / max(len(df) * 3, 1), 3)
    bias_score  = round(max(0.0, 1.0 - bias_rate), 3)

    # ── Linguistic Diversity ──
    lang_counts  = Counter()
    non_english  = []

    if text_cols:
        text_col = text_cols[0]
        sample   = df[text_col].dropna().astype(str).tolist()[:100]
        for i, text in enumerate(sample):
            lang = detect_language(text)
            lang_counts[lang] += 1
            if lang not in ("en", "unknown") and len(non_english) < 5:
                non_english.append({"row": i, "text": text[:80], "detected_lang": lang})

    total_detected = sum(lang_counts.values())
    english_pct    = round(lang_counts.get("en", 0) / max(total_detected, 1), 3)
    is_multilingual = len([l for l in lang_counts if l not in ("en", "unknown")]) > 0

    result = {
        "turn": 4,
        "turn_name": "bias_and_linguistic_diversity",
        "bias_report": bias_report,
        "total_bias_mentions": total_bias,
        "bias_rate": bias_rate,
        "bias_score": bias_score,
        "bias_severity": "HIGH" if bias_rate > 0.3 else "MEDIUM" if bias_rate > 0.1 else "LOW",
        "language_distribution": dict(lang_counts),
        "english_percentage": english_pct,
        "is_multilingual": is_multilingual,
        "non_english_samples": non_english,
        "language_diversity_flag": is_multilingual,
        "status": "completed"
    }
    print(json.dumps(result, indent=2))
    return result


# ─────────────────────────────────────────────
# TURN 5 — Data Leakage + Domain Drift + Final Report
# ─────────────────────────────────────────────

def turn5_leakage_drift_report(
    df: pd.DataFrame, dataset_name: str, first_split: str,
    t1: dict, t2: dict, t3: dict, t4: dict,
    config: str = None
) -> dict:
    _cfg = config
    print("\n" + "="*60)
    print("TURN 5 — Data Leakage + Domain Drift + Final Audit Report")
    print("="*60)

    text_cols     = get_text_columns(df)
    leakage_pairs = []
    leakage_rate  = 0.0
    split2        = None

    # ── Data Leakage ──
    print("  🔍 Fetching second split for leakage check...")
    df2, split2 = fetch_second_split(dataset_name, first_split, config=_cfg, num_rows=50)

    if not df2.empty and text_cols:
        text_col = text_cols[0]
        texts1   = df[text_col].dropna().astype(str).tolist()[:50]
        texts2   = df2[text_col].dropna().astype(str).tolist()[:50] if text_col in df2.columns else []
        for i, t1_text in enumerate(texts1):
            for j, t2_text in enumerate(texts2):
                ratio = similarity_ratio(t1_text, t2_text)
                if ratio >= 0.90:
                    leakage_pairs.append({
                        f"{first_split}_row": i, f"{split2}_row": j,
                        "similarity": round(ratio, 3),
                        f"{first_split}_text": t1_text[:80],
                        f"{split2}_text": t2_text[:80]
                    })
        leakage_rate = round(len(leakage_pairs) / max(len(texts1), 1), 3)

    # ── Domain Drift ──
    drift_detected = False
    drift_details  = {}

    if text_cols:
        text_col = text_cols[0]
        texts    = df[text_col].dropna().astype(str).tolist()
        if len(texts) >= 20:
            mid   = len(texts) // 2
            first_half  = texts[:mid]
            second_half = texts[mid:]
            avg_len_first  = round(np.mean([len(t) for t in first_half]), 1)
            avg_len_second = round(np.mean([len(t) for t in second_half]), 1)

            def vocab(txts):
                w = set()
                for t in txts:
                    w.update(re.findall(r'\b\w+\b', t.lower()))
                return w

            v1, v2   = vocab(first_half), vocab(second_half)
            jaccard  = round(len(v1 & v2) / max(len(v1 | v2), 1), 3)
            drift_detected = jaccard < 0.40 or abs(avg_len_first - avg_len_second) > 50

            drift_details = {
                "avg_text_length_first_half":  avg_len_first,
                "avg_text_length_second_half": avg_len_second,
                "length_drift": round(abs(avg_len_first - avg_len_second), 1),
                "vocabulary_overlap_jaccard": jaccard,
                "drift_detected": drift_detected,
                "interpretation": (
                    "Significant topic/domain drift — dataset may not be uniform."
                    if drift_detected else
                    "No significant drift — dataset appears topically consistent."
                )
            }

    # ── Scoring ──
    dup_score     = clamp_score(1.0 - (t2.get("near_duplicates_found", 0) / max(len(df), 1)) * 5)
    ann_score     = clamp_score(1.0 - (t2.get("annotation_inconsistencies_found", 0) / max(len(df), 1)) * 10)
    label_score   = clamp_score(1.0 - t3.get("label_noise_rate", 0) * 10)
    tox_score     = clamp_score(1.0 - t3.get("toxicity_rate", 0) * 5)
    bias_score    = clamp_score(t4.get("bias_score", 1.0))
    lang_score    = clamp_score(t4.get("english_percentage", 1.0))
    leakage_score = clamp_score(1.0 - leakage_rate * 5)
    drift_score   = clamp_score(0.6 if drift_detected else 1.0)

    audit_score = clamp_score(
    dup_score * 0.12 +
    ann_score * 0.13 +
    label_score * 0.15 +
    tox_score * 0.15 +
    bias_score * 0.15 +
    lang_score * 0.10 +
    leakage_score * 0.10 +
    drift_score * 0.10
)

    # Verdict
    if audit_score >= 0.85:
        verdict = "EXCELLENT"
        summary = "Dataset passes all hard quality checks. Ready for training."
    elif audit_score >= 0.70:
        verdict = "GOOD"
        summary = "Dataset is usable but has some issues worth addressing."
    elif audit_score >= 0.50:
        verdict = "NEEDS_WORK"
        summary = "Multiple quality/bias issues found. Significant cleanup needed."
    else:
        verdict = "POOR"
        summary = "Serious quality, bias, or leakage issues. Not recommended without major fixes."

    # Recommendations
    recs = []
    if dup_score     < 0.8: recs.append("Remove near-duplicate rows to prevent overfitting.")
    if ann_score     < 0.8: recs.append("Re-annotate inconsistently labelled similar texts.")
    if label_score   < 0.8: recs.append("Review and correct noisy labels.")
    if tox_score     < 0.8: recs.append("Filter toxic/hate speech content before training.")
    if bias_score    < 0.7: recs.append("Audit and balance gender, racial, and cultural representation.")
    if lang_score    < 0.8: recs.append("Filter non-English rows if multilingual content is unintentional.")
    if leakage_score < 0.8: recs.append("Data leakage detected between splits — re-split the dataset.")
    if drift_detected:      recs.append("Topic drift detected — verify dataset was collected from a consistent source.")
    if not recs:            recs.append("Dataset passed all checks. Consider expanding size for better generalization.")

    result = {
        "task_id": "task3_hard",
        "turn": 5,
        "turn_name": "leakage_drift_final_report",
        "dataset_name": dataset_name,
        "total_rows_audited": len(df),

        "data_leakage": {
            "splits_compared": [first_split, split2] if split2 else [first_split],
            "leakage_pairs_found": len(leakage_pairs),
            "leakage_pairs_sample": leakage_pairs[:5],
            "leakage_rate": leakage_rate
        },

        "domain_drift": drift_details,

        "scores": {
            "near_duplicate_score":         dup_score,
            "annotation_consistency_score": ann_score,
            "label_noise_score":            label_score,
            "toxicity_score":               tox_score,
            "bias_score":                   bias_score,
            "language_consistency_score":   lang_score,
            "data_leakage_score":           leakage_score,
            "domain_drift_score":           drift_score
        },

        "audit_score": audit_score,
        "verdict": verdict,
        "summary": summary,
        "recommendations": recs,

        "turn_results": {
            "turn1_overview":              t1,
            "turn2_near_dupes_annotation": t2,
            "turn3_label_noise_toxicity":  t3,
            "turn4_bias_language":         t4
        },

        "status": "completed"
    }

    print(json.dumps(result, indent=2))
    return result


# ─────────────────────────────────────────────
# MAIN
# ─────────────────────────────────────────────

if __name__ == "__main__":
    print("=" * 60)
    print("  TASK 3 — BIAS & QUALITY AUDIT (HARD, 5-TURN)")
    print("  Checks: Near-Dupes | Annotation Inconsistency |")
    print("  Label Noise | Toxicity | Bias | Language | Leakage | Drift")
    print("=" * 60)

    raw_input = input("\nEnter HuggingFace dataset name (e.g. imdb  OR  cardiffnlp/tweet_eval hate): ").strip()
    if not raw_input:
        raw_input = "imdb"

    dataset_name, config = clean_dataset_name(raw_input)
    print(f"   📌 Dataset : {dataset_name}")
    if config:
        print(f"   📌 Config  : {config}")

    df, split = fetch_dataset_rows(dataset_name, config=config, num_rows=100)

    if df.empty:
        print("❌ Could not load dataset.")
        print("   Tips:")
        print("   • Use just the name, e.g.  imdb  or  dair-ai/emotion")
        print("   • For tweet_eval use:  cardiffnlp/tweet_eval hate")
        print("   • Do NOT paste the full URL")
        exit(1)

    t1 = turn1_overview(df, dataset_name, split)
    input("\n⏎  Press Enter → Turn 2 (Near-Dupes + Annotation)...")

    t2 = turn2_near_dupes_and_annotation(df)
    input("\n⏎  Press Enter → Turn 3 (Label Noise + Toxicity)...")

    t3 = turn3_label_noise_and_toxicity(df)
    input("\n⏎  Press Enter → Turn 4 (Bias + Language)...")

    t4 = turn4_bias_and_language(df)
    input("\n⏎  Press Enter → Turn 5 (Leakage + Drift + Final Report)...")

    final = turn5_leakage_drift_report(df, dataset_name, split, t1, t2, t3, t4, config=config)

    print("\n" + "=" * 60)
    print("✅ TASK 3 COMPLETE — Copy the JSON below into grader3.py")
    print("=" * 60)
    print(json.dumps(final, indent=2))

    with open("task3_output.json", "w") as f:
        json.dump(final, f, indent=2)
    print("\n📄 Output saved to: task3_output.json")