Spaces:

Adithya765
/

dataset-quality-env-fixed

Sleeping

App Files Files Community

dataset-quality-env-fixed / tasks /task3_hard.py

Adithya765

final fix: clamp all scores strictly

8fd29dc about 2 months ago

raw

history blame contribute delete

30.8 kB

	"""
	task3_hard.py
	=============
	Task 3 — Bias & Quality Audit (Hard, 5-Turn)
	OpenEnv Project \| Meta × Hugging Face Hackathon

	What it does:
	Runs a 5-turn deep audit on a HuggingFace dataset
	covering the hardest quality and bias checks.

	Checks (9 across 5 turns):
	Turn 1: Column profiling
	Turn 2: Near-duplicate detection + Annotation inconsistency
	Turn 3: Label noise + Toxicity + Hate speech
	Turn 4: Bias detection (gender/racial/cultural) + Linguistic diversity
	Turn 5: Data leakage + Domain drift + Final audit report

	Usage:
	python task3_hard.py
	→ Enter dataset name (e.g. dair-ai/emotion)
	→ Press Enter to progress through each turn
	→ Output saved to task3_output.json
	→ Copy JSON output into grader3.py

	Requirements:
	pip install datasets pandas numpy scikit-learn langdetect
	"""
	"""
	Task 3 — Bias & Quality Audit (HARD)
	=====================================
	A 5-turn multi-step agent task with 8 hard checks.

	Turn 1 → Dataset Overview + Column Profiling
	Turn 2 → Near-Duplicate Detection + Annotation Inconsistency
	Turn 3 → Label Noise + Toxicity / Hate Speech Detection
	Turn 4 → Bias Detection + Linguistic Diversity
	Turn 5 → Data Leakage + Domain Drift + Final Audit Report

	Compatible with: Google Colab, Python 3.8+
	Install: pip install datasets pandas numpy scikit-learn langdetect difflib
	"""

	import json
	import re
	import difflib
	import warnings
	import subprocess
	import sys
	from collections import Counter, defaultdict

	warnings.filterwarnings("ignore")

	def clamp_score(x):
	return round(max(0.01, min(0.99, x)), 3)
	# ─────────────────────────────────────────────
	# INSTALL DEPS
	# ─────────────────────────────────────────────

	def install(pkg):
	subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])

	for pkg in ["datasets", "pandas", "numpy", "scikit-learn", "langdetect"]:
	try:
	__import__(pkg.replace("-", "_"))
	except ImportError:
	install(pkg)

	import pandas as pd
	import numpy as np
	from datasets import load_dataset
	from langdetect import detect, LangDetectException

	# ─────────────────────────────────────────────
	# WORD LISTS
	# ─────────────────────────────────────────────

	GENDER_BIAS_WORDS = [
	"he", "she", "him", "her", "his", "hers", "man", "woman", "men", "women",
	"male", "female", "boy", "girl", "husband", "wife", "father", "mother",
	"son", "daughter", "brother", "sister", "mr", "mrs", "ms", "sir", "madam",
	"king", "queen", "prince", "princess", "actor", "actress", "waiter", "waitress",
	"policeman", "policewoman", "fireman", "stewardess", "mankind", "manpower"
	]

	RACIAL_BIAS_WORDS = [
	"black", "white", "asian", "hispanic", "latino", "latina", "african",
	"caucasian", "arab", "jewish", "muslim", "christian", "hindu", "immigrant",
	"refugee", "foreigner", "native", "minority", "ethnic", "race", "racial",
	"colored", "indigenous", "tribal"
	]

	CULTURAL_BIAS_WORDS = [
	"western", "eastern", "american", "european", "third world", "developing",
	"primitive", "civilized", "savage", "barbarian", "exotic", "oriental",
	"traditional", "modern", "backward", "progressive", "uncivilized"
	]

	TOXIC_WORDS = [
	"idiot", "stupid", "moron", "loser", "trash", "garbage", "hate", "kill",
	"die", "retard", "freak", "ugly", "fat", "dumb", "worthless", "scum",
	"disgusting", "pathetic", "monster", "pervert", "creep"
	]

	HATE_SPEECH_PATTERNS = [
	r'\b(all\|those\|these)\s+(black\|white\|asian\|muslim\|jewish\|gay\|trans)\s+(people\|guys\|men\|women)\s+(are\|should\|must\|deserve)\b',
	r'\b(go back to)\s+\w+',
	r'\b(ban all)\s+\w+',
	r'\b(they\|them)\s+(don\'t belong\|should leave\|are inferior\|are criminals)\b',
	]

	POSITIVE_WORDS = [
	"great", "excellent", "amazing", "wonderful", "fantastic", "best", "good",
	"beautiful", "smart", "brilliant", "awesome", "perfect", "love", "superb",
	"outstanding", "exceptional", "magnificent", "enjoyed", "loved"
	]

	NEGATIVE_WORDS = [
	"terrible", "awful", "horrible", "disgusting", "hate", "worst", "bad",
	"poor", "ugly", "stupid", "dumb", "trash", "garbage", "useless",
	"worthless", "pathetic", "loser", "failure", "boring", "waste", "dreadful"
	]

	# ─────────────────────────────────────────────
	# HELPERS
	# ─────────────────────────────────────────────

	def clean_dataset_name(name: str) -> tuple:
	"""
	Accept either:
	- plain name: imdb
	- name + config: cardiffnlp/tweet_eval hate
	- full HF URL: https://huggingface.co/datasets/cardiffnlp/tweet_eval
	Returns (dataset_name, config_or_None)
	"""
	name = name.strip()
	# Strip full HF URL down to just the dataset path
	if name.startswith("http"):
	# e.g. https://huggingface.co/datasets/cardiffnlp/tweet_eval
	name = re.sub(r"https?://huggingface\.co/datasets/", "", name).strip("/")

	# Check if user passed config after a space e.g. "cardiffnlp/tweet_eval hate"
	parts = name.split()
	if len(parts) == 2:
	return parts[0], parts[1]
	return name, None


	def fetch_dataset_rows(dataset_name: str, config: str = None, num_rows: int = 100):
	"""Fetch rows — tries train, then test, then validation split."""
	label = f"{dataset_name}" + (f" ({config})" if config else "")
	print(f"\n📦 Loading dataset: {label} ...")

	# Known datasets that require a config — default config map
	CONFIG_DEFAULTS = {
	"cardiffnlp/tweet_eval": "hate",
	"nyu-mll/glue": "sst2",
	"glue": "sst2",
	}

	if config is None:
	config = CONFIG_DEFAULTS.get(dataset_name)

	for split in ["train", "test", "validation"]:
	try:
	if config:
	ds = load_dataset(dataset_name, config, split=split, streaming=True)
	else:
	ds = load_dataset(dataset_name, split=split, streaming=True)
	rows = []
	for i, row in enumerate(ds):
	if i >= num_rows:
	break
	rows.append(row)
	if rows:
	df = pd.DataFrame(rows)
	print(f"✅ Loaded {len(df)} rows, {len(df.columns)} columns from '{split}' split")
	print(f" Columns: {list(df.columns)}")
	return df, split
	except Exception as e:
	print(f" ⚠️ Split '{split}' failed: {str(e)[:80]}")
	continue
	print("❌ Could not load any split.")
	return pd.DataFrame(), None


	def fetch_second_split(dataset_name: str, first_split: str, config: str = None, num_rows: int = 50):
	"""Fetch a different split for data leakage check."""
	for split in ["train", "test", "validation"]:
	if split == first_split:
	continue
	try:
	if config:
	ds = load_dataset(dataset_name, config, split=split, streaming=True)
	else:
	ds = load_dataset(dataset_name, split=split, streaming=True)
	rows = []
	for i, row in enumerate(ds):
	if i >= num_rows:
	break
	rows.append(row)
	if rows:
	return pd.DataFrame(rows), split
	except Exception:
	continue
	return pd.DataFrame(), None


	def get_text_columns(df: pd.DataFrame):
	return [c for c in df.columns if df[c].dtype == object and
	df[c].dropna().apply(lambda x: isinstance(x, str) and len(x) > 20).mean() > 0.3]


	def get_label_columns(df: pd.DataFrame):
	return [c for c in df.columns if 1 < df[c].nunique() <= 20]


	def similarity_ratio(s1: str, s2: str) -> float:
	return difflib.SequenceMatcher(None, s1.lower().strip(), s2.lower().strip()).ratio()


	def detect_language(text: str) -> str:
	try:
	return detect(str(text))
	except LangDetectException:
	return "unknown"


	# ─────────────────────────────────────────────
	# TURN 1 — Dataset Overview + Column Profiling
	# ─────────────────────────────────────────────

	def turn1_overview(df: pd.DataFrame, dataset_name: str, split: str) -> dict:
	print("\n" + "="*60)
	print("TURN 1 — Dataset Overview & Column Profiling")
	print("="*60)

	columns_info = {}
	for col in df.columns:
	missing = int(df[col].isnull().sum())
	nuniq = int(df[col].nunique())
	dtype = str(df[col].dtype)
	sample = df[col].dropna().iloc[:3].tolist() if not df[col].dropna().empty else []
	avg_len = None
	if df[col].dtype == object:
	avg_len = round(df[col].dropna().astype(str).apply(len).mean(), 1)
	columns_info[col] = {
	"dtype": dtype,
	"missing": missing,
	"missing_rate": round(missing / max(len(df), 1), 3),
	"unique_values": nuniq,
	"avg_text_length": avg_len,
	"sample": [str(s)[:60] for s in sample]
	}

	text_cols = get_text_columns(df)
	label_cols = get_label_columns(df)

	flags = []
	if len(df) < 20:
	flags.append("Very small dataset — statistical checks may be unreliable")
	if not text_cols:
	flags.append("No long-text columns — bias/toxicity checks will be limited")
	if not label_cols:
	flags.append("No label columns — label noise check will be skipped")

	result = {
	"turn": 1,
	"turn_name": "dataset_overview",
	"dataset_name": dataset_name,
	"split_used": split,
	"total_rows": len(df),
	"total_columns": len(df.columns),
	"column_names": list(df.columns),
	"columns_info": columns_info,
	"text_columns_detected": text_cols,
	"label_columns_detected": label_cols,
	"initial_flags": flags,
	"status": "completed"
	}
	print(json.dumps(result, indent=2))
	return result


	# ─────────────────────────────────────────────
	# TURN 2 — Near-Duplicates + Annotation Inconsistency
	# ─────────────────────────────────────────────

	def turn2_near_dupes_and_annotation(df: pd.DataFrame) -> dict:
	print("\n" + "="*60)
	print("TURN 2 — Near-Duplicate Detection + Annotation Inconsistency")
	print("="*60)

	text_cols = get_text_columns(df)
	label_cols = get_label_columns(df)

	# ── Near-Duplicates ──
	near_dupes = []
	if text_cols:
	col = text_cols[0]
	texts = df[col].dropna().astype(str).tolist()
	limit = min(len(texts), 150)
	for i in range(limit):
	for j in range(i + 1, limit):
	ratio = similarity_ratio(texts[i], texts[j])
	if 0.85 <= ratio < 1.0:
	near_dupes.append({
	"row_i": i, "row_j": j,
	"similarity": round(ratio, 3),
	"text_i": texts[i][:80],
	"text_j": texts[j][:80]
	})

	# ── Annotation Inconsistency ──
	annotation_issues = []
	if text_cols and label_cols:
	text_col = text_cols[0]
	label_col = label_cols[0]
	texts = df[text_col].dropna().astype(str).tolist()
	labels = df[label_col].astype(str).tolist()
	limit = min(len(texts), 150)
	for i in range(limit):
	for j in range(i + 1, limit):
	ratio = similarity_ratio(texts[i], texts[j])
	if ratio >= 0.80 and labels[i] != labels[j]:
	annotation_issues.append({
	"row_i": i, "row_j": j,
	"similarity": round(ratio, 3),
	"text_i": texts[i][:80],
	"text_j": texts[j][:80],
	"label_i": labels[i],
	"label_j": labels[j],
	"issue": "Similar texts have different labels"
	})

	total = len(near_dupes) + len(annotation_issues)
	severity = "HIGH" if total > 8 else "MEDIUM" if total > 3 else "LOW"

	result = {
	"turn": 2,
	"turn_name": "near_duplicates_and_annotation_inconsistency",
	"near_duplicates_found": len(near_dupes),
	"near_duplicate_pairs": near_dupes[:10],
	"annotation_inconsistencies_found": len(annotation_issues),
	"annotation_inconsistency_samples": annotation_issues[:10],
	"total_issues": total,
	"severity": severity,
	"status": "completed"
	}
	print(json.dumps(result, indent=2))
	return result


	# ─────────────────────────────────────────────
	# TURN 3 — Label Noise + Toxicity / Hate Speech
	# ─────────────────────────────────────────────

	def turn3_label_noise_and_toxicity(df: pd.DataFrame) -> dict:
	print("\n" + "="*60)
	print("TURN 3 — Label Noise + Toxicity / Hate Speech Detection")
	print("="*60)

	text_cols = get_text_columns(df)
	label_cols = get_label_columns(df)

	# ── Label Noise ──
	noisy_labels = []
	label_distribution = {}
	class_imbalance = False

	if label_cols:
	label_col = label_cols[0]
	counts = df[label_col].value_counts().to_dict()
	label_distribution = {str(k): int(v) for k, v in counts.items()}
	if counts:
	max_c, min_c = max(counts.values()), min(counts.values())
	class_imbalance = max_c > 0 and (min_c / max_c) < 0.3

	if text_cols and label_cols:
	text_col = text_cols[0]
	label_col = label_cols[0]
	label_vals = [str(v).lower() for v in df[label_col].dropna().unique()]
	is_sentiment = any(v in label_vals for v in ["positive","negative","0","1","pos","neg"])

	if is_sentiment:
	for idx, row in df.iterrows():
	try:
	text = str(row[text_col]).lower()
	label = str(row[label_col]).lower()
	pos = sum(1 for w in POSITIVE_WORDS if w in text)
	neg = sum(1 for w in NEGATIVE_WORDS if w in text)
	if pos >= 3 and neg == 0 and label in ["negative","neg","0"]:
	noisy_labels.append({"row": int(idx), "text": str(row[text_col])[:100],
	"label": str(row[label_col]),
	"issue": "Strong positive text → negative label"})
	elif neg >= 3 and pos == 0 and label in ["positive","pos","1"]:
	noisy_labels.append({"row": int(idx), "text": str(row[text_col])[:100],
	"label": str(row[label_col]),
	"issue": "Strong negative text → positive label"})
	except Exception:
	continue

	# ── Toxicity & Hate Speech ──
	toxic_rows = []
	hate_speech_rows = []

	if text_cols:
	text_col = text_cols[0]
	for idx, row in df.iterrows():
	try:
	text = str(row[text_col]).lower()
	words = set(re.findall(r'\b\w+\b', text))
	hits = [w for w in TOXIC_WORDS if w in words]
	if hits:
	toxic_rows.append({"row": int(idx), "text": str(row[text_col])[:100],
	"toxic_words_found": hits[:5]})
	for pattern in HATE_SPEECH_PATTERNS:
	if re.search(pattern, text):
	hate_speech_rows.append({"row": int(idx),
	"text": str(row[text_col])[:100],
	"pattern_matched": pattern})
	break
	except Exception:
	continue

	noise_rate = round(len(noisy_labels) / max(len(df), 1), 3)
	tox_rate = round(len(toxic_rows) / max(len(df), 1), 3)
	severity = "HIGH" if (noise_rate > 0.1 or tox_rate > 0.1) else \
	"MEDIUM" if (noise_rate > 0.03 or tox_rate > 0.03) else "LOW"

	result = {
	"turn": 3,
	"turn_name": "label_noise_and_toxicity",
	"label_distribution": label_distribution,
	"class_imbalance_detected": class_imbalance,
	"noisy_labels_found": len(noisy_labels),
	"noisy_label_samples": noisy_labels[:10],
	"label_noise_rate": noise_rate,
	"toxic_rows_found": len(toxic_rows),
	"toxic_samples": toxic_rows[:10],
	"hate_speech_rows_found": len(hate_speech_rows),
	"hate_speech_samples": hate_speech_rows[:5],
	"toxicity_rate": tox_rate,
	"severity": severity,
	"status": "completed"
	}
	print(json.dumps(result, indent=2))
	return result


	# ─────────────────────────────────────────────
	# TURN 4 — Bias Detection + Linguistic Diversity
	# ─────────────────────────────────────────────

	def turn4_bias_and_language(df: pd.DataFrame) -> dict:
	print("\n" + "="*60)
	print("TURN 4 — Bias Detection + Linguistic Diversity")
	print("="*60)

	text_cols = get_text_columns(df)
	bias_report = {
	"gender_bias": {"mentions": 0, "words_found": {}, "affected_rows": []},
	"racial_bias": {"mentions": 0, "words_found": {}, "affected_rows": []},
	"cultural_bias": {"mentions": 0, "words_found": {}, "affected_rows": []}
	}

	if text_cols:
	text_col = text_cols[0]
	g_ctr, r_ctr, c_ctr = Counter(), Counter(), Counter()

	for idx, row in df.iterrows():
	try:
	text = str(row[text_col]).lower()
	words = set(re.findall(r'\b\w+\b', text))
	g = [w for w in GENDER_BIAS_WORDS if w in words]
	r = [w for w in RACIAL_BIAS_WORDS if w in words]
	c = [w for w in CULTURAL_BIAS_WORDS if w in words]
	if g:
	bias_report["gender_bias"]["mentions"] += 1
	g_ctr.update(g)
	if len(bias_report["gender_bias"]["affected_rows"]) < 5:
	bias_report["gender_bias"]["affected_rows"].append(int(idx))
	if r:
	bias_report["racial_bias"]["mentions"] += 1
	r_ctr.update(r)
	if len(bias_report["racial_bias"]["affected_rows"]) < 5:
	bias_report["racial_bias"]["affected_rows"].append(int(idx))
	if c:
	bias_report["cultural_bias"]["mentions"] += 1
	c_ctr.update(c)
	if len(bias_report["cultural_bias"]["affected_rows"]) < 5:
	bias_report["cultural_bias"]["affected_rows"].append(int(idx))
	except Exception:
	continue

	bias_report["gender_bias"]["words_found"] = dict(g_ctr.most_common(10))
	bias_report["racial_bias"]["words_found"] = dict(r_ctr.most_common(10))
	bias_report["cultural_bias"]["words_found"] = dict(c_ctr.most_common(10))

	total_bias = sum(v["mentions"] for v in bias_report.values())
	bias_rate = round(total_bias / max(len(df) * 3, 1), 3)
	bias_score = round(max(0.0, 1.0 - bias_rate), 3)

	# ── Linguistic Diversity ──
	lang_counts = Counter()
	non_english = []

	if text_cols:
	text_col = text_cols[0]
	sample = df[text_col].dropna().astype(str).tolist()[:100]
	for i, text in enumerate(sample):
	lang = detect_language(text)
	lang_counts[lang] += 1
	if lang not in ("en", "unknown") and len(non_english) < 5:
	non_english.append({"row": i, "text": text[:80], "detected_lang": lang})

	total_detected = sum(lang_counts.values())
	english_pct = round(lang_counts.get("en", 0) / max(total_detected, 1), 3)
	is_multilingual = len([l for l in lang_counts if l not in ("en", "unknown")]) > 0

	result = {
	"turn": 4,
	"turn_name": "bias_and_linguistic_diversity",
	"bias_report": bias_report,
	"total_bias_mentions": total_bias,
	"bias_rate": bias_rate,
	"bias_score": bias_score,
	"bias_severity": "HIGH" if bias_rate > 0.3 else "MEDIUM" if bias_rate > 0.1 else "LOW",
	"language_distribution": dict(lang_counts),
	"english_percentage": english_pct,
	"is_multilingual": is_multilingual,
	"non_english_samples": non_english,
	"language_diversity_flag": is_multilingual,
	"status": "completed"
	}
	print(json.dumps(result, indent=2))
	return result


	# ─────────────────────────────────────────────
	# TURN 5 — Data Leakage + Domain Drift + Final Report
	# ─────────────────────────────────────────────

	def turn5_leakage_drift_report(
	df: pd.DataFrame, dataset_name: str, first_split: str,
	t1: dict, t2: dict, t3: dict, t4: dict,
	config: str = None
	) -> dict:
	_cfg = config
	print("\n" + "="*60)
	print("TURN 5 — Data Leakage + Domain Drift + Final Audit Report")
	print("="*60)

	text_cols = get_text_columns(df)
	leakage_pairs = []
	leakage_rate = 0.0
	split2 = None

	# ── Data Leakage ──
	print(" 🔍 Fetching second split for leakage check...")
	df2, split2 = fetch_second_split(dataset_name, first_split, config=_cfg, num_rows=50)

	if not df2.empty and text_cols:
	text_col = text_cols[0]
	texts1 = df[text_col].dropna().astype(str).tolist()[:50]
	texts2 = df2[text_col].dropna().astype(str).tolist()[:50] if text_col in df2.columns else []
	for i, t1_text in enumerate(texts1):
	for j, t2_text in enumerate(texts2):
	ratio = similarity_ratio(t1_text, t2_text)
	if ratio >= 0.90:
	leakage_pairs.append({
	f"{first_split}_row": i, f"{split2}_row": j,
	"similarity": round(ratio, 3),
	f"{first_split}_text": t1_text[:80],
	f"{split2}_text": t2_text[:80]
	})
	leakage_rate = round(len(leakage_pairs) / max(len(texts1), 1), 3)

	# ── Domain Drift ──
	drift_detected = False
	drift_details = {}

	if text_cols:
	text_col = text_cols[0]
	texts = df[text_col].dropna().astype(str).tolist()
	if len(texts) >= 20:
	mid = len(texts) // 2
	first_half = texts[:mid]
	second_half = texts[mid:]
	avg_len_first = round(np.mean([len(t) for t in first_half]), 1)
	avg_len_second = round(np.mean([len(t) for t in second_half]), 1)

	def vocab(txts):
	w = set()
	for t in txts:
	w.update(re.findall(r'\b\w+\b', t.lower()))
	return w

	v1, v2 = vocab(first_half), vocab(second_half)
	jaccard = round(len(v1 & v2) / max(len(v1 \| v2), 1), 3)
	drift_detected = jaccard < 0.40 or abs(avg_len_first - avg_len_second) > 50

	drift_details = {
	"avg_text_length_first_half": avg_len_first,
	"avg_text_length_second_half": avg_len_second,
	"length_drift": round(abs(avg_len_first - avg_len_second), 1),
	"vocabulary_overlap_jaccard": jaccard,
	"drift_detected": drift_detected,
	"interpretation": (
	"Significant topic/domain drift — dataset may not be uniform."
	if drift_detected else
	"No significant drift — dataset appears topically consistent."
	)
	}

	# ── Scoring ──
	dup_score = clamp_score(1.0 - (t2.get("near_duplicates_found", 0) / max(len(df), 1)) * 5)
	ann_score = clamp_score(1.0 - (t2.get("annotation_inconsistencies_found", 0) / max(len(df), 1)) * 10)
	label_score = clamp_score(1.0 - t3.get("label_noise_rate", 0) * 10)
	tox_score = clamp_score(1.0 - t3.get("toxicity_rate", 0) * 5)
	bias_score = clamp_score(t4.get("bias_score", 1.0))
	lang_score = clamp_score(t4.get("english_percentage", 1.0))
	leakage_score = clamp_score(1.0 - leakage_rate * 5)
	drift_score = clamp_score(0.6 if drift_detected else 1.0)

	audit_score = clamp_score(
	dup_score * 0.12 +
	ann_score * 0.13 +
	label_score * 0.15 +
	tox_score * 0.15 +
	bias_score * 0.15 +
	lang_score * 0.10 +
	leakage_score * 0.10 +
	drift_score * 0.10
	)

	# Verdict
	if audit_score >= 0.85:
	verdict = "EXCELLENT"
	summary = "Dataset passes all hard quality checks. Ready for training."
	elif audit_score >= 0.70:
	verdict = "GOOD"
	summary = "Dataset is usable but has some issues worth addressing."
	elif audit_score >= 0.50:
	verdict = "NEEDS_WORK"
	summary = "Multiple quality/bias issues found. Significant cleanup needed."
	else:
	verdict = "POOR"
	summary = "Serious quality, bias, or leakage issues. Not recommended without major fixes."

	# Recommendations
	recs = []
	if dup_score < 0.8: recs.append("Remove near-duplicate rows to prevent overfitting.")
	if ann_score < 0.8: recs.append("Re-annotate inconsistently labelled similar texts.")
	if label_score < 0.8: recs.append("Review and correct noisy labels.")
	if tox_score < 0.8: recs.append("Filter toxic/hate speech content before training.")
	if bias_score < 0.7: recs.append("Audit and balance gender, racial, and cultural representation.")
	if lang_score < 0.8: recs.append("Filter non-English rows if multilingual content is unintentional.")
	if leakage_score < 0.8: recs.append("Data leakage detected between splits — re-split the dataset.")
	if drift_detected: recs.append("Topic drift detected — verify dataset was collected from a consistent source.")
	if not recs: recs.append("Dataset passed all checks. Consider expanding size for better generalization.")

	result = {
	"task_id": "task3_hard",
	"turn": 5,
	"turn_name": "leakage_drift_final_report",
	"dataset_name": dataset_name,
	"total_rows_audited": len(df),

	"data_leakage": {
	"splits_compared": [first_split, split2] if split2 else [first_split],
	"leakage_pairs_found": len(leakage_pairs),
	"leakage_pairs_sample": leakage_pairs[:5],
	"leakage_rate": leakage_rate
	},

	"domain_drift": drift_details,

	"scores": {
	"near_duplicate_score": dup_score,
	"annotation_consistency_score": ann_score,
	"label_noise_score": label_score,
	"toxicity_score": tox_score,
	"bias_score": bias_score,
	"language_consistency_score": lang_score,
	"data_leakage_score": leakage_score,
	"domain_drift_score": drift_score
	},

	"audit_score": audit_score,
	"verdict": verdict,
	"summary": summary,
	"recommendations": recs,

	"turn_results": {
	"turn1_overview": t1,
	"turn2_near_dupes_annotation": t2,
	"turn3_label_noise_toxicity": t3,
	"turn4_bias_language": t4
	},

	"status": "completed"
	}

	print(json.dumps(result, indent=2))
	return result


	# ─────────────────────────────────────────────
	# MAIN
	# ─────────────────────────────────────────────

	if __name__ == "__main__":
	print("=" * 60)
	print(" TASK 3 — BIAS & QUALITY AUDIT (HARD, 5-TURN)")
	print(" Checks: Near-Dupes \| Annotation Inconsistency \|")
	print(" Label Noise \| Toxicity \| Bias \| Language \| Leakage \| Drift")
	print("=" * 60)

	raw_input = input("\nEnter HuggingFace dataset name (e.g. imdb OR cardiffnlp/tweet_eval hate): ").strip()
	if not raw_input:
	raw_input = "imdb"

	dataset_name, config = clean_dataset_name(raw_input)
	print(f" 📌 Dataset : {dataset_name}")
	if config:
	print(f" 📌 Config : {config}")

	df, split = fetch_dataset_rows(dataset_name, config=config, num_rows=100)

	if df.empty:
	print("❌ Could not load dataset.")
	print(" Tips:")
	print(" • Use just the name, e.g. imdb or dair-ai/emotion")
	print(" • For tweet_eval use: cardiffnlp/tweet_eval hate")
	print(" • Do NOT paste the full URL")
	exit(1)

	t1 = turn1_overview(df, dataset_name, split)
	input("\n⏎ Press Enter → Turn 2 (Near-Dupes + Annotation)...")

	t2 = turn2_near_dupes_and_annotation(df)
	input("\n⏎ Press Enter → Turn 3 (Label Noise + Toxicity)...")

	t3 = turn3_label_noise_and_toxicity(df)
	input("\n⏎ Press Enter → Turn 4 (Bias + Language)...")

	t4 = turn4_bias_and_language(df)
	input("\n⏎ Press Enter → Turn 5 (Leakage + Drift + Final Report)...")

	final = turn5_leakage_drift_report(df, dataset_name, split, t1, t2, t3, t4, config=config)

	print("\n" + "=" * 60)
	print("✅ TASK 3 COMPLETE — Copy the JSON below into grader3.py")
	print("=" * 60)
	print(json.dumps(final, indent=2))

	with open("task3_output.json", "w") as f:
	json.dump(final, f, indent=2)
	print("\n📄 Output saved to: task3_output.json")