| """ |
| task3_hard.py |
| ============= |
| Task 3 β Bias & Quality Audit (Hard, 5-Turn) |
| OpenEnv Project | Meta Γ Hugging Face Hackathon |
| |
| What it does: |
| Runs a 5-turn deep audit on a HuggingFace dataset |
| covering the hardest quality and bias checks. |
| |
| Checks (9 across 5 turns): |
| Turn 1: Column profiling |
| Turn 2: Near-duplicate detection + Annotation inconsistency |
| Turn 3: Label noise + Toxicity + Hate speech |
| Turn 4: Bias detection (gender/racial/cultural) + Linguistic diversity |
| Turn 5: Data leakage + Domain drift + Final audit report |
| |
| Usage: |
| python task3_hard.py |
| β Enter dataset name (e.g. dair-ai/emotion) |
| β Press Enter to progress through each turn |
| β Output saved to task3_output.json |
| β Copy JSON output into grader3.py |
| |
| Requirements: |
| pip install datasets pandas numpy scikit-learn langdetect |
| """ |
| """ |
| Task 3 β Bias & Quality Audit (HARD) |
| ===================================== |
| A 5-turn multi-step agent task with 8 hard checks. |
| |
| Turn 1 β Dataset Overview + Column Profiling |
| Turn 2 β Near-Duplicate Detection + Annotation Inconsistency |
| Turn 3 β Label Noise + Toxicity / Hate Speech Detection |
| Turn 4 β Bias Detection + Linguistic Diversity |
| Turn 5 β Data Leakage + Domain Drift + Final Audit Report |
| |
| Compatible with: Google Colab, Python 3.8+ |
| Install: pip install datasets pandas numpy scikit-learn langdetect difflib |
| """ |
|
|
| import json |
| import re |
| import difflib |
| import warnings |
| import subprocess |
| import sys |
| from collections import Counter, defaultdict |
|
|
| warnings.filterwarnings("ignore") |
|
|
| def clamp_score(x): |
| return round(max(0.01, min(0.99, x)), 3) |
| |
| |
| |
|
|
| def install(pkg): |
| subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"]) |
|
|
| for pkg in ["datasets", "pandas", "numpy", "scikit-learn", "langdetect"]: |
| try: |
| __import__(pkg.replace("-", "_")) |
| except ImportError: |
| install(pkg) |
|
|
| import pandas as pd |
| import numpy as np |
| from datasets import load_dataset |
| from langdetect import detect, LangDetectException |
|
|
| |
| |
| |
|
|
| GENDER_BIAS_WORDS = [ |
| "he", "she", "him", "her", "his", "hers", "man", "woman", "men", "women", |
| "male", "female", "boy", "girl", "husband", "wife", "father", "mother", |
| "son", "daughter", "brother", "sister", "mr", "mrs", "ms", "sir", "madam", |
| "king", "queen", "prince", "princess", "actor", "actress", "waiter", "waitress", |
| "policeman", "policewoman", "fireman", "stewardess", "mankind", "manpower" |
| ] |
|
|
| RACIAL_BIAS_WORDS = [ |
| "black", "white", "asian", "hispanic", "latino", "latina", "african", |
| "caucasian", "arab", "jewish", "muslim", "christian", "hindu", "immigrant", |
| "refugee", "foreigner", "native", "minority", "ethnic", "race", "racial", |
| "colored", "indigenous", "tribal" |
| ] |
|
|
| CULTURAL_BIAS_WORDS = [ |
| "western", "eastern", "american", "european", "third world", "developing", |
| "primitive", "civilized", "savage", "barbarian", "exotic", "oriental", |
| "traditional", "modern", "backward", "progressive", "uncivilized" |
| ] |
|
|
| TOXIC_WORDS = [ |
| "idiot", "stupid", "moron", "loser", "trash", "garbage", "hate", "kill", |
| "die", "retard", "freak", "ugly", "fat", "dumb", "worthless", "scum", |
| "disgusting", "pathetic", "monster", "pervert", "creep" |
| ] |
|
|
| HATE_SPEECH_PATTERNS = [ |
| r'\b(all|those|these)\s+(black|white|asian|muslim|jewish|gay|trans)\s+(people|guys|men|women)\s+(are|should|must|deserve)\b', |
| r'\b(go back to)\s+\w+', |
| r'\b(ban all)\s+\w+', |
| r'\b(they|them)\s+(don\'t belong|should leave|are inferior|are criminals)\b', |
| ] |
|
|
| POSITIVE_WORDS = [ |
| "great", "excellent", "amazing", "wonderful", "fantastic", "best", "good", |
| "beautiful", "smart", "brilliant", "awesome", "perfect", "love", "superb", |
| "outstanding", "exceptional", "magnificent", "enjoyed", "loved" |
| ] |
|
|
| NEGATIVE_WORDS = [ |
| "terrible", "awful", "horrible", "disgusting", "hate", "worst", "bad", |
| "poor", "ugly", "stupid", "dumb", "trash", "garbage", "useless", |
| "worthless", "pathetic", "loser", "failure", "boring", "waste", "dreadful" |
| ] |
|
|
| |
| |
| |
|
|
| def clean_dataset_name(name: str) -> tuple: |
| """ |
| Accept either: |
| - plain name: imdb |
| - name + config: cardiffnlp/tweet_eval hate |
| - full HF URL: https://huggingface.co/datasets/cardiffnlp/tweet_eval |
| Returns (dataset_name, config_or_None) |
| """ |
| name = name.strip() |
| |
| if name.startswith("http"): |
| |
| name = re.sub(r"https?://huggingface\.co/datasets/", "", name).strip("/") |
|
|
| |
| parts = name.split() |
| if len(parts) == 2: |
| return parts[0], parts[1] |
| return name, None |
|
|
|
|
| def fetch_dataset_rows(dataset_name: str, config: str = None, num_rows: int = 100): |
| """Fetch rows β tries train, then test, then validation split.""" |
| label = f"{dataset_name}" + (f" ({config})" if config else "") |
| print(f"\nπ¦ Loading dataset: {label} ...") |
|
|
| |
| CONFIG_DEFAULTS = { |
| "cardiffnlp/tweet_eval": "hate", |
| "nyu-mll/glue": "sst2", |
| "glue": "sst2", |
| } |
|
|
| if config is None: |
| config = CONFIG_DEFAULTS.get(dataset_name) |
|
|
| for split in ["train", "test", "validation"]: |
| try: |
| if config: |
| ds = load_dataset(dataset_name, config, split=split, streaming=True) |
| else: |
| ds = load_dataset(dataset_name, split=split, streaming=True) |
| rows = [] |
| for i, row in enumerate(ds): |
| if i >= num_rows: |
| break |
| rows.append(row) |
| if rows: |
| df = pd.DataFrame(rows) |
| print(f"β
Loaded {len(df)} rows, {len(df.columns)} columns from '{split}' split") |
| print(f" Columns: {list(df.columns)}") |
| return df, split |
| except Exception as e: |
| print(f" β οΈ Split '{split}' failed: {str(e)[:80]}") |
| continue |
| print("β Could not load any split.") |
| return pd.DataFrame(), None |
|
|
|
|
| def fetch_second_split(dataset_name: str, first_split: str, config: str = None, num_rows: int = 50): |
| """Fetch a different split for data leakage check.""" |
| for split in ["train", "test", "validation"]: |
| if split == first_split: |
| continue |
| try: |
| if config: |
| ds = load_dataset(dataset_name, config, split=split, streaming=True) |
| else: |
| ds = load_dataset(dataset_name, split=split, streaming=True) |
| rows = [] |
| for i, row in enumerate(ds): |
| if i >= num_rows: |
| break |
| rows.append(row) |
| if rows: |
| return pd.DataFrame(rows), split |
| except Exception: |
| continue |
| return pd.DataFrame(), None |
|
|
|
|
| def get_text_columns(df: pd.DataFrame): |
| return [c for c in df.columns if df[c].dtype == object and |
| df[c].dropna().apply(lambda x: isinstance(x, str) and len(x) > 20).mean() > 0.3] |
|
|
|
|
| def get_label_columns(df: pd.DataFrame): |
| return [c for c in df.columns if 1 < df[c].nunique() <= 20] |
|
|
|
|
| def similarity_ratio(s1: str, s2: str) -> float: |
| return difflib.SequenceMatcher(None, s1.lower().strip(), s2.lower().strip()).ratio() |
|
|
|
|
| def detect_language(text: str) -> str: |
| try: |
| return detect(str(text)) |
| except LangDetectException: |
| return "unknown" |
|
|
|
|
| |
| |
| |
|
|
| def turn1_overview(df: pd.DataFrame, dataset_name: str, split: str) -> dict: |
| print("\n" + "="*60) |
| print("TURN 1 β Dataset Overview & Column Profiling") |
| print("="*60) |
|
|
| columns_info = {} |
| for col in df.columns: |
| missing = int(df[col].isnull().sum()) |
| nuniq = int(df[col].nunique()) |
| dtype = str(df[col].dtype) |
| sample = df[col].dropna().iloc[:3].tolist() if not df[col].dropna().empty else [] |
| avg_len = None |
| if df[col].dtype == object: |
| avg_len = round(df[col].dropna().astype(str).apply(len).mean(), 1) |
| columns_info[col] = { |
| "dtype": dtype, |
| "missing": missing, |
| "missing_rate": round(missing / max(len(df), 1), 3), |
| "unique_values": nuniq, |
| "avg_text_length": avg_len, |
| "sample": [str(s)[:60] for s in sample] |
| } |
|
|
| text_cols = get_text_columns(df) |
| label_cols = get_label_columns(df) |
|
|
| flags = [] |
| if len(df) < 20: |
| flags.append("Very small dataset β statistical checks may be unreliable") |
| if not text_cols: |
| flags.append("No long-text columns β bias/toxicity checks will be limited") |
| if not label_cols: |
| flags.append("No label columns β label noise check will be skipped") |
|
|
| result = { |
| "turn": 1, |
| "turn_name": "dataset_overview", |
| "dataset_name": dataset_name, |
| "split_used": split, |
| "total_rows": len(df), |
| "total_columns": len(df.columns), |
| "column_names": list(df.columns), |
| "columns_info": columns_info, |
| "text_columns_detected": text_cols, |
| "label_columns_detected": label_cols, |
| "initial_flags": flags, |
| "status": "completed" |
| } |
| print(json.dumps(result, indent=2)) |
| return result |
|
|
|
|
| |
| |
| |
|
|
| def turn2_near_dupes_and_annotation(df: pd.DataFrame) -> dict: |
| print("\n" + "="*60) |
| print("TURN 2 β Near-Duplicate Detection + Annotation Inconsistency") |
| print("="*60) |
|
|
| text_cols = get_text_columns(df) |
| label_cols = get_label_columns(df) |
|
|
| |
| near_dupes = [] |
| if text_cols: |
| col = text_cols[0] |
| texts = df[col].dropna().astype(str).tolist() |
| limit = min(len(texts), 150) |
| for i in range(limit): |
| for j in range(i + 1, limit): |
| ratio = similarity_ratio(texts[i], texts[j]) |
| if 0.85 <= ratio < 1.0: |
| near_dupes.append({ |
| "row_i": i, "row_j": j, |
| "similarity": round(ratio, 3), |
| "text_i": texts[i][:80], |
| "text_j": texts[j][:80] |
| }) |
|
|
| |
| annotation_issues = [] |
| if text_cols and label_cols: |
| text_col = text_cols[0] |
| label_col = label_cols[0] |
| texts = df[text_col].dropna().astype(str).tolist() |
| labels = df[label_col].astype(str).tolist() |
| limit = min(len(texts), 150) |
| for i in range(limit): |
| for j in range(i + 1, limit): |
| ratio = similarity_ratio(texts[i], texts[j]) |
| if ratio >= 0.80 and labels[i] != labels[j]: |
| annotation_issues.append({ |
| "row_i": i, "row_j": j, |
| "similarity": round(ratio, 3), |
| "text_i": texts[i][:80], |
| "text_j": texts[j][:80], |
| "label_i": labels[i], |
| "label_j": labels[j], |
| "issue": "Similar texts have different labels" |
| }) |
|
|
| total = len(near_dupes) + len(annotation_issues) |
| severity = "HIGH" if total > 8 else "MEDIUM" if total > 3 else "LOW" |
|
|
| result = { |
| "turn": 2, |
| "turn_name": "near_duplicates_and_annotation_inconsistency", |
| "near_duplicates_found": len(near_dupes), |
| "near_duplicate_pairs": near_dupes[:10], |
| "annotation_inconsistencies_found": len(annotation_issues), |
| "annotation_inconsistency_samples": annotation_issues[:10], |
| "total_issues": total, |
| "severity": severity, |
| "status": "completed" |
| } |
| print(json.dumps(result, indent=2)) |
| return result |
|
|
|
|
| |
| |
| |
|
|
| def turn3_label_noise_and_toxicity(df: pd.DataFrame) -> dict: |
| print("\n" + "="*60) |
| print("TURN 3 β Label Noise + Toxicity / Hate Speech Detection") |
| print("="*60) |
|
|
| text_cols = get_text_columns(df) |
| label_cols = get_label_columns(df) |
|
|
| |
| noisy_labels = [] |
| label_distribution = {} |
| class_imbalance = False |
|
|
| if label_cols: |
| label_col = label_cols[0] |
| counts = df[label_col].value_counts().to_dict() |
| label_distribution = {str(k): int(v) for k, v in counts.items()} |
| if counts: |
| max_c, min_c = max(counts.values()), min(counts.values()) |
| class_imbalance = max_c > 0 and (min_c / max_c) < 0.3 |
|
|
| if text_cols and label_cols: |
| text_col = text_cols[0] |
| label_col = label_cols[0] |
| label_vals = [str(v).lower() for v in df[label_col].dropna().unique()] |
| is_sentiment = any(v in label_vals for v in ["positive","negative","0","1","pos","neg"]) |
|
|
| if is_sentiment: |
| for idx, row in df.iterrows(): |
| try: |
| text = str(row[text_col]).lower() |
| label = str(row[label_col]).lower() |
| pos = sum(1 for w in POSITIVE_WORDS if w in text) |
| neg = sum(1 for w in NEGATIVE_WORDS if w in text) |
| if pos >= 3 and neg == 0 and label in ["negative","neg","0"]: |
| noisy_labels.append({"row": int(idx), "text": str(row[text_col])[:100], |
| "label": str(row[label_col]), |
| "issue": "Strong positive text β negative label"}) |
| elif neg >= 3 and pos == 0 and label in ["positive","pos","1"]: |
| noisy_labels.append({"row": int(idx), "text": str(row[text_col])[:100], |
| "label": str(row[label_col]), |
| "issue": "Strong negative text β positive label"}) |
| except Exception: |
| continue |
|
|
| |
| toxic_rows = [] |
| hate_speech_rows = [] |
|
|
| if text_cols: |
| text_col = text_cols[0] |
| for idx, row in df.iterrows(): |
| try: |
| text = str(row[text_col]).lower() |
| words = set(re.findall(r'\b\w+\b', text)) |
| hits = [w for w in TOXIC_WORDS if w in words] |
| if hits: |
| toxic_rows.append({"row": int(idx), "text": str(row[text_col])[:100], |
| "toxic_words_found": hits[:5]}) |
| for pattern in HATE_SPEECH_PATTERNS: |
| if re.search(pattern, text): |
| hate_speech_rows.append({"row": int(idx), |
| "text": str(row[text_col])[:100], |
| "pattern_matched": pattern}) |
| break |
| except Exception: |
| continue |
|
|
| noise_rate = round(len(noisy_labels) / max(len(df), 1), 3) |
| tox_rate = round(len(toxic_rows) / max(len(df), 1), 3) |
| severity = "HIGH" if (noise_rate > 0.1 or tox_rate > 0.1) else \ |
| "MEDIUM" if (noise_rate > 0.03 or tox_rate > 0.03) else "LOW" |
|
|
| result = { |
| "turn": 3, |
| "turn_name": "label_noise_and_toxicity", |
| "label_distribution": label_distribution, |
| "class_imbalance_detected": class_imbalance, |
| "noisy_labels_found": len(noisy_labels), |
| "noisy_label_samples": noisy_labels[:10], |
| "label_noise_rate": noise_rate, |
| "toxic_rows_found": len(toxic_rows), |
| "toxic_samples": toxic_rows[:10], |
| "hate_speech_rows_found": len(hate_speech_rows), |
| "hate_speech_samples": hate_speech_rows[:5], |
| "toxicity_rate": tox_rate, |
| "severity": severity, |
| "status": "completed" |
| } |
| print(json.dumps(result, indent=2)) |
| return result |
|
|
|
|
| |
| |
| |
|
|
| def turn4_bias_and_language(df: pd.DataFrame) -> dict: |
| print("\n" + "="*60) |
| print("TURN 4 β Bias Detection + Linguistic Diversity") |
| print("="*60) |
|
|
| text_cols = get_text_columns(df) |
| bias_report = { |
| "gender_bias": {"mentions": 0, "words_found": {}, "affected_rows": []}, |
| "racial_bias": {"mentions": 0, "words_found": {}, "affected_rows": []}, |
| "cultural_bias": {"mentions": 0, "words_found": {}, "affected_rows": []} |
| } |
|
|
| if text_cols: |
| text_col = text_cols[0] |
| g_ctr, r_ctr, c_ctr = Counter(), Counter(), Counter() |
|
|
| for idx, row in df.iterrows(): |
| try: |
| text = str(row[text_col]).lower() |
| words = set(re.findall(r'\b\w+\b', text)) |
| g = [w for w in GENDER_BIAS_WORDS if w in words] |
| r = [w for w in RACIAL_BIAS_WORDS if w in words] |
| c = [w for w in CULTURAL_BIAS_WORDS if w in words] |
| if g: |
| bias_report["gender_bias"]["mentions"] += 1 |
| g_ctr.update(g) |
| if len(bias_report["gender_bias"]["affected_rows"]) < 5: |
| bias_report["gender_bias"]["affected_rows"].append(int(idx)) |
| if r: |
| bias_report["racial_bias"]["mentions"] += 1 |
| r_ctr.update(r) |
| if len(bias_report["racial_bias"]["affected_rows"]) < 5: |
| bias_report["racial_bias"]["affected_rows"].append(int(idx)) |
| if c: |
| bias_report["cultural_bias"]["mentions"] += 1 |
| c_ctr.update(c) |
| if len(bias_report["cultural_bias"]["affected_rows"]) < 5: |
| bias_report["cultural_bias"]["affected_rows"].append(int(idx)) |
| except Exception: |
| continue |
|
|
| bias_report["gender_bias"]["words_found"] = dict(g_ctr.most_common(10)) |
| bias_report["racial_bias"]["words_found"] = dict(r_ctr.most_common(10)) |
| bias_report["cultural_bias"]["words_found"] = dict(c_ctr.most_common(10)) |
|
|
| total_bias = sum(v["mentions"] for v in bias_report.values()) |
| bias_rate = round(total_bias / max(len(df) * 3, 1), 3) |
| bias_score = round(max(0.0, 1.0 - bias_rate), 3) |
|
|
| |
| lang_counts = Counter() |
| non_english = [] |
|
|
| if text_cols: |
| text_col = text_cols[0] |
| sample = df[text_col].dropna().astype(str).tolist()[:100] |
| for i, text in enumerate(sample): |
| lang = detect_language(text) |
| lang_counts[lang] += 1 |
| if lang not in ("en", "unknown") and len(non_english) < 5: |
| non_english.append({"row": i, "text": text[:80], "detected_lang": lang}) |
|
|
| total_detected = sum(lang_counts.values()) |
| english_pct = round(lang_counts.get("en", 0) / max(total_detected, 1), 3) |
| is_multilingual = len([l for l in lang_counts if l not in ("en", "unknown")]) > 0 |
|
|
| result = { |
| "turn": 4, |
| "turn_name": "bias_and_linguistic_diversity", |
| "bias_report": bias_report, |
| "total_bias_mentions": total_bias, |
| "bias_rate": bias_rate, |
| "bias_score": bias_score, |
| "bias_severity": "HIGH" if bias_rate > 0.3 else "MEDIUM" if bias_rate > 0.1 else "LOW", |
| "language_distribution": dict(lang_counts), |
| "english_percentage": english_pct, |
| "is_multilingual": is_multilingual, |
| "non_english_samples": non_english, |
| "language_diversity_flag": is_multilingual, |
| "status": "completed" |
| } |
| print(json.dumps(result, indent=2)) |
| return result |
|
|
|
|
| |
| |
| |
|
|
| def turn5_leakage_drift_report( |
| df: pd.DataFrame, dataset_name: str, first_split: str, |
| t1: dict, t2: dict, t3: dict, t4: dict, |
| config: str = None |
| ) -> dict: |
| _cfg = config |
| print("\n" + "="*60) |
| print("TURN 5 β Data Leakage + Domain Drift + Final Audit Report") |
| print("="*60) |
|
|
| text_cols = get_text_columns(df) |
| leakage_pairs = [] |
| leakage_rate = 0.0 |
| split2 = None |
|
|
| |
| print(" π Fetching second split for leakage check...") |
| df2, split2 = fetch_second_split(dataset_name, first_split, config=_cfg, num_rows=50) |
|
|
| if not df2.empty and text_cols: |
| text_col = text_cols[0] |
| texts1 = df[text_col].dropna().astype(str).tolist()[:50] |
| texts2 = df2[text_col].dropna().astype(str).tolist()[:50] if text_col in df2.columns else [] |
| for i, t1_text in enumerate(texts1): |
| for j, t2_text in enumerate(texts2): |
| ratio = similarity_ratio(t1_text, t2_text) |
| if ratio >= 0.90: |
| leakage_pairs.append({ |
| f"{first_split}_row": i, f"{split2}_row": j, |
| "similarity": round(ratio, 3), |
| f"{first_split}_text": t1_text[:80], |
| f"{split2}_text": t2_text[:80] |
| }) |
| leakage_rate = round(len(leakage_pairs) / max(len(texts1), 1), 3) |
|
|
| |
| drift_detected = False |
| drift_details = {} |
|
|
| if text_cols: |
| text_col = text_cols[0] |
| texts = df[text_col].dropna().astype(str).tolist() |
| if len(texts) >= 20: |
| mid = len(texts) // 2 |
| first_half = texts[:mid] |
| second_half = texts[mid:] |
| avg_len_first = round(np.mean([len(t) for t in first_half]), 1) |
| avg_len_second = round(np.mean([len(t) for t in second_half]), 1) |
|
|
| def vocab(txts): |
| w = set() |
| for t in txts: |
| w.update(re.findall(r'\b\w+\b', t.lower())) |
| return w |
|
|
| v1, v2 = vocab(first_half), vocab(second_half) |
| jaccard = round(len(v1 & v2) / max(len(v1 | v2), 1), 3) |
| drift_detected = jaccard < 0.40 or abs(avg_len_first - avg_len_second) > 50 |
|
|
| drift_details = { |
| "avg_text_length_first_half": avg_len_first, |
| "avg_text_length_second_half": avg_len_second, |
| "length_drift": round(abs(avg_len_first - avg_len_second), 1), |
| "vocabulary_overlap_jaccard": jaccard, |
| "drift_detected": drift_detected, |
| "interpretation": ( |
| "Significant topic/domain drift β dataset may not be uniform." |
| if drift_detected else |
| "No significant drift β dataset appears topically consistent." |
| ) |
| } |
|
|
| |
| dup_score = clamp_score(1.0 - (t2.get("near_duplicates_found", 0) / max(len(df), 1)) * 5) |
| ann_score = clamp_score(1.0 - (t2.get("annotation_inconsistencies_found", 0) / max(len(df), 1)) * 10) |
| label_score = clamp_score(1.0 - t3.get("label_noise_rate", 0) * 10) |
| tox_score = clamp_score(1.0 - t3.get("toxicity_rate", 0) * 5) |
| bias_score = clamp_score(t4.get("bias_score", 1.0)) |
| lang_score = clamp_score(t4.get("english_percentage", 1.0)) |
| leakage_score = clamp_score(1.0 - leakage_rate * 5) |
| drift_score = clamp_score(0.6 if drift_detected else 1.0) |
|
|
| audit_score = clamp_score( |
| dup_score * 0.12 + |
| ann_score * 0.13 + |
| label_score * 0.15 + |
| tox_score * 0.15 + |
| bias_score * 0.15 + |
| lang_score * 0.10 + |
| leakage_score * 0.10 + |
| drift_score * 0.10 |
| ) |
|
|
| |
| if audit_score >= 0.85: |
| verdict = "EXCELLENT" |
| summary = "Dataset passes all hard quality checks. Ready for training." |
| elif audit_score >= 0.70: |
| verdict = "GOOD" |
| summary = "Dataset is usable but has some issues worth addressing." |
| elif audit_score >= 0.50: |
| verdict = "NEEDS_WORK" |
| summary = "Multiple quality/bias issues found. Significant cleanup needed." |
| else: |
| verdict = "POOR" |
| summary = "Serious quality, bias, or leakage issues. Not recommended without major fixes." |
|
|
| |
| recs = [] |
| if dup_score < 0.8: recs.append("Remove near-duplicate rows to prevent overfitting.") |
| if ann_score < 0.8: recs.append("Re-annotate inconsistently labelled similar texts.") |
| if label_score < 0.8: recs.append("Review and correct noisy labels.") |
| if tox_score < 0.8: recs.append("Filter toxic/hate speech content before training.") |
| if bias_score < 0.7: recs.append("Audit and balance gender, racial, and cultural representation.") |
| if lang_score < 0.8: recs.append("Filter non-English rows if multilingual content is unintentional.") |
| if leakage_score < 0.8: recs.append("Data leakage detected between splits β re-split the dataset.") |
| if drift_detected: recs.append("Topic drift detected β verify dataset was collected from a consistent source.") |
| if not recs: recs.append("Dataset passed all checks. Consider expanding size for better generalization.") |
|
|
| result = { |
| "task_id": "task3_hard", |
| "turn": 5, |
| "turn_name": "leakage_drift_final_report", |
| "dataset_name": dataset_name, |
| "total_rows_audited": len(df), |
|
|
| "data_leakage": { |
| "splits_compared": [first_split, split2] if split2 else [first_split], |
| "leakage_pairs_found": len(leakage_pairs), |
| "leakage_pairs_sample": leakage_pairs[:5], |
| "leakage_rate": leakage_rate |
| }, |
|
|
| "domain_drift": drift_details, |
|
|
| "scores": { |
| "near_duplicate_score": dup_score, |
| "annotation_consistency_score": ann_score, |
| "label_noise_score": label_score, |
| "toxicity_score": tox_score, |
| "bias_score": bias_score, |
| "language_consistency_score": lang_score, |
| "data_leakage_score": leakage_score, |
| "domain_drift_score": drift_score |
| }, |
|
|
| "audit_score": audit_score, |
| "verdict": verdict, |
| "summary": summary, |
| "recommendations": recs, |
|
|
| "turn_results": { |
| "turn1_overview": t1, |
| "turn2_near_dupes_annotation": t2, |
| "turn3_label_noise_toxicity": t3, |
| "turn4_bias_language": t4 |
| }, |
|
|
| "status": "completed" |
| } |
|
|
| print(json.dumps(result, indent=2)) |
| return result |
|
|
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| print("=" * 60) |
| print(" TASK 3 β BIAS & QUALITY AUDIT (HARD, 5-TURN)") |
| print(" Checks: Near-Dupes | Annotation Inconsistency |") |
| print(" Label Noise | Toxicity | Bias | Language | Leakage | Drift") |
| print("=" * 60) |
|
|
| raw_input = input("\nEnter HuggingFace dataset name (e.g. imdb OR cardiffnlp/tweet_eval hate): ").strip() |
| if not raw_input: |
| raw_input = "imdb" |
|
|
| dataset_name, config = clean_dataset_name(raw_input) |
| print(f" π Dataset : {dataset_name}") |
| if config: |
| print(f" π Config : {config}") |
|
|
| df, split = fetch_dataset_rows(dataset_name, config=config, num_rows=100) |
|
|
| if df.empty: |
| print("β Could not load dataset.") |
| print(" Tips:") |
| print(" β’ Use just the name, e.g. imdb or dair-ai/emotion") |
| print(" β’ For tweet_eval use: cardiffnlp/tweet_eval hate") |
| print(" β’ Do NOT paste the full URL") |
| exit(1) |
|
|
| t1 = turn1_overview(df, dataset_name, split) |
| input("\nβ Press Enter β Turn 2 (Near-Dupes + Annotation)...") |
|
|
| t2 = turn2_near_dupes_and_annotation(df) |
| input("\nβ Press Enter β Turn 3 (Label Noise + Toxicity)...") |
|
|
| t3 = turn3_label_noise_and_toxicity(df) |
| input("\nβ Press Enter β Turn 4 (Bias + Language)...") |
|
|
| t4 = turn4_bias_and_language(df) |
| input("\nβ Press Enter β Turn 5 (Leakage + Drift + Final Report)...") |
|
|
| final = turn5_leakage_drift_report(df, dataset_name, split, t1, t2, t3, t4, config=config) |
|
|
| print("\n" + "=" * 60) |
| print("β
TASK 3 COMPLETE β Copy the JSON below into grader3.py") |
| print("=" * 60) |
| print(json.dumps(final, indent=2)) |
|
|
| with open("task3_output.json", "w") as f: |
| json.dump(final, f, indent=2) |
| print("\nπ Output saved to: task3_output.json") |