import pandas as pd import numpy as np from typing import Dict, Any def generate_task(dataset_size: int = 1000, dirt_level: float = 0.3) -> pd.DataFrame: """ Generate a dirty dataset for the AutoClean AI task Contains: missing values, duplicates, inconsistent types, outliers, messy text """ np.random.seed(42) data = { 'id': np.arange(dataset_size), 'age': np.random.normal(35, 12, dataset_size).astype(int), 'income': np.random.lognormal(10, 1, dataset_size).astype(int), 'gender': np.random.choice(['Male', 'Female', 'male', 'female', 'M', 'F', None], dataset_size, p=[0.3, 0.3, 0.1, 0.1, 0.05, 0.05, 0.1]), 'join_date': pd.date_range('2020-01-01', periods=dataset_size).tolist(), 'score': np.random.normal(50, 15, dataset_size), 'comments': np.random.choice(['Good', 'Excellent', 'Bad', 'Average', ' ', None, ' '], dataset_size), 'category': np.random.choice(['A', 'B', 'C', 'D', None], dataset_size, p=[0.25, 0.25, 0.25, 0.2, 0.05]) } df = pd.DataFrame(data) # Add missing values mask = np.random.choice([True, False], size=df.shape, p=[dirt_level * 0.4, 1 - dirt_level * 0.4]) df = df.mask(mask) # Add duplicates duplicates = df.sample(frac=dirt_level * 0.25, random_state=42) df = pd.concat([df, duplicates], ignore_index=True) # Add outliers numeric_cols = ['age', 'income', 'score'] for col in numeric_cols: outliers_idx = np.random.choice(df.index, size=int(dataset_size * dirt_level * 0.1), replace=False) df.loc[outliers_idx, col] = df[col].mean() * 10 # Mess up data types df['age'] = df['age'].apply(lambda x: str(x) if np.random.random() < 0.1 else x) df['income'] = df['income'].apply(lambda x: f"${x}" if np.random.random() < 0.15 else x) return df.sample(frac=1, random_state=42).reset_index(drop=True) def get_task_description() -> Dict[str, Any]: return { "name": "AutoClean AI Data Cleaning Challenge", "goal": "Maximize the dataset cleanliness score by applying optimal cleaning operations", "success_threshold": 0.95, "max_steps": 50, "allowed_actions": [ "fill_missing", "remove_duplicates", "normalize", "fix_types", "remove_outliers", "drop_column", "encode_categorical", "handle_text" ] }