File size: 2,507 Bytes
8d6e02d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import numpy as np
from typing import Dict, Any


def generate_task(dataset_size: int = 1000, dirt_level: float = 0.3) -> pd.DataFrame:
    """
    Generate a dirty dataset for the AutoClean AI task
    Contains: missing values, duplicates, inconsistent types, outliers, messy text
    """
    np.random.seed(42)
    
    data = {
        'id': np.arange(dataset_size),
        'age': np.random.normal(35, 12, dataset_size).astype(int),
        'income': np.random.lognormal(10, 1, dataset_size).astype(int),
        'gender': np.random.choice(['Male', 'Female', 'male', 'female', 'M', 'F', None], dataset_size, 
                                  p=[0.3, 0.3, 0.1, 0.1, 0.05, 0.05, 0.1]),
        'join_date': pd.date_range('2020-01-01', periods=dataset_size).tolist(),
        'score': np.random.normal(50, 15, dataset_size),
        'comments': np.random.choice(['Good', 'Excellent', 'Bad', 'Average', ' ', None, '  '], dataset_size),
        'category': np.random.choice(['A', 'B', 'C', 'D', None], dataset_size, p=[0.25, 0.25, 0.25, 0.2, 0.05])
    }
    
    df = pd.DataFrame(data)
    
    # Add missing values
    mask = np.random.choice([True, False], size=df.shape, p=[dirt_level * 0.4, 1 - dirt_level * 0.4])
    df = df.mask(mask)
    
    # Add duplicates
    duplicates = df.sample(frac=dirt_level * 0.25, random_state=42)
    df = pd.concat([df, duplicates], ignore_index=True)
    
    # Add outliers
    numeric_cols = ['age', 'income', 'score']
    for col in numeric_cols:
        outliers_idx = np.random.choice(df.index, size=int(dataset_size * dirt_level * 0.1), replace=False)
        df.loc[outliers_idx, col] = df[col].mean() * 10
    
    # Mess up data types
    df['age'] = df['age'].apply(lambda x: str(x) if np.random.random() < 0.1 else x)
    df['income'] = df['income'].apply(lambda x: f"${x}" if np.random.random() < 0.15 else x)
    
    return df.sample(frac=1, random_state=42).reset_index(drop=True)


def get_task_description() -> Dict[str, Any]:
    return {
        "name": "AutoClean AI Data Cleaning Challenge",
        "goal": "Maximize the dataset cleanliness score by applying optimal cleaning operations",
        "success_threshold": 0.95,
        "max_steps": 50,
        "allowed_actions": [
            "fill_missing",
            "remove_duplicates", 
            "normalize",
            "fix_types",
            "remove_outliers",
            "drop_column",
            "encode_categorical",
            "handle_text"
        ]
    }