sairaj2's picture
Upload folder using huggingface_hub
188937b verified
import pandas as pd
import numpy as np
from typing import Dict, Any
def generate_task(dataset_size: int = 1000, dirt_level: float = 0.3) -> pd.DataFrame:
"""
Generate a dirty dataset for the AutoClean AI task
Contains: missing values, duplicates, inconsistent types, outliers, messy text
"""
np.random.seed(42)
data = {
'id': np.arange(dataset_size),
'age': np.random.normal(35, 12, dataset_size).astype(int),
'income': np.random.lognormal(10, 1, dataset_size).astype(int),
'gender': np.random.choice(['Male', 'Female', 'male', 'female', 'M', 'F', None], dataset_size,
p=[0.3, 0.3, 0.1, 0.1, 0.05, 0.05, 0.1]),
'join_date': pd.date_range('2020-01-01', periods=dataset_size).tolist(),
'score': np.random.normal(50, 15, dataset_size),
'comments': np.random.choice(['Good', 'Excellent', 'Bad', 'Average', ' ', None, ' '], dataset_size),
'category': np.random.choice(['A', 'B', 'C', 'D', None], dataset_size, p=[0.25, 0.25, 0.25, 0.2, 0.05])
}
df = pd.DataFrame(data)
# Add missing values
mask = np.random.choice([True, False], size=df.shape, p=[dirt_level * 0.4, 1 - dirt_level * 0.4])
df = df.mask(mask)
# Add duplicates
duplicates = df.sample(frac=dirt_level * 0.25, random_state=42)
df = pd.concat([df, duplicates], ignore_index=True)
# Add outliers
numeric_cols = ['age', 'income', 'score']
for col in numeric_cols:
outliers_idx = np.random.choice(df.index, size=int(dataset_size * dirt_level * 0.1), replace=False)
df.loc[outliers_idx, col] = df[col].mean() * 10
# Mess up data types
df['age'] = df['age'].apply(lambda x: str(x) if np.random.random() < 0.1 else x)
df['income'] = df['income'].apply(lambda x: f"${x}" if np.random.random() < 0.15 else x)
return df.sample(frac=1, random_state=42).reset_index(drop=True)
def get_task_description() -> Dict[str, Any]:
return {
"name": "AutoClean AI Data Cleaning Challenge",
"goal": "Maximize the dataset cleanliness score by applying optimal cleaning operations",
"success_threshold": 0.95,
"max_steps": 50,
"allowed_actions": [
"fill_missing",
"remove_duplicates",
"normalize",
"fix_types",
"remove_outliers",
"drop_column",
"encode_categorical",
"handle_text"
]
}