env / tasks.py
sairaj2's picture
Upload folder using huggingface_hub
f3a0522 verified
"""
OpenEnv Data Cleaning Environment - Task Definitions
Defines tasks with dataset configurations and grading criteria.
"""
from typing import List, Dict, Any
try:
from .models import TaskConfig
except ImportError: # pragma: no cover - supports direct execution from env/
from models import TaskConfig
def get_tasks() -> List[str]:
"""
Get list of all available task IDs.
OpenEnv-compatible task registration.
"""
return [
"easy_001",
"medium_001",
"hard_001"
]
def get_task_config(task_id: str) -> TaskConfig:
"""
Get configuration for a specific task.
"""
task_configs = {
"easy_001": TaskConfig(
name="Basic Data Cleaning",
task_id="easy_001",
difficulty="easy",
description="Basic data cleaning: drop nulls and remove duplicates",
dataset_config={
"rows": 100,
"null_percentage": 0.1,
"duplicate_count": 5,
"columns": ["id", "name", "age", "email", "salary"]
},
expected_actions=["drop_nulls", "remove_duplicates"],
grading_criteria={
"null_handling": 0.4,
"duplicate_handling": 0.4,
"efficiency": 0.2
},
grader="/grade/easy_001",
),
"medium_001": TaskConfig(
name="Intermediate Data Cleaning",
task_id="medium_001",
difficulty="medium",
description="Intermediate cleaning: handle nulls, validate emails, remove outliers",
dataset_config={
"rows": 200,
"null_percentage": 0.15,
"duplicate_count": 10,
"invalid_email_count": 20,
"outlier_count": 8,
"columns": ["id", "name", "age", "email", "salary", "department"]
},
expected_actions=["fill_nulls", "validate_email", "outlier_removal"],
grading_criteria={
"null_handling": 0.25,
"email_validation": 0.3,
"outlier_handling": 0.25,
"efficiency": 0.2
},
grader="/grade/medium_001",
),
"hard_001": TaskConfig(
name="Advanced Data Cleaning",
task_id="hard_001",
difficulty="hard",
description="Advanced cleaning: full pipeline with type conversion and normalization",
dataset_config={
"rows": 500,
"null_percentage": 0.2,
"duplicate_count": 25,
"invalid_email_count": 40,
"outlier_count": 20,
"type_issues": True,
"columns": ["id", "name", "age", "email", "salary", "department", "join_date", "score"]
},
expected_actions=[
"drop_nulls",
"fill_nulls",
"remove_duplicates",
"validate_email",
"convert_types",
"outlier_removal",
"normalize"
],
grading_criteria={
"null_handling": 0.15,
"duplicate_handling": 0.1,
"email_validation": 0.15,
"type_conversion": 0.2,
"outlier_handling": 0.2,
"normalization": 0.1,
"efficiency": 0.1
},
grader="/grade/hard_001",
)
}
if task_id not in task_configs:
raise ValueError(f"Unknown task: {task_id}. Available: {get_tasks()}")
return task_configs[task_id]
def get_all_task_configs() -> Dict[str, TaskConfig]:
"""Get all task configurations."""
return {task_id: get_task_config(task_id) for task_id in get_tasks()}