odse / core /tasks /cleaning_task.py
simeetnayan's picture
Upload folder using huggingface_hub
fede53c verified
"""Data Cleaning task implementation
Supported actions: ``impute``, ``drop_columns``, ``drop_rows``, ``submit``.
Reward logic:
reward = (new_accuracy - old_accuracy)*10 - 0.01 (step_penalty)
Termination:
* All null values are eliminated
* Maximum step count reached
TODO: Work on further termination conditions for different types of data
"""
from __future__ import annotations
from typing import Any, Dict, Set
import pandas as pd
from ..models import (
Action,
Difficulty,
DropColumnAction,
DropRowAction,
ImputeAction,
TaskType,
)
from .base_task import BaseTask
class CleaningTask(BaseTask):
"""Clean a dirty dataset by removing / imputing missing values."""
TASK_TYPE = TaskType.DATA_CLEANING
SUPPORTED_ACTIONS: Set[str] = {"impute", "drop_columns", "drop_rows"}
# Action application
def apply_action(self, action: Action) -> None: #noqa: D401
df= self.data_state.df.copy()
if isinstance(action, ImputeAction):
df = self._impute(
df, action.column, action.strategy, action.fill_value
)
label = f"impute({action.column}, {action.strategy})"
elif isinstance(action, DropColumnAction):
if action.column in df.columns:
df = df.drop(columns=[action.column])
label = f"drop_column({action.column})"
elif isinstance(action, DropRowAction):
if action.column in df.columns:
df = df.dropna(subset=[action.column])
label = f"drop_row({action.column})"
else:
return # unsupported action, should not happen due to prior validation
self.data_state.apply_update(df, action_name=label)
#Reward
def calculate_reward(self, old_accuracy: float, new_accuracy: float, action: Action) -> float:
accuracy_gain=new_accuracy-old_accuracy
return accuracy_gain*10 - 0.01
#Termination
def is_done(self) -> bool:
#TODO: Add more termination conditions
#Think more about total_nulls, maybe remove this condition,
#If dataset has no rows/columns left, end the episode
if self.data_state.total_nulls == 0:
return True
if self.step_count >= self.max_steps:
return True
return False
# Grading
def grade(self) -> Dict[str, Any]:
score = 0.0
details: Dict[str, Any] = {}
# 50% - no nulls remaining
nulls = self.data_state.total_nulls
if nulls == 0:
score += 0.5
details["nulls_check"] = "passed"
else:
details["nulls_remaining"] = "failed"
# 50% - accuracy above threshold (scales with difficulty)
accuracy = self.calculate_accuracy()
details["final_accuracy"] = round(accuracy, 4)
threshold = {
Difficulty.EASY: 0.75,
Difficulty.MEDIUM: 0.80,
Difficulty.HARD: 0.85
}.get(self.difficulty, 0.80)
if accuracy > threshold:
score += 0.5
details["accuracy_check"] = "passed"
else:
details["accuracy_check"] = "failed"
details["score"] = min(score, 1.0)
details["steps_taken"] = self.step_count
details["action_history"] = list(self.data_state.history)
return details
# Goal
def get_goal_description(self) -> str:
return (
"DATA CLEANINING: Remove all missing values from the dataset while "
"maintaining or improving model accuracy. Use impute, drop_column, "
"or drop_rows actions. Submit when finished."
)
# Helpers
@staticmethod
def _impute(
df: pd.DataFrame,
column: str,
strategy: str,
fill_value: Any = None
) -> pd.DataFrame:
if column not in df.columns:
return df
df = df.copy()
imputed_value = None
if strategy == "mean" and pd.api.types.is_numeric_dtype(df[column]):
imputed_value = df[column].mean()
elif strategy == "median" and pd.api.types.is_numeric_dtype(df[column]):
imputed_value = df[column].median()
elif strategy == "mode":
imputed_value = df[column].mode()
if len(imputed_value) > 0:
imputed_value = imputed_value.iloc[0]
elif strategy == "constant":
if fill_value is not None:
imputed_value = fill_value
else:
# Unsupported strategy or non-numeric column for mean/median
return df
if imputed_value is not None:
df[column] = df[column].fillna(imputed_value)
return df