"""Data Cleaning task implementation Supported actions: ``impute``, ``drop_columns``, ``drop_rows``, ``submit``. Reward logic: reward = (new_accuracy - old_accuracy)*10 - 0.01 (step_penalty) Termination: * All null values are eliminated * Maximum step count reached TODO: Work on further termination conditions for different types of data """ from __future__ import annotations from typing import Any, Dict, Set import pandas as pd from ..models import ( Action, Difficulty, DropColumnAction, DropRowAction, ImputeAction, TaskType, ) from .base_task import BaseTask class CleaningTask(BaseTask): """Clean a dirty dataset by removing / imputing missing values.""" TASK_TYPE = TaskType.DATA_CLEANING SUPPORTED_ACTIONS: Set[str] = {"impute", "drop_columns", "drop_rows"} # Action application def apply_action(self, action: Action) -> None: #noqa: D401 df= self.data_state.df.copy() if isinstance(action, ImputeAction): df = self._impute( df, action.column, action.strategy, action.fill_value ) label = f"impute({action.column}, {action.strategy})" elif isinstance(action, DropColumnAction): if action.column in df.columns: df = df.drop(columns=[action.column]) label = f"drop_column({action.column})" elif isinstance(action, DropRowAction): if action.column in df.columns: df = df.dropna(subset=[action.column]) label = f"drop_row({action.column})" else: return # unsupported action, should not happen due to prior validation self.data_state.apply_update(df, action_name=label) #Reward def calculate_reward(self, old_accuracy: float, new_accuracy: float, action: Action) -> float: accuracy_gain=new_accuracy-old_accuracy return accuracy_gain*10 - 0.01 #Termination def is_done(self) -> bool: #TODO: Add more termination conditions #Think more about total_nulls, maybe remove this condition, #If dataset has no rows/columns left, end the episode if self.data_state.total_nulls == 0: return True if self.step_count >= self.max_steps: return True return False # Grading def grade(self) -> Dict[str, Any]: score = 0.0 details: Dict[str, Any] = {} # 50% - no nulls remaining nulls = self.data_state.total_nulls if nulls == 0: score += 0.5 details["nulls_check"] = "passed" else: details["nulls_remaining"] = "failed" # 50% - accuracy above threshold (scales with difficulty) accuracy = self.calculate_accuracy() details["final_accuracy"] = round(accuracy, 4) threshold = { Difficulty.EASY: 0.75, Difficulty.MEDIUM: 0.80, Difficulty.HARD: 0.85 }.get(self.difficulty, 0.80) if accuracy > threshold: score += 0.5 details["accuracy_check"] = "passed" else: details["accuracy_check"] = "failed" details["score"] = min(score, 1.0) details["steps_taken"] = self.step_count details["action_history"] = list(self.data_state.history) return details # Goal def get_goal_description(self) -> str: return ( "DATA CLEANINING: Remove all missing values from the dataset while " "maintaining or improving model accuracy. Use impute, drop_column, " "or drop_rows actions. Submit when finished." ) # Helpers @staticmethod def _impute( df: pd.DataFrame, column: str, strategy: str, fill_value: Any = None ) -> pd.DataFrame: if column not in df.columns: return df df = df.copy() imputed_value = None if strategy == "mean" and pd.api.types.is_numeric_dtype(df[column]): imputed_value = df[column].mean() elif strategy == "median" and pd.api.types.is_numeric_dtype(df[column]): imputed_value = df[column].median() elif strategy == "mode": imputed_value = df[column].mode() if len(imputed_value) > 0: imputed_value = imputed_value.iloc[0] elif strategy == "constant": if fill_value is not None: imputed_value = fill_value else: # Unsupported strategy or non-numeric column for mean/median return df if imputed_value is not None: df[column] = df[column].fillna(imputed_value) return df