Spaces:
Running
Running
| import copy | |
| from typing import List, Dict, Any | |
| class DataCleaningTask: | |
| def __init__(self, name: str, description: str, initial_data: List[Dict[str, Any]], target_data: List[Dict[str, Any]]): | |
| self.name = name | |
| self.description = description | |
| self.initial_data = copy.deepcopy(initial_data) | |
| self.target_data = copy.deepcopy(target_data) | |
| def grader(self, current_data: List[Dict[str, Any]]) -> float: | |
| """ | |
| Grader that computes a similarity score (0.0 to 1.0) between the submitted dataset and the target dataset. | |
| """ | |
| if not current_data: | |
| return 0.0 | |
| # Calculate how many rows match exactly | |
| match_count = 0 | |
| target_copy = list(self.target_data) | |
| for current_row in current_data: | |
| if current_row in target_copy: | |
| match_count += 1 | |
| target_copy.remove(current_row) | |
| # Calculate precision and recall | |
| precision = match_count / len(current_data) if len(current_data) > 0 else 0.0 | |
| recall = match_count / len(self.target_data) if len(self.target_data) > 0 else 0.0 | |
| # F1 score approximation capped to 1.0 | |
| if precision + recall == 0: | |
| return 0.0 | |
| score = 2 * (precision * recall) / (precision + recall) | |
| return max(0.0, min(1.0, float(score))) | |
| # Task 1: Easy - Drop exact duplicates | |
| TASK_1_INITIAL = [ | |
| {"id": 1, "name": "Alice"}, | |
| {"id": 1, "name": "Alice"}, # Duplicate | |
| {"id": 2, "name": "Bob"} | |
| ] | |
| TASK_1_TARGET = [ | |
| {"id": 1, "name": "Alice"}, | |
| {"id": 2, "name": "Bob"} | |
| ] | |
| task_easy = DataCleaningTask( | |
| name="Easy", | |
| description="Remove exact duplicate rows from the dataset.", | |
| initial_data=TASK_1_INITIAL, | |
| target_data=TASK_1_TARGET | |
| ) | |
| # Task 2: Medium - Fill NA and Drop Duplicates | |
| TASK_2_INITIAL = [ | |
| {"id": 1, "name": "Charlie", "email": None}, | |
| {"id": 2, "name": "Dave", "email": "dave@example.com"}, | |
| {"id": 2, "name": "Dave", "email": "dave@example.com"} # Duplicate | |
| ] | |
| TASK_2_TARGET = [ | |
| {"id": 1, "name": "Charlie", "email": "unknown"}, | |
| {"id": 2, "name": "Dave", "email": "dave@example.com"} | |
| ] | |
| task_medium = DataCleaningTask( | |
| name="Medium", | |
| description="Impute missing 'email' variables with 'unknown' and drop exact duplicate rows.", | |
| initial_data=TASK_2_INITIAL, | |
| target_data=TASK_2_TARGET | |
| ) | |
| # Task 3: Hard - Format date, fill NA, filter | |
| TASK_3_INITIAL = [ | |
| {"id": 1, "date": "12/31/2023", "status": "active", "score": None}, | |
| {"id": 2, "date": "01/15/2024", "status": "inactive", "score": 85}, | |
| {"id": 3, "date": "02/20/2024", "status": "active", "score": 90} | |
| ] | |
| TASK_3_TARGET = [ | |
| {"id": 1, "date": "2023-12-31", "status": "active", "score": 0}, | |
| {"id": 3, "date": "2024-02-20", "status": "active", "score": 90} | |
| ] | |
| task_hard = DataCleaningTask( | |
| name="Hard", | |
| description="Format 'date' from MM/DD/YYYY to YYYY-MM-DD format. Fill missing 'score' with 0. Filter to keep only rows where 'status' is 'active'.", | |
| initial_data=TASK_3_INITIAL, | |
| target_data=TASK_3_TARGET | |
| ) | |
| # Exposed globally for random selection or sequential iteration | |
| TASKS = [task_easy, task_medium, task_hard] | |