kousiksasmal's picture
Upload folder using huggingface_hub
d283c7f verified
import copy
from typing import List, Dict, Any
class DataCleaningTask:
def __init__(self, name: str, description: str, initial_data: List[Dict[str, Any]], target_data: List[Dict[str, Any]]):
self.name = name
self.description = description
self.initial_data = copy.deepcopy(initial_data)
self.target_data = copy.deepcopy(target_data)
def grader(self, current_data: List[Dict[str, Any]]) -> float:
"""
Grader that computes a similarity score (0.0 to 1.0) between the submitted dataset and the target dataset.
"""
if not current_data:
return 0.0
# Calculate how many rows match exactly
match_count = 0
target_copy = list(self.target_data)
for current_row in current_data:
if current_row in target_copy:
match_count += 1
target_copy.remove(current_row)
# Calculate precision and recall
precision = match_count / len(current_data) if len(current_data) > 0 else 0.0
recall = match_count / len(self.target_data) if len(self.target_data) > 0 else 0.0
# F1 score approximation capped to 1.0
if precision + recall == 0:
return 0.0
score = 2 * (precision * recall) / (precision + recall)
return max(0.0, min(1.0, float(score)))
# Task 1: Easy - Drop exact duplicates
TASK_1_INITIAL = [
{"id": 1, "name": "Alice"},
{"id": 1, "name": "Alice"}, # Duplicate
{"id": 2, "name": "Bob"}
]
TASK_1_TARGET = [
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"}
]
task_easy = DataCleaningTask(
name="Easy",
description="Remove exact duplicate rows from the dataset.",
initial_data=TASK_1_INITIAL,
target_data=TASK_1_TARGET
)
# Task 2: Medium - Fill NA and Drop Duplicates
TASK_2_INITIAL = [
{"id": 1, "name": "Charlie", "email": None},
{"id": 2, "name": "Dave", "email": "dave@example.com"},
{"id": 2, "name": "Dave", "email": "dave@example.com"} # Duplicate
]
TASK_2_TARGET = [
{"id": 1, "name": "Charlie", "email": "unknown"},
{"id": 2, "name": "Dave", "email": "dave@example.com"}
]
task_medium = DataCleaningTask(
name="Medium",
description="Impute missing 'email' variables with 'unknown' and drop exact duplicate rows.",
initial_data=TASK_2_INITIAL,
target_data=TASK_2_TARGET
)
# Task 3: Hard - Format date, fill NA, filter
TASK_3_INITIAL = [
{"id": 1, "date": "12/31/2023", "status": "active", "score": None},
{"id": 2, "date": "01/15/2024", "status": "inactive", "score": 85},
{"id": 3, "date": "02/20/2024", "status": "active", "score": 90}
]
TASK_3_TARGET = [
{"id": 1, "date": "2023-12-31", "status": "active", "score": 0},
{"id": 3, "date": "2024-02-20", "status": "active", "score": 90}
]
task_hard = DataCleaningTask(
name="Hard",
description="Format 'date' from MM/DD/YYYY to YYYY-MM-DD format. Fill missing 'score' with 0. Filter to keep only rows where 'status' is 'active'.",
initial_data=TASK_3_INITIAL,
target_data=TASK_3_TARGET
)
# Exposed globally for random selection or sequential iteration
TASKS = [task_easy, task_medium, task_hard]