Spaces:

kousiksasmal
/

data_cleaning_env

Running

App Files Files Community

data_cleaning_env / server /tasks.py

kousiksasmal

Upload folder using huggingface_hub

d283c7f verified 3 days ago

raw

history blame contribute delete

3.22 kB

	import copy
	from typing import List, Dict, Any

	class DataCleaningTask:
	def __init__(self, name: str, description: str, initial_data: List[Dict[str, Any]], target_data: List[Dict[str, Any]]):
	self.name = name
	self.description = description
	self.initial_data = copy.deepcopy(initial_data)
	self.target_data = copy.deepcopy(target_data)

	def grader(self, current_data: List[Dict[str, Any]]) -> float:
	"""
	Grader that computes a similarity score (0.0 to 1.0) between the submitted dataset and the target dataset.
	"""
	if not current_data:
	return 0.0

	# Calculate how many rows match exactly
	match_count = 0
	target_copy = list(self.target_data)
	for current_row in current_data:
	if current_row in target_copy:
	match_count += 1
	target_copy.remove(current_row)

	# Calculate precision and recall
	precision = match_count / len(current_data) if len(current_data) > 0 else 0.0
	recall = match_count / len(self.target_data) if len(self.target_data) > 0 else 0.0

	# F1 score approximation capped to 1.0
	if precision + recall == 0:
	return 0.0
	score = 2 * (precision * recall) / (precision + recall)
	return max(0.0, min(1.0, float(score)))


	# Task 1: Easy - Drop exact duplicates
	TASK_1_INITIAL = [
	{"id": 1, "name": "Alice"},
	{"id": 1, "name": "Alice"}, # Duplicate
	{"id": 2, "name": "Bob"}
	]
	TASK_1_TARGET = [
	{"id": 1, "name": "Alice"},
	{"id": 2, "name": "Bob"}
	]
	task_easy = DataCleaningTask(
	name="Easy",
	description="Remove exact duplicate rows from the dataset.",
	initial_data=TASK_1_INITIAL,
	target_data=TASK_1_TARGET
	)

	# Task 2: Medium - Fill NA and Drop Duplicates
	TASK_2_INITIAL = [
	{"id": 1, "name": "Charlie", "email": None},
	{"id": 2, "name": "Dave", "email": "dave@example.com"},
	{"id": 2, "name": "Dave", "email": "dave@example.com"} # Duplicate
	]
	TASK_2_TARGET = [
	{"id": 1, "name": "Charlie", "email": "unknown"},
	{"id": 2, "name": "Dave", "email": "dave@example.com"}
	]
	task_medium = DataCleaningTask(
	name="Medium",
	description="Impute missing 'email' variables with 'unknown' and drop exact duplicate rows.",
	initial_data=TASK_2_INITIAL,
	target_data=TASK_2_TARGET
	)

	# Task 3: Hard - Format date, fill NA, filter
	TASK_3_INITIAL = [
	{"id": 1, "date": "12/31/2023", "status": "active", "score": None},
	{"id": 2, "date": "01/15/2024", "status": "inactive", "score": 85},
	{"id": 3, "date": "02/20/2024", "status": "active", "score": 90}
	]
	TASK_3_TARGET = [
	{"id": 1, "date": "2023-12-31", "status": "active", "score": 0},
	{"id": 3, "date": "2024-02-20", "status": "active", "score": 90}
	]
	task_hard = DataCleaningTask(
	name="Hard",
	description="Format 'date' from MM/DD/YYYY to YYYY-MM-DD format. Fill missing 'score' with 0. Filter to keep only rows where 'status' is 'active'.",
	initial_data=TASK_3_INITIAL,
	target_data=TASK_3_TARGET
	)

	# Exposed globally for random selection or sequential iteration
	TASKS = [task_easy, task_medium, task_hard]