Spaces:
Sleeping
Sleeping
| # tasks.py | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Task definitions for all 3 difficulty levels | |
| # Each task has a description, dataset, grader | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| import pandas as pd | |
| from typing import Dict, Any, Tuple, Optional | |
| from config import ( | |
| TASK_EASY, TASK_MEDIUM, TASK_HARD, TASK_IDS, MAX_STEPS | |
| ) | |
| from datasets import ( | |
| generate_easy_dataset, | |
| generate_medium_dataset, | |
| generate_hard_dataset, | |
| detect_issues, | |
| ) | |
| from graders import grade_task1, grade_task2, grade_task3 | |
| from models import TaskInfo | |
| # ββ Task Registry βββββββββββββββββββββββββββββ | |
| TASK_REGISTRY = { | |
| TASK_EASY: { | |
| "task_id": TASK_IDS[TASK_EASY], | |
| "difficulty": TASK_EASY, | |
| "description": ( | |
| "Fix a 10-row employee dataset: " | |
| "fill missing values in 'age' and 'salary', " | |
| "convert 'age' from string to integer." | |
| ), | |
| "max_steps": MAX_STEPS, | |
| "issues": [ | |
| "missing_values in age and salary columns", | |
| "wrong dtype: age should be int not string", | |
| ], | |
| "hints": [ | |
| "Use fill_missing to handle null values", | |
| "Use fix_dtype to convert age to int", | |
| "Call submit when the dataset looks clean", | |
| ], | |
| }, | |
| TASK_MEDIUM: { | |
| "task_id": TASK_IDS[TASK_MEDIUM], | |
| "difficulty": TASK_MEDIUM, | |
| "description": ( | |
| "Fix a 20-row dataset: " | |
| "remove duplicate rows, " | |
| "handle salary outliers, " | |
| "standardize country values to 'United States' or 'UK'." | |
| ), | |
| "max_steps": MAX_STEPS, | |
| "issues": [ | |
| "duplicate rows present", | |
| "salary column has extreme outliers", | |
| "country has inconsistent values (USA, US, America, etc.)", | |
| ], | |
| "hints": [ | |
| "Use drop_duplicates first", | |
| "Use remove_outliers on salary column", | |
| "Use standardize_values to unify country names", | |
| ], | |
| }, | |
| TASK_HARD: { | |
| "task_id": TASK_IDS[TASK_HARD], | |
| "difficulty": TASK_HARD, | |
| "description": ( | |
| "Fix a 30-row dataset with all issues: " | |
| "nulls, duplicates, outliers, " | |
| "bad column names (trailing spaces, uppercase, special chars), " | |
| "inconsistent country values, " | |
| "and invalid dept_id values (referential integrity)." | |
| ), | |
| "max_steps": MAX_STEPS, | |
| "issues": [ | |
| "missing values in multiple columns", | |
| "bad column names: 'Full Name ', 'AGE', 'salary$', 'COUNTRY'", | |
| "salary outliers and negative values", | |
| "inconsistent country values", | |
| "invalid dept_id: 99 does not exist in lookup table", | |
| ], | |
| "hints": [ | |
| "Start by renaming bad column names", | |
| "Then fix nulls, duplicates, and outliers", | |
| "Standardize country values", | |
| "Fix invalid dept_id values last", | |
| ], | |
| }, | |
| } | |
| # ββ Task Class ββββββββββββββββββββββββββββββββ | |
| class Task: | |
| """Represents a single task with its dataset and grader""" | |
| def __init__(self, difficulty: str): | |
| if difficulty not in TASK_REGISTRY: | |
| raise ValueError(f"Unknown difficulty: {difficulty}") | |
| self.difficulty = difficulty | |
| self.meta = TASK_REGISTRY[difficulty] | |
| self.task_id = self.meta["task_id"] | |
| self.description = self.meta["description"] | |
| self.max_steps = self.meta["max_steps"] | |
| self.hints = self.meta["hints"] | |
| # Dataset state | |
| self.original_df: Optional[pd.DataFrame] = None | |
| self.current_df: Optional[pd.DataFrame] = None | |
| self.lookup_df: Optional[pd.DataFrame] = None | |
| self.issues: list = [] | |
| def reset(self) -> pd.DataFrame: | |
| """Generate a fresh dataset and return it""" | |
| if self.difficulty == TASK_EASY: | |
| self.original_df = generate_easy_dataset() | |
| elif self.difficulty == TASK_MEDIUM: | |
| self.original_df = generate_medium_dataset() | |
| elif self.difficulty == TASK_HARD: | |
| self.original_df, self.lookup_df = generate_hard_dataset() | |
| self.current_df = self.original_df.copy() | |
| self.issues = detect_issues(self.current_df, self.task_id) | |
| return self.current_df | |
| def grade(self) -> Tuple[float, Dict[str, Any]]: | |
| """Grade the current state of the dataset""" | |
| if self.current_df is None or self.original_df is None: | |
| return 0.0, {"error": "Task not initialized, call reset() first"} | |
| if self.difficulty == TASK_EASY: | |
| return grade_task1(self.original_df, self.current_df) | |
| elif self.difficulty == TASK_MEDIUM: | |
| return grade_task2(self.original_df, self.current_df) | |
| elif self.difficulty == TASK_HARD: | |
| return grade_task3( | |
| self.original_df, self.current_df, self.lookup_df | |
| ) | |
| return 0.0, {} | |
| def update_issues(self): | |
| """Refresh issue list based on current state""" | |
| if self.current_df is not None: | |
| self.issues = detect_issues(self.current_df, self.task_id) | |
| def get_info(self) -> TaskInfo: | |
| """Return TaskInfo model""" | |
| score, _ = self.grade() if self.current_df is not None else (None, {}) | |
| return TaskInfo( | |
| task_id = self.task_id, | |
| difficulty = self.difficulty, | |
| description = self.description, | |
| max_steps = self.max_steps, | |
| score = score, | |
| ) | |
| # ββ Task Factory ββββββββββββββββββββββββββββββ | |
| def get_task(difficulty: str) -> Task: | |
| """Create and return a Task instance""" | |
| return Task(difficulty) | |
| def get_all_tasks() -> Dict[str, Task]: | |
| """Return all tasks as a dict""" | |
| return {d: Task(d) for d in [TASK_EASY, TASK_MEDIUM, TASK_HARD]} |