# tasks.py # ───────────────────────────────────────────── # Task definitions for all 3 difficulty levels # Each task has a description, dataset, grader # ───────────────────────────────────────────── import pandas as pd from typing import Dict, Any, Tuple, Optional from config import ( TASK_EASY, TASK_MEDIUM, TASK_HARD, TASK_IDS, MAX_STEPS ) from datasets import ( generate_easy_dataset, generate_medium_dataset, generate_hard_dataset, detect_issues, ) from graders import grade_task1, grade_task2, grade_task3 from models import TaskInfo # ── Task Registry ───────────────────────────── TASK_REGISTRY = { TASK_EASY: { "task_id": TASK_IDS[TASK_EASY], "difficulty": TASK_EASY, "description": ( "Fix a 10-row employee dataset: " "fill missing values in 'age' and 'salary', " "convert 'age' from string to integer." ), "max_steps": MAX_STEPS, "issues": [ "missing_values in age and salary columns", "wrong dtype: age should be int not string", ], "hints": [ "Use fill_missing to handle null values", "Use fix_dtype to convert age to int", "Call submit when the dataset looks clean", ], }, TASK_MEDIUM: { "task_id": TASK_IDS[TASK_MEDIUM], "difficulty": TASK_MEDIUM, "description": ( "Fix a 20-row dataset: " "remove duplicate rows, " "handle salary outliers, " "standardize country values to 'United States' or 'UK'." ), "max_steps": MAX_STEPS, "issues": [ "duplicate rows present", "salary column has extreme outliers", "country has inconsistent values (USA, US, America, etc.)", ], "hints": [ "Use drop_duplicates first", "Use remove_outliers on salary column", "Use standardize_values to unify country names", ], }, TASK_HARD: { "task_id": TASK_IDS[TASK_HARD], "difficulty": TASK_HARD, "description": ( "Fix a 30-row dataset with all issues: " "nulls, duplicates, outliers, " "bad column names (trailing spaces, uppercase, special chars), " "inconsistent country values, " "and invalid dept_id values (referential integrity)." ), "max_steps": MAX_STEPS, "issues": [ "missing values in multiple columns", "bad column names: 'Full Name ', 'AGE', 'salary$', 'COUNTRY'", "salary outliers and negative values", "inconsistent country values", "invalid dept_id: 99 does not exist in lookup table", ], "hints": [ "Start by renaming bad column names", "Then fix nulls, duplicates, and outliers", "Standardize country values", "Fix invalid dept_id values last", ], }, } # ── Task Class ──────────────────────────────── class Task: """Represents a single task with its dataset and grader""" def __init__(self, difficulty: str): if difficulty not in TASK_REGISTRY: raise ValueError(f"Unknown difficulty: {difficulty}") self.difficulty = difficulty self.meta = TASK_REGISTRY[difficulty] self.task_id = self.meta["task_id"] self.description = self.meta["description"] self.max_steps = self.meta["max_steps"] self.hints = self.meta["hints"] # Dataset state self.original_df: Optional[pd.DataFrame] = None self.current_df: Optional[pd.DataFrame] = None self.lookup_df: Optional[pd.DataFrame] = None self.issues: list = [] def reset(self) -> pd.DataFrame: """Generate a fresh dataset and return it""" if self.difficulty == TASK_EASY: self.original_df = generate_easy_dataset() elif self.difficulty == TASK_MEDIUM: self.original_df = generate_medium_dataset() elif self.difficulty == TASK_HARD: self.original_df, self.lookup_df = generate_hard_dataset() self.current_df = self.original_df.copy() self.issues = detect_issues(self.current_df, self.task_id) return self.current_df def grade(self) -> Tuple[float, Dict[str, Any]]: """Grade the current state of the dataset""" if self.current_df is None or self.original_df is None: return 0.0, {"error": "Task not initialized, call reset() first"} if self.difficulty == TASK_EASY: return grade_task1(self.original_df, self.current_df) elif self.difficulty == TASK_MEDIUM: return grade_task2(self.original_df, self.current_df) elif self.difficulty == TASK_HARD: return grade_task3( self.original_df, self.current_df, self.lookup_df ) return 0.0, {} def update_issues(self): """Refresh issue list based on current state""" if self.current_df is not None: self.issues = detect_issues(self.current_df, self.task_id) def get_info(self) -> TaskInfo: """Return TaskInfo model""" score, _ = self.grade() if self.current_df is not None else (None, {}) return TaskInfo( task_id = self.task_id, difficulty = self.difficulty, description = self.description, max_steps = self.max_steps, score = score, ) # ── Task Factory ────────────────────────────── def get_task(difficulty: str) -> Task: """Create and return a Task instance""" return Task(difficulty) def get_all_tasks() -> Dict[str, Task]: """Return all tasks as a dict""" return {d: Task(d) for d in [TASK_EASY, TASK_MEDIUM, TASK_HARD]}