Spaces:
Sleeping
Sleeping
| """ | |
| Task Definitions β Realistic datasets with known data-quality issues. | |
| ===================================================================== | |
| Each task provides: | |
| dirty_data β the messy rows the agent starts with | |
| clean_data β ground-truth rows (used by the grader) | |
| issues β list describing every problem to fix | |
| max_steps β action budget | |
| description β human-readable goal | |
| """ | |
| from __future__ import annotations | |
| from typing import Any, Dict, List | |
| import copy | |
| # ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| IssueDict = Dict[str, Any] | |
| Row = Dict[str, Any] | |
| # ββ TASK 1 β EASY: Customer Contact Cleanup βββββββββββββββββββββββββββββββ | |
| _EASY_DIRTY: List[Row] = [ | |
| {"id": 1, "name": "John Smith", "email": "john.smith@gmail.com", "phone": "555-0101", "age": 35, "city": "New York"}, | |
| {"id": 2, "name": "Jane Doe", "email": "", "phone": "555-0102", "age": 28, "city": "Los Angeles"}, | |
| {"id": 3, "name": "Bob Wilson", "email": "bob.w@yahoo.com", "phone": "555-ABCD", "age": 42, "city": "Chicago"}, | |
| {"id": 4, "name": "John Smith", "email": "john.smith@gmail.com", "phone": "555-0101", "age": 35, "city": "New York"}, | |
| {"id": 5, "name": "Alice Brown", "email": "alice.b@hotmail.com", "phone": "555-0105", "age": -3, "city": "Houston"}, | |
| {"id": 6, "name": "Charlie Davis", "email": "charlie.d@gmail.com", "phone": "555-0106", "age": 31, "city": "Phoenix"}, | |
| {"id": 7, "name": "Eva Martinez", "email": "eva.m@outlook.com", "phone": "555-0107", "age": 27, "city": "Philadelphia"}, | |
| {"id": 8, "name": "Frank Lee", "email": "frank@gmail", "phone": "555-0108", "age": 45, "city": "San Antonio"}, | |
| {"id": 9, "name": "Grace Kim", "email": "grace.k@yahoo.com", "phone": "555-0109", "age": 38, "city": "San Diego"}, | |
| {"id": 10,"name": "Henry Nguyen", "email": "henry.n@gmail.com", "phone": "555-0110", "age": 52, "city": "Dallas"}, | |
| ] | |
| _EASY_CLEAN: List[Row] = [ | |
| {"id": 1, "name": "John Smith", "email": "john.smith@gmail.com", "phone": "555-0101", "age": 35, "city": "New York"}, | |
| {"id": 2, "name": "Jane Doe", "email": "jane.doe@email.com", "phone": "555-0102", "age": 28, "city": "Los Angeles"}, | |
| {"id": 3, "name": "Bob Wilson", "email": "bob.w@yahoo.com", "phone": "555-0103", "age": 42, "city": "Chicago"}, | |
| # row 4 (duplicate of row 0) deleted | |
| {"id": 5, "name": "Alice Brown", "email": "alice.b@hotmail.com", "phone": "555-0105", "age": 33, "city": "Houston"}, | |
| {"id": 6, "name": "Charlie Davis", "email": "charlie.d@gmail.com", "phone": "555-0106", "age": 31, "city": "Phoenix"}, | |
| {"id": 7, "name": "Eva Martinez", "email": "eva.m@outlook.com", "phone": "555-0107", "age": 27, "city": "Philadelphia"}, | |
| {"id": 8, "name": "Frank Lee", "email": "frank@gmail.com", "phone": "555-0108", "age": 45, "city": "San Antonio"}, | |
| {"id": 9, "name": "Grace Kim", "email": "grace.k@yahoo.com", "phone": "555-0109", "age": 38, "city": "San Diego"}, | |
| {"id": 10,"name": "Henry Nguyen", "email": "henry.n@gmail.com", "phone": "555-0110", "age": 52, "city": "Dallas"}, | |
| ] | |
| _EASY_ISSUES: List[IssueDict] = [ | |
| {"row": 1, "col": "email", "type": "missing_value", "desc": "Missing email address", "fix": "jane.doe@email.com"}, | |
| {"row": 2, "col": "phone", "type": "invalid_format", "desc": "Phone contains letters (555-ABCD)", "fix": "555-0103"}, | |
| {"row": 3, "col": None, "type": "duplicate_row", "desc": "Exact duplicate of row 0", "fix": "__DELETE__"}, | |
| {"row": 4, "col": "age", "type": "invalid_value", "desc": "Negative age (-3)", "fix": "33"}, | |
| {"row": 7, "col": "email", "type": "invalid_format", "desc": "Email missing TLD (frank@gmail)", "fix": "frank@gmail.com"}, | |
| ] | |
| # ββ TASK 2 β MEDIUM: E-commerce Order Normalisation ββββββββββββββββββββββ | |
| _MED_DIRTY: List[Row] = [ | |
| {"order_id": "ORD-001", "customer": "Acme Corp", "product": "P100", "quantity": 10, "price": "249.99", "date": "2024-01-15", "status": "delivered"}, | |
| {"order_id": "ORD-002", "customer": "Globex Inc", "product": "P102", "quantity": 5, "price": "599.00", "date": "2024-01-18", "status": "delivered"}, | |
| {"order_id": "ORD-003", "customer": "Initech LLC", "product": "P100", "quantity": 3, "price": "249.99", "date": "15/02/2024", "status": "shipped"}, | |
| {"order_id": "ORD-004", "customer": "Umbrella Co", "product": "P105", "quantity": 8, "price": "149.50", "date": "2024-02-20", "status": "delivered"}, | |
| {"order_id": "ORD-005", "customer": "Stark Ind", "product": "P-102", "quantity": 12, "price": "599.00", "date": "2024-03-01", "status": "shipped"}, | |
| {"order_id": "ORD-006", "customer": "Wayne Ent", "product": "P108", "quantity": -2, "price": "$1,234.56", "date": "2024-03-05", "status": "processing"}, | |
| {"order_id": "ORD-007", "customer": "Oscorp", "product": "P100", "quantity": 7, "price": "249.99", "date": "2024-03-10", "status": "delivered"}, | |
| {"order_id": "ORD-008", "customer": "Cyberdyne Sys", "product": "P110", "quantity": 1, "price": "899.00", "date": "2024.03.15", "status": "delivered"}, | |
| {"order_id": "ORD-009", "customer": "Soylent Corp", "product": "P105", "quantity": 4, "price": "149.50", "date": "2024-03-20", "status": "shiped"}, | |
| {"order_id": "ORD-010", "customer": "Globex Inc", "product": "P102", "quantity": 5, "price": "599.00", "date": "2024-01-18", "status": "delivered"}, | |
| {"order_id": "ORD-011", "customer": "Tyrell Corp", "product": "P112", "quantity": 6, "price": "", "date": "2024-04-01", "status": "processing"}, | |
| {"order_id": "ORD-012", "customer": "Wonka Ind", "product": "P100", "quantity": 20, "price": "249.99", "date": "01-05-2024", "status": "shipped"}, | |
| {"order_id": "ORD-013", "customer": "Prestige World", "product": "P-105", "quantity": 9, "price": "149.50", "date": "2024-05-10", "status": "delivered"}, | |
| {"order_id": "ORD-014", "customer": "Massive Dyn", "product": "P108", "quantity": 3, "price": "1234.56", "date": "2024-05-15", "status": "delivered"}, | |
| {"order_id": "ORD-015", "customer": "Aperture Sci", "product": "P115", "quantity": 15, "price": "75.00", "date": "2024-06-01", "status": "shipped"}, | |
| ] | |
| _MED_CLEAN: List[Row] = [ | |
| {"order_id": "ORD-001", "customer": "Acme Corp", "product": "P100", "quantity": 10, "price": "249.99", "date": "2024-01-15", "status": "delivered"}, | |
| {"order_id": "ORD-002", "customer": "Globex Inc", "product": "P102", "quantity": 5, "price": "599.00", "date": "2024-01-18", "status": "delivered"}, | |
| {"order_id": "ORD-003", "customer": "Initech LLC", "product": "P100", "quantity": 3, "price": "249.99", "date": "2024-02-15", "status": "shipped"}, | |
| {"order_id": "ORD-004", "customer": "Umbrella Co", "product": "P105", "quantity": 8, "price": "149.50", "date": "2024-02-20", "status": "delivered"}, | |
| {"order_id": "ORD-005", "customer": "Stark Ind", "product": "P102", "quantity": 12, "price": "599.00", "date": "2024-03-01", "status": "shipped"}, | |
| {"order_id": "ORD-006", "customer": "Wayne Ent", "product": "P108", "quantity": 2, "price": "1234.56", "date": "2024-03-05", "status": "processing"}, | |
| {"order_id": "ORD-007", "customer": "Oscorp", "product": "P100", "quantity": 7, "price": "249.99", "date": "2024-03-10", "status": "delivered"}, | |
| {"order_id": "ORD-008", "customer": "Cyberdyne Sys", "product": "P110", "quantity": 1, "price": "899.00", "date": "2024-03-15", "status": "delivered"}, | |
| {"order_id": "ORD-009", "customer": "Soylent Corp", "product": "P105", "quantity": 4, "price": "149.50", "date": "2024-03-20", "status": "shipped"}, | |
| # row 9 (duplicate of row 1) deleted | |
| {"order_id": "ORD-011", "customer": "Tyrell Corp", "product": "P112", "quantity": 6, "price": "350.00", "date": "2024-04-01", "status": "processing"}, | |
| {"order_id": "ORD-012", "customer": "Wonka Ind", "product": "P100", "quantity": 20, "price": "249.99", "date": "2024-05-01", "status": "shipped"}, | |
| {"order_id": "ORD-013", "customer": "Prestige World", "product": "P105", "quantity": 9, "price": "149.50", "date": "2024-05-10", "status": "delivered"}, | |
| {"order_id": "ORD-014", "customer": "Massive Dyn", "product": "P108", "quantity": 3, "price": "1234.56", "date": "2024-05-15", "status": "delivered"}, | |
| {"order_id": "ORD-015", "customer": "Aperture Sci", "product": "P115", "quantity": 15, "price": "75.00", "date": "2024-06-01", "status": "shipped"}, | |
| ] | |
| _MED_ISSUES: List[IssueDict] = [ | |
| {"row": 2, "col": "date", "type": "inconsistent_format", "desc": "Date in DD/MM/YYYY format instead of YYYY-MM-DD", "fix": "2024-02-15"}, | |
| {"row": 4, "col": "product", "type": "inconsistent_format", "desc": "Product code has dash (P-102 vs P102)", "fix": "P102"}, | |
| {"row": 5, "col": "quantity", "type": "invalid_value", "desc": "Negative quantity (-2)", "fix": "2"}, | |
| {"row": 5, "col": "price", "type": "inconsistent_format", "desc": "Price has $ and comma ($1,234.56)", "fix": "1234.56"}, | |
| {"row": 7, "col": "date", "type": "inconsistent_format", "desc": "Date uses dots (2024.03.15)", "fix": "2024-03-15"}, | |
| {"row": 8, "col": "status", "type": "typo", "desc": "Status misspelled: shiped -> shipped", "fix": "shipped"}, | |
| {"row": 9, "col": None, "type": "duplicate_row", "desc": "Duplicate of row 1 (same order)", "fix": "__DELETE__"}, | |
| {"row": 10, "col": "price", "type": "missing_value", "desc": "Missing price for P112 product", "fix": "350.00"}, | |
| {"row": 11, "col": "date", "type": "inconsistent_format", "desc": "Date in DD-MM-YYYY format", "fix": "2024-05-01"}, | |
| {"row": 12, "col": "product", "type": "inconsistent_format", "desc": "Product code has dash (P-105 vs P105)", "fix": "P105"}, | |
| ] | |
| # ββ TASK 3 β HARD: Employee Records Audit βββββββββββββββββββββββββββββββββ | |
| _HARD_DIRTY: List[Row] = [ | |
| {"emp_id": "E001", "name": "Sarah Johnson", "email": "sarah.j@company.com", "birth_date": "1985-06-12", "age": 39, "department": "Engineering", "dept_code": "ENG", "role": "Senior Engineer", "salary": 125000, "start_date": "2015-03-01", "manager_id": "E010"}, | |
| {"emp_id": "E002", "name": "Michael Chen", "email": "michael.c@company.com", "birth_date": "1990-03-15", "age": 28, "department": "Engineering", "dept_code": "ENG", "role": "Junior Developer", "salary": 72000, "start_date": "2022-07-15", "manager_id": "E001"}, | |
| {"emp_id": "E003", "name": "Emily Watson", "email": "emily.w@company.com", "birth_date": "1988-11-22", "age": 36, "department": "Marketing", "dept_code": "MKT", "role": "Marketing Manager", "salary": 98000, "start_date": "2018-01-10", "manager_id": "E010"}, | |
| {"emp_id": "E004", "name": "David Park", "email": "david.p@company.com", "birth_date": "1992-07-04", "age": 32, "department": "Engineering", "dept_code": "MKT", "role": "Software Engineer", "salary": 105000, "start_date": "2020-09-01", "manager_id": "E001"}, | |
| {"emp_id": "E005", "name": "Lisa Rodriguez", "email": "lisa.r@company.com", "birth_date": "1995-01-30", "age": 29, "department": "Sales", "dept_code": "SAL", "role": "Sales Representative","salary": 65000, "start_date": "2023-02-14", "manager_id": "E008"}, | |
| {"emp_id": "E006", "name": "James O'Brien", "email": "james.ob@company.com", "birth_date": "1987-09-18", "age": 37, "department": "Finance", "dept_code": "FIN", "role": "Financial Analyst", "salary": 88000, "start_date": "2019-05-20", "manager_id": "E010"}, | |
| {"emp_id": "E007", "name": "James Obrien", "email": "james.ob@company.com", "birth_date": "1987-09-18", "age": 37, "department": "Finance", "dept_code": "FIN", "role": "Financial Analyst", "salary": 88000, "start_date": "2019-05-20", "manager_id": "E010"}, | |
| {"emp_id": "E008", "name": "Rachel Green", "email": "rachel.g@company.com", "birth_date": "1983-04-05", "age": 41, "department": "Sales", "dept_code": "SAL", "role": "Sales Director", "salary": 140000, "start_date": "2014-11-01", "manager_id": "E010"}, | |
| {"emp_id": "E009", "name": "Tom Anderson", "email": "tom.a@company.com", "birth_date": "1991-12-25", "age": 33, "department": "Engineering", "dept_code": "ENG", "role": "Junior Developer", "salary": 250000, "start_date": "2023-06-01", "manager_id": "E001"}, | |
| {"emp_id": "E010", "name": "Patricia Moore", "email": "patricia.m@company.com", "birth_date": "1978-02-14", "age": 46, "department": "Executive", "dept_code": "EXE", "role": "VP of Operations", "salary": 185000, "start_date": "2010-01-15", "manager_id": ""}, | |
| {"emp_id": "E011", "name": "Kevin Hall", "email": "kevin.h@company.com", "birth_date": "1993-08-07", "age": 31, "department": "Marketing", "dept_code": "MKT", "role": "Content Specialist", "salary": 62000, "start_date": "2025-08-01", "manager_id": "E003"}, | |
| {"emp_id": "E012", "name": "Amy Liu", "email": "AMY.LIU@COMPANY.COM", "birth_date": "1994-05-19", "age": 30, "department": "Engineering", "dept_code": "ENG", "role": "QA Engineer", "salary": 82000, "start_date": "2021-04-12", "manager_id": "E001"}, | |
| {"emp_id": "E013", "name": "Robert Taylor", "email": "robert.t@company.com", "birth_date": "1986-10-31", "age": 38, "department": "", "dept_code": "SAL", "role": "Account Manager", "salary": 78000, "start_date": "2020-01-06", "manager_id": "E008"}, | |
| {"emp_id": "E014", "name": "NULL", "email": "nina.s@company.com", "birth_date": "1997-03-22", "age": 27, "department": "Finance", "dept_code": "FIN", "role": "Junior Analyst", "salary": 58000, "start_date": "2024-01-08", "manager_id": "E006"}, | |
| {"emp_id": "E015", "name": "Carlos Mendez", "email": "carlos.m@company.com", "birth_date": "1989-07-16", "age": 35, "department": "Engineering", "dept_code": "ENG", "role": "DevOps Engineer", "salary": -95000, "start_date": "2019-10-01", "manager_id": "E001"}, | |
| {"emp_id": "E016", "name": "Sophie Turner", "email": "sophie.t@company.com", "birth_date": "1996-11-03", "age": 28, "department": "Marketing", "dept_code": "MKT", "role": "Social Media Mgr", "salary": 60000, "start_date": "2022-03-15", "manager_id": "E003"}, | |
| {"emp_id": "E017", "name": "Alex Rivera", "email": "alex.r@company.com", "birth_date": "1984-01-28", "age": 40, "department": "Sales", "dept_code": "SAL", "role": "Regional Manager", "salary": 110000, "start_date": "1899-01-01", "manager_id": "E008"}, | |
| {"emp_id": "E018", "name": "Diana Foster", "email": "diana.f@company.com", "birth_date": "1991-06-09", "age": 33, "department": "Finance", "dept_code": "FIN", "role": "Senior Accountant", "salary": 92000, "start_date": "2017-08-21", "manager_id": "E006"}, | |
| {"emp_id": "E019", "name": "Brandon White", "email": "brandon.w@company.com", "birth_date": "1998-04-14", "age": 26, "department": "Engineering", "dept_code": "ENG", "role": "Intern", "salary": 45000, "start_date": "2024-06-01", "manager_id": "E999"}, | |
| {"emp_id": "E020", "name": "Maria Gonzalez", "email": "maria.g@company.com", "birth_date": "1982-12-01", "age": 42, "department": "Executive", "dept_code": "EXE", "role": "CFO", "salary": 210000, "start_date": "2012-04-01", "manager_id": ""}, | |
| ] | |
| _HARD_CLEAN: List[Row] = [ | |
| {"emp_id": "E001", "name": "Sarah Johnson", "email": "sarah.j@company.com", "birth_date": "1985-06-12", "age": 39, "department": "Engineering", "dept_code": "ENG", "role": "Senior Engineer", "salary": 125000, "start_date": "2015-03-01", "manager_id": "E010"}, | |
| {"emp_id": "E002", "name": "Michael Chen", "email": "michael.c@company.com", "birth_date": "1990-03-15", "age": 34, "department": "Engineering", "dept_code": "ENG", "role": "Junior Developer", "salary": 72000, "start_date": "2022-07-15", "manager_id": "E001"}, | |
| {"emp_id": "E003", "name": "Emily Watson", "email": "emily.w@company.com", "birth_date": "1988-11-22", "age": 36, "department": "Marketing", "dept_code": "MKT", "role": "Marketing Manager", "salary": 98000, "start_date": "2018-01-10", "manager_id": "E010"}, | |
| {"emp_id": "E004", "name": "David Park", "email": "david.p@company.com", "birth_date": "1992-07-04", "age": 32, "department": "Engineering", "dept_code": "ENG", "role": "Software Engineer", "salary": 105000, "start_date": "2020-09-01", "manager_id": "E001"}, | |
| {"emp_id": "E005", "name": "Lisa Rodriguez", "email": "lisa.r@company.com", "birth_date": "1995-01-30", "age": 29, "department": "Sales", "dept_code": "SAL", "role": "Sales Representative","salary": 65000, "start_date": "2023-02-14", "manager_id": "E008"}, | |
| {"emp_id": "E006", "name": "James O'Brien", "email": "james.ob@company.com", "birth_date": "1987-09-18", "age": 37, "department": "Finance", "dept_code": "FIN", "role": "Financial Analyst", "salary": 88000, "start_date": "2019-05-20", "manager_id": "E010"}, | |
| # row 6 (near-duplicate of row 5) deleted | |
| {"emp_id": "E008", "name": "Rachel Green", "email": "rachel.g@company.com", "birth_date": "1983-04-05", "age": 41, "department": "Sales", "dept_code": "SAL", "role": "Sales Director", "salary": 140000, "start_date": "2014-11-01", "manager_id": "E010"}, | |
| {"emp_id": "E009", "name": "Tom Anderson", "email": "tom.a@company.com", "birth_date": "1991-12-25", "age": 33, "department": "Engineering", "dept_code": "ENG", "role": "Junior Developer", "salary": 75000, "start_date": "2023-06-01", "manager_id": "E001"}, | |
| {"emp_id": "E010", "name": "Patricia Moore", "email": "patricia.m@company.com", "birth_date": "1978-02-14", "age": 46, "department": "Executive", "dept_code": "EXE", "role": "VP of Operations", "salary": 185000, "start_date": "2010-01-15", "manager_id": ""}, | |
| {"emp_id": "E011", "name": "Kevin Hall", "email": "kevin.h@company.com", "birth_date": "1993-08-07", "age": 31, "department": "Marketing", "dept_code": "MKT", "role": "Content Specialist", "salary": 62000, "start_date": "2024-08-01", "manager_id": "E003"}, | |
| {"emp_id": "E012", "name": "Amy Liu", "email": "amy.liu@company.com", "birth_date": "1994-05-19", "age": 30, "department": "Engineering", "dept_code": "ENG", "role": "QA Engineer", "salary": 82000, "start_date": "2021-04-12", "manager_id": "E001"}, | |
| {"emp_id": "E013", "name": "Robert Taylor", "email": "robert.t@company.com", "birth_date": "1986-10-31", "age": 38, "department": "Sales", "dept_code": "SAL", "role": "Account Manager", "salary": 78000, "start_date": "2020-01-06", "manager_id": "E008"}, | |
| {"emp_id": "E014", "name": "Nina Sharma", "email": "nina.s@company.com", "birth_date": "1997-03-22", "age": 27, "department": "Finance", "dept_code": "FIN", "role": "Junior Analyst", "salary": 58000, "start_date": "2024-01-08", "manager_id": "E006"}, | |
| {"emp_id": "E015", "name": "Carlos Mendez", "email": "carlos.m@company.com", "birth_date": "1989-07-16", "age": 35, "department": "Engineering", "dept_code": "ENG", "role": "DevOps Engineer", "salary": 95000, "start_date": "2019-10-01", "manager_id": "E001"}, | |
| {"emp_id": "E016", "name": "Sophie Turner", "email": "sophie.t@company.com", "birth_date": "1996-11-03", "age": 28, "department": "Marketing", "dept_code": "MKT", "role": "Social Media Mgr", "salary": 60000, "start_date": "2022-03-15", "manager_id": "E003"}, | |
| {"emp_id": "E017", "name": "Alex Rivera", "email": "alex.r@company.com", "birth_date": "1984-01-28", "age": 40, "department": "Sales", "dept_code": "SAL", "role": "Regional Manager", "salary": 110000, "start_date": "2016-09-01", "manager_id": "E008"}, | |
| {"emp_id": "E018", "name": "Diana Foster", "email": "diana.f@company.com", "birth_date": "1991-06-09", "age": 33, "department": "Finance", "dept_code": "FIN", "role": "Senior Accountant", "salary": 92000, "start_date": "2017-08-21", "manager_id": "E006"}, | |
| {"emp_id": "E019", "name": "Brandon White", "email": "brandon.w@company.com", "birth_date": "1998-04-14", "age": 26, "department": "Engineering", "dept_code": "ENG", "role": "Intern", "salary": 45000, "start_date": "2024-06-01", "manager_id": "E001"}, | |
| {"emp_id": "E020", "name": "Maria Gonzalez", "email": "maria.g@company.com", "birth_date": "1982-12-01", "age": 42, "department": "Executive", "dept_code": "EXE", "role": "CFO", "salary": 210000, "start_date": "2012-04-01", "manager_id": ""}, | |
| ] | |
| _HARD_ISSUES: List[IssueDict] = [ | |
| {"row": 1, "col": "age", "type": "cross_field", "desc": "Age 28 inconsistent with birth_date 1990-03-15 (should be ~34)", "fix": "34"}, | |
| {"row": 3, "col": "dept_code", "type": "cross_field", "desc": "dept_code MKT but department is Engineering", "fix": "ENG"}, | |
| {"row": 6, "col": None, "type": "near_duplicate", "desc": "Near-duplicate of row 5 (James Obrien vs James O'Brien)", "fix": "__DELETE__"}, | |
| {"row": 8, "col": "salary", "type": "anomalous_value", "desc": "Salary $250k for Junior Developer (expected $60k-$85k)", "fix": "75000"}, | |
| {"row": 10, "col": "start_date", "type": "future_date", "desc": "Start date 2025-08-01 is in the future", "fix": "2024-08-01"}, | |
| {"row": 11, "col": "email", "type": "inconsistent_format", "desc": "Email in ALL CAPS vs lowercase convention", "fix": "amy.liu@company.com"}, | |
| {"row": 12, "col": "department", "type": "missing_value", "desc": "Department empty but dept_code is SAL", "fix": "Sales"}, | |
| {"row": 13, "col": "name", "type": "placeholder_value", "desc": "Name is literal 'NULL' string instead of real name", "fix": "Nina Sharma"}, | |
| {"row": 14, "col": "salary", "type": "invalid_value", "desc": "Negative salary (-95000)", "fix": "95000"}, | |
| {"row": 16, "col": "start_date", "type": "anomalous_value", "desc": "Start date 1899-01-01 is clearly wrong", "fix": "2016-09-01"}, | |
| {"row": 18, "col": "manager_id", "type": "referential", "desc": "manager_id E999 does not exist in employee list", "fix": "E001"}, | |
| ] | |
| # ββ public registry ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| TASKS = { | |
| "easy": { | |
| "name": "easy", | |
| "title": "Customer Contact Cleanup", | |
| "difficulty": "easy", | |
| "description": ( | |
| "You are a data-quality analyst. A customer-contacts spreadsheet has " | |
| "been imported with several obvious errors: missing e-mails, invalid " | |
| "phone numbers, duplicate rows, and impossible ages. " | |
| "Identify and fix every issue. Use the available actions to correct " | |
| "each problem, then submit when you believe the data is clean." | |
| ), | |
| "dirty_data": _EASY_DIRTY, | |
| "clean_data": _EASY_CLEAN, | |
| "issues": _EASY_ISSUES, | |
| "max_steps": 15, | |
| "columns": ["id", "name", "email", "phone", "age", "city"], | |
| }, | |
| "medium": { | |
| "name": "medium", | |
| "title": "E-commerce Order Normalisation", | |
| "difficulty": "medium", | |
| "description": ( | |
| "You are a data engineer preparing an orders export for a BI dashboard. " | |
| "The dataset has mixed date formats (YYYY-MM-DD, DD/MM/YYYY, YYYY.MM.DD, DD-MM-YYYY), " | |
| "inconsistent price formatting, product-code variants (P100 vs P-100), " | |
| "a typo in a status field, a duplicate order, negative quantities, " | |
| "and missing values. Normalise every field so the data is consistent, " | |
| "then submit." | |
| ), | |
| "dirty_data": _MED_DIRTY, | |
| "clean_data": _MED_CLEAN, | |
| "issues": _MED_ISSUES, | |
| "max_steps": 25, | |
| "columns": ["order_id", "customer", "product", "quantity", "price", "date", "status"], | |
| }, | |
| "hard": { | |
| "name": "hard", | |
| "title": "Employee Records Audit", | |
| "difficulty": "hard", | |
| "description": ( | |
| "You are auditing an HR database before a compliance review. " | |
| "The employee records contain subtle cross-field inconsistencies " | |
| "(age vs birth-date mismatches, department vs dept-code conflicts), " | |
| "near-duplicate employees with slightly different name spellings, " | |
| "anomalous salary values for the given role, future or impossible dates, " | |
| "placeholder 'NULL' strings, ALL-CAPS email addresses, missing departments, " | |
| "and referential-integrity violations (manager_id pointing to non-existent employees). " | |
| "Find and fix all issues, then submit." | |
| ), | |
| "dirty_data": _HARD_DIRTY, | |
| "clean_data": _HARD_CLEAN, | |
| "issues": _HARD_ISSUES, | |
| "max_steps": 35, | |
| "columns": ["emp_id", "name", "email", "birth_date", "age", "department", | |
| "dept_code", "role", "salary", "start_date", "manager_id"], | |
| }, | |
| } | |
| def get_task(name: str) -> dict: | |
| """Return a deep copy of a task definition so mutations are isolated.""" | |
| if name not in TASKS: | |
| raise ValueError(f"Unknown task '{name}'. Choose from: {list(TASKS.keys())}") | |
| return copy.deepcopy(TASKS[name]) | |