Spaces:
Sleeping
Sleeping
| """ | |
| Task definitions for the Data Cleaning OpenEnv environment. | |
| Defines three tasks at easy, medium, and hard difficulty levels, each with | |
| complete ground truth data, schema definitions, and corruption specifications. | |
| SELF-CONTAINED: All ground truth data is inlined. No external imports for data. | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass, field | |
| from typing import Any | |
| # ============================================================================ | |
| # Utility Probe dataclass | |
| # ============================================================================ | |
| class UtilityProbe: | |
| """A downstream analytics probe that validates the cleaned data produces | |
| correct aggregate results, not just cell-level correctness.""" | |
| name: str # e.g., "unique_customer_count" | |
| description: str # e.g., "Count of unique customers after dedup" | |
| query_fn: str # Name of the probe function to run | |
| params: dict[str, Any] = field(default_factory=dict) # e.g., {"column": "email"} | |
| expected_result: Any = None # The correct answer from ground truth | |
| # ============================================================================ | |
| # Task dataclass | |
| # ============================================================================ | |
| class Task: | |
| task_id: str | |
| name: str | |
| difficulty: str | |
| description: str | |
| ground_truth: list[dict[str, Any]] | |
| schema: dict[str, Any] | |
| corruptions: list[dict[str, Any]] | |
| max_steps: int | |
| utility_probes: list[UtilityProbe] = field(default_factory=list) | |
| ambiguous_cells: list[tuple[str, str]] = field(default_factory=list) | |
| # ============================================================================ | |
| # Task registry | |
| # ============================================================================ | |
| _TASK_REGISTRY: dict[str, Task] = {} | |
| def register_task(task: Task) -> None: | |
| """Register a task in the global registry, keyed by task_id.""" | |
| _TASK_REGISTRY[task.task_id] = task | |
| def get_task(task_id: str) -> Task: | |
| """Retrieve a registered task by its task_id. | |
| Raises ``KeyError`` if the task_id is not found. | |
| """ | |
| if task_id not in _TASK_REGISTRY: | |
| raise KeyError(f"Task '{task_id}' not found. Available: {list(_TASK_REGISTRY)}") | |
| return _TASK_REGISTRY[task_id] | |
| def list_tasks() -> list[dict[str, Any]]: | |
| """Return a list of metadata dicts (one per registered task).""" | |
| return [ | |
| { | |
| "task_id": t.task_id, | |
| "name": t.name, | |
| "difficulty": t.difficulty, | |
| "description": t.description, | |
| "num_ground_truth_rows": len(t.ground_truth), | |
| "num_corruptions": len(t.corruptions), | |
| "max_steps": t.max_steps, | |
| "num_utility_probes": len(t.utility_probes), | |
| } | |
| for t in _TASK_REGISTRY.values() | |
| ] | |
| # ############################################################################ | |
| # EASY TASK: Customer Contact Cleanup | |
| # ############################################################################ | |
| _EASY_GROUND_TRUTH: list[dict[str, Any]] = [ | |
| { | |
| "_entity_id": "CONTACT001", | |
| "id": 1, | |
| "first_name": "Alice", | |
| "last_name": "Morgan", | |
| "email": "alice.morgan@example.com", | |
| "phone": "(555) 123-4567", | |
| "signup_date": "2022-01-15", | |
| "state": "CA", | |
| }, | |
| { | |
| "_entity_id": "CONTACT002", | |
| "id": 2, | |
| "first_name": "Brian", | |
| "last_name": "Cho", | |
| "email": "brian.cho@example.com", | |
| "phone": "(555) 234-5678", | |
| "signup_date": "2022-03-22", | |
| "state": "NY", | |
| }, | |
| { | |
| "_entity_id": "CONTACT003", | |
| "id": 3, | |
| "first_name": "Carmen", | |
| "last_name": "Reyes", | |
| "email": "carmen.reyes@example.com", | |
| "phone": "(555) 345-6789", | |
| "signup_date": "2022-06-10", | |
| "state": "TX", | |
| }, | |
| { | |
| "_entity_id": "CONTACT004", | |
| "id": 4, | |
| "first_name": "David", | |
| "last_name": "Novak", | |
| "email": "david.novak@example.com", | |
| "phone": "(555) 456-7890", | |
| "signup_date": "2022-08-05", | |
| "state": "FL", | |
| }, | |
| { | |
| "_entity_id": "CONTACT005", | |
| "id": 5, | |
| "first_name": "Elena", | |
| "last_name": "Petrova", | |
| "email": "elena.petrova@example.com", | |
| "phone": "(555) 567-8901", | |
| "signup_date": "2023-01-18", | |
| "state": "WA", | |
| }, | |
| { | |
| "_entity_id": "CONTACT006", | |
| "id": 6, | |
| "first_name": "Frank", | |
| "last_name": "Oduya", | |
| "email": "frank.oduya@example.com", | |
| "phone": "(555) 678-9012", | |
| "signup_date": "2023-04-30", | |
| "state": "IL", | |
| }, | |
| { | |
| "_entity_id": "CONTACT007", | |
| "id": 7, | |
| "first_name": "Grace", | |
| "last_name": "Kim", | |
| "email": "grace.kim@example.com", | |
| "phone": "(555) 789-0123", | |
| "signup_date": "2023-07-14", | |
| "state": "OR", | |
| }, | |
| { | |
| "_entity_id": "CONTACT008", | |
| "id": 8, | |
| "first_name": "Hassan", | |
| "last_name": "Ali", | |
| "email": "hassan.ali@example.com", | |
| "phone": "(555) 890-1234", | |
| "signup_date": "2023-10-02", | |
| "state": "CO", | |
| }, | |
| ] | |
| _EASY_SCHEMA: dict[str, Any] = { | |
| "primary_key": "id", | |
| "expected_types": { | |
| "id": "int", | |
| "first_name": "str", | |
| "last_name": "str", | |
| "email": "email", | |
| "phone": "phone", | |
| "signup_date": "date", | |
| "state": "str", | |
| }, | |
| "constraints": { | |
| "email": {"format": r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}$"}, | |
| "phone": {"format": r"^\(\d{3}\) \d{3}-\d{4}$"}, | |
| "signup_date": {"format": "YYYY-MM-DD"}, | |
| "state": { | |
| "allowed_values": [ | |
| "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", | |
| "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", | |
| "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", | |
| "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", | |
| "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY", | |
| ], | |
| }, | |
| }, | |
| } | |
| _EASY_CORRUPTIONS: list[dict[str, Any]] = [ | |
| # --- char_swap: 2 typos --- | |
| { | |
| "type": "char_swap", | |
| "targets": [{"row_idx": 1, "field": "first_name"}], | |
| "description": "Swapped letters in first_name for contact 2 (Brian -> Biran)", | |
| "target_entity_id": "CONTACT002", | |
| "original": "Brian", | |
| "corrupted": "Biran", | |
| }, | |
| { | |
| "type": "char_swap", | |
| "targets": [{"row_idx": 5, "field": "last_name"}], | |
| "description": "Transposed letters in last name for contact 6 (Oduya -> Oduay)", | |
| "target_entity_id": "CONTACT006", | |
| "original": "Oduya", | |
| "corrupted": "Oduay", | |
| }, | |
| # --- format_randomize: 2 dates --- | |
| { | |
| "type": "format_randomize", | |
| "column": "signup_date", | |
| "row_indices": [0], | |
| "description": "Date reformatted to MM/DD/YYYY for contact 1", | |
| "target_entity_id": "CONTACT001", | |
| "original": "2022-01-15", | |
| "corrupted": "01/15/2022", | |
| }, | |
| { | |
| "type": "format_randomize", | |
| "column": "signup_date", | |
| "row_indices": [4], | |
| "description": "Date reformatted to DD-Mon-YYYY for contact 5", | |
| "target_entity_id": "CONTACT005", | |
| "original": "2023-01-18", | |
| "corrupted": "18-Jan-2023", | |
| }, | |
| # --- null_inject: 1 --- | |
| { | |
| "type": "null_inject", | |
| "targets": [{"row_idx": 3, "field": "email"}], | |
| "description": "Email set to None for contact 4", | |
| "target_entity_id": "CONTACT004", | |
| "original": "david.novak@example.com", | |
| "corrupted": None, | |
| }, | |
| # --- format_strip: 1 phone --- | |
| { | |
| "type": "format_strip", | |
| "column": "phone", | |
| "row_indices": [2], | |
| "description": "Phone stripped of formatting for contact 3", | |
| "target_entity_id": "CONTACT003", | |
| "original": "(555) 345-6789", | |
| "corrupted": "5553456789", | |
| }, | |
| # --- case_corrupt: 2 --- | |
| { | |
| "type": "case_corrupt", | |
| "targets": [{"row_idx": 6, "field": "first_name"}], | |
| "description": "First name uppercased for contact 7", | |
| "target_entity_id": "CONTACT007", | |
| "original": "Grace", | |
| "corrupted": "GRACE", | |
| }, | |
| { | |
| "type": "case_corrupt", | |
| "targets": [{"row_idx": 7, "field": "state"}], | |
| "description": "State lowercased for contact 8", | |
| "target_entity_id": "CONTACT008", | |
| "original": "CO", | |
| "corrupted": "co", | |
| }, | |
| ] | |
| _EASY_PROBES: list[UtilityProbe] = [ | |
| UtilityProbe( | |
| name="unique_email_count", | |
| description="Count of unique non-null email addresses after cleaning", | |
| query_fn="unique_count", | |
| params={"column": "email"}, | |
| expected_result=8, | |
| ), | |
| UtilityProbe( | |
| name="state_distribution", | |
| description="Count of contacts per state", | |
| query_fn="distribution", | |
| params={"column": "state"}, | |
| expected_result={ | |
| "CA": 1, "NY": 1, "TX": 1, "FL": 1, | |
| "WA": 1, "IL": 1, "OR": 1, "CO": 1, | |
| }, | |
| ), | |
| ] | |
| _EASY_TASK = Task( | |
| task_id="easy_contacts", | |
| name="Customer Contact Cleanup", | |
| difficulty="easy", | |
| description=( | |
| "Clean a small customer contact list with 8 rows and 7 columns plus id. " | |
| "Corruptions include character swaps (typos), randomised date formats, " | |
| "a null injection, a stripped phone number, and case corruptions. " | |
| "There are NO duplicate rows in this task." | |
| ), | |
| ground_truth=_EASY_GROUND_TRUTH, | |
| schema=_EASY_SCHEMA, | |
| corruptions=_EASY_CORRUPTIONS, | |
| max_steps=30, | |
| utility_probes=_EASY_PROBES, | |
| ) | |
| register_task(_EASY_TASK) | |
| # ############################################################################ | |
| # MEDIUM TASK: Employee Records Reconciliation | |
| # ############################################################################ | |
| _MEDIUM_GROUND_TRUTH: list[dict[str, Any]] = [ | |
| { | |
| "_entity_id": "EMP001", | |
| "emp_id": 1001, | |
| "first_name": "Robert", | |
| "last_name": "Chen", | |
| "email": "r.chen@company.com", | |
| "phone": "(415) 555-0142", | |
| "department": "Engineering", | |
| "hire_date": "2019-03-15", | |
| "salary": 145000.00, | |
| "office_city": "San Francisco", | |
| "office_zip": "94105", | |
| }, | |
| { | |
| "_entity_id": "EMP002", | |
| "emp_id": 1002, | |
| "first_name": "Jennifer", | |
| "last_name": "Okafor", | |
| "email": "j.okafor@company.com", | |
| "phone": "(212) 555-0387", | |
| "department": "Marketing", | |
| "hire_date": "2020-07-22", | |
| "salary": 112000.00, | |
| "office_city": "New York", | |
| "office_zip": "10013", | |
| }, | |
| { | |
| "_entity_id": "EMP003", | |
| "emp_id": 1003, | |
| "first_name": "William", | |
| "last_name": "Hernandez", | |
| "email": "w.hernandez@company.com", | |
| "phone": "(512) 555-0219", | |
| "department": "Sales", | |
| "hire_date": "2021-01-10", | |
| "salary": 98000.00, | |
| "office_city": "Austin", | |
| "office_zip": "78701", | |
| }, | |
| { | |
| "_entity_id": "EMP004", | |
| "emp_id": 1004, | |
| "first_name": "Priya", | |
| "last_name": "Patel", | |
| "email": "p.patel@company.com", | |
| "phone": "(206) 555-0463", | |
| "department": "Engineering", | |
| "hire_date": "2019-11-04", | |
| "salary": 162000.00, | |
| "office_city": "Seattle", | |
| "office_zip": "98101", | |
| }, | |
| { | |
| "_entity_id": "EMP005", | |
| "emp_id": 1005, | |
| "first_name": "Michael", | |
| "last_name": "Thompson", | |
| "email": "m.thompson@company.com", | |
| "phone": "(312) 555-0578", | |
| "department": "Finance", | |
| "hire_date": "2020-04-18", | |
| "salary": 131000.00, | |
| "office_city": "Chicago", | |
| "office_zip": "60601", | |
| }, | |
| { | |
| "_entity_id": "EMP006", | |
| "emp_id": 1006, | |
| "first_name": "Aisha", | |
| "last_name": "Washington", | |
| "email": "a.washington@company.com", | |
| "phone": "(303) 555-0691", | |
| "department": "HR", | |
| "hire_date": "2022-02-28", | |
| "salary": 95000.00, | |
| "office_city": "Denver", | |
| "office_zip": "80202", | |
| }, | |
| { | |
| "_entity_id": "EMP007", | |
| "emp_id": 1007, | |
| "first_name": "Carlos", | |
| "last_name": "Rivera", | |
| "email": "c.rivera@company.com", | |
| "phone": "(415) 555-0734", | |
| "department": "Engineering", | |
| "hire_date": "2021-06-14", | |
| "salary": 138000.00, | |
| "office_city": "San Francisco", | |
| "office_zip": "94107", | |
| }, | |
| { | |
| "_entity_id": "EMP008", | |
| "emp_id": 1008, | |
| "first_name": "Sarah", | |
| "last_name": "Kim", | |
| "email": "s.kim@company.com", | |
| "phone": "(212) 555-0856", | |
| "department": "Marketing", | |
| "hire_date": "2023-01-09", | |
| "salary": 87000.00, | |
| "office_city": "New York", | |
| "office_zip": "10001", | |
| }, | |
| { | |
| "_entity_id": "EMP009", | |
| "emp_id": 1009, | |
| "first_name": "James", | |
| "last_name": "O'Brien", | |
| "email": "j.obrien@company.com", | |
| "phone": "(512) 555-0923", | |
| "department": "Operations", | |
| "hire_date": "2019-08-21", | |
| "salary": 105000.00, | |
| "office_city": "Austin", | |
| "office_zip": "78702", | |
| }, | |
| { | |
| "_entity_id": "EMP010", | |
| "emp_id": 1010, | |
| "first_name": "Wei", | |
| "last_name": "Zhang", | |
| "email": "w.zhang@company.com", | |
| "phone": "(206) 555-0147", | |
| "department": "Engineering", | |
| "hire_date": "2020-10-05", | |
| "salary": 155000.00, | |
| "office_city": "Seattle", | |
| "office_zip": "98104", | |
| }, | |
| { | |
| "_entity_id": "EMP011", | |
| "emp_id": 1011, | |
| "first_name": "Maria", | |
| "last_name": "Gonzalez", | |
| "email": "m.gonzalez@company.com", | |
| "phone": "(303) 555-0268", | |
| "department": "Sales", | |
| "hire_date": "2022-05-16", | |
| "salary": 91000.00, | |
| "office_city": "Denver", | |
| "office_zip": "80203", | |
| }, | |
| { | |
| "_entity_id": "EMP012", | |
| "emp_id": 1012, | |
| "first_name": "David", | |
| "last_name": "Nakamura", | |
| "email": "d.nakamura@company.com", | |
| "phone": "(312) 555-0395", | |
| "department": "Finance", | |
| "hire_date": "2021-09-01", | |
| "salary": 118000.00, | |
| "office_city": "Chicago", | |
| "office_zip": "60602", | |
| }, | |
| { | |
| "_entity_id": "EMP013", | |
| "emp_id": 1013, | |
| "first_name": "Fatima", | |
| "last_name": "Al-Rashidi", | |
| "email": "f.al-rashidi@company.com", | |
| "phone": "(415) 555-0512", | |
| "department": "Engineering", | |
| "hire_date": "2023-03-20", | |
| "salary": 128000.00, | |
| "office_city": "San Francisco", | |
| "office_zip": "94103", | |
| }, | |
| { | |
| "_entity_id": "EMP014", | |
| "emp_id": 1014, | |
| "first_name": "Marcus", | |
| "last_name": "Johnson", | |
| "email": "m.johnson@company.com", | |
| "phone": "(212) 555-0643", | |
| "department": "Operations", | |
| "hire_date": "2020-12-07", | |
| "salary": 102000.00, | |
| "office_city": "New York", | |
| "office_zip": "10016", | |
| }, | |
| { | |
| "_entity_id": "EMP015", | |
| "emp_id": 1015, | |
| "first_name": "Elena", | |
| "last_name": "Petrov", | |
| "email": "e.petrov@company.com", | |
| "phone": "(512) 555-0178", | |
| "department": "HR", | |
| "hire_date": "2024-01-15", | |
| "salary": 88000.00, | |
| "office_city": "Austin", | |
| "office_zip": "78703", | |
| }, | |
| { | |
| "_entity_id": "EMP016", | |
| "emp_id": 1016, | |
| "first_name": "Andre", | |
| "last_name": "Williams", | |
| "email": "a.williams@company.com", | |
| "phone": "(206) 555-0834", | |
| "department": "Sales", | |
| "hire_date": "2022-08-30", | |
| "salary": 96000.00, | |
| "office_city": "Seattle", | |
| "office_zip": "98102", | |
| }, | |
| { | |
| "_entity_id": "EMP017", | |
| "emp_id": 1017, | |
| "first_name": "Lisa", | |
| "last_name": "Svensson", | |
| "email": "l.svensson@company.com", | |
| "phone": "(312) 555-0956", | |
| "department": "Finance", | |
| "hire_date": "2023-07-11", | |
| "salary": 108000.00, | |
| "office_city": "Chicago", | |
| "office_zip": "60603", | |
| }, | |
| ] | |
| _MEDIUM_SCHEMA: dict[str, Any] = { | |
| "primary_key": "emp_id", | |
| "expected_types": { | |
| "emp_id": "int", | |
| "first_name": "str", | |
| "last_name": "str", | |
| "email": "email", | |
| "phone": "phone", | |
| "department": "str", | |
| "hire_date": "date", | |
| "salary": "float", | |
| "office_city": "str", | |
| "office_zip": "str", | |
| }, | |
| "constraints": { | |
| "email": {"format": r"^[a-z]\.[a-z\-]+@company\.com$"}, | |
| "phone": {"format": r"^\(\d{3}\) \d{3}-\d{4}$"}, | |
| "hire_date": {"format": "YYYY-MM-DD"}, | |
| "salary": {"min": 55000.0, "max": 185000.0}, | |
| "department": { | |
| "allowed_values": [ | |
| "Engineering", "Marketing", "Sales", "HR", "Finance", "Operations", | |
| ], | |
| }, | |
| "office_city": { | |
| "allowed_values": [ | |
| "San Francisco", "New York", "Chicago", "Austin", "Seattle", "Denver", | |
| ], | |
| }, | |
| "office_zip": {"format": r"^\d{5}$"}, | |
| }, | |
| } | |
| _MEDIUM_CORRUPTIONS: list[dict[str, Any]] = [ | |
| # --- duplicate_with_noise: 3 rows (nickname + slight field noise) --- | |
| { | |
| "type": "duplicate_with_noise", | |
| "source_indices": [0], | |
| "noise_fields": ["first_name", "phone"], | |
| "description": "Duplicate of EMP001 (Robert Chen) as 'Bob Chen' with transposed phone digits", | |
| "source_entity_id": "EMP001", | |
| "source_emp_id": 1001, | |
| "corrupted_row": { | |
| "emp_id": 1001, | |
| "first_name": "Bob", | |
| "last_name": "Chen", | |
| "email": "r.chen@company.com", | |
| "phone": "(415) 555-0124", | |
| "department": "Engineering", | |
| "hire_date": "2019-03-15", | |
| "salary": 145000.00, | |
| "office_city": "San Francisco", | |
| "office_zip": "94105", | |
| }, | |
| }, | |
| { | |
| "type": "duplicate_with_noise", | |
| "source_indices": [2], | |
| "noise_fields": ["first_name", "phone"], | |
| "description": "Duplicate of EMP003 (William Hernandez) as 'Will Hernandez' with slightly different salary", | |
| "source_entity_id": "EMP003", | |
| "source_emp_id": 1003, | |
| "corrupted_row": { | |
| "emp_id": 1003, | |
| "first_name": "Will", | |
| "last_name": "Hernandez", | |
| "email": "w.hernandez@company.com", | |
| "phone": "(512) 555-0219", | |
| "department": "Sales", | |
| "hire_date": "2021-01-10", | |
| "salary": 98500.00, | |
| "office_city": "Austin", | |
| "office_zip": "78701", | |
| }, | |
| }, | |
| { | |
| "type": "duplicate_with_noise", | |
| "source_indices": [4], | |
| "noise_fields": ["first_name", "phone"], | |
| "description": "Duplicate of EMP005 (Michael Thompson) as 'Mike Thompson' with wrong date format", | |
| "source_entity_id": "EMP005", | |
| "source_emp_id": 1005, | |
| "corrupted_row": { | |
| "emp_id": 1005, | |
| "first_name": "Mike", | |
| "last_name": "Thompson", | |
| "email": "m.thompson@company.com", | |
| "phone": "(312) 555-0578", | |
| "department": "Finance", | |
| "hire_date": "04/18/2020", | |
| "salary": 131000.00, | |
| "office_city": "Chicago", | |
| "office_zip": "60601", | |
| }, | |
| }, | |
| # --- format_randomize dates: 4 --- | |
| { | |
| "type": "format_randomize", | |
| "column": "hire_date", | |
| "row_indices": [10], | |
| "description": "Hire date in DD-MM-YYYY for EMP011 (Maria Gonzalez)", | |
| "target_entity_id": "EMP011", | |
| "target_emp_id": 1011, | |
| "field": "hire_date", | |
| "original": "2022-05-16", | |
| "corrupted": "16-05-2022", | |
| }, | |
| { | |
| "type": "format_randomize", | |
| "column": "hire_date", | |
| "row_indices": [14], | |
| "description": "Hire date in MM/DD/YYYY for EMP015 (Elena Petrov)", | |
| "target_entity_id": "EMP015", | |
| "target_emp_id": 1015, | |
| "field": "hire_date", | |
| "original": "2024-01-15", | |
| "corrupted": "01/15/2024", | |
| }, | |
| { | |
| "type": "format_randomize", | |
| "column": "hire_date", | |
| "row_indices": [7], | |
| "description": "Hire date in Mon DD, YYYY for EMP008 (Sarah Kim)", | |
| "target_entity_id": "EMP008", | |
| "target_emp_id": 1008, | |
| "field": "hire_date", | |
| "original": "2023-01-09", | |
| "corrupted": "Jan 09, 2023", | |
| }, | |
| { | |
| "type": "format_randomize", | |
| "column": "hire_date", | |
| "row_indices": [16], | |
| "description": "Hire date in DD/MM/YYYY for EMP017 (Lisa Svensson)", | |
| "target_entity_id": "EMP017", | |
| "target_emp_id": 1017, | |
| "field": "hire_date", | |
| "original": "2023-07-11", | |
| "corrupted": "11/07/2023", | |
| }, | |
| # --- null_inject: 3 --- | |
| { | |
| "type": "null_inject", | |
| "targets": [{"row_idx": 5, "field": "phone"}], | |
| "description": "Phone set to None for EMP006 (Aisha Washington)", | |
| "target_entity_id": "EMP006", | |
| "target_emp_id": 1006, | |
| "original": "(303) 555-0691", | |
| "corrupted": None, | |
| }, | |
| { | |
| "type": "null_inject", | |
| "targets": [{"row_idx": 11, "field": "department"}], | |
| "description": "Department set to None for EMP012 (David Nakamura)", | |
| "target_entity_id": "EMP012", | |
| "target_emp_id": 1012, | |
| "original": "Finance", | |
| "corrupted": None, | |
| }, | |
| { | |
| "type": "null_inject", | |
| "targets": [{"row_idx": 15, "field": "email"}], | |
| "description": "Email set to None for EMP016 (Andre Williams)", | |
| "target_entity_id": "EMP016", | |
| "target_emp_id": 1016, | |
| "original": "a.williams@company.com", | |
| "corrupted": None, | |
| }, | |
| # --- value_variation: department name variations --- | |
| { | |
| "type": "value_variation", | |
| "column": "department", | |
| "mapping": {"Engineering": ["Engneering"]}, | |
| "description": "Department 'Engineering' misspelled as 'Engneering' for EMP007 (Carlos Rivera)", | |
| "target_entity_id": "EMP007", | |
| "target_emp_id": 1007, | |
| "field": "department", | |
| "original": "Engineering", | |
| "corrupted": "Engneering", | |
| }, | |
| { | |
| "type": "value_variation", | |
| "column": "last_name", | |
| "mapping": {"Okafor": ["Okafro"]}, | |
| "description": "Last name 'Okafor' transposed as 'Okafro' for EMP002 (Jennifer Okafor)", | |
| "target_entity_id": "EMP002", | |
| "target_emp_id": 1002, | |
| "field": "last_name", | |
| "original": "Okafor", | |
| "corrupted": "Okafro", | |
| }, | |
| # --- state_expand: 2 --- | |
| { | |
| "type": "state_expand", | |
| "row_indices": [12], | |
| "description": "Office city 'San Francisco' misspelled as 'San Fransisco' for EMP013 (Fatima Al-Rashidi)", | |
| "target_entity_id": "EMP013", | |
| "target_emp_id": 1013, | |
| "field": "office_city", | |
| "original": "San Francisco", | |
| "corrupted": "San Fransisco", | |
| }, | |
| { | |
| "type": "state_expand", | |
| "row_indices": [13], | |
| "description": "Department 'Operations' corrupted with extra space as 'Operat ions' for EMP014 (Marcus Johnson)", | |
| "target_entity_id": "EMP014", | |
| "target_emp_id": 1014, | |
| "field": "department", | |
| "original": "Operations", | |
| "corrupted": "Operat ions", | |
| }, | |
| # --- format_randomize salary --- | |
| { | |
| "type": "format_randomize", | |
| "column": "salary", | |
| "row_indices": [7], | |
| "description": "Salary turned negative for EMP008 (Sarah Kim) - invalid value", | |
| "target_entity_id": "EMP008", | |
| "target_emp_id": 1008, | |
| "field": "salary", | |
| "original": 87000.00, | |
| "corrupted": -87000.00, | |
| }, | |
| ] | |
| _MEDIUM_PROBES: list[UtilityProbe] = [ | |
| UtilityProbe( | |
| name="unique_employee_count", | |
| description="Count of unique employees after deduplication", | |
| query_fn="unique_count", | |
| params={"column": "emp_id"}, | |
| expected_result=17, | |
| ), | |
| UtilityProbe( | |
| name="department_salary_avg", | |
| description="Average salary per department", | |
| query_fn="avg_by_group", | |
| params={"value_col": "salary", "group_col": "department"}, | |
| expected_result={ | |
| "Engineering": 145600.00, | |
| "Finance": 119000.00, | |
| "HR": 91500.00, | |
| "Marketing": 99500.00, | |
| "Operations": 103500.00, | |
| "Sales": 95000.00, | |
| }, | |
| ), | |
| UtilityProbe( | |
| name="engineering_headcount", | |
| description="Number of employees in the Engineering department", | |
| query_fn="count_where", | |
| params={"column": "department", "value": "Engineering"}, | |
| expected_result=5, | |
| ), | |
| ] | |
| _MEDIUM_TASK = Task( | |
| task_id="medium_employees", | |
| name="Employee Records Reconciliation", | |
| difficulty="medium", | |
| description=( | |
| "Reconcile a set of 17 unique employee records (10 columns) that have " | |
| "been corrupted with 3 near-duplicate rows (nickname variants), " | |
| "date format inconsistencies, null injections, department name typos, " | |
| "city misspellings, and an invalid salary. The dirty dataset has 20 rows." | |
| ), | |
| ground_truth=_MEDIUM_GROUND_TRUTH, | |
| schema=_MEDIUM_SCHEMA, | |
| corruptions=_MEDIUM_CORRUPTIONS, | |
| max_steps=60, | |
| utility_probes=_MEDIUM_PROBES, | |
| ) | |
| register_task(_MEDIUM_TASK) | |
| # ############################################################################ | |
| # HARD TASK: Multi-Source Patient Registry | |
| # ############################################################################ | |
| _HARD_GROUND_TRUTH: list[dict[str, Any]] = [ | |
| # --- DUPLICATE CLUSTER CANDIDATES (patients 1-6) --- | |
| { | |
| "_entity_id": "PAT001", | |
| "patient_id": 1, | |
| "first_name": "William", | |
| "last_name": "Thompson", | |
| "dob": "1958-03-14", | |
| "gender": "M", | |
| "phone": "312-555-0147", | |
| "email": "william.thompson@gmail.com", | |
| "address_line": "742 North Michigan Ave", | |
| "city": "Chicago", | |
| "state": "IL", | |
| "zip": "60601", | |
| "insurance_provider": "Blue Cross", | |
| "insurance_id": "BC-4471583", | |
| "last_visit_date": "2024-01-15", | |
| }, | |
| { | |
| "_entity_id": "PAT002", | |
| "patient_id": 2, | |
| "first_name": "Robert", | |
| "last_name": "Martinez", | |
| "dob": "1975-07-22", | |
| "gender": "M", | |
| "phone": "206-555-0293", | |
| "email": "robert.martinez@outlook.com", | |
| "address_line": "1501 Pike Place", | |
| "city": "Seattle", | |
| "state": "WA", | |
| "zip": "98101", | |
| "insurance_provider": "UnitedHealth", | |
| "insurance_id": "UH-8823146", | |
| "last_visit_date": "2024-03-08", | |
| }, | |
| { | |
| "_entity_id": "PAT003", | |
| "patient_id": 3, | |
| "first_name": "Elizabeth", | |
| "last_name": "O'Brien", | |
| "dob": "1982-11-05", | |
| "gender": "F", | |
| "phone": "617-555-0381", | |
| "email": "elizabeth.obrien@yahoo.com", | |
| "address_line": "88 Beacon Street", | |
| "city": "Boston", | |
| "state": "MA", | |
| "zip": "02101", | |
| "insurance_provider": "Aetna", | |
| "insurance_id": "AE-5539274", | |
| "last_visit_date": "2023-09-20", | |
| }, | |
| { | |
| "_entity_id": "PAT004", | |
| "patient_id": 4, | |
| "first_name": "Jennifer", | |
| "last_name": "Nguyen", | |
| "dob": "1990-04-18", | |
| "gender": "F", | |
| "phone": "503-555-0462", | |
| "email": "jennifer.nguyen@gmail.com", | |
| "address_line": "2200 NW Burnside Rd", | |
| "city": "Portland", | |
| "state": "OR", | |
| "zip": "97201", | |
| "insurance_provider": "Cigna", | |
| "insurance_id": "CI-3317892", | |
| "last_visit_date": "2024-06-11", | |
| }, | |
| { | |
| "_entity_id": "PAT005", | |
| "patient_id": 5, | |
| "first_name": "James", | |
| "last_name": "Kowalski", | |
| "dob": "1967-09-30", | |
| "gender": "M", | |
| "phone": "303-555-0518", | |
| "email": "james.kowalski@aol.com", | |
| "address_line": "1600 Colfax Ave", | |
| "city": "Denver", | |
| "state": "CO", | |
| "zip": "80201", | |
| "insurance_provider": "Medicare", | |
| "insurance_id": "MC-6642018", | |
| "last_visit_date": "2024-02-28", | |
| }, | |
| { | |
| "_entity_id": "PAT006", | |
| "patient_id": 6, | |
| "first_name": "Katherine", | |
| "last_name": "Patel", | |
| "dob": "1988-01-27", | |
| "gender": "F", | |
| "phone": "404-555-0637", | |
| "email": "katherine.patel@hotmail.com", | |
| "address_line": "350 Peachtree St NE", | |
| "city": "Atlanta", | |
| "state": "GA", | |
| "zip": "30301", | |
| "insurance_provider": "Blue Cross", | |
| "insurance_id": "BC-9918453", | |
| "last_visit_date": "2023-12-05", | |
| }, | |
| # --- FALSE POSITIVE PAIR 1: Two different "Michael Davis" --- | |
| { | |
| "_entity_id": "PAT007", | |
| "patient_id": 7, | |
| "first_name": "Michael", | |
| "last_name": "Davis", | |
| "dob": "1972-06-10", | |
| "gender": "M", | |
| "phone": "310-555-0744", | |
| "email": "michael.davis72@gmail.com", | |
| "address_line": "456 Rodeo Drive", | |
| "city": "Beverly Hills", | |
| "state": "CA", | |
| "zip": "90210", | |
| "insurance_provider": "UnitedHealth", | |
| "insurance_id": "UH-1157329", | |
| "last_visit_date": "2024-04-22", | |
| }, | |
| { | |
| "_entity_id": "PAT008", | |
| "patient_id": 8, | |
| "first_name": "Michael", | |
| "last_name": "Davis", | |
| "dob": "1995-02-14", | |
| "gender": "M", | |
| "phone": "305-555-0856", | |
| "email": "mdavis95@yahoo.com", | |
| "address_line": "900 Biscayne Blvd", | |
| "city": "Miami", | |
| "state": "FL", | |
| "zip": "33101", | |
| "insurance_provider": "Aetna", | |
| "insurance_id": "AE-7743201", | |
| "last_visit_date": "2023-11-30", | |
| }, | |
| # --- FALSE POSITIVE PAIR 2: Two different "Maria Garcia" --- | |
| { | |
| "_entity_id": "PAT009", | |
| "patient_id": 9, | |
| "first_name": "Maria", | |
| "last_name": "Garcia", | |
| "dob": "1960-08-03", | |
| "gender": "F", | |
| "phone": "512-555-0912", | |
| "email": "maria.garcia60@gmail.com", | |
| "address_line": "1100 Congress Ave", | |
| "city": "Austin", | |
| "state": "TX", | |
| "zip": "73301", | |
| "insurance_provider": "Medicare", | |
| "insurance_id": "MC-2204857", | |
| "last_visit_date": "2024-05-14", | |
| }, | |
| { | |
| "_entity_id": "PAT010", | |
| "patient_id": 10, | |
| "first_name": "Maria", | |
| "last_name": "Garcia", | |
| "dob": "1985-12-19", | |
| "gender": "F", | |
| "phone": "212-555-1023", | |
| "email": "mgarcia.nyc@outlook.com", | |
| "address_line": "250 West 34th St", | |
| "city": "New York", | |
| "state": "NY", | |
| "zip": "10001", | |
| "insurance_provider": "Cigna", | |
| "insurance_id": "CI-8856134", | |
| "last_visit_date": "2023-08-27", | |
| }, | |
| # --- REMAINING UNIQUE PATIENTS (11-30) --- | |
| { | |
| "_entity_id": "PAT011", | |
| "patient_id": 11, | |
| "first_name": "David", | |
| "last_name": "Chen", | |
| "dob": "1979-05-11", | |
| "gender": "M", | |
| "phone": "415-555-1134", | |
| "email": "david.chen@gmail.com", | |
| "address_line": "580 Market Street", | |
| "city": "San Francisco", | |
| "state": "CA", | |
| "zip": "94105", | |
| "insurance_provider": "Blue Cross", | |
| "insurance_id": "BC-3364721", | |
| "last_visit_date": "2024-07-03", | |
| }, | |
| { | |
| "_entity_id": "PAT012", | |
| "patient_id": 12, | |
| "first_name": "Sarah", | |
| "last_name": "Johnson", | |
| "dob": "1993-02-28", | |
| "gender": "F", | |
| "phone": "713-555-1245", | |
| "email": "sarah.johnson93@yahoo.com", | |
| "address_line": "4200 Westheimer Rd", | |
| "city": "Houston", | |
| "state": "TX", | |
| "zip": "77027", | |
| "insurance_provider": "UnitedHealth", | |
| "insurance_id": "UH-5578934", | |
| "last_visit_date": "2023-10-18", | |
| }, | |
| { | |
| "_entity_id": "PAT013", | |
| "patient_id": 13, | |
| "first_name": "Thomas", | |
| "last_name": "Wilson", | |
| "dob": "1945-10-07", | |
| "gender": "M", | |
| "phone": "602-555-1356", | |
| "email": "tom.wilson45@aol.com", | |
| "address_line": "3300 N Central Ave", | |
| "city": "Phoenix", | |
| "state": "AZ", | |
| "zip": "85012", | |
| "insurance_provider": "Medicare", | |
| "insurance_id": "MC-1192746", | |
| "last_visit_date": "2024-01-09", | |
| }, | |
| # --- STATISTICALLY UNUSUAL BUT VALID: Male named Ashley --- | |
| { | |
| "_entity_id": "PAT014", | |
| "patient_id": 14, | |
| "first_name": "Ashley", | |
| "last_name": "Richardson", | |
| "dob": "1970-12-22", | |
| "gender": "M", | |
| "phone": "615-555-1467", | |
| "email": "ashley.richardson@gmail.com", | |
| "address_line": "210 Broadway", | |
| "city": "Nashville", | |
| "state": "TN", | |
| "zip": "37201", | |
| "insurance_provider": "Aetna", | |
| "insurance_id": "AE-4426183", | |
| "last_visit_date": "2024-03-19", | |
| }, | |
| { | |
| "_entity_id": "PAT015", | |
| "patient_id": 15, | |
| "first_name": "Patricia", | |
| "last_name": "Lee", | |
| "dob": "1952-04-15", | |
| "gender": "F", | |
| "phone": "702-555-1578", | |
| "email": "patricia.lee@outlook.com", | |
| "address_line": "3600 Las Vegas Blvd S", | |
| "city": "Las Vegas", | |
| "state": "NV", | |
| "zip": "89109", | |
| "insurance_provider": "Medicare", | |
| "insurance_id": "MC-8835492", | |
| "last_visit_date": "2023-07-25", | |
| }, | |
| { | |
| "_entity_id": "PAT016", | |
| "patient_id": 16, | |
| "first_name": "Daniel", | |
| "last_name": "Brown", | |
| "dob": "1998-08-09", | |
| "gender": "M", | |
| "phone": "214-555-1689", | |
| "email": "daniel.brown98@gmail.com", | |
| "address_line": "1700 Pacific Ave", | |
| "city": "Dallas", | |
| "state": "TX", | |
| "zip": "75201", | |
| "insurance_provider": "Cigna", | |
| "insurance_id": "CI-2293847", | |
| "last_visit_date": "2024-08-01", | |
| }, | |
| { | |
| "_entity_id": "PAT017", | |
| "patient_id": 17, | |
| "first_name": "Linda", | |
| "last_name": "Anderson", | |
| "dob": "1963-01-30", | |
| "gender": "F", | |
| "phone": "952-555-1791", | |
| "email": "linda.anderson@yahoo.com", | |
| "address_line": "800 Nicollet Mall", | |
| "city": "Minneapolis", | |
| "state": "MN", | |
| "zip": "55402", | |
| "insurance_provider": "Blue Cross", | |
| "insurance_id": "BC-6671835", | |
| "last_visit_date": "2024-05-30", | |
| }, | |
| { | |
| "_entity_id": "PAT018", | |
| "patient_id": 18, | |
| "first_name": "Christopher", | |
| "last_name": "Taylor", | |
| "dob": "1984-06-14", | |
| "gender": "M", | |
| "phone": "704-555-1802", | |
| "email": "chris.taylor84@gmail.com", | |
| "address_line": "401 N Tryon St", | |
| "city": "Charlotte", | |
| "state": "NC", | |
| "zip": "28202", | |
| "insurance_provider": "UnitedHealth", | |
| "insurance_id": "UH-3349128", | |
| "last_visit_date": "2023-12-12", | |
| }, | |
| { | |
| "_entity_id": "PAT019", | |
| "patient_id": 19, | |
| "first_name": "Nancy", | |
| "last_name": "White", | |
| "dob": "1948-09-21", | |
| "gender": "F", | |
| "phone": "314-555-1913", | |
| "email": "nancy.white48@hotmail.com", | |
| "address_line": "100 Washington Ave", | |
| "city": "Saint Louis", | |
| "state": "MO", | |
| "zip": "63101", | |
| "insurance_provider": "Medicare", | |
| "insurance_id": "MC-4457631", | |
| "last_visit_date": "2024-02-14", | |
| }, | |
| { | |
| "_entity_id": "PAT020", | |
| "patient_id": 20, | |
| "first_name": "Kevin", | |
| "last_name": "Harris", | |
| "dob": "2001-03-05", | |
| "gender": "M", | |
| "phone": "407-555-2024", | |
| "email": "kevin.harris01@gmail.com", | |
| "address_line": "525 S Orange Ave", | |
| "city": "Orlando", | |
| "state": "FL", | |
| "zip": "32801", | |
| "insurance_provider": "Aetna", | |
| "insurance_id": "AE-9914567", | |
| "last_visit_date": "2024-06-28", | |
| }, | |
| { | |
| "_entity_id": "PAT021", | |
| "patient_id": 21, | |
| "first_name": "Susan", | |
| "last_name": "Clark", | |
| "dob": "1977-11-13", | |
| "gender": "F", | |
| "phone": "412-555-2135", | |
| "email": "susan.clark77@outlook.com", | |
| "address_line": "600 Grant St", | |
| "city": "Pittsburgh", | |
| "state": "PA", | |
| "zip": "15219", | |
| "insurance_provider": "Cigna", | |
| "insurance_id": "CI-5582719", | |
| "last_visit_date": "2023-09-05", | |
| }, | |
| # --- STATISTICALLY UNUSUAL BUT VALID: Female named Jordan --- | |
| { | |
| "_entity_id": "PAT022", | |
| "patient_id": 22, | |
| "first_name": "Jordan", | |
| "last_name": "Mitchell", | |
| "dob": "1996-07-08", | |
| "gender": "F", | |
| "phone": "619-555-2246", | |
| "email": "jordan.mitchell96@gmail.com", | |
| "address_line": "750 B Street", | |
| "city": "San Diego", | |
| "state": "CA", | |
| "zip": "92101", | |
| "insurance_provider": "Blue Cross", | |
| "insurance_id": "BC-7728364", | |
| "last_visit_date": "2024-04-10", | |
| }, | |
| { | |
| "_entity_id": "PAT023", | |
| "patient_id": 23, | |
| "first_name": "Richard", | |
| "last_name": "Lopez", | |
| "dob": "1955-12-01", | |
| "gender": "M", | |
| "phone": "210-555-2357", | |
| "email": "richard.lopez55@aol.com", | |
| "address_line": "300 Alamo Plaza", | |
| "city": "San Antonio", | |
| "state": "TX", | |
| "zip": "78205", | |
| "insurance_provider": "Medicare", | |
| "insurance_id": "MC-3346285", | |
| "last_visit_date": "2024-01-22", | |
| }, | |
| { | |
| "_entity_id": "PAT024", | |
| "patient_id": 24, | |
| "first_name": "Angela", | |
| "last_name": "Robinson", | |
| "dob": "1989-05-26", | |
| "gender": "F", | |
| "phone": "317-555-2468", | |
| "email": "angela.robinson@yahoo.com", | |
| "address_line": "200 E Washington St", | |
| "city": "Indianapolis", | |
| "state": "IN", | |
| "zip": "46204", | |
| "insurance_provider": "UnitedHealth", | |
| "insurance_id": "UH-6693241", | |
| "last_visit_date": "2023-11-08", | |
| }, | |
| { | |
| "_entity_id": "PAT025", | |
| "patient_id": 25, | |
| "first_name": "Steven", | |
| "last_name": "Walker", | |
| "dob": "1941-07-19", | |
| "gender": "M", | |
| "phone": "216-555-2579", | |
| "email": "steven.walker@gmail.com", | |
| "address_line": "1100 Superior Ave", | |
| "city": "Cleveland", | |
| "state": "OH", | |
| "zip": "44114", | |
| "insurance_provider": "Medicare", | |
| "insurance_id": "MC-1128574", | |
| "last_visit_date": "2024-03-05", | |
| }, | |
| { | |
| "_entity_id": "PAT026", | |
| "patient_id": 26, | |
| "first_name": "Michelle", | |
| "last_name": "Young", | |
| "dob": "2003-10-31", | |
| "gender": "F", | |
| "phone": "504-555-2681", | |
| "email": "michelle.young03@outlook.com", | |
| "address_line": "800 Canal St", | |
| "city": "New Orleans", | |
| "state": "LA", | |
| "zip": "70112", | |
| "insurance_provider": "Aetna", | |
| "insurance_id": "AE-8847392", | |
| "last_visit_date": "2024-07-19", | |
| }, | |
| # --- STATISTICALLY UNUSUAL BUT VALID: Male named Shannon --- | |
| { | |
| "_entity_id": "PAT027", | |
| "patient_id": 27, | |
| "first_name": "Shannon", | |
| "last_name": "Burke", | |
| "dob": "1974-02-08", | |
| "gender": "M", | |
| "phone": "816-555-2792", | |
| "email": "shannon.burke@gmail.com", | |
| "address_line": "1200 Main St", | |
| "city": "Kansas City", | |
| "state": "MO", | |
| "zip": "64105", | |
| "insurance_provider": "Cigna", | |
| "insurance_id": "CI-4431957", | |
| "last_visit_date": "2023-08-14", | |
| }, | |
| { | |
| "_entity_id": "PAT028", | |
| "patient_id": 28, | |
| "first_name": "Dorothy", | |
| "last_name": "Hall", | |
| "dob": "1943-06-17", | |
| "gender": "F", | |
| "phone": "414-555-2903", | |
| "email": "dorothy.hall43@yahoo.com", | |
| "address_line": "500 W Wisconsin Ave", | |
| "city": "Milwaukee", | |
| "state": "WI", | |
| "zip": "53203", | |
| "insurance_provider": "Medicare", | |
| "insurance_id": "MC-5563418", | |
| "last_visit_date": "2024-02-07", | |
| }, | |
| { | |
| "_entity_id": "PAT029", | |
| "patient_id": 29, | |
| "first_name": "Brian", | |
| "last_name": "Kim", | |
| "dob": "1992-09-14", | |
| "gender": "M", | |
| "phone": "571-555-3014", | |
| "email": "brian.kim92@gmail.com", | |
| "address_line": "1750 Tysons Blvd", | |
| "city": "Tysons", | |
| "state": "VA", | |
| "zip": "22102", | |
| "insurance_provider": "Blue Cross", | |
| "insurance_id": "BC-2215847", | |
| "last_visit_date": "2024-05-02", | |
| }, | |
| { | |
| "_entity_id": "PAT030", | |
| "patient_id": 30, | |
| "first_name": "Laura", | |
| "last_name": "Scott", | |
| "dob": "1999-01-23", | |
| "gender": "F", | |
| "phone": "919-555-3125", | |
| "email": "laura.scott99@hotmail.com", | |
| "address_line": "400 Fayetteville St", | |
| "city": "Raleigh", | |
| "state": "NC", | |
| "zip": "27601", | |
| "insurance_provider": "UnitedHealth", | |
| "insurance_id": "UH-7741926", | |
| "last_visit_date": "2023-10-31", | |
| }, | |
| # --- NEW PATIENTS (PAT031-PAT050) --- | |
| # --- GENDER-NEUTRAL NAME TRAP: Morgan (M) --- | |
| { | |
| "_entity_id": "PAT031", | |
| "patient_id": 31, | |
| "first_name": "Morgan", | |
| "last_name": "Fletcher", | |
| "dob": "1986-04-12", | |
| "gender": "M", | |
| "phone": "253-555-3201", | |
| "email": "morgan.fletcher86@gmail.com", | |
| "address_line": "1900 Pacific Ave", | |
| "city": "Tacoma", | |
| "state": "WA", | |
| "zip": "98402", | |
| "insurance_provider": "Cigna", | |
| "insurance_id": "CI-7712548", | |
| "last_visit_date": "2024-03-22", | |
| }, | |
| # --- FALSE POSITIVE PAIR 3: Two "David Kim" (PAT032 & PAT033) --- | |
| { | |
| "_entity_id": "PAT032", | |
| "patient_id": 32, | |
| "first_name": "David", | |
| "last_name": "Kim", | |
| "dob": "1988-07-20", | |
| "gender": "M", | |
| "phone": "425-555-3312", | |
| "email": "david.kim88@gmail.com", | |
| "address_line": "300 108th Ave NE", | |
| "city": "Bellevue", | |
| "state": "WA", | |
| "zip": "98004", | |
| "insurance_provider": "UnitedHealth", | |
| "insurance_id": "UH-4423891", | |
| "last_visit_date": "2024-05-18", | |
| }, | |
| { | |
| "_entity_id": "PAT033", | |
| "patient_id": 33, | |
| "first_name": "David", | |
| "last_name": "Kim", | |
| "dob": "1990-11-03", | |
| "gender": "M", | |
| "phone": "206-555-3423", | |
| "email": "dkim90@outlook.com", | |
| "address_line": "815 Pine St", | |
| "city": "Seattle", | |
| "state": "WA", | |
| "zip": "98101", | |
| "insurance_provider": "UnitedHealth", | |
| "insurance_id": "UH-6638172", | |
| "last_visit_date": "2024-01-29", | |
| }, | |
| # --- DUPLICATE CLUSTER: Christopher -> typo variants (PAT034) --- | |
| { | |
| "_entity_id": "PAT034", | |
| "patient_id": 34, | |
| "first_name": "Christopher", | |
| "last_name": "Reeves", | |
| "dob": "1976-08-25", | |
| "gender": "M", | |
| "phone": "813-555-3534", | |
| "email": "christopher.reeves@gmail.com", | |
| "address_line": "1400 N Dale Mabry Hwy", | |
| "city": "Tampa", | |
| "state": "FL", | |
| "zip": "33607", | |
| "insurance_provider": "Blue Cross", | |
| "insurance_id": "BC-5547832", | |
| "last_visit_date": "2024-06-05", | |
| }, | |
| # --- GENDER-NEUTRAL NAME TRAP: Avery (M) --- | |
| { | |
| "_entity_id": "PAT035", | |
| "patient_id": 35, | |
| "first_name": "Avery", | |
| "last_name": "Simmons", | |
| "dob": "1994-02-17", | |
| "gender": "M", | |
| "phone": "678-555-3645", | |
| "email": "avery.simmons94@yahoo.com", | |
| "address_line": "2500 Peachtree Rd NW", | |
| "city": "Atlanta", | |
| "state": "GA", | |
| "zip": "30305", | |
| "insurance_provider": "Aetna", | |
| "insurance_id": "AE-3318724", | |
| "last_visit_date": "2023-11-15", | |
| }, | |
| # --- FALSE POSITIVE PAIR 4: Two "Sarah Williams" (PAT036 & PAT037) --- | |
| { | |
| "_entity_id": "PAT036", | |
| "patient_id": 36, | |
| "first_name": "Sarah", | |
| "last_name": "Williams", | |
| "dob": "1983-09-14", | |
| "gender": "F", | |
| "phone": "312-555-3756", | |
| "email": "sarah.williams83@gmail.com", | |
| "address_line": "55 E Monroe St", | |
| "city": "Chicago", | |
| "state": "IL", | |
| "zip": "60603", | |
| "insurance_provider": "Blue Cross", | |
| "insurance_id": "BC-8834291", | |
| "last_visit_date": "2024-04-01", | |
| }, | |
| { | |
| "_entity_id": "PAT037", | |
| "patient_id": 37, | |
| "first_name": "Sarah", | |
| "last_name": "Williams", | |
| "dob": "1978-03-22", | |
| "gender": "F", | |
| "phone": "773-555-3867", | |
| "email": "swilliams78@yahoo.com", | |
| "address_line": "4700 N Lincoln Ave", | |
| "city": "Chicago", | |
| "state": "IL", | |
| "zip": "60625", | |
| "insurance_provider": "Cigna", | |
| "insurance_id": "CI-2247163", | |
| "last_visit_date": "2023-07-19", | |
| }, | |
| # --- DUPLICATE CLUSTER: Alexandra -> misspelling variant (PAT038) --- | |
| { | |
| "_entity_id": "PAT038", | |
| "patient_id": 38, | |
| "first_name": "Alexandra", | |
| "last_name": "Petrov", | |
| "dob": "1991-06-08", | |
| "gender": "F", | |
| "phone": "480-555-3978", | |
| "email": "alexandra.petrov@gmail.com", | |
| "address_line": "7100 E Camelback Rd", | |
| "city": "Scottsdale", | |
| "state": "AZ", | |
| "zip": "85251", | |
| "insurance_provider": "UnitedHealth", | |
| "insurance_id": "UH-9917453", | |
| "last_visit_date": "2024-02-20", | |
| }, | |
| # --- GENDER-NEUTRAL NAME TRAP: Casey (F) --- | |
| { | |
| "_entity_id": "PAT039", | |
| "patient_id": 39, | |
| "first_name": "Casey", | |
| "last_name": "Morgan", | |
| "dob": "2000-01-30", | |
| "gender": "F", | |
| "phone": "720-555-4089", | |
| "email": "casey.morgan00@outlook.com", | |
| "address_line": "1600 Stout St", | |
| "city": "Denver", | |
| "state": "CO", | |
| "zip": "80202", | |
| "insurance_provider": "Medicare", | |
| "insurance_id": "MC-7724316", | |
| "last_visit_date": "2024-08-10", | |
| }, | |
| # --- DUPLICATE CLUSTER: Patricia -> typo variants (PAT040) --- | |
| { | |
| "_entity_id": "PAT040", | |
| "patient_id": 40, | |
| "first_name": "Patricia", | |
| "last_name": "Hernandez", | |
| "dob": "1969-12-04", | |
| "gender": "F", | |
| "phone": "520-555-4190", | |
| "email": "patricia.hernandez@gmail.com", | |
| "address_line": "150 N Stone Ave", | |
| "city": "Tucson", | |
| "state": "AZ", | |
| "zip": "85701", | |
| "insurance_provider": "Aetna", | |
| "insurance_id": "AE-6641258", | |
| "last_visit_date": "2024-01-17", | |
| }, | |
| { | |
| "_entity_id": "PAT041", | |
| "patient_id": 41, | |
| "first_name": "Gregory", | |
| "last_name": "Adams", | |
| "dob": "1957-03-19", | |
| "gender": "M", | |
| "phone": "860-555-4201", | |
| "email": "gregory.adams57@aol.com", | |
| "address_line": "250 Constitution Plaza", | |
| "city": "Hartford", | |
| "state": "CT", | |
| "zip": "06103", | |
| "insurance_provider": "Medicare", | |
| "insurance_id": "MC-3392841", | |
| "last_visit_date": "2024-07-08", | |
| }, | |
| { | |
| "_entity_id": "PAT042", | |
| "patient_id": 42, | |
| "first_name": "Samantha", | |
| "last_name": "Rivera", | |
| "dob": "1997-08-22", | |
| "gender": "F", | |
| "phone": "505-555-4312", | |
| "email": "samantha.rivera97@gmail.com", | |
| "address_line": "400 Central Ave SW", | |
| "city": "Albuquerque", | |
| "state": "NM", | |
| "zip": "87102", | |
| "insurance_provider": "Cigna", | |
| "insurance_id": "CI-8812347", | |
| "last_visit_date": "2023-12-29", | |
| }, | |
| # --- GENDER-NEUTRAL NAME TRAP: Dana (M) --- | |
| { | |
| "_entity_id": "PAT043", | |
| "patient_id": 43, | |
| "first_name": "Dana", | |
| "last_name": "Crawford", | |
| "dob": "1965-11-28", | |
| "gender": "M", | |
| "phone": "901-555-4423", | |
| "email": "dana.crawford65@hotmail.com", | |
| "address_line": "203 Beale St", | |
| "city": "Memphis", | |
| "state": "TN", | |
| "zip": "38103", | |
| "insurance_provider": "Blue Cross", | |
| "insurance_id": "BC-1148273", | |
| "last_visit_date": "2024-04-15", | |
| }, | |
| # --- FALSE POSITIVE PAIR 5: Two "James Lee" (PAT044 & PAT045) --- | |
| { | |
| "_entity_id": "PAT044", | |
| "patient_id": 44, | |
| "first_name": "James", | |
| "last_name": "Lee", | |
| "dob": "1981-05-09", | |
| "gender": "M", | |
| "phone": "510-555-4534", | |
| "email": "james.lee81@gmail.com", | |
| "address_line": "1901 Harrison St", | |
| "city": "Oakland", | |
| "state": "CA", | |
| "zip": "94612", | |
| "insurance_provider": "Aetna", | |
| "insurance_id": "AE-5523918", | |
| "last_visit_date": "2024-06-22", | |
| }, | |
| { | |
| "_entity_id": "PAT045", | |
| "patient_id": 45, | |
| "first_name": "James", | |
| "last_name": "Lee", | |
| "dob": "1982-10-31", | |
| "gender": "M", | |
| "phone": "408-555-4645", | |
| "email": "jlee82@yahoo.com", | |
| "address_line": "225 W Santa Clara St", | |
| "city": "San Jose", | |
| "state": "CA", | |
| "zip": "95113", | |
| "insurance_provider": "Aetna", | |
| "insurance_id": "AE-7739482", | |
| "last_visit_date": "2023-09-14", | |
| }, | |
| { | |
| "_entity_id": "PAT046", | |
| "patient_id": 46, | |
| "first_name": "Theresa", | |
| "last_name": "Nguyen", | |
| "dob": "1973-07-14", | |
| "gender": "F", | |
| "phone": "832-555-4756", | |
| "email": "theresa.nguyen73@gmail.com", | |
| "address_line": "2100 Travis St", | |
| "city": "Houston", | |
| "state": "TX", | |
| "zip": "77002", | |
| "insurance_provider": "UnitedHealth", | |
| "insurance_id": "UH-2248637", | |
| "last_visit_date": "2024-05-07", | |
| }, | |
| # --- GENDER-NEUTRAL NAME TRAP: Robin (F) --- | |
| { | |
| "_entity_id": "PAT047", | |
| "patient_id": 47, | |
| "first_name": "Robin", | |
| "last_name": "Blackwell", | |
| "dob": "1980-05-16", | |
| "gender": "F", | |
| "phone": "336-555-4867", | |
| "email": "robin.blackwell@outlook.com", | |
| "address_line": "300 N Greene St", | |
| "city": "Greensboro", | |
| "state": "NC", | |
| "zip": "27401", | |
| "insurance_provider": "Medicare", | |
| "insurance_id": "MC-4458923", | |
| "last_visit_date": "2024-03-11", | |
| }, | |
| # --- DUPLICATE CLUSTER: Catherine -> spelling variants (PAT048) --- | |
| # NOTE: PAT006 is Katherine Patel (different person!). Agent must NOT | |
| # merge PAT048's duplicates with PAT006. | |
| { | |
| "_entity_id": "PAT048", | |
| "patient_id": 48, | |
| "first_name": "Catherine", | |
| "last_name": "Brooks", | |
| "dob": "1987-09-03", | |
| "gender": "F", | |
| "phone": "614-555-4978", | |
| "email": "catherine.brooks@gmail.com", | |
| "address_line": "100 E Broad St", | |
| "city": "Columbus", | |
| "state": "OH", | |
| "zip": "43215", | |
| "insurance_provider": "Blue Cross", | |
| "insurance_id": "BC-6693147", | |
| "last_visit_date": "2024-07-25", | |
| }, | |
| { | |
| "_entity_id": "PAT049", | |
| "patient_id": 49, | |
| "first_name": "Raymond", | |
| "last_name": "Foster", | |
| "dob": "1950-02-11", | |
| "gender": "M", | |
| "phone": "502-555-5089", | |
| "email": "raymond.foster50@aol.com", | |
| "address_line": "700 W Main St", | |
| "city": "Louisville", | |
| "state": "KY", | |
| "zip": "40202", | |
| "insurance_provider": "Medicare", | |
| "insurance_id": "MC-8817294", | |
| "last_visit_date": "2024-02-19", | |
| }, | |
| { | |
| "_entity_id": "PAT050", | |
| "patient_id": 50, | |
| "first_name": "Heather", | |
| "last_name": "Sanchez", | |
| "dob": "2004-06-21", | |
| "gender": "F", | |
| "phone": "515-555-5190", | |
| "email": "heather.sanchez04@gmail.com", | |
| "address_line": "1000 Walnut St", | |
| "city": "Des Moines", | |
| "state": "IA", | |
| "zip": "50309", | |
| "insurance_provider": "Cigna", | |
| "insurance_id": "CI-3347291", | |
| "last_visit_date": "2023-10-05", | |
| }, | |
| ] | |
| _HARD_SCHEMA: dict[str, Any] = { | |
| "primary_key": "patient_id", | |
| "expected_types": { | |
| "patient_id": "int", | |
| "first_name": "str", | |
| "last_name": "str", | |
| "dob": "date", | |
| "gender": "str", | |
| "phone": "phone", | |
| "email": "email", | |
| "address_line": "str", | |
| "city": "str", | |
| "state": "str", | |
| "zip": "str", | |
| "insurance_provider": "str", | |
| "insurance_id": "str", | |
| "last_visit_date": "date", | |
| }, | |
| "constraints": { | |
| "dob": {"format": "YYYY-MM-DD", "min": "1940-01-01", "max": "2005-12-31"}, | |
| "gender": {"allowed_values": ["M", "F"]}, | |
| "phone": {"format": r"^\d{3}-\d{3}-\d{4}$"}, | |
| "email": {"format": r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}$"}, | |
| "state": { | |
| "allowed_values": [ | |
| "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", | |
| "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", | |
| "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", | |
| "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", | |
| "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY", | |
| ], | |
| }, | |
| "zip": {"format": r"^\d{5}$"}, | |
| "insurance_provider": { | |
| "allowed_values": ["Blue Cross", "UnitedHealth", "Aetna", "Cigna", "Medicare"], | |
| }, | |
| "insurance_id": { | |
| "format": r"^(BC|UH|AE|CI|MC)-\d{7}$", | |
| }, | |
| "last_visit_date": {"format": "YYYY-MM-DD", "min": "2023-01-01", "max": "2024-12-31"}, | |
| }, | |
| "cross_field_rules": { | |
| "zip_city_map": { | |
| "60601": "Chicago", | |
| "60602": "Chicago", | |
| "60603": "Chicago", | |
| "60625": "Chicago", | |
| "98101": "Seattle", | |
| "02101": "Boston", | |
| "97201": "Portland", | |
| "80201": "Denver", | |
| "80202": "Denver", | |
| "30301": "Atlanta", | |
| "30305": "Atlanta", | |
| "90210": "Beverly Hills", | |
| "33101": "Miami", | |
| "73301": "Austin", | |
| "10001": "New York", | |
| "94105": "San Francisco", | |
| "77027": "Houston", | |
| "77002": "Houston", | |
| "85012": "Phoenix", | |
| "37201": "Nashville", | |
| "89109": "Las Vegas", | |
| "75201": "Dallas", | |
| "55402": "Minneapolis", | |
| "28202": "Charlotte", | |
| "63101": "Saint Louis", | |
| "32801": "Orlando", | |
| "15219": "Pittsburgh", | |
| "92101": "San Diego", | |
| "78205": "San Antonio", | |
| "46204": "Indianapolis", | |
| "44114": "Cleveland", | |
| "70112": "New Orleans", | |
| "64105": "Kansas City", | |
| "53203": "Milwaukee", | |
| "22102": "Tysons", | |
| "27601": "Raleigh", | |
| "98402": "Tacoma", | |
| "98004": "Bellevue", | |
| "33607": "Tampa", | |
| "85251": "Scottsdale", | |
| "85701": "Tucson", | |
| "06103": "Hartford", | |
| "87102": "Albuquerque", | |
| "38103": "Memphis", | |
| "94612": "Oakland", | |
| "95113": "San Jose", | |
| "27401": "Greensboro", | |
| "43215": "Columbus", | |
| "40202": "Louisville", | |
| "50309": "Des Moines", | |
| }, | |
| "insurance_prefix_map": { | |
| "Blue Cross": "BC", | |
| "UnitedHealth": "UH", | |
| "Aetna": "AE", | |
| "Cigna": "CI", | |
| "Medicare": "MC", | |
| }, | |
| }, | |
| } | |
| _HARD_CORRUPTIONS: list[dict[str, Any]] = [ | |
| # ========================================================================= | |
| # ORIGINAL 6 duplicate clusters (nickname variants) creating ~10 extra rows | |
| # Duplicate patient_ids start at 51 (PAT031-PAT050 are real patients now) | |
| # ========================================================================= | |
| { | |
| "type": "duplicate_cluster", | |
| "source_indices": [0], | |
| "cluster_sizes": [2], | |
| "noise_fields": ["first_name", "phone", "address_line"], | |
| "description": "William Thompson -> 'Bill Thompson' and 'Wm Thompson'", | |
| "source_entity_id": "PAT001", | |
| "source_patient_id": 1, | |
| "duplicates": [ | |
| { | |
| "new_patient_id": 51, | |
| "changes": { | |
| "first_name": "Bill", | |
| "email": "bill.thompson@gmail.com", | |
| "phone": "312-555-0148", | |
| "address_line": "742 N Michigan Ave", | |
| "last_visit_date": "2024-01-16", | |
| }, | |
| }, | |
| { | |
| "new_patient_id": 52, | |
| "changes": { | |
| "first_name": "Wm", | |
| "email": "wm.thompson@yahoo.com", | |
| "phone": "(312) 555-0147", | |
| "insurance_id": "BC-4471583", | |
| "zip": "60602", | |
| }, | |
| }, | |
| ], | |
| }, | |
| { | |
| "type": "duplicate_cluster", | |
| "source_indices": [1], | |
| "cluster_sizes": [1], | |
| "noise_fields": ["first_name", "phone", "address_line"], | |
| "description": "Robert Martinez -> 'Bob Martinez'", | |
| "source_entity_id": "PAT002", | |
| "source_patient_id": 2, | |
| "duplicates": [ | |
| { | |
| "new_patient_id": 53, | |
| "changes": { | |
| "first_name": "Bob", | |
| "email": "bob.martinez@outlook.com", | |
| "address_line": "1501 Pike Pl", | |
| "last_visit_date": "2024-03-09", | |
| "insurance_id": "UH-8823146", | |
| }, | |
| }, | |
| ], | |
| }, | |
| { | |
| "type": "duplicate_cluster", | |
| "source_indices": [2], | |
| "cluster_sizes": [2], | |
| "noise_fields": ["first_name", "phone", "address_line"], | |
| "description": "Elizabeth O'Brien -> 'Liz OBrien' and 'Beth O Brien'", | |
| "source_entity_id": "PAT003", | |
| "source_patient_id": 3, | |
| "duplicates": [ | |
| { | |
| "new_patient_id": 54, | |
| "changes": { | |
| "first_name": "Liz", | |
| "last_name": "OBrien", | |
| "email": "liz.obrien@yahoo.com", | |
| "phone": "617-555-0382", | |
| "address_line": "88 Beacon St", | |
| }, | |
| }, | |
| { | |
| "new_patient_id": 55, | |
| "changes": { | |
| "first_name": "Beth", | |
| "last_name": "O Brien", | |
| "email": "beth.obrien@gmail.com", | |
| "dob": "1982-11-05", | |
| "insurance_id": "AE-5539274", | |
| }, | |
| }, | |
| ], | |
| }, | |
| { | |
| "type": "duplicate_cluster", | |
| "source_indices": [3], | |
| "cluster_sizes": [1], | |
| "noise_fields": ["first_name", "phone", "address_line"], | |
| "description": "Jennifer Nguyen -> 'Jen Nguyen'", | |
| "source_entity_id": "PAT004", | |
| "source_patient_id": 4, | |
| "duplicates": [ | |
| { | |
| "new_patient_id": 56, | |
| "changes": { | |
| "first_name": "Jen", | |
| "email": "jen.nguyen@gmail.com", | |
| "phone": "503-555-0463", | |
| "address_line": "2200 NW Burnside Road", | |
| "last_visit_date": "2024-06-12", | |
| }, | |
| }, | |
| ], | |
| }, | |
| { | |
| "type": "duplicate_cluster", | |
| "source_indices": [4], | |
| "cluster_sizes": [1], | |
| "noise_fields": ["first_name", "phone", "address_line"], | |
| "description": "James Kowalski -> 'Jim Kowalski'", | |
| "source_entity_id": "PAT005", | |
| "source_patient_id": 5, | |
| "duplicates": [ | |
| { | |
| "new_patient_id": 57, | |
| "changes": { | |
| "first_name": "Jim", | |
| "email": "jim.kowalski@aol.com", | |
| "address_line": "1600 Colfax Avenue", | |
| "phone": "3035550518", | |
| "insurance_id": "MC-6642018", | |
| }, | |
| }, | |
| ], | |
| }, | |
| { | |
| "type": "duplicate_cluster", | |
| "source_indices": [5], | |
| "cluster_sizes": [2], | |
| "noise_fields": ["first_name", "phone", "address_line"], | |
| "description": "Katherine Patel -> 'Kate Patel' and 'Kathy Patel'", | |
| "source_entity_id": "PAT006", | |
| "source_patient_id": 6, | |
| "duplicates": [ | |
| { | |
| "new_patient_id": 58, | |
| "changes": { | |
| "first_name": "Kate", | |
| "email": "kate.patel@hotmail.com", | |
| "phone": "404-555-0638", | |
| "city": "atlanta", | |
| "state": "ga", | |
| }, | |
| }, | |
| { | |
| "new_patient_id": 59, | |
| "changes": { | |
| "first_name": "Kathy", | |
| "email": "kathy.patel@gmail.com", | |
| "address_line": "350 Peachtree Street NE", | |
| "dob": "1988-01-27", | |
| "insurance_id": "BC-9918453", | |
| }, | |
| }, | |
| ], | |
| }, | |
| # ========================================================================= | |
| # NEW 4 duplicate clusters (TYPO-based, much harder than nicknames) | |
| # ========================================================================= | |
| { | |
| "type": "duplicate_cluster", | |
| "source_indices": [33], | |
| "cluster_sizes": [2], | |
| "noise_fields": ["first_name", "address_line"], | |
| "description": "Christopher Reeves -> 'Christpher Reeves' (dropped 'o') and 'Chistopher Reeves' (dropped 'r')", | |
| "source_entity_id": "PAT034", | |
| "source_patient_id": 34, | |
| "duplicates": [ | |
| { | |
| "new_patient_id": 60, | |
| "changes": { | |
| "first_name": "Christpher", | |
| "email": "christpher.reeves@gmail.com", | |
| "phone": "813-555-3535", | |
| "address_line": "1400 N Dale Mabry", | |
| }, | |
| }, | |
| { | |
| "new_patient_id": 61, | |
| "changes": { | |
| "first_name": "Chistopher", | |
| "email": "chistopher.reeves@yahoo.com", | |
| "address_line": "1400 North Dale Mabry Hwy", | |
| "last_visit_date": "2024-06-06", | |
| }, | |
| }, | |
| ], | |
| }, | |
| { | |
| "type": "duplicate_cluster", | |
| "source_indices": [37], | |
| "cluster_sizes": [1], | |
| "noise_fields": ["first_name"], | |
| "description": "Alexandra Petrov -> 'Alessandra Petrov' (common misspelling/Italian variant)", | |
| "source_entity_id": "PAT038", | |
| "source_patient_id": 38, | |
| "duplicates": [ | |
| { | |
| "new_patient_id": 62, | |
| "changes": { | |
| "first_name": "Alessandra", | |
| "email": "alessandra.petrov@gmail.com", | |
| "phone": "480-555-3979", | |
| "address_line": "7100 E Camelback Road", | |
| }, | |
| }, | |
| ], | |
| }, | |
| { | |
| "type": "duplicate_cluster", | |
| "source_indices": [39], | |
| "cluster_sizes": [2], | |
| "noise_fields": ["first_name", "address_line"], | |
| "description": "Patricia Hernandez -> 'Patricla Hernandez' (typo i->l) and 'Patrica Hernandez' (dropped 'i')", | |
| "source_entity_id": "PAT040", | |
| "source_patient_id": 40, | |
| "duplicates": [ | |
| { | |
| "new_patient_id": 63, | |
| "changes": { | |
| "first_name": "Patricla", | |
| "email": "patricla.hernandez@gmail.com", | |
| "phone": "520-555-4191", | |
| "address_line": "150 North Stone Ave", | |
| }, | |
| }, | |
| { | |
| "new_patient_id": 64, | |
| "changes": { | |
| "first_name": "Patrica", | |
| "email": "patrica.hernandez@yahoo.com", | |
| "address_line": "150 N Stone Avenue", | |
| "last_visit_date": "2024-01-18", | |
| }, | |
| }, | |
| ], | |
| }, | |
| { | |
| "type": "duplicate_cluster", | |
| "source_indices": [47], | |
| "cluster_sizes": [2], | |
| "noise_fields": ["first_name", "address_line"], | |
| "description": ( | |
| "Catherine Brooks -> 'Katherine Brooks' (C->K variant) and 'Catharine Brooks' (e->a variant). " | |
| "TRAP: PAT006 is Katherine Patel - agent must NOT merge these with PAT006!" | |
| ), | |
| "source_entity_id": "PAT048", | |
| "source_patient_id": 48, | |
| "duplicates": [ | |
| { | |
| "new_patient_id": 65, | |
| "changes": { | |
| "first_name": "Katherine", | |
| "email": "katherine.brooks@gmail.com", | |
| "phone": "614-555-4979", | |
| "address_line": "100 East Broad St", | |
| }, | |
| }, | |
| { | |
| "new_patient_id": 66, | |
| "changes": { | |
| "first_name": "Catharine", | |
| "email": "catharine.brooks@yahoo.com", | |
| "address_line": "100 E Broad Street", | |
| "last_visit_date": "2024-07-26", | |
| }, | |
| }, | |
| ], | |
| }, | |
| # ========================================================================= | |
| # ORIGINAL cross_field_corrupt: zip-city mismatches | |
| # ========================================================================= | |
| { | |
| "type": "cross_field_corrupt", | |
| "row_indices": [17], | |
| "description": "Zip-city mismatch: Charlotte patient 18 given Raleigh zip 27601", | |
| "target_entity_id": "PAT018", | |
| "target_patient_id": 18, | |
| "field": "zip", | |
| "original": "28202", | |
| "corrupted": "27601", | |
| }, | |
| { | |
| "type": "cross_field_corrupt", | |
| "row_indices": [23], | |
| "description": "Zip-city mismatch: Indianapolis patient 24 given Chicago zip 60601", | |
| "target_entity_id": "PAT024", | |
| "target_patient_id": 24, | |
| "field": "zip", | |
| "original": "46204", | |
| "corrupted": "60601", | |
| }, | |
| # --- NEW zip-city mismatches for new patients --- | |
| { | |
| "type": "cross_field_corrupt", | |
| "row_indices": [30], | |
| "description": "Zip-city mismatch: Tacoma patient 31 given Seattle zip 98101", | |
| "target_entity_id": "PAT031", | |
| "target_patient_id": 31, | |
| "field": "zip", | |
| "original": "98402", | |
| "corrupted": "98101", | |
| }, | |
| { | |
| "type": "cross_field_corrupt", | |
| "row_indices": [41], | |
| "description": "Zip-city mismatch: Albuquerque patient 42 given Tucson zip 85701", | |
| "target_entity_id": "PAT042", | |
| "target_patient_id": 42, | |
| "field": "zip", | |
| "original": "87102", | |
| "corrupted": "85701", | |
| }, | |
| { | |
| "type": "cross_field_corrupt", | |
| "row_indices": [48], | |
| "description": "Zip-city mismatch: Louisville patient 49 given Columbus zip 43215", | |
| "target_entity_id": "PAT049", | |
| "target_patient_id": 49, | |
| "field": "zip", | |
| "original": "40202", | |
| "corrupted": "43215", | |
| }, | |
| { | |
| "type": "cross_field_corrupt", | |
| "row_indices": [45], | |
| "description": "Zip-city mismatch: Houston patient 46 given Dallas zip 75201", | |
| "target_entity_id": "PAT046", | |
| "target_patient_id": 46, | |
| "field": "zip", | |
| "original": "77002", | |
| "corrupted": "75201", | |
| }, | |
| # ========================================================================= | |
| # ORIGINAL cross_field_corrupt: insurance ID prefix mismatches | |
| # ========================================================================= | |
| { | |
| "type": "cross_field_corrupt", | |
| "row_indices": [20], | |
| "description": "Insurance ID prefix mismatch: Cigna patient 21 given BC prefix", | |
| "target_entity_id": "PAT021", | |
| "target_patient_id": 21, | |
| "field": "insurance_id", | |
| "original": "CI-5582719", | |
| "corrupted": "BC-5582719", | |
| }, | |
| { | |
| "type": "cross_field_corrupt", | |
| "row_indices": [28], | |
| "description": "Insurance ID prefix mismatch: Blue Cross patient 29 given AE prefix", | |
| "target_entity_id": "PAT029", | |
| "target_patient_id": 29, | |
| "field": "insurance_id", | |
| "original": "BC-2215847", | |
| "corrupted": "AE-2215847", | |
| }, | |
| # --- NEW insurance prefix mismatches --- | |
| { | |
| "type": "cross_field_corrupt", | |
| "row_indices": [40], | |
| "description": "Insurance ID prefix mismatch: Medicare patient 41 given UH prefix", | |
| "target_entity_id": "PAT041", | |
| "target_patient_id": 41, | |
| "field": "insurance_id", | |
| "original": "MC-3392841", | |
| "corrupted": "UH-3392841", | |
| }, | |
| { | |
| "type": "cross_field_corrupt", | |
| "row_indices": [34], | |
| "description": "Insurance ID prefix mismatch: Aetna patient 35 given CI prefix", | |
| "target_entity_id": "PAT035", | |
| "target_patient_id": 35, | |
| "field": "insurance_id", | |
| "original": "AE-3318724", | |
| "corrupted": "CI-3318724", | |
| }, | |
| { | |
| "type": "cross_field_corrupt", | |
| "row_indices": [49], | |
| "description": "Insurance ID prefix mismatch: Cigna patient 50 given MC prefix", | |
| "target_entity_id": "PAT050", | |
| "target_patient_id": 50, | |
| "field": "insurance_id", | |
| "original": "CI-3347291", | |
| "corrupted": "MC-3347291", | |
| }, | |
| # ========================================================================= | |
| # ORIGINAL cross_field_corrupt: gender format | |
| # ========================================================================= | |
| { | |
| "type": "cross_field_corrupt", | |
| "row_indices": [11], | |
| "description": "Gender format corruption: patient 12 'F' -> 'Female'", | |
| "target_entity_id": "PAT012", | |
| "target_patient_id": 12, | |
| "field": "gender", | |
| "original": "F", | |
| "corrupted": "Female", | |
| }, | |
| # ========================================================================= | |
| # ORIGINAL impossible_date: date format corruptions | |
| # ========================================================================= | |
| { | |
| "type": "impossible_date", | |
| "targets": [{"row_idx": 11, "field": "dob", "corrupt_type": "format"}], | |
| "description": "DOB reformatted to MM/DD/YYYY for patient 12", | |
| "target_entity_id": "PAT012", | |
| "target_patient_id": 12, | |
| "field": "dob", | |
| "original": "1993-02-28", | |
| "corrupted": "02/28/1993", | |
| }, | |
| { | |
| "type": "impossible_date", | |
| "targets": [{"row_idx": 15, "field": "last_visit_date", "corrupt_type": "format"}], | |
| "description": "Last visit date reformatted to MM-DD-YYYY for patient 16", | |
| "target_entity_id": "PAT016", | |
| "target_patient_id": 16, | |
| "field": "last_visit_date", | |
| "original": "2024-08-01", | |
| "corrupted": "08-01-2024", | |
| }, | |
| { | |
| "type": "impossible_date", | |
| "targets": [{"row_idx": 22, "field": "dob", "corrupt_type": "format"}], | |
| "description": "DOB reformatted to 'Dec 1, 1955' for patient 23", | |
| "target_entity_id": "PAT023", | |
| "target_patient_id": 23, | |
| "field": "dob", | |
| "original": "1955-12-01", | |
| "corrupted": "Dec 1, 1955", | |
| }, | |
| { | |
| "type": "impossible_date", | |
| "targets": [{"row_idx": 27, "field": "last_visit_date", "corrupt_type": "format"}], | |
| "description": "Last visit date reformatted to M/D/YYYY for patient 28", | |
| "target_entity_id": "PAT028", | |
| "target_patient_id": 28, | |
| "field": "last_visit_date", | |
| "original": "2024-02-07", | |
| "corrupted": "2/7/2024", | |
| }, | |
| # --- NEW date format corruptions for new patients --- | |
| { | |
| "type": "impossible_date", | |
| "targets": [{"row_idx": 33, "field": "dob", "corrupt_type": "format"}], | |
| "description": "DOB reformatted to DD/MM/YYYY for patient 34", | |
| "target_entity_id": "PAT034", | |
| "target_patient_id": 34, | |
| "field": "dob", | |
| "original": "1976-08-25", | |
| "corrupted": "25/08/1976", | |
| }, | |
| { | |
| "type": "impossible_date", | |
| "targets": [{"row_idx": 37, "field": "last_visit_date", "corrupt_type": "format"}], | |
| "description": "Last visit date reformatted to 'Feb 20, 2024' for patient 38", | |
| "target_entity_id": "PAT038", | |
| "target_patient_id": 38, | |
| "field": "last_visit_date", | |
| "original": "2024-02-20", | |
| "corrupted": "Feb 20, 2024", | |
| }, | |
| { | |
| "type": "impossible_date", | |
| "targets": [{"row_idx": 42, "field": "dob", "corrupt_type": "format"}], | |
| "description": "DOB reformatted to MM-DD-YYYY for patient 43", | |
| "target_entity_id": "PAT043", | |
| "target_patient_id": 43, | |
| "field": "dob", | |
| "original": "1965-11-28", | |
| "corrupted": "11-28-1965", | |
| }, | |
| { | |
| "type": "impossible_date", | |
| "targets": [{"row_idx": 46, "field": "last_visit_date", "corrupt_type": "format"}], | |
| "description": "Last visit date reformatted to D/M/YYYY for patient 47", | |
| "target_entity_id": "PAT047", | |
| "target_patient_id": 47, | |
| "field": "last_visit_date", | |
| "original": "2024-03-11", | |
| "corrupted": "11/3/2024", | |
| }, | |
| { | |
| "type": "impossible_date", | |
| "targets": [{"row_idx": 49, "field": "dob", "corrupt_type": "format"}], | |
| "description": "DOB reformatted to 'Jun 21, 2004' for patient 50", | |
| "target_entity_id": "PAT050", | |
| "target_patient_id": 50, | |
| "field": "dob", | |
| "original": "2004-06-21", | |
| "corrupted": "Jun 21, 2004", | |
| }, | |
| # --- NEW subtle DOB off-by-one corruptions (very hard to detect) --- | |
| { | |
| "type": "impossible_date", | |
| "targets": [{"row_idx": 31, "field": "dob", "corrupt_type": "off_by_one"}], | |
| "description": "DOB day off by 1: patient 32 '1988-07-20' -> '1988-07-21'", | |
| "target_entity_id": "PAT032", | |
| "target_patient_id": 32, | |
| "field": "dob", | |
| "original": "1988-07-20", | |
| "corrupted": "1988-07-21", | |
| }, | |
| { | |
| "type": "impossible_date", | |
| "targets": [{"row_idx": 43, "field": "dob", "corrupt_type": "off_by_one"}], | |
| "description": "DOB day off by 1: patient 44 '1981-05-09' -> '1981-05-10'", | |
| "target_entity_id": "PAT044", | |
| "target_patient_id": 44, | |
| "field": "dob", | |
| "original": "1981-05-09", | |
| "corrupted": "1981-05-10", | |
| }, | |
| # ========================================================================= | |
| # ORIGINAL insurance_id_mismatch: missing/corrupted + gender mismatches | |
| # ========================================================================= | |
| { | |
| "type": "insurance_id_mismatch", | |
| "row_indices": [26], | |
| "description": "Patient 27 insurance_id set to empty string", | |
| "target_entity_id": "PAT027", | |
| "target_patient_id": 27, | |
| "field": "insurance_id", | |
| "original": "CI-4431957", | |
| "corrupted": "", | |
| }, | |
| { | |
| "type": "insurance_id_mismatch", | |
| "row_indices": [15], | |
| "description": "Patient 16 gender 'M' -> 'male' (format mismatch)", | |
| "target_entity_id": "PAT016", | |
| "target_patient_id": 16, | |
| "field": "gender", | |
| "original": "M", | |
| "corrupted": "male", | |
| }, | |
| { | |
| "type": "insurance_id_mismatch", | |
| "row_indices": [22], | |
| "description": "Patient 23 gender 'M' -> 'm' (case mismatch)", | |
| "target_entity_id": "PAT023", | |
| "target_patient_id": 23, | |
| "field": "gender", | |
| "original": "M", | |
| "corrupted": "m", | |
| }, | |
| # ========================================================================= | |
| # ORIGINAL null_inject_contextual: missing values and whitespace | |
| # ========================================================================= | |
| { | |
| "type": "null_inject_contextual", | |
| "targets": [{"row_idx": 13, "field": "email"}], | |
| "description": "Email set to None for patient 14 (Ashley Richardson)", | |
| "target_entity_id": "PAT014", | |
| "target_patient_id": 14, | |
| "field": "email", | |
| "original": "ashley.richardson@gmail.com", | |
| "corrupted": None, | |
| }, | |
| { | |
| "type": "null_inject_contextual", | |
| "targets": [{"row_idx": 21, "field": "phone"}], | |
| "description": "Phone set to None for patient 22 (Jordan Mitchell)", | |
| "target_entity_id": "PAT022", | |
| "target_patient_id": 22, | |
| "field": "phone", | |
| "original": "619-555-2246", | |
| "corrupted": None, | |
| }, | |
| { | |
| "type": "null_inject_contextual", | |
| "targets": [{"row_idx": 29, "field": "address_line"}], | |
| "description": "Address set to None for patient 30 (Laura Scott)", | |
| "target_entity_id": "PAT030", | |
| "target_patient_id": 30, | |
| "field": "address_line", | |
| "original": "400 Fayetteville St", | |
| "corrupted": None, | |
| }, | |
| # --- NEW null injections for new patients --- | |
| { | |
| "type": "null_inject_contextual", | |
| "targets": [{"row_idx": 40, "field": "phone"}], | |
| "description": "Phone set to None for patient 41 (Gregory Adams)", | |
| "target_entity_id": "PAT041", | |
| "target_patient_id": 41, | |
| "field": "phone", | |
| "original": "860-555-4201", | |
| "corrupted": None, | |
| }, | |
| { | |
| "type": "null_inject_contextual", | |
| "targets": [{"row_idx": 47, "field": "email"}], | |
| "description": "Email set to None for patient 48 (Catherine Brooks)", | |
| "target_entity_id": "PAT048", | |
| "target_patient_id": 48, | |
| "field": "email", | |
| "original": "catherine.brooks@gmail.com", | |
| "corrupted": None, | |
| }, | |
| { | |
| "type": "null_inject_contextual", | |
| "targets": [{"row_idx": 38, "field": "insurance_id"}], | |
| "description": "Insurance ID set to None for patient 39 (Casey Morgan)", | |
| "target_entity_id": "PAT039", | |
| "target_patient_id": 39, | |
| "field": "insurance_id", | |
| "original": "MC-7724316", | |
| "corrupted": None, | |
| }, | |
| # --- ORIGINAL whitespace corruptions --- | |
| { | |
| "type": "null_inject_contextual", | |
| "targets": [{"row_idx": 10, "field": "first_name"}], | |
| "description": "First name padded with spaces for patient 11 (David Chen)", | |
| "target_entity_id": "PAT011", | |
| "target_patient_id": 11, | |
| "field": "first_name", | |
| "original": "David", | |
| "corrupted": " David ", | |
| }, | |
| { | |
| "type": "null_inject_contextual", | |
| "targets": [{"row_idx": 14, "field": "email"}], | |
| "description": "Email with extra space for patient 15 (Patricia Lee)", | |
| "target_entity_id": "PAT015", | |
| "target_patient_id": 15, | |
| "field": "email", | |
| "original": "patricia.lee@outlook.com", | |
| "corrupted": "patricia.lee @outlook.com", | |
| }, | |
| # --- NEW whitespace corruptions --- | |
| { | |
| "type": "null_inject_contextual", | |
| "targets": [{"row_idx": 44, "field": "last_name"}], | |
| "description": "Last name with trailing space for patient 45 (James Lee)", | |
| "target_entity_id": "PAT045", | |
| "target_patient_id": 45, | |
| "field": "last_name", | |
| "original": "Lee", | |
| "corrupted": "Lee ", | |
| }, | |
| { | |
| "type": "null_inject_contextual", | |
| "targets": [{"row_idx": 39, "field": "first_name"}], | |
| "description": "First name with leading tab for patient 40 (Patricia Hernandez)", | |
| "target_entity_id": "PAT040", | |
| "target_patient_id": 40, | |
| "field": "first_name", | |
| "original": "Patricia", | |
| "corrupted": "\tPatricia", | |
| }, | |
| { | |
| "type": "null_inject_contextual", | |
| "targets": [{"row_idx": 48, "field": "city"}], | |
| "description": "City with trailing whitespace for patient 49 (Raymond Foster)", | |
| "target_entity_id": "PAT049", | |
| "target_patient_id": 49, | |
| "field": "city", | |
| "original": "Louisville", | |
| "corrupted": "Louisville ", | |
| }, | |
| # ========================================================================= | |
| # ORIGINAL false_positive_duplicate: 2 pairs (already in ground truth) | |
| # ========================================================================= | |
| { | |
| "type": "false_positive_duplicate", | |
| "pairs": [[6, 7]], | |
| "description": ( | |
| "Two different 'Michael Davis' patients (PAT007 and PAT008) share the same " | |
| "name but have different DOB, location, insurance. Must NOT be merged." | |
| ), | |
| "entity_ids": ["PAT007", "PAT008"], | |
| "patient_ids": [7, 8], | |
| "distinguishing_fields": ["dob", "city", "state", "zip", "email", "insurance_provider", "insurance_id"], | |
| }, | |
| { | |
| "type": "false_positive_duplicate", | |
| "pairs": [[8, 9]], | |
| "description": ( | |
| "Two different 'Maria Garcia' patients (PAT009 and PAT010) share the same " | |
| "name but have different DOB, location, insurance. Must NOT be merged." | |
| ), | |
| "entity_ids": ["PAT009", "PAT010"], | |
| "patient_ids": [9, 10], | |
| "distinguishing_fields": ["dob", "city", "state", "zip", "email", "insurance_provider", "insurance_id"], | |
| }, | |
| # --- NEW false_positive_duplicate: 3 harder pairs --- | |
| { | |
| "type": "false_positive_duplicate", | |
| "pairs": [[31, 32]], | |
| "description": ( | |
| "Two different 'David Kim' patients (PAT032 and PAT033) - SAME insurance " | |
| "provider (UnitedHealth), SAME state (WA), DOBs only 2 years apart. " | |
| "Distinguishable by different insurance IDs, different cities, different DOB. " | |
| "Must NOT be merged." | |
| ), | |
| "entity_ids": ["PAT032", "PAT033"], | |
| "patient_ids": [32, 33], | |
| "distinguishing_fields": ["dob", "city", "zip", "email", "insurance_id", "phone"], | |
| }, | |
| { | |
| "type": "false_positive_duplicate", | |
| "pairs": [[35, 36]], | |
| "description": ( | |
| "Two different 'Sarah Williams' patients (PAT036 and PAT037) - SAME city " | |
| "(Chicago), SAME state (IL). DOBs 5 years apart, different insurance. " | |
| "Must NOT be merged." | |
| ), | |
| "entity_ids": ["PAT036", "PAT037"], | |
| "patient_ids": [36, 37], | |
| "distinguishing_fields": ["dob", "zip", "address_line", "email", "insurance_provider", "insurance_id", "phone"], | |
| }, | |
| { | |
| "type": "false_positive_duplicate", | |
| "pairs": [[43, 44]], | |
| "description": ( | |
| "Two different 'James Lee' patients (PAT044 and PAT045) - DOBs only 1 year " | |
| "apart, SAME state (CA), SAME insurance provider (Aetna). Distinguishable " | |
| "by different insurance IDs, different cities. Must NOT be merged." | |
| ), | |
| "entity_ids": ["PAT044", "PAT045"], | |
| "patient_ids": [44, 45], | |
| "distinguishing_fields": ["dob", "city", "zip", "email", "insurance_id", "phone"], | |
| }, | |
| # ========================================================================= | |
| # ORIGINAL address_variation: phone format variations | |
| # ========================================================================= | |
| { | |
| "type": "address_variation", | |
| "row_indices": [10], | |
| "description": "Phone format changed to (XXX) XXX-XXXX for patient 11", | |
| "target_entity_id": "PAT011", | |
| "target_patient_id": 11, | |
| "field": "phone", | |
| "original": "415-555-1134", | |
| "corrupted": "(415) 555-1134", | |
| }, | |
| { | |
| "type": "address_variation", | |
| "row_indices": [14], | |
| "description": "Phone format changed to XXX.XXX.XXXX for patient 15", | |
| "target_entity_id": "PAT015", | |
| "target_patient_id": 15, | |
| "field": "phone", | |
| "original": "702-555-1578", | |
| "corrupted": "702.555.1578", | |
| }, | |
| { | |
| "type": "address_variation", | |
| "row_indices": [19], | |
| "description": "Phone stripped of dashes for patient 20", | |
| "target_entity_id": "PAT020", | |
| "target_patient_id": 20, | |
| "field": "phone", | |
| "original": "407-555-2024", | |
| "corrupted": "4075552024", | |
| }, | |
| { | |
| "type": "address_variation", | |
| "row_indices": [24], | |
| "description": "Phone with country code prefix for patient 25", | |
| "target_entity_id": "PAT025", | |
| "target_patient_id": 25, | |
| "field": "phone", | |
| "original": "216-555-2579", | |
| "corrupted": "+1-216-555-2579", | |
| }, | |
| # --- NEW phone format variations --- | |
| { | |
| "type": "address_variation", | |
| "row_indices": [33], | |
| "description": "Phone format changed to (XXX) XXX-XXXX for patient 34", | |
| "target_entity_id": "PAT034", | |
| "target_patient_id": 34, | |
| "field": "phone", | |
| "original": "813-555-3534", | |
| "corrupted": "(813) 555-3534", | |
| }, | |
| { | |
| "type": "address_variation", | |
| "row_indices": [41], | |
| "description": "Phone format changed to XXX.XXX.XXXX for patient 42", | |
| "target_entity_id": "PAT042", | |
| "target_patient_id": 42, | |
| "field": "phone", | |
| "original": "505-555-4312", | |
| "corrupted": "505.555.4312", | |
| }, | |
| { | |
| "type": "address_variation", | |
| "row_indices": [45], | |
| "description": "Phone stripped of dashes for patient 46", | |
| "target_entity_id": "PAT046", | |
| "target_patient_id": 46, | |
| "field": "phone", | |
| "original": "832-555-4756", | |
| "corrupted": "8325554756", | |
| }, | |
| { | |
| "type": "address_variation", | |
| "row_indices": [48], | |
| "description": "Phone with country code prefix for patient 49", | |
| "target_entity_id": "PAT049", | |
| "target_patient_id": 49, | |
| "field": "phone", | |
| "original": "502-555-5089", | |
| "corrupted": "+1-502-555-5089", | |
| }, | |
| # ========================================================================= | |
| # ORIGINAL case corruptions | |
| # ========================================================================= | |
| { | |
| "type": "case_corrupt", | |
| "targets": [{"row_idx": 12, "field": "first_name"}], | |
| "description": "First name uppercased for patient 13 (Thomas -> THOMAS)", | |
| "target_entity_id": "PAT013", | |
| "target_patient_id": 13, | |
| "original": "Thomas", | |
| "corrupted": "THOMAS", | |
| }, | |
| { | |
| "type": "case_corrupt", | |
| "targets": [{"row_idx": 16, "field": "city"}], | |
| "description": "City lowercased for patient 17 (Minneapolis -> minneapolis)", | |
| "target_entity_id": "PAT017", | |
| "target_patient_id": 17, | |
| "original": "Minneapolis", | |
| "corrupted": "minneapolis", | |
| }, | |
| { | |
| "type": "case_corrupt", | |
| "targets": [{"row_idx": 18, "field": "state"}], | |
| "description": "State lowercased for patient 19 (MO -> mo)", | |
| "target_entity_id": "PAT019", | |
| "target_patient_id": 19, | |
| "original": "MO", | |
| "corrupted": "mo", | |
| }, | |
| { | |
| "type": "case_corrupt", | |
| "targets": [{"row_idx": 25, "field": "last_name"}], | |
| "description": "Last name uppercased for patient 26 (Young -> YOUNG)", | |
| "target_entity_id": "PAT026", | |
| "target_patient_id": 26, | |
| "original": "Young", | |
| "corrupted": "YOUNG", | |
| }, | |
| # --- NEW case corruptions --- | |
| { | |
| "type": "case_corrupt", | |
| "targets": [{"row_idx": 37, "field": "last_name"}], | |
| "description": "Last name lowercased for patient 38 (Petrov -> petrov)", | |
| "target_entity_id": "PAT038", | |
| "target_patient_id": 38, | |
| "original": "Petrov", | |
| "corrupted": "petrov", | |
| }, | |
| { | |
| "type": "case_corrupt", | |
| "targets": [{"row_idx": 42, "field": "first_name"}], | |
| "description": "First name uppercased for patient 43 (Dana -> DANA)", | |
| "target_entity_id": "PAT043", | |
| "target_patient_id": 43, | |
| "original": "Dana", | |
| "corrupted": "DANA", | |
| }, | |
| { | |
| "type": "case_corrupt", | |
| "targets": [{"row_idx": 47, "field": "city"}], | |
| "description": "City lowercased for patient 48 (Columbus -> columbus)", | |
| "target_entity_id": "PAT048", | |
| "target_patient_id": 48, | |
| "original": "Columbus", | |
| "corrupted": "columbus", | |
| }, | |
| { | |
| "type": "case_corrupt", | |
| "targets": [{"row_idx": 49, "field": "state"}], | |
| "description": "State lowercased for patient 50 (IA -> ia)", | |
| "target_entity_id": "PAT050", | |
| "target_patient_id": 50, | |
| "original": "IA", | |
| "corrupted": "ia", | |
| }, | |
| # ========================================================================= | |
| # ORIGINAL address whitespace corruptions | |
| # ========================================================================= | |
| { | |
| "type": "address_variation", | |
| "row_indices": [19], | |
| "description": "Extra spaces in address for patient 20 (525 S Orange Ave -> 525 S Orange Ave)", | |
| "target_entity_id": "PAT020", | |
| "target_patient_id": 20, | |
| "field": "address_line", | |
| "original": "525 S Orange Ave", | |
| "corrupted": "525 S Orange Ave", | |
| }, | |
| { | |
| "type": "address_variation", | |
| "row_indices": [25], | |
| "description": "Extra space in city for patient 26 (New Orleans -> New Orleans)", | |
| "target_entity_id": "PAT026", | |
| "target_patient_id": 26, | |
| "field": "city", | |
| "original": "New Orleans", | |
| "corrupted": "New Orleans", | |
| }, | |
| # ========================================================================= | |
| # NEW email domain typo corruptions (subtle) | |
| # ========================================================================= | |
| { | |
| "type": "address_variation", | |
| "row_indices": [34], | |
| "description": "Email domain typo for patient 35 (yahoo.com -> yaho.com)", | |
| "target_entity_id": "PAT035", | |
| "target_patient_id": 35, | |
| "field": "email", | |
| "original": "avery.simmons94@yahoo.com", | |
| "corrupted": "avery.simmons94@yaho.com", | |
| }, | |
| { | |
| "type": "address_variation", | |
| "row_indices": [41], | |
| "description": "Email domain typo for patient 42 (gmail.com -> gmial.com)", | |
| "target_entity_id": "PAT042", | |
| "target_patient_id": 42, | |
| "field": "email", | |
| "original": "samantha.rivera97@gmail.com", | |
| "corrupted": "samantha.rivera97@gmial.com", | |
| }, | |
| { | |
| "type": "address_variation", | |
| "row_indices": [48], | |
| "description": "Email domain typo for patient 49 (aol.com -> aol.cm)", | |
| "target_entity_id": "PAT049", | |
| "target_patient_id": 49, | |
| "field": "email", | |
| "original": "raymond.foster50@aol.com", | |
| "corrupted": "raymond.foster50@aol.cm", | |
| }, | |
| # ========================================================================= | |
| # NEW state full-name instead of abbreviation corruptions | |
| # ========================================================================= | |
| { | |
| "type": "cross_field_corrupt", | |
| "row_indices": [33], | |
| "description": "State full name instead of abbreviation for patient 34 (FL -> Florida)", | |
| "target_entity_id": "PAT034", | |
| "target_patient_id": 34, | |
| "field": "state", | |
| "original": "FL", | |
| "corrupted": "Florida", | |
| }, | |
| { | |
| "type": "cross_field_corrupt", | |
| "row_indices": [43], | |
| "description": "State full name instead of abbreviation for patient 44 (CA -> California)", | |
| "target_entity_id": "PAT044", | |
| "target_patient_id": 44, | |
| "field": "state", | |
| "original": "CA", | |
| "corrupted": "California", | |
| }, | |
| { | |
| "type": "cross_field_corrupt", | |
| "row_indices": [40], | |
| "description": "State full name instead of abbreviation for patient 41 (CT -> Connecticut)", | |
| "target_entity_id": "PAT041", | |
| "target_patient_id": 41, | |
| "field": "state", | |
| "original": "CT", | |
| "corrupted": "Connecticut", | |
| }, | |
| # ========================================================================= | |
| # ORIGINAL + NEW valid_unusual: gender/name traps (NOT errors) | |
| # ========================================================================= | |
| { | |
| "type": "valid_unusual", | |
| "description": "Ashley (M) - historically male name, VALID. Do NOT correct.", | |
| "entity_id": "PAT014", | |
| "patient_id": 14, | |
| "first_name": "Ashley", | |
| "gender": "M", | |
| "note": "Ashley was historically a male name; this is valid.", | |
| }, | |
| { | |
| "type": "valid_unusual", | |
| "description": "Jordan (F) - gender-neutral name, VALID. Do NOT correct.", | |
| "entity_id": "PAT022", | |
| "patient_id": 22, | |
| "first_name": "Jordan", | |
| "gender": "F", | |
| "note": "Jordan is gender-neutral; valid for female patients.", | |
| }, | |
| { | |
| "type": "valid_unusual", | |
| "description": "Shannon (M) - historically male Irish name, VALID. Do NOT correct.", | |
| "entity_id": "PAT027", | |
| "patient_id": 27, | |
| "first_name": "Shannon", | |
| "gender": "M", | |
| "note": "Shannon was historically a male Irish name; this is valid.", | |
| }, | |
| { | |
| "type": "valid_unusual", | |
| "description": "Morgan (M) - gender-neutral name, VALID. Do NOT correct.", | |
| "entity_id": "PAT031", | |
| "patient_id": 31, | |
| "first_name": "Morgan", | |
| "gender": "M", | |
| "note": "Morgan is gender-neutral; valid for male patients.", | |
| }, | |
| { | |
| "type": "valid_unusual", | |
| "description": "Avery (M) - gender-neutral name, VALID. Do NOT correct.", | |
| "entity_id": "PAT035", | |
| "patient_id": 35, | |
| "first_name": "Avery", | |
| "gender": "M", | |
| "note": "Avery is gender-neutral; valid for male patients.", | |
| }, | |
| { | |
| "type": "valid_unusual", | |
| "description": "Casey (F) - gender-neutral but often male, VALID. Do NOT correct.", | |
| "entity_id": "PAT039", | |
| "patient_id": 39, | |
| "first_name": "Casey", | |
| "gender": "F", | |
| "note": "Casey is gender-neutral; valid for female patients.", | |
| }, | |
| { | |
| "type": "valid_unusual", | |
| "description": "Dana (M) - gender-neutral name, VALID. Do NOT correct.", | |
| "entity_id": "PAT043", | |
| "patient_id": 43, | |
| "first_name": "Dana", | |
| "gender": "M", | |
| "note": "Dana is gender-neutral; valid for male patients.", | |
| }, | |
| { | |
| "type": "valid_unusual", | |
| "description": "Robin (F) - gender-neutral name, VALID. Do NOT correct.", | |
| "entity_id": "PAT047", | |
| "patient_id": 47, | |
| "first_name": "Robin", | |
| "gender": "F", | |
| "note": "Robin is gender-neutral; valid for female patients.", | |
| }, | |
| ] | |
| _HARD_PROBES: list[UtilityProbe] = [ | |
| UtilityProbe( | |
| name="unique_patient_count", | |
| description="Count of unique patients after deduplication", | |
| query_fn="unique_count", | |
| params={"column": "patient_id"}, | |
| expected_result=50, | |
| ), | |
| UtilityProbe( | |
| name="insurance_provider_distribution", | |
| description="Count of patients per insurance provider", | |
| query_fn="distribution", | |
| params={"column": "insurance_provider"}, | |
| expected_result={ | |
| "Blue Cross": 10, | |
| "UnitedHealth": 10, | |
| "Aetna": 9, | |
| "Cigna": 9, | |
| "Medicare": 12, | |
| }, | |
| ), | |
| UtilityProbe( | |
| name="patients_per_city", | |
| description="Count of patients per city", | |
| query_fn="distribution", | |
| params={"column": "city"}, | |
| expected_result={ | |
| "Chicago": 3, "Seattle": 2, "Boston": 1, "Portland": 1, | |
| "Denver": 2, "Atlanta": 2, "Beverly Hills": 1, "Miami": 1, | |
| "Austin": 1, "New York": 1, "San Francisco": 1, "Houston": 2, | |
| "Phoenix": 1, "Nashville": 1, "Las Vegas": 1, "Dallas": 1, | |
| "Minneapolis": 1, "Charlotte": 1, "Saint Louis": 1, | |
| "Orlando": 1, "Pittsburgh": 1, "San Diego": 1, | |
| "San Antonio": 1, "Indianapolis": 1, "Cleveland": 1, | |
| "New Orleans": 1, "Kansas City": 1, "Milwaukee": 1, | |
| "Tysons": 1, "Raleigh": 1, | |
| "Tacoma": 1, "Bellevue": 1, "Tampa": 1, "Scottsdale": 1, | |
| "Tucson": 1, "Hartford": 1, "Albuquerque": 1, "Memphis": 1, | |
| "Oakland": 1, "San Jose": 1, "Greensboro": 1, "Columbus": 1, | |
| "Louisville": 1, "Des Moines": 1, | |
| }, | |
| ), | |
| UtilityProbe( | |
| name="avg_age_by_gender", | |
| description="Average age (2026 - birth year) grouped by gender", | |
| query_fn="avg_by_group", | |
| params={"value_col": "dob", "group_col": "gender", "transform": "year_age_2026"}, | |
| expected_result={"F": 44.80, "M": 51.00}, | |
| ), | |
| ] | |
| _HARD_TASK = Task( | |
| task_id="hard_patients", | |
| name="Multi-Source Patient Registry", | |
| difficulty="hard", | |
| description=( | |
| "Clean and deduplicate a multi-source patient registry with 50 unique " | |
| "patients (14 columns). The dirty dataset contains ~70 rows due to 10 " | |
| "duplicate clusters (6 nickname-based, 4 typo-based including a " | |
| "Catherine/Katherine cross-patient trap). Includes 5 false-positive " | |
| "duplicate pairs (same name, genuinely different people with subtle " | |
| "overlaps in location, insurance, or DOB) that must NOT be merged, " | |
| "8 gender/name traps that are valid and must NOT be corrected, " | |
| "cross-field validation rules (zip-city map, insurance prefix map), " | |
| "date and phone format inconsistencies, DOB off-by-one corruptions, " | |
| "email domain typos, state full-name expansions, null injections, " | |
| "whitespace corruptions, and insurance ID mismatches." | |
| ), | |
| ground_truth=_HARD_GROUND_TRUTH, | |
| schema=_HARD_SCHEMA, | |
| corruptions=_HARD_CORRUPTIONS, | |
| max_steps=80, | |
| utility_probes=_HARD_PROBES, | |
| # Genuinely ambiguous cells where the correct value is debatable. | |
| # Gender/name traps: unusual but valid gender assignments. | |
| # An ideal agent should escalate these rather than "fix" them. | |
| ambiguous_cells=[ | |
| ("PAT014", "gender"), # Ashley (M) - historically male name | |
| ("PAT022", "gender"), # Jordan (F) - gender-neutral name | |
| ("PAT027", "gender"), # Shannon (M) - historically male Irish name | |
| ("PAT031", "gender"), # Morgan (M) - gender-neutral name | |
| ("PAT035", "gender"), # Avery (M) - gender-neutral name | |
| ("PAT039", "gender"), # Casey (F) - gender-neutral but often male | |
| ("PAT043", "gender"), # Dana (M) - gender-neutral name | |
| ("PAT047", "gender"), # Robin (F) - gender-neutral name | |
| ("PAT003", "last_name"), # O'Brien vs OBrien vs O Brien - apostrophe handling debatable | |
| ], | |
| ) | |
| register_task(_HARD_TASK) | |