""" Task definitions for the Data Cleaning OpenEnv environment. Defines three tasks at easy, medium, and hard difficulty levels, each with complete ground truth data, schema definitions, and corruption specifications. SELF-CONTAINED: All ground truth data is inlined. No external imports for data. """ from __future__ import annotations from dataclasses import dataclass, field from typing import Any # ============================================================================ # Utility Probe dataclass # ============================================================================ @dataclass class UtilityProbe: """A downstream analytics probe that validates the cleaned data produces correct aggregate results, not just cell-level correctness.""" name: str # e.g., "unique_customer_count" description: str # e.g., "Count of unique customers after dedup" query_fn: str # Name of the probe function to run params: dict[str, Any] = field(default_factory=dict) # e.g., {"column": "email"} expected_result: Any = None # The correct answer from ground truth # ============================================================================ # Task dataclass # ============================================================================ @dataclass class Task: task_id: str name: str difficulty: str description: str ground_truth: list[dict[str, Any]] schema: dict[str, Any] corruptions: list[dict[str, Any]] max_steps: int utility_probes: list[UtilityProbe] = field(default_factory=list) ambiguous_cells: list[tuple[str, str]] = field(default_factory=list) # ============================================================================ # Task registry # ============================================================================ _TASK_REGISTRY: dict[str, Task] = {} def register_task(task: Task) -> None: """Register a task in the global registry, keyed by task_id.""" _TASK_REGISTRY[task.task_id] = task def get_task(task_id: str) -> Task: """Retrieve a registered task by its task_id. Raises ``KeyError`` if the task_id is not found. """ if task_id not in _TASK_REGISTRY: raise KeyError(f"Task '{task_id}' not found. Available: {list(_TASK_REGISTRY)}") return _TASK_REGISTRY[task_id] def list_tasks() -> list[dict[str, Any]]: """Return a list of metadata dicts (one per registered task).""" return [ { "task_id": t.task_id, "name": t.name, "difficulty": t.difficulty, "description": t.description, "num_ground_truth_rows": len(t.ground_truth), "num_corruptions": len(t.corruptions), "max_steps": t.max_steps, "num_utility_probes": len(t.utility_probes), } for t in _TASK_REGISTRY.values() ] # ############################################################################ # EASY TASK: Customer Contact Cleanup # ############################################################################ _EASY_GROUND_TRUTH: list[dict[str, Any]] = [ { "_entity_id": "CONTACT001", "id": 1, "first_name": "Alice", "last_name": "Morgan", "email": "alice.morgan@example.com", "phone": "(555) 123-4567", "signup_date": "2022-01-15", "state": "CA", }, { "_entity_id": "CONTACT002", "id": 2, "first_name": "Brian", "last_name": "Cho", "email": "brian.cho@example.com", "phone": "(555) 234-5678", "signup_date": "2022-03-22", "state": "NY", }, { "_entity_id": "CONTACT003", "id": 3, "first_name": "Carmen", "last_name": "Reyes", "email": "carmen.reyes@example.com", "phone": "(555) 345-6789", "signup_date": "2022-06-10", "state": "TX", }, { "_entity_id": "CONTACT004", "id": 4, "first_name": "David", "last_name": "Novak", "email": "david.novak@example.com", "phone": "(555) 456-7890", "signup_date": "2022-08-05", "state": "FL", }, { "_entity_id": "CONTACT005", "id": 5, "first_name": "Elena", "last_name": "Petrova", "email": "elena.petrova@example.com", "phone": "(555) 567-8901", "signup_date": "2023-01-18", "state": "WA", }, { "_entity_id": "CONTACT006", "id": 6, "first_name": "Frank", "last_name": "Oduya", "email": "frank.oduya@example.com", "phone": "(555) 678-9012", "signup_date": "2023-04-30", "state": "IL", }, { "_entity_id": "CONTACT007", "id": 7, "first_name": "Grace", "last_name": "Kim", "email": "grace.kim@example.com", "phone": "(555) 789-0123", "signup_date": "2023-07-14", "state": "OR", }, { "_entity_id": "CONTACT008", "id": 8, "first_name": "Hassan", "last_name": "Ali", "email": "hassan.ali@example.com", "phone": "(555) 890-1234", "signup_date": "2023-10-02", "state": "CO", }, ] _EASY_SCHEMA: dict[str, Any] = { "primary_key": "id", "expected_types": { "id": "int", "first_name": "str", "last_name": "str", "email": "email", "phone": "phone", "signup_date": "date", "state": "str", }, "constraints": { "email": {"format": r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}$"}, "phone": {"format": r"^\(\d{3}\) \d{3}-\d{4}$"}, "signup_date": {"format": "YYYY-MM-DD"}, "state": { "allowed_values": [ "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY", ], }, }, } _EASY_CORRUPTIONS: list[dict[str, Any]] = [ # --- char_swap: 2 typos --- { "type": "char_swap", "targets": [{"row_idx": 1, "field": "first_name"}], "description": "Swapped letters in first_name for contact 2 (Brian -> Biran)", "target_entity_id": "CONTACT002", "original": "Brian", "corrupted": "Biran", }, { "type": "char_swap", "targets": [{"row_idx": 5, "field": "last_name"}], "description": "Transposed letters in last name for contact 6 (Oduya -> Oduay)", "target_entity_id": "CONTACT006", "original": "Oduya", "corrupted": "Oduay", }, # --- format_randomize: 2 dates --- { "type": "format_randomize", "column": "signup_date", "row_indices": [0], "description": "Date reformatted to MM/DD/YYYY for contact 1", "target_entity_id": "CONTACT001", "original": "2022-01-15", "corrupted": "01/15/2022", }, { "type": "format_randomize", "column": "signup_date", "row_indices": [4], "description": "Date reformatted to DD-Mon-YYYY for contact 5", "target_entity_id": "CONTACT005", "original": "2023-01-18", "corrupted": "18-Jan-2023", }, # --- null_inject: 1 --- { "type": "null_inject", "targets": [{"row_idx": 3, "field": "email"}], "description": "Email set to None for contact 4", "target_entity_id": "CONTACT004", "original": "david.novak@example.com", "corrupted": None, }, # --- format_strip: 1 phone --- { "type": "format_strip", "column": "phone", "row_indices": [2], "description": "Phone stripped of formatting for contact 3", "target_entity_id": "CONTACT003", "original": "(555) 345-6789", "corrupted": "5553456789", }, # --- case_corrupt: 2 --- { "type": "case_corrupt", "targets": [{"row_idx": 6, "field": "first_name"}], "description": "First name uppercased for contact 7", "target_entity_id": "CONTACT007", "original": "Grace", "corrupted": "GRACE", }, { "type": "case_corrupt", "targets": [{"row_idx": 7, "field": "state"}], "description": "State lowercased for contact 8", "target_entity_id": "CONTACT008", "original": "CO", "corrupted": "co", }, ] _EASY_PROBES: list[UtilityProbe] = [ UtilityProbe( name="unique_email_count", description="Count of unique non-null email addresses after cleaning", query_fn="unique_count", params={"column": "email"}, expected_result=8, ), UtilityProbe( name="state_distribution", description="Count of contacts per state", query_fn="distribution", params={"column": "state"}, expected_result={ "CA": 1, "NY": 1, "TX": 1, "FL": 1, "WA": 1, "IL": 1, "OR": 1, "CO": 1, }, ), ] _EASY_TASK = Task( task_id="easy_contacts", name="Customer Contact Cleanup", difficulty="easy", description=( "Clean a small customer contact list with 8 rows and 7 columns plus id. " "Corruptions include character swaps (typos), randomised date formats, " "a null injection, a stripped phone number, and case corruptions. " "There are NO duplicate rows in this task." ), ground_truth=_EASY_GROUND_TRUTH, schema=_EASY_SCHEMA, corruptions=_EASY_CORRUPTIONS, max_steps=30, utility_probes=_EASY_PROBES, ) register_task(_EASY_TASK) # ############################################################################ # MEDIUM TASK: Employee Records Reconciliation # ############################################################################ _MEDIUM_GROUND_TRUTH: list[dict[str, Any]] = [ { "_entity_id": "EMP001", "emp_id": 1001, "first_name": "Robert", "last_name": "Chen", "email": "r.chen@company.com", "phone": "(415) 555-0142", "department": "Engineering", "hire_date": "2019-03-15", "salary": 145000.00, "office_city": "San Francisco", "office_zip": "94105", }, { "_entity_id": "EMP002", "emp_id": 1002, "first_name": "Jennifer", "last_name": "Okafor", "email": "j.okafor@company.com", "phone": "(212) 555-0387", "department": "Marketing", "hire_date": "2020-07-22", "salary": 112000.00, "office_city": "New York", "office_zip": "10013", }, { "_entity_id": "EMP003", "emp_id": 1003, "first_name": "William", "last_name": "Hernandez", "email": "w.hernandez@company.com", "phone": "(512) 555-0219", "department": "Sales", "hire_date": "2021-01-10", "salary": 98000.00, "office_city": "Austin", "office_zip": "78701", }, { "_entity_id": "EMP004", "emp_id": 1004, "first_name": "Priya", "last_name": "Patel", "email": "p.patel@company.com", "phone": "(206) 555-0463", "department": "Engineering", "hire_date": "2019-11-04", "salary": 162000.00, "office_city": "Seattle", "office_zip": "98101", }, { "_entity_id": "EMP005", "emp_id": 1005, "first_name": "Michael", "last_name": "Thompson", "email": "m.thompson@company.com", "phone": "(312) 555-0578", "department": "Finance", "hire_date": "2020-04-18", "salary": 131000.00, "office_city": "Chicago", "office_zip": "60601", }, { "_entity_id": "EMP006", "emp_id": 1006, "first_name": "Aisha", "last_name": "Washington", "email": "a.washington@company.com", "phone": "(303) 555-0691", "department": "HR", "hire_date": "2022-02-28", "salary": 95000.00, "office_city": "Denver", "office_zip": "80202", }, { "_entity_id": "EMP007", "emp_id": 1007, "first_name": "Carlos", "last_name": "Rivera", "email": "c.rivera@company.com", "phone": "(415) 555-0734", "department": "Engineering", "hire_date": "2021-06-14", "salary": 138000.00, "office_city": "San Francisco", "office_zip": "94107", }, { "_entity_id": "EMP008", "emp_id": 1008, "first_name": "Sarah", "last_name": "Kim", "email": "s.kim@company.com", "phone": "(212) 555-0856", "department": "Marketing", "hire_date": "2023-01-09", "salary": 87000.00, "office_city": "New York", "office_zip": "10001", }, { "_entity_id": "EMP009", "emp_id": 1009, "first_name": "James", "last_name": "O'Brien", "email": "j.obrien@company.com", "phone": "(512) 555-0923", "department": "Operations", "hire_date": "2019-08-21", "salary": 105000.00, "office_city": "Austin", "office_zip": "78702", }, { "_entity_id": "EMP010", "emp_id": 1010, "first_name": "Wei", "last_name": "Zhang", "email": "w.zhang@company.com", "phone": "(206) 555-0147", "department": "Engineering", "hire_date": "2020-10-05", "salary": 155000.00, "office_city": "Seattle", "office_zip": "98104", }, { "_entity_id": "EMP011", "emp_id": 1011, "first_name": "Maria", "last_name": "Gonzalez", "email": "m.gonzalez@company.com", "phone": "(303) 555-0268", "department": "Sales", "hire_date": "2022-05-16", "salary": 91000.00, "office_city": "Denver", "office_zip": "80203", }, { "_entity_id": "EMP012", "emp_id": 1012, "first_name": "David", "last_name": "Nakamura", "email": "d.nakamura@company.com", "phone": "(312) 555-0395", "department": "Finance", "hire_date": "2021-09-01", "salary": 118000.00, "office_city": "Chicago", "office_zip": "60602", }, { "_entity_id": "EMP013", "emp_id": 1013, "first_name": "Fatima", "last_name": "Al-Rashidi", "email": "f.al-rashidi@company.com", "phone": "(415) 555-0512", "department": "Engineering", "hire_date": "2023-03-20", "salary": 128000.00, "office_city": "San Francisco", "office_zip": "94103", }, { "_entity_id": "EMP014", "emp_id": 1014, "first_name": "Marcus", "last_name": "Johnson", "email": "m.johnson@company.com", "phone": "(212) 555-0643", "department": "Operations", "hire_date": "2020-12-07", "salary": 102000.00, "office_city": "New York", "office_zip": "10016", }, { "_entity_id": "EMP015", "emp_id": 1015, "first_name": "Elena", "last_name": "Petrov", "email": "e.petrov@company.com", "phone": "(512) 555-0178", "department": "HR", "hire_date": "2024-01-15", "salary": 88000.00, "office_city": "Austin", "office_zip": "78703", }, { "_entity_id": "EMP016", "emp_id": 1016, "first_name": "Andre", "last_name": "Williams", "email": "a.williams@company.com", "phone": "(206) 555-0834", "department": "Sales", "hire_date": "2022-08-30", "salary": 96000.00, "office_city": "Seattle", "office_zip": "98102", }, { "_entity_id": "EMP017", "emp_id": 1017, "first_name": "Lisa", "last_name": "Svensson", "email": "l.svensson@company.com", "phone": "(312) 555-0956", "department": "Finance", "hire_date": "2023-07-11", "salary": 108000.00, "office_city": "Chicago", "office_zip": "60603", }, ] _MEDIUM_SCHEMA: dict[str, Any] = { "primary_key": "emp_id", "expected_types": { "emp_id": "int", "first_name": "str", "last_name": "str", "email": "email", "phone": "phone", "department": "str", "hire_date": "date", "salary": "float", "office_city": "str", "office_zip": "str", }, "constraints": { "email": {"format": r"^[a-z]\.[a-z\-]+@company\.com$"}, "phone": {"format": r"^\(\d{3}\) \d{3}-\d{4}$"}, "hire_date": {"format": "YYYY-MM-DD"}, "salary": {"min": 55000.0, "max": 185000.0}, "department": { "allowed_values": [ "Engineering", "Marketing", "Sales", "HR", "Finance", "Operations", ], }, "office_city": { "allowed_values": [ "San Francisco", "New York", "Chicago", "Austin", "Seattle", "Denver", ], }, "office_zip": {"format": r"^\d{5}$"}, }, } _MEDIUM_CORRUPTIONS: list[dict[str, Any]] = [ # --- duplicate_with_noise: 3 rows (nickname + slight field noise) --- { "type": "duplicate_with_noise", "source_indices": [0], "noise_fields": ["first_name", "phone"], "description": "Duplicate of EMP001 (Robert Chen) as 'Bob Chen' with transposed phone digits", "source_entity_id": "EMP001", "source_emp_id": 1001, "corrupted_row": { "emp_id": 1001, "first_name": "Bob", "last_name": "Chen", "email": "r.chen@company.com", "phone": "(415) 555-0124", "department": "Engineering", "hire_date": "2019-03-15", "salary": 145000.00, "office_city": "San Francisco", "office_zip": "94105", }, }, { "type": "duplicate_with_noise", "source_indices": [2], "noise_fields": ["first_name", "phone"], "description": "Duplicate of EMP003 (William Hernandez) as 'Will Hernandez' with slightly different salary", "source_entity_id": "EMP003", "source_emp_id": 1003, "corrupted_row": { "emp_id": 1003, "first_name": "Will", "last_name": "Hernandez", "email": "w.hernandez@company.com", "phone": "(512) 555-0219", "department": "Sales", "hire_date": "2021-01-10", "salary": 98500.00, "office_city": "Austin", "office_zip": "78701", }, }, { "type": "duplicate_with_noise", "source_indices": [4], "noise_fields": ["first_name", "phone"], "description": "Duplicate of EMP005 (Michael Thompson) as 'Mike Thompson' with wrong date format", "source_entity_id": "EMP005", "source_emp_id": 1005, "corrupted_row": { "emp_id": 1005, "first_name": "Mike", "last_name": "Thompson", "email": "m.thompson@company.com", "phone": "(312) 555-0578", "department": "Finance", "hire_date": "04/18/2020", "salary": 131000.00, "office_city": "Chicago", "office_zip": "60601", }, }, # --- format_randomize dates: 4 --- { "type": "format_randomize", "column": "hire_date", "row_indices": [10], "description": "Hire date in DD-MM-YYYY for EMP011 (Maria Gonzalez)", "target_entity_id": "EMP011", "target_emp_id": 1011, "field": "hire_date", "original": "2022-05-16", "corrupted": "16-05-2022", }, { "type": "format_randomize", "column": "hire_date", "row_indices": [14], "description": "Hire date in MM/DD/YYYY for EMP015 (Elena Petrov)", "target_entity_id": "EMP015", "target_emp_id": 1015, "field": "hire_date", "original": "2024-01-15", "corrupted": "01/15/2024", }, { "type": "format_randomize", "column": "hire_date", "row_indices": [7], "description": "Hire date in Mon DD, YYYY for EMP008 (Sarah Kim)", "target_entity_id": "EMP008", "target_emp_id": 1008, "field": "hire_date", "original": "2023-01-09", "corrupted": "Jan 09, 2023", }, { "type": "format_randomize", "column": "hire_date", "row_indices": [16], "description": "Hire date in DD/MM/YYYY for EMP017 (Lisa Svensson)", "target_entity_id": "EMP017", "target_emp_id": 1017, "field": "hire_date", "original": "2023-07-11", "corrupted": "11/07/2023", }, # --- null_inject: 3 --- { "type": "null_inject", "targets": [{"row_idx": 5, "field": "phone"}], "description": "Phone set to None for EMP006 (Aisha Washington)", "target_entity_id": "EMP006", "target_emp_id": 1006, "original": "(303) 555-0691", "corrupted": None, }, { "type": "null_inject", "targets": [{"row_idx": 11, "field": "department"}], "description": "Department set to None for EMP012 (David Nakamura)", "target_entity_id": "EMP012", "target_emp_id": 1012, "original": "Finance", "corrupted": None, }, { "type": "null_inject", "targets": [{"row_idx": 15, "field": "email"}], "description": "Email set to None for EMP016 (Andre Williams)", "target_entity_id": "EMP016", "target_emp_id": 1016, "original": "a.williams@company.com", "corrupted": None, }, # --- value_variation: department name variations --- { "type": "value_variation", "column": "department", "mapping": {"Engineering": ["Engneering"]}, "description": "Department 'Engineering' misspelled as 'Engneering' for EMP007 (Carlos Rivera)", "target_entity_id": "EMP007", "target_emp_id": 1007, "field": "department", "original": "Engineering", "corrupted": "Engneering", }, { "type": "value_variation", "column": "last_name", "mapping": {"Okafor": ["Okafro"]}, "description": "Last name 'Okafor' transposed as 'Okafro' for EMP002 (Jennifer Okafor)", "target_entity_id": "EMP002", "target_emp_id": 1002, "field": "last_name", "original": "Okafor", "corrupted": "Okafro", }, # --- state_expand: 2 --- { "type": "state_expand", "row_indices": [12], "description": "Office city 'San Francisco' misspelled as 'San Fransisco' for EMP013 (Fatima Al-Rashidi)", "target_entity_id": "EMP013", "target_emp_id": 1013, "field": "office_city", "original": "San Francisco", "corrupted": "San Fransisco", }, { "type": "state_expand", "row_indices": [13], "description": "Department 'Operations' corrupted with extra space as 'Operat ions' for EMP014 (Marcus Johnson)", "target_entity_id": "EMP014", "target_emp_id": 1014, "field": "department", "original": "Operations", "corrupted": "Operat ions", }, # --- format_randomize salary --- { "type": "format_randomize", "column": "salary", "row_indices": [7], "description": "Salary turned negative for EMP008 (Sarah Kim) - invalid value", "target_entity_id": "EMP008", "target_emp_id": 1008, "field": "salary", "original": 87000.00, "corrupted": -87000.00, }, ] _MEDIUM_PROBES: list[UtilityProbe] = [ UtilityProbe( name="unique_employee_count", description="Count of unique employees after deduplication", query_fn="unique_count", params={"column": "emp_id"}, expected_result=17, ), UtilityProbe( name="department_salary_avg", description="Average salary per department", query_fn="avg_by_group", params={"value_col": "salary", "group_col": "department"}, expected_result={ "Engineering": 145600.00, "Finance": 119000.00, "HR": 91500.00, "Marketing": 99500.00, "Operations": 103500.00, "Sales": 95000.00, }, ), UtilityProbe( name="engineering_headcount", description="Number of employees in the Engineering department", query_fn="count_where", params={"column": "department", "value": "Engineering"}, expected_result=5, ), ] _MEDIUM_TASK = Task( task_id="medium_employees", name="Employee Records Reconciliation", difficulty="medium", description=( "Reconcile a set of 17 unique employee records (10 columns) that have " "been corrupted with 3 near-duplicate rows (nickname variants), " "date format inconsistencies, null injections, department name typos, " "city misspellings, and an invalid salary. The dirty dataset has 20 rows." ), ground_truth=_MEDIUM_GROUND_TRUTH, schema=_MEDIUM_SCHEMA, corruptions=_MEDIUM_CORRUPTIONS, max_steps=60, utility_probes=_MEDIUM_PROBES, ) register_task(_MEDIUM_TASK) # ############################################################################ # HARD TASK: Multi-Source Patient Registry # ############################################################################ _HARD_GROUND_TRUTH: list[dict[str, Any]] = [ # --- DUPLICATE CLUSTER CANDIDATES (patients 1-6) --- { "_entity_id": "PAT001", "patient_id": 1, "first_name": "William", "last_name": "Thompson", "dob": "1958-03-14", "gender": "M", "phone": "312-555-0147", "email": "william.thompson@gmail.com", "address_line": "742 North Michigan Ave", "city": "Chicago", "state": "IL", "zip": "60601", "insurance_provider": "Blue Cross", "insurance_id": "BC-4471583", "last_visit_date": "2024-01-15", }, { "_entity_id": "PAT002", "patient_id": 2, "first_name": "Robert", "last_name": "Martinez", "dob": "1975-07-22", "gender": "M", "phone": "206-555-0293", "email": "robert.martinez@outlook.com", "address_line": "1501 Pike Place", "city": "Seattle", "state": "WA", "zip": "98101", "insurance_provider": "UnitedHealth", "insurance_id": "UH-8823146", "last_visit_date": "2024-03-08", }, { "_entity_id": "PAT003", "patient_id": 3, "first_name": "Elizabeth", "last_name": "O'Brien", "dob": "1982-11-05", "gender": "F", "phone": "617-555-0381", "email": "elizabeth.obrien@yahoo.com", "address_line": "88 Beacon Street", "city": "Boston", "state": "MA", "zip": "02101", "insurance_provider": "Aetna", "insurance_id": "AE-5539274", "last_visit_date": "2023-09-20", }, { "_entity_id": "PAT004", "patient_id": 4, "first_name": "Jennifer", "last_name": "Nguyen", "dob": "1990-04-18", "gender": "F", "phone": "503-555-0462", "email": "jennifer.nguyen@gmail.com", "address_line": "2200 NW Burnside Rd", "city": "Portland", "state": "OR", "zip": "97201", "insurance_provider": "Cigna", "insurance_id": "CI-3317892", "last_visit_date": "2024-06-11", }, { "_entity_id": "PAT005", "patient_id": 5, "first_name": "James", "last_name": "Kowalski", "dob": "1967-09-30", "gender": "M", "phone": "303-555-0518", "email": "james.kowalski@aol.com", "address_line": "1600 Colfax Ave", "city": "Denver", "state": "CO", "zip": "80201", "insurance_provider": "Medicare", "insurance_id": "MC-6642018", "last_visit_date": "2024-02-28", }, { "_entity_id": "PAT006", "patient_id": 6, "first_name": "Katherine", "last_name": "Patel", "dob": "1988-01-27", "gender": "F", "phone": "404-555-0637", "email": "katherine.patel@hotmail.com", "address_line": "350 Peachtree St NE", "city": "Atlanta", "state": "GA", "zip": "30301", "insurance_provider": "Blue Cross", "insurance_id": "BC-9918453", "last_visit_date": "2023-12-05", }, # --- FALSE POSITIVE PAIR 1: Two different "Michael Davis" --- { "_entity_id": "PAT007", "patient_id": 7, "first_name": "Michael", "last_name": "Davis", "dob": "1972-06-10", "gender": "M", "phone": "310-555-0744", "email": "michael.davis72@gmail.com", "address_line": "456 Rodeo Drive", "city": "Beverly Hills", "state": "CA", "zip": "90210", "insurance_provider": "UnitedHealth", "insurance_id": "UH-1157329", "last_visit_date": "2024-04-22", }, { "_entity_id": "PAT008", "patient_id": 8, "first_name": "Michael", "last_name": "Davis", "dob": "1995-02-14", "gender": "M", "phone": "305-555-0856", "email": "mdavis95@yahoo.com", "address_line": "900 Biscayne Blvd", "city": "Miami", "state": "FL", "zip": "33101", "insurance_provider": "Aetna", "insurance_id": "AE-7743201", "last_visit_date": "2023-11-30", }, # --- FALSE POSITIVE PAIR 2: Two different "Maria Garcia" --- { "_entity_id": "PAT009", "patient_id": 9, "first_name": "Maria", "last_name": "Garcia", "dob": "1960-08-03", "gender": "F", "phone": "512-555-0912", "email": "maria.garcia60@gmail.com", "address_line": "1100 Congress Ave", "city": "Austin", "state": "TX", "zip": "73301", "insurance_provider": "Medicare", "insurance_id": "MC-2204857", "last_visit_date": "2024-05-14", }, { "_entity_id": "PAT010", "patient_id": 10, "first_name": "Maria", "last_name": "Garcia", "dob": "1985-12-19", "gender": "F", "phone": "212-555-1023", "email": "mgarcia.nyc@outlook.com", "address_line": "250 West 34th St", "city": "New York", "state": "NY", "zip": "10001", "insurance_provider": "Cigna", "insurance_id": "CI-8856134", "last_visit_date": "2023-08-27", }, # --- REMAINING UNIQUE PATIENTS (11-30) --- { "_entity_id": "PAT011", "patient_id": 11, "first_name": "David", "last_name": "Chen", "dob": "1979-05-11", "gender": "M", "phone": "415-555-1134", "email": "david.chen@gmail.com", "address_line": "580 Market Street", "city": "San Francisco", "state": "CA", "zip": "94105", "insurance_provider": "Blue Cross", "insurance_id": "BC-3364721", "last_visit_date": "2024-07-03", }, { "_entity_id": "PAT012", "patient_id": 12, "first_name": "Sarah", "last_name": "Johnson", "dob": "1993-02-28", "gender": "F", "phone": "713-555-1245", "email": "sarah.johnson93@yahoo.com", "address_line": "4200 Westheimer Rd", "city": "Houston", "state": "TX", "zip": "77027", "insurance_provider": "UnitedHealth", "insurance_id": "UH-5578934", "last_visit_date": "2023-10-18", }, { "_entity_id": "PAT013", "patient_id": 13, "first_name": "Thomas", "last_name": "Wilson", "dob": "1945-10-07", "gender": "M", "phone": "602-555-1356", "email": "tom.wilson45@aol.com", "address_line": "3300 N Central Ave", "city": "Phoenix", "state": "AZ", "zip": "85012", "insurance_provider": "Medicare", "insurance_id": "MC-1192746", "last_visit_date": "2024-01-09", }, # --- STATISTICALLY UNUSUAL BUT VALID: Male named Ashley --- { "_entity_id": "PAT014", "patient_id": 14, "first_name": "Ashley", "last_name": "Richardson", "dob": "1970-12-22", "gender": "M", "phone": "615-555-1467", "email": "ashley.richardson@gmail.com", "address_line": "210 Broadway", "city": "Nashville", "state": "TN", "zip": "37201", "insurance_provider": "Aetna", "insurance_id": "AE-4426183", "last_visit_date": "2024-03-19", }, { "_entity_id": "PAT015", "patient_id": 15, "first_name": "Patricia", "last_name": "Lee", "dob": "1952-04-15", "gender": "F", "phone": "702-555-1578", "email": "patricia.lee@outlook.com", "address_line": "3600 Las Vegas Blvd S", "city": "Las Vegas", "state": "NV", "zip": "89109", "insurance_provider": "Medicare", "insurance_id": "MC-8835492", "last_visit_date": "2023-07-25", }, { "_entity_id": "PAT016", "patient_id": 16, "first_name": "Daniel", "last_name": "Brown", "dob": "1998-08-09", "gender": "M", "phone": "214-555-1689", "email": "daniel.brown98@gmail.com", "address_line": "1700 Pacific Ave", "city": "Dallas", "state": "TX", "zip": "75201", "insurance_provider": "Cigna", "insurance_id": "CI-2293847", "last_visit_date": "2024-08-01", }, { "_entity_id": "PAT017", "patient_id": 17, "first_name": "Linda", "last_name": "Anderson", "dob": "1963-01-30", "gender": "F", "phone": "952-555-1791", "email": "linda.anderson@yahoo.com", "address_line": "800 Nicollet Mall", "city": "Minneapolis", "state": "MN", "zip": "55402", "insurance_provider": "Blue Cross", "insurance_id": "BC-6671835", "last_visit_date": "2024-05-30", }, { "_entity_id": "PAT018", "patient_id": 18, "first_name": "Christopher", "last_name": "Taylor", "dob": "1984-06-14", "gender": "M", "phone": "704-555-1802", "email": "chris.taylor84@gmail.com", "address_line": "401 N Tryon St", "city": "Charlotte", "state": "NC", "zip": "28202", "insurance_provider": "UnitedHealth", "insurance_id": "UH-3349128", "last_visit_date": "2023-12-12", }, { "_entity_id": "PAT019", "patient_id": 19, "first_name": "Nancy", "last_name": "White", "dob": "1948-09-21", "gender": "F", "phone": "314-555-1913", "email": "nancy.white48@hotmail.com", "address_line": "100 Washington Ave", "city": "Saint Louis", "state": "MO", "zip": "63101", "insurance_provider": "Medicare", "insurance_id": "MC-4457631", "last_visit_date": "2024-02-14", }, { "_entity_id": "PAT020", "patient_id": 20, "first_name": "Kevin", "last_name": "Harris", "dob": "2001-03-05", "gender": "M", "phone": "407-555-2024", "email": "kevin.harris01@gmail.com", "address_line": "525 S Orange Ave", "city": "Orlando", "state": "FL", "zip": "32801", "insurance_provider": "Aetna", "insurance_id": "AE-9914567", "last_visit_date": "2024-06-28", }, { "_entity_id": "PAT021", "patient_id": 21, "first_name": "Susan", "last_name": "Clark", "dob": "1977-11-13", "gender": "F", "phone": "412-555-2135", "email": "susan.clark77@outlook.com", "address_line": "600 Grant St", "city": "Pittsburgh", "state": "PA", "zip": "15219", "insurance_provider": "Cigna", "insurance_id": "CI-5582719", "last_visit_date": "2023-09-05", }, # --- STATISTICALLY UNUSUAL BUT VALID: Female named Jordan --- { "_entity_id": "PAT022", "patient_id": 22, "first_name": "Jordan", "last_name": "Mitchell", "dob": "1996-07-08", "gender": "F", "phone": "619-555-2246", "email": "jordan.mitchell96@gmail.com", "address_line": "750 B Street", "city": "San Diego", "state": "CA", "zip": "92101", "insurance_provider": "Blue Cross", "insurance_id": "BC-7728364", "last_visit_date": "2024-04-10", }, { "_entity_id": "PAT023", "patient_id": 23, "first_name": "Richard", "last_name": "Lopez", "dob": "1955-12-01", "gender": "M", "phone": "210-555-2357", "email": "richard.lopez55@aol.com", "address_line": "300 Alamo Plaza", "city": "San Antonio", "state": "TX", "zip": "78205", "insurance_provider": "Medicare", "insurance_id": "MC-3346285", "last_visit_date": "2024-01-22", }, { "_entity_id": "PAT024", "patient_id": 24, "first_name": "Angela", "last_name": "Robinson", "dob": "1989-05-26", "gender": "F", "phone": "317-555-2468", "email": "angela.robinson@yahoo.com", "address_line": "200 E Washington St", "city": "Indianapolis", "state": "IN", "zip": "46204", "insurance_provider": "UnitedHealth", "insurance_id": "UH-6693241", "last_visit_date": "2023-11-08", }, { "_entity_id": "PAT025", "patient_id": 25, "first_name": "Steven", "last_name": "Walker", "dob": "1941-07-19", "gender": "M", "phone": "216-555-2579", "email": "steven.walker@gmail.com", "address_line": "1100 Superior Ave", "city": "Cleveland", "state": "OH", "zip": "44114", "insurance_provider": "Medicare", "insurance_id": "MC-1128574", "last_visit_date": "2024-03-05", }, { "_entity_id": "PAT026", "patient_id": 26, "first_name": "Michelle", "last_name": "Young", "dob": "2003-10-31", "gender": "F", "phone": "504-555-2681", "email": "michelle.young03@outlook.com", "address_line": "800 Canal St", "city": "New Orleans", "state": "LA", "zip": "70112", "insurance_provider": "Aetna", "insurance_id": "AE-8847392", "last_visit_date": "2024-07-19", }, # --- STATISTICALLY UNUSUAL BUT VALID: Male named Shannon --- { "_entity_id": "PAT027", "patient_id": 27, "first_name": "Shannon", "last_name": "Burke", "dob": "1974-02-08", "gender": "M", "phone": "816-555-2792", "email": "shannon.burke@gmail.com", "address_line": "1200 Main St", "city": "Kansas City", "state": "MO", "zip": "64105", "insurance_provider": "Cigna", "insurance_id": "CI-4431957", "last_visit_date": "2023-08-14", }, { "_entity_id": "PAT028", "patient_id": 28, "first_name": "Dorothy", "last_name": "Hall", "dob": "1943-06-17", "gender": "F", "phone": "414-555-2903", "email": "dorothy.hall43@yahoo.com", "address_line": "500 W Wisconsin Ave", "city": "Milwaukee", "state": "WI", "zip": "53203", "insurance_provider": "Medicare", "insurance_id": "MC-5563418", "last_visit_date": "2024-02-07", }, { "_entity_id": "PAT029", "patient_id": 29, "first_name": "Brian", "last_name": "Kim", "dob": "1992-09-14", "gender": "M", "phone": "571-555-3014", "email": "brian.kim92@gmail.com", "address_line": "1750 Tysons Blvd", "city": "Tysons", "state": "VA", "zip": "22102", "insurance_provider": "Blue Cross", "insurance_id": "BC-2215847", "last_visit_date": "2024-05-02", }, { "_entity_id": "PAT030", "patient_id": 30, "first_name": "Laura", "last_name": "Scott", "dob": "1999-01-23", "gender": "F", "phone": "919-555-3125", "email": "laura.scott99@hotmail.com", "address_line": "400 Fayetteville St", "city": "Raleigh", "state": "NC", "zip": "27601", "insurance_provider": "UnitedHealth", "insurance_id": "UH-7741926", "last_visit_date": "2023-10-31", }, # --- NEW PATIENTS (PAT031-PAT050) --- # --- GENDER-NEUTRAL NAME TRAP: Morgan (M) --- { "_entity_id": "PAT031", "patient_id": 31, "first_name": "Morgan", "last_name": "Fletcher", "dob": "1986-04-12", "gender": "M", "phone": "253-555-3201", "email": "morgan.fletcher86@gmail.com", "address_line": "1900 Pacific Ave", "city": "Tacoma", "state": "WA", "zip": "98402", "insurance_provider": "Cigna", "insurance_id": "CI-7712548", "last_visit_date": "2024-03-22", }, # --- FALSE POSITIVE PAIR 3: Two "David Kim" (PAT032 & PAT033) --- { "_entity_id": "PAT032", "patient_id": 32, "first_name": "David", "last_name": "Kim", "dob": "1988-07-20", "gender": "M", "phone": "425-555-3312", "email": "david.kim88@gmail.com", "address_line": "300 108th Ave NE", "city": "Bellevue", "state": "WA", "zip": "98004", "insurance_provider": "UnitedHealth", "insurance_id": "UH-4423891", "last_visit_date": "2024-05-18", }, { "_entity_id": "PAT033", "patient_id": 33, "first_name": "David", "last_name": "Kim", "dob": "1990-11-03", "gender": "M", "phone": "206-555-3423", "email": "dkim90@outlook.com", "address_line": "815 Pine St", "city": "Seattle", "state": "WA", "zip": "98101", "insurance_provider": "UnitedHealth", "insurance_id": "UH-6638172", "last_visit_date": "2024-01-29", }, # --- DUPLICATE CLUSTER: Christopher -> typo variants (PAT034) --- { "_entity_id": "PAT034", "patient_id": 34, "first_name": "Christopher", "last_name": "Reeves", "dob": "1976-08-25", "gender": "M", "phone": "813-555-3534", "email": "christopher.reeves@gmail.com", "address_line": "1400 N Dale Mabry Hwy", "city": "Tampa", "state": "FL", "zip": "33607", "insurance_provider": "Blue Cross", "insurance_id": "BC-5547832", "last_visit_date": "2024-06-05", }, # --- GENDER-NEUTRAL NAME TRAP: Avery (M) --- { "_entity_id": "PAT035", "patient_id": 35, "first_name": "Avery", "last_name": "Simmons", "dob": "1994-02-17", "gender": "M", "phone": "678-555-3645", "email": "avery.simmons94@yahoo.com", "address_line": "2500 Peachtree Rd NW", "city": "Atlanta", "state": "GA", "zip": "30305", "insurance_provider": "Aetna", "insurance_id": "AE-3318724", "last_visit_date": "2023-11-15", }, # --- FALSE POSITIVE PAIR 4: Two "Sarah Williams" (PAT036 & PAT037) --- { "_entity_id": "PAT036", "patient_id": 36, "first_name": "Sarah", "last_name": "Williams", "dob": "1983-09-14", "gender": "F", "phone": "312-555-3756", "email": "sarah.williams83@gmail.com", "address_line": "55 E Monroe St", "city": "Chicago", "state": "IL", "zip": "60603", "insurance_provider": "Blue Cross", "insurance_id": "BC-8834291", "last_visit_date": "2024-04-01", }, { "_entity_id": "PAT037", "patient_id": 37, "first_name": "Sarah", "last_name": "Williams", "dob": "1978-03-22", "gender": "F", "phone": "773-555-3867", "email": "swilliams78@yahoo.com", "address_line": "4700 N Lincoln Ave", "city": "Chicago", "state": "IL", "zip": "60625", "insurance_provider": "Cigna", "insurance_id": "CI-2247163", "last_visit_date": "2023-07-19", }, # --- DUPLICATE CLUSTER: Alexandra -> misspelling variant (PAT038) --- { "_entity_id": "PAT038", "patient_id": 38, "first_name": "Alexandra", "last_name": "Petrov", "dob": "1991-06-08", "gender": "F", "phone": "480-555-3978", "email": "alexandra.petrov@gmail.com", "address_line": "7100 E Camelback Rd", "city": "Scottsdale", "state": "AZ", "zip": "85251", "insurance_provider": "UnitedHealth", "insurance_id": "UH-9917453", "last_visit_date": "2024-02-20", }, # --- GENDER-NEUTRAL NAME TRAP: Casey (F) --- { "_entity_id": "PAT039", "patient_id": 39, "first_name": "Casey", "last_name": "Morgan", "dob": "2000-01-30", "gender": "F", "phone": "720-555-4089", "email": "casey.morgan00@outlook.com", "address_line": "1600 Stout St", "city": "Denver", "state": "CO", "zip": "80202", "insurance_provider": "Medicare", "insurance_id": "MC-7724316", "last_visit_date": "2024-08-10", }, # --- DUPLICATE CLUSTER: Patricia -> typo variants (PAT040) --- { "_entity_id": "PAT040", "patient_id": 40, "first_name": "Patricia", "last_name": "Hernandez", "dob": "1969-12-04", "gender": "F", "phone": "520-555-4190", "email": "patricia.hernandez@gmail.com", "address_line": "150 N Stone Ave", "city": "Tucson", "state": "AZ", "zip": "85701", "insurance_provider": "Aetna", "insurance_id": "AE-6641258", "last_visit_date": "2024-01-17", }, { "_entity_id": "PAT041", "patient_id": 41, "first_name": "Gregory", "last_name": "Adams", "dob": "1957-03-19", "gender": "M", "phone": "860-555-4201", "email": "gregory.adams57@aol.com", "address_line": "250 Constitution Plaza", "city": "Hartford", "state": "CT", "zip": "06103", "insurance_provider": "Medicare", "insurance_id": "MC-3392841", "last_visit_date": "2024-07-08", }, { "_entity_id": "PAT042", "patient_id": 42, "first_name": "Samantha", "last_name": "Rivera", "dob": "1997-08-22", "gender": "F", "phone": "505-555-4312", "email": "samantha.rivera97@gmail.com", "address_line": "400 Central Ave SW", "city": "Albuquerque", "state": "NM", "zip": "87102", "insurance_provider": "Cigna", "insurance_id": "CI-8812347", "last_visit_date": "2023-12-29", }, # --- GENDER-NEUTRAL NAME TRAP: Dana (M) --- { "_entity_id": "PAT043", "patient_id": 43, "first_name": "Dana", "last_name": "Crawford", "dob": "1965-11-28", "gender": "M", "phone": "901-555-4423", "email": "dana.crawford65@hotmail.com", "address_line": "203 Beale St", "city": "Memphis", "state": "TN", "zip": "38103", "insurance_provider": "Blue Cross", "insurance_id": "BC-1148273", "last_visit_date": "2024-04-15", }, # --- FALSE POSITIVE PAIR 5: Two "James Lee" (PAT044 & PAT045) --- { "_entity_id": "PAT044", "patient_id": 44, "first_name": "James", "last_name": "Lee", "dob": "1981-05-09", "gender": "M", "phone": "510-555-4534", "email": "james.lee81@gmail.com", "address_line": "1901 Harrison St", "city": "Oakland", "state": "CA", "zip": "94612", "insurance_provider": "Aetna", "insurance_id": "AE-5523918", "last_visit_date": "2024-06-22", }, { "_entity_id": "PAT045", "patient_id": 45, "first_name": "James", "last_name": "Lee", "dob": "1982-10-31", "gender": "M", "phone": "408-555-4645", "email": "jlee82@yahoo.com", "address_line": "225 W Santa Clara St", "city": "San Jose", "state": "CA", "zip": "95113", "insurance_provider": "Aetna", "insurance_id": "AE-7739482", "last_visit_date": "2023-09-14", }, { "_entity_id": "PAT046", "patient_id": 46, "first_name": "Theresa", "last_name": "Nguyen", "dob": "1973-07-14", "gender": "F", "phone": "832-555-4756", "email": "theresa.nguyen73@gmail.com", "address_line": "2100 Travis St", "city": "Houston", "state": "TX", "zip": "77002", "insurance_provider": "UnitedHealth", "insurance_id": "UH-2248637", "last_visit_date": "2024-05-07", }, # --- GENDER-NEUTRAL NAME TRAP: Robin (F) --- { "_entity_id": "PAT047", "patient_id": 47, "first_name": "Robin", "last_name": "Blackwell", "dob": "1980-05-16", "gender": "F", "phone": "336-555-4867", "email": "robin.blackwell@outlook.com", "address_line": "300 N Greene St", "city": "Greensboro", "state": "NC", "zip": "27401", "insurance_provider": "Medicare", "insurance_id": "MC-4458923", "last_visit_date": "2024-03-11", }, # --- DUPLICATE CLUSTER: Catherine -> spelling variants (PAT048) --- # NOTE: PAT006 is Katherine Patel (different person!). Agent must NOT # merge PAT048's duplicates with PAT006. { "_entity_id": "PAT048", "patient_id": 48, "first_name": "Catherine", "last_name": "Brooks", "dob": "1987-09-03", "gender": "F", "phone": "614-555-4978", "email": "catherine.brooks@gmail.com", "address_line": "100 E Broad St", "city": "Columbus", "state": "OH", "zip": "43215", "insurance_provider": "Blue Cross", "insurance_id": "BC-6693147", "last_visit_date": "2024-07-25", }, { "_entity_id": "PAT049", "patient_id": 49, "first_name": "Raymond", "last_name": "Foster", "dob": "1950-02-11", "gender": "M", "phone": "502-555-5089", "email": "raymond.foster50@aol.com", "address_line": "700 W Main St", "city": "Louisville", "state": "KY", "zip": "40202", "insurance_provider": "Medicare", "insurance_id": "MC-8817294", "last_visit_date": "2024-02-19", }, { "_entity_id": "PAT050", "patient_id": 50, "first_name": "Heather", "last_name": "Sanchez", "dob": "2004-06-21", "gender": "F", "phone": "515-555-5190", "email": "heather.sanchez04@gmail.com", "address_line": "1000 Walnut St", "city": "Des Moines", "state": "IA", "zip": "50309", "insurance_provider": "Cigna", "insurance_id": "CI-3347291", "last_visit_date": "2023-10-05", }, ] _HARD_SCHEMA: dict[str, Any] = { "primary_key": "patient_id", "expected_types": { "patient_id": "int", "first_name": "str", "last_name": "str", "dob": "date", "gender": "str", "phone": "phone", "email": "email", "address_line": "str", "city": "str", "state": "str", "zip": "str", "insurance_provider": "str", "insurance_id": "str", "last_visit_date": "date", }, "constraints": { "dob": {"format": "YYYY-MM-DD", "min": "1940-01-01", "max": "2005-12-31"}, "gender": {"allowed_values": ["M", "F"]}, "phone": {"format": r"^\d{3}-\d{3}-\d{4}$"}, "email": {"format": r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}$"}, "state": { "allowed_values": [ "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY", ], }, "zip": {"format": r"^\d{5}$"}, "insurance_provider": { "allowed_values": ["Blue Cross", "UnitedHealth", "Aetna", "Cigna", "Medicare"], }, "insurance_id": { "format": r"^(BC|UH|AE|CI|MC)-\d{7}$", }, "last_visit_date": {"format": "YYYY-MM-DD", "min": "2023-01-01", "max": "2024-12-31"}, }, "cross_field_rules": { "zip_city_map": { "60601": "Chicago", "60602": "Chicago", "60603": "Chicago", "60625": "Chicago", "98101": "Seattle", "02101": "Boston", "97201": "Portland", "80201": "Denver", "80202": "Denver", "30301": "Atlanta", "30305": "Atlanta", "90210": "Beverly Hills", "33101": "Miami", "73301": "Austin", "10001": "New York", "94105": "San Francisco", "77027": "Houston", "77002": "Houston", "85012": "Phoenix", "37201": "Nashville", "89109": "Las Vegas", "75201": "Dallas", "55402": "Minneapolis", "28202": "Charlotte", "63101": "Saint Louis", "32801": "Orlando", "15219": "Pittsburgh", "92101": "San Diego", "78205": "San Antonio", "46204": "Indianapolis", "44114": "Cleveland", "70112": "New Orleans", "64105": "Kansas City", "53203": "Milwaukee", "22102": "Tysons", "27601": "Raleigh", "98402": "Tacoma", "98004": "Bellevue", "33607": "Tampa", "85251": "Scottsdale", "85701": "Tucson", "06103": "Hartford", "87102": "Albuquerque", "38103": "Memphis", "94612": "Oakland", "95113": "San Jose", "27401": "Greensboro", "43215": "Columbus", "40202": "Louisville", "50309": "Des Moines", }, "insurance_prefix_map": { "Blue Cross": "BC", "UnitedHealth": "UH", "Aetna": "AE", "Cigna": "CI", "Medicare": "MC", }, }, } _HARD_CORRUPTIONS: list[dict[str, Any]] = [ # ========================================================================= # ORIGINAL 6 duplicate clusters (nickname variants) creating ~10 extra rows # Duplicate patient_ids start at 51 (PAT031-PAT050 are real patients now) # ========================================================================= { "type": "duplicate_cluster", "source_indices": [0], "cluster_sizes": [2], "noise_fields": ["first_name", "phone", "address_line"], "description": "William Thompson -> 'Bill Thompson' and 'Wm Thompson'", "source_entity_id": "PAT001", "source_patient_id": 1, "duplicates": [ { "new_patient_id": 51, "changes": { "first_name": "Bill", "email": "bill.thompson@gmail.com", "phone": "312-555-0148", "address_line": "742 N Michigan Ave", "last_visit_date": "2024-01-16", }, }, { "new_patient_id": 52, "changes": { "first_name": "Wm", "email": "wm.thompson@yahoo.com", "phone": "(312) 555-0147", "insurance_id": "BC-4471583", "zip": "60602", }, }, ], }, { "type": "duplicate_cluster", "source_indices": [1], "cluster_sizes": [1], "noise_fields": ["first_name", "phone", "address_line"], "description": "Robert Martinez -> 'Bob Martinez'", "source_entity_id": "PAT002", "source_patient_id": 2, "duplicates": [ { "new_patient_id": 53, "changes": { "first_name": "Bob", "email": "bob.martinez@outlook.com", "address_line": "1501 Pike Pl", "last_visit_date": "2024-03-09", "insurance_id": "UH-8823146", }, }, ], }, { "type": "duplicate_cluster", "source_indices": [2], "cluster_sizes": [2], "noise_fields": ["first_name", "phone", "address_line"], "description": "Elizabeth O'Brien -> 'Liz OBrien' and 'Beth O Brien'", "source_entity_id": "PAT003", "source_patient_id": 3, "duplicates": [ { "new_patient_id": 54, "changes": { "first_name": "Liz", "last_name": "OBrien", "email": "liz.obrien@yahoo.com", "phone": "617-555-0382", "address_line": "88 Beacon St", }, }, { "new_patient_id": 55, "changes": { "first_name": "Beth", "last_name": "O Brien", "email": "beth.obrien@gmail.com", "dob": "1982-11-05", "insurance_id": "AE-5539274", }, }, ], }, { "type": "duplicate_cluster", "source_indices": [3], "cluster_sizes": [1], "noise_fields": ["first_name", "phone", "address_line"], "description": "Jennifer Nguyen -> 'Jen Nguyen'", "source_entity_id": "PAT004", "source_patient_id": 4, "duplicates": [ { "new_patient_id": 56, "changes": { "first_name": "Jen", "email": "jen.nguyen@gmail.com", "phone": "503-555-0463", "address_line": "2200 NW Burnside Road", "last_visit_date": "2024-06-12", }, }, ], }, { "type": "duplicate_cluster", "source_indices": [4], "cluster_sizes": [1], "noise_fields": ["first_name", "phone", "address_line"], "description": "James Kowalski -> 'Jim Kowalski'", "source_entity_id": "PAT005", "source_patient_id": 5, "duplicates": [ { "new_patient_id": 57, "changes": { "first_name": "Jim", "email": "jim.kowalski@aol.com", "address_line": "1600 Colfax Avenue", "phone": "3035550518", "insurance_id": "MC-6642018", }, }, ], }, { "type": "duplicate_cluster", "source_indices": [5], "cluster_sizes": [2], "noise_fields": ["first_name", "phone", "address_line"], "description": "Katherine Patel -> 'Kate Patel' and 'Kathy Patel'", "source_entity_id": "PAT006", "source_patient_id": 6, "duplicates": [ { "new_patient_id": 58, "changes": { "first_name": "Kate", "email": "kate.patel@hotmail.com", "phone": "404-555-0638", "city": "atlanta", "state": "ga", }, }, { "new_patient_id": 59, "changes": { "first_name": "Kathy", "email": "kathy.patel@gmail.com", "address_line": "350 Peachtree Street NE", "dob": "1988-01-27", "insurance_id": "BC-9918453", }, }, ], }, # ========================================================================= # NEW 4 duplicate clusters (TYPO-based, much harder than nicknames) # ========================================================================= { "type": "duplicate_cluster", "source_indices": [33], "cluster_sizes": [2], "noise_fields": ["first_name", "address_line"], "description": "Christopher Reeves -> 'Christpher Reeves' (dropped 'o') and 'Chistopher Reeves' (dropped 'r')", "source_entity_id": "PAT034", "source_patient_id": 34, "duplicates": [ { "new_patient_id": 60, "changes": { "first_name": "Christpher", "email": "christpher.reeves@gmail.com", "phone": "813-555-3535", "address_line": "1400 N Dale Mabry", }, }, { "new_patient_id": 61, "changes": { "first_name": "Chistopher", "email": "chistopher.reeves@yahoo.com", "address_line": "1400 North Dale Mabry Hwy", "last_visit_date": "2024-06-06", }, }, ], }, { "type": "duplicate_cluster", "source_indices": [37], "cluster_sizes": [1], "noise_fields": ["first_name"], "description": "Alexandra Petrov -> 'Alessandra Petrov' (common misspelling/Italian variant)", "source_entity_id": "PAT038", "source_patient_id": 38, "duplicates": [ { "new_patient_id": 62, "changes": { "first_name": "Alessandra", "email": "alessandra.petrov@gmail.com", "phone": "480-555-3979", "address_line": "7100 E Camelback Road", }, }, ], }, { "type": "duplicate_cluster", "source_indices": [39], "cluster_sizes": [2], "noise_fields": ["first_name", "address_line"], "description": "Patricia Hernandez -> 'Patricla Hernandez' (typo i->l) and 'Patrica Hernandez' (dropped 'i')", "source_entity_id": "PAT040", "source_patient_id": 40, "duplicates": [ { "new_patient_id": 63, "changes": { "first_name": "Patricla", "email": "patricla.hernandez@gmail.com", "phone": "520-555-4191", "address_line": "150 North Stone Ave", }, }, { "new_patient_id": 64, "changes": { "first_name": "Patrica", "email": "patrica.hernandez@yahoo.com", "address_line": "150 N Stone Avenue", "last_visit_date": "2024-01-18", }, }, ], }, { "type": "duplicate_cluster", "source_indices": [47], "cluster_sizes": [2], "noise_fields": ["first_name", "address_line"], "description": ( "Catherine Brooks -> 'Katherine Brooks' (C->K variant) and 'Catharine Brooks' (e->a variant). " "TRAP: PAT006 is Katherine Patel - agent must NOT merge these with PAT006!" ), "source_entity_id": "PAT048", "source_patient_id": 48, "duplicates": [ { "new_patient_id": 65, "changes": { "first_name": "Katherine", "email": "katherine.brooks@gmail.com", "phone": "614-555-4979", "address_line": "100 East Broad St", }, }, { "new_patient_id": 66, "changes": { "first_name": "Catharine", "email": "catharine.brooks@yahoo.com", "address_line": "100 E Broad Street", "last_visit_date": "2024-07-26", }, }, ], }, # ========================================================================= # ORIGINAL cross_field_corrupt: zip-city mismatches # ========================================================================= { "type": "cross_field_corrupt", "row_indices": [17], "description": "Zip-city mismatch: Charlotte patient 18 given Raleigh zip 27601", "target_entity_id": "PAT018", "target_patient_id": 18, "field": "zip", "original": "28202", "corrupted": "27601", }, { "type": "cross_field_corrupt", "row_indices": [23], "description": "Zip-city mismatch: Indianapolis patient 24 given Chicago zip 60601", "target_entity_id": "PAT024", "target_patient_id": 24, "field": "zip", "original": "46204", "corrupted": "60601", }, # --- NEW zip-city mismatches for new patients --- { "type": "cross_field_corrupt", "row_indices": [30], "description": "Zip-city mismatch: Tacoma patient 31 given Seattle zip 98101", "target_entity_id": "PAT031", "target_patient_id": 31, "field": "zip", "original": "98402", "corrupted": "98101", }, { "type": "cross_field_corrupt", "row_indices": [41], "description": "Zip-city mismatch: Albuquerque patient 42 given Tucson zip 85701", "target_entity_id": "PAT042", "target_patient_id": 42, "field": "zip", "original": "87102", "corrupted": "85701", }, { "type": "cross_field_corrupt", "row_indices": [48], "description": "Zip-city mismatch: Louisville patient 49 given Columbus zip 43215", "target_entity_id": "PAT049", "target_patient_id": 49, "field": "zip", "original": "40202", "corrupted": "43215", }, { "type": "cross_field_corrupt", "row_indices": [45], "description": "Zip-city mismatch: Houston patient 46 given Dallas zip 75201", "target_entity_id": "PAT046", "target_patient_id": 46, "field": "zip", "original": "77002", "corrupted": "75201", }, # ========================================================================= # ORIGINAL cross_field_corrupt: insurance ID prefix mismatches # ========================================================================= { "type": "cross_field_corrupt", "row_indices": [20], "description": "Insurance ID prefix mismatch: Cigna patient 21 given BC prefix", "target_entity_id": "PAT021", "target_patient_id": 21, "field": "insurance_id", "original": "CI-5582719", "corrupted": "BC-5582719", }, { "type": "cross_field_corrupt", "row_indices": [28], "description": "Insurance ID prefix mismatch: Blue Cross patient 29 given AE prefix", "target_entity_id": "PAT029", "target_patient_id": 29, "field": "insurance_id", "original": "BC-2215847", "corrupted": "AE-2215847", }, # --- NEW insurance prefix mismatches --- { "type": "cross_field_corrupt", "row_indices": [40], "description": "Insurance ID prefix mismatch: Medicare patient 41 given UH prefix", "target_entity_id": "PAT041", "target_patient_id": 41, "field": "insurance_id", "original": "MC-3392841", "corrupted": "UH-3392841", }, { "type": "cross_field_corrupt", "row_indices": [34], "description": "Insurance ID prefix mismatch: Aetna patient 35 given CI prefix", "target_entity_id": "PAT035", "target_patient_id": 35, "field": "insurance_id", "original": "AE-3318724", "corrupted": "CI-3318724", }, { "type": "cross_field_corrupt", "row_indices": [49], "description": "Insurance ID prefix mismatch: Cigna patient 50 given MC prefix", "target_entity_id": "PAT050", "target_patient_id": 50, "field": "insurance_id", "original": "CI-3347291", "corrupted": "MC-3347291", }, # ========================================================================= # ORIGINAL cross_field_corrupt: gender format # ========================================================================= { "type": "cross_field_corrupt", "row_indices": [11], "description": "Gender format corruption: patient 12 'F' -> 'Female'", "target_entity_id": "PAT012", "target_patient_id": 12, "field": "gender", "original": "F", "corrupted": "Female", }, # ========================================================================= # ORIGINAL impossible_date: date format corruptions # ========================================================================= { "type": "impossible_date", "targets": [{"row_idx": 11, "field": "dob", "corrupt_type": "format"}], "description": "DOB reformatted to MM/DD/YYYY for patient 12", "target_entity_id": "PAT012", "target_patient_id": 12, "field": "dob", "original": "1993-02-28", "corrupted": "02/28/1993", }, { "type": "impossible_date", "targets": [{"row_idx": 15, "field": "last_visit_date", "corrupt_type": "format"}], "description": "Last visit date reformatted to MM-DD-YYYY for patient 16", "target_entity_id": "PAT016", "target_patient_id": 16, "field": "last_visit_date", "original": "2024-08-01", "corrupted": "08-01-2024", }, { "type": "impossible_date", "targets": [{"row_idx": 22, "field": "dob", "corrupt_type": "format"}], "description": "DOB reformatted to 'Dec 1, 1955' for patient 23", "target_entity_id": "PAT023", "target_patient_id": 23, "field": "dob", "original": "1955-12-01", "corrupted": "Dec 1, 1955", }, { "type": "impossible_date", "targets": [{"row_idx": 27, "field": "last_visit_date", "corrupt_type": "format"}], "description": "Last visit date reformatted to M/D/YYYY for patient 28", "target_entity_id": "PAT028", "target_patient_id": 28, "field": "last_visit_date", "original": "2024-02-07", "corrupted": "2/7/2024", }, # --- NEW date format corruptions for new patients --- { "type": "impossible_date", "targets": [{"row_idx": 33, "field": "dob", "corrupt_type": "format"}], "description": "DOB reformatted to DD/MM/YYYY for patient 34", "target_entity_id": "PAT034", "target_patient_id": 34, "field": "dob", "original": "1976-08-25", "corrupted": "25/08/1976", }, { "type": "impossible_date", "targets": [{"row_idx": 37, "field": "last_visit_date", "corrupt_type": "format"}], "description": "Last visit date reformatted to 'Feb 20, 2024' for patient 38", "target_entity_id": "PAT038", "target_patient_id": 38, "field": "last_visit_date", "original": "2024-02-20", "corrupted": "Feb 20, 2024", }, { "type": "impossible_date", "targets": [{"row_idx": 42, "field": "dob", "corrupt_type": "format"}], "description": "DOB reformatted to MM-DD-YYYY for patient 43", "target_entity_id": "PAT043", "target_patient_id": 43, "field": "dob", "original": "1965-11-28", "corrupted": "11-28-1965", }, { "type": "impossible_date", "targets": [{"row_idx": 46, "field": "last_visit_date", "corrupt_type": "format"}], "description": "Last visit date reformatted to D/M/YYYY for patient 47", "target_entity_id": "PAT047", "target_patient_id": 47, "field": "last_visit_date", "original": "2024-03-11", "corrupted": "11/3/2024", }, { "type": "impossible_date", "targets": [{"row_idx": 49, "field": "dob", "corrupt_type": "format"}], "description": "DOB reformatted to 'Jun 21, 2004' for patient 50", "target_entity_id": "PAT050", "target_patient_id": 50, "field": "dob", "original": "2004-06-21", "corrupted": "Jun 21, 2004", }, # --- NEW subtle DOB off-by-one corruptions (very hard to detect) --- { "type": "impossible_date", "targets": [{"row_idx": 31, "field": "dob", "corrupt_type": "off_by_one"}], "description": "DOB day off by 1: patient 32 '1988-07-20' -> '1988-07-21'", "target_entity_id": "PAT032", "target_patient_id": 32, "field": "dob", "original": "1988-07-20", "corrupted": "1988-07-21", }, { "type": "impossible_date", "targets": [{"row_idx": 43, "field": "dob", "corrupt_type": "off_by_one"}], "description": "DOB day off by 1: patient 44 '1981-05-09' -> '1981-05-10'", "target_entity_id": "PAT044", "target_patient_id": 44, "field": "dob", "original": "1981-05-09", "corrupted": "1981-05-10", }, # ========================================================================= # ORIGINAL insurance_id_mismatch: missing/corrupted + gender mismatches # ========================================================================= { "type": "insurance_id_mismatch", "row_indices": [26], "description": "Patient 27 insurance_id set to empty string", "target_entity_id": "PAT027", "target_patient_id": 27, "field": "insurance_id", "original": "CI-4431957", "corrupted": "", }, { "type": "insurance_id_mismatch", "row_indices": [15], "description": "Patient 16 gender 'M' -> 'male' (format mismatch)", "target_entity_id": "PAT016", "target_patient_id": 16, "field": "gender", "original": "M", "corrupted": "male", }, { "type": "insurance_id_mismatch", "row_indices": [22], "description": "Patient 23 gender 'M' -> 'm' (case mismatch)", "target_entity_id": "PAT023", "target_patient_id": 23, "field": "gender", "original": "M", "corrupted": "m", }, # ========================================================================= # ORIGINAL null_inject_contextual: missing values and whitespace # ========================================================================= { "type": "null_inject_contextual", "targets": [{"row_idx": 13, "field": "email"}], "description": "Email set to None for patient 14 (Ashley Richardson)", "target_entity_id": "PAT014", "target_patient_id": 14, "field": "email", "original": "ashley.richardson@gmail.com", "corrupted": None, }, { "type": "null_inject_contextual", "targets": [{"row_idx": 21, "field": "phone"}], "description": "Phone set to None for patient 22 (Jordan Mitchell)", "target_entity_id": "PAT022", "target_patient_id": 22, "field": "phone", "original": "619-555-2246", "corrupted": None, }, { "type": "null_inject_contextual", "targets": [{"row_idx": 29, "field": "address_line"}], "description": "Address set to None for patient 30 (Laura Scott)", "target_entity_id": "PAT030", "target_patient_id": 30, "field": "address_line", "original": "400 Fayetteville St", "corrupted": None, }, # --- NEW null injections for new patients --- { "type": "null_inject_contextual", "targets": [{"row_idx": 40, "field": "phone"}], "description": "Phone set to None for patient 41 (Gregory Adams)", "target_entity_id": "PAT041", "target_patient_id": 41, "field": "phone", "original": "860-555-4201", "corrupted": None, }, { "type": "null_inject_contextual", "targets": [{"row_idx": 47, "field": "email"}], "description": "Email set to None for patient 48 (Catherine Brooks)", "target_entity_id": "PAT048", "target_patient_id": 48, "field": "email", "original": "catherine.brooks@gmail.com", "corrupted": None, }, { "type": "null_inject_contextual", "targets": [{"row_idx": 38, "field": "insurance_id"}], "description": "Insurance ID set to None for patient 39 (Casey Morgan)", "target_entity_id": "PAT039", "target_patient_id": 39, "field": "insurance_id", "original": "MC-7724316", "corrupted": None, }, # --- ORIGINAL whitespace corruptions --- { "type": "null_inject_contextual", "targets": [{"row_idx": 10, "field": "first_name"}], "description": "First name padded with spaces for patient 11 (David Chen)", "target_entity_id": "PAT011", "target_patient_id": 11, "field": "first_name", "original": "David", "corrupted": " David ", }, { "type": "null_inject_contextual", "targets": [{"row_idx": 14, "field": "email"}], "description": "Email with extra space for patient 15 (Patricia Lee)", "target_entity_id": "PAT015", "target_patient_id": 15, "field": "email", "original": "patricia.lee@outlook.com", "corrupted": "patricia.lee @outlook.com", }, # --- NEW whitespace corruptions --- { "type": "null_inject_contextual", "targets": [{"row_idx": 44, "field": "last_name"}], "description": "Last name with trailing space for patient 45 (James Lee)", "target_entity_id": "PAT045", "target_patient_id": 45, "field": "last_name", "original": "Lee", "corrupted": "Lee ", }, { "type": "null_inject_contextual", "targets": [{"row_idx": 39, "field": "first_name"}], "description": "First name with leading tab for patient 40 (Patricia Hernandez)", "target_entity_id": "PAT040", "target_patient_id": 40, "field": "first_name", "original": "Patricia", "corrupted": "\tPatricia", }, { "type": "null_inject_contextual", "targets": [{"row_idx": 48, "field": "city"}], "description": "City with trailing whitespace for patient 49 (Raymond Foster)", "target_entity_id": "PAT049", "target_patient_id": 49, "field": "city", "original": "Louisville", "corrupted": "Louisville ", }, # ========================================================================= # ORIGINAL false_positive_duplicate: 2 pairs (already in ground truth) # ========================================================================= { "type": "false_positive_duplicate", "pairs": [[6, 7]], "description": ( "Two different 'Michael Davis' patients (PAT007 and PAT008) share the same " "name but have different DOB, location, insurance. Must NOT be merged." ), "entity_ids": ["PAT007", "PAT008"], "patient_ids": [7, 8], "distinguishing_fields": ["dob", "city", "state", "zip", "email", "insurance_provider", "insurance_id"], }, { "type": "false_positive_duplicate", "pairs": [[8, 9]], "description": ( "Two different 'Maria Garcia' patients (PAT009 and PAT010) share the same " "name but have different DOB, location, insurance. Must NOT be merged." ), "entity_ids": ["PAT009", "PAT010"], "patient_ids": [9, 10], "distinguishing_fields": ["dob", "city", "state", "zip", "email", "insurance_provider", "insurance_id"], }, # --- NEW false_positive_duplicate: 3 harder pairs --- { "type": "false_positive_duplicate", "pairs": [[31, 32]], "description": ( "Two different 'David Kim' patients (PAT032 and PAT033) - SAME insurance " "provider (UnitedHealth), SAME state (WA), DOBs only 2 years apart. " "Distinguishable by different insurance IDs, different cities, different DOB. " "Must NOT be merged." ), "entity_ids": ["PAT032", "PAT033"], "patient_ids": [32, 33], "distinguishing_fields": ["dob", "city", "zip", "email", "insurance_id", "phone"], }, { "type": "false_positive_duplicate", "pairs": [[35, 36]], "description": ( "Two different 'Sarah Williams' patients (PAT036 and PAT037) - SAME city " "(Chicago), SAME state (IL). DOBs 5 years apart, different insurance. " "Must NOT be merged." ), "entity_ids": ["PAT036", "PAT037"], "patient_ids": [36, 37], "distinguishing_fields": ["dob", "zip", "address_line", "email", "insurance_provider", "insurance_id", "phone"], }, { "type": "false_positive_duplicate", "pairs": [[43, 44]], "description": ( "Two different 'James Lee' patients (PAT044 and PAT045) - DOBs only 1 year " "apart, SAME state (CA), SAME insurance provider (Aetna). Distinguishable " "by different insurance IDs, different cities. Must NOT be merged." ), "entity_ids": ["PAT044", "PAT045"], "patient_ids": [44, 45], "distinguishing_fields": ["dob", "city", "zip", "email", "insurance_id", "phone"], }, # ========================================================================= # ORIGINAL address_variation: phone format variations # ========================================================================= { "type": "address_variation", "row_indices": [10], "description": "Phone format changed to (XXX) XXX-XXXX for patient 11", "target_entity_id": "PAT011", "target_patient_id": 11, "field": "phone", "original": "415-555-1134", "corrupted": "(415) 555-1134", }, { "type": "address_variation", "row_indices": [14], "description": "Phone format changed to XXX.XXX.XXXX for patient 15", "target_entity_id": "PAT015", "target_patient_id": 15, "field": "phone", "original": "702-555-1578", "corrupted": "702.555.1578", }, { "type": "address_variation", "row_indices": [19], "description": "Phone stripped of dashes for patient 20", "target_entity_id": "PAT020", "target_patient_id": 20, "field": "phone", "original": "407-555-2024", "corrupted": "4075552024", }, { "type": "address_variation", "row_indices": [24], "description": "Phone with country code prefix for patient 25", "target_entity_id": "PAT025", "target_patient_id": 25, "field": "phone", "original": "216-555-2579", "corrupted": "+1-216-555-2579", }, # --- NEW phone format variations --- { "type": "address_variation", "row_indices": [33], "description": "Phone format changed to (XXX) XXX-XXXX for patient 34", "target_entity_id": "PAT034", "target_patient_id": 34, "field": "phone", "original": "813-555-3534", "corrupted": "(813) 555-3534", }, { "type": "address_variation", "row_indices": [41], "description": "Phone format changed to XXX.XXX.XXXX for patient 42", "target_entity_id": "PAT042", "target_patient_id": 42, "field": "phone", "original": "505-555-4312", "corrupted": "505.555.4312", }, { "type": "address_variation", "row_indices": [45], "description": "Phone stripped of dashes for patient 46", "target_entity_id": "PAT046", "target_patient_id": 46, "field": "phone", "original": "832-555-4756", "corrupted": "8325554756", }, { "type": "address_variation", "row_indices": [48], "description": "Phone with country code prefix for patient 49", "target_entity_id": "PAT049", "target_patient_id": 49, "field": "phone", "original": "502-555-5089", "corrupted": "+1-502-555-5089", }, # ========================================================================= # ORIGINAL case corruptions # ========================================================================= { "type": "case_corrupt", "targets": [{"row_idx": 12, "field": "first_name"}], "description": "First name uppercased for patient 13 (Thomas -> THOMAS)", "target_entity_id": "PAT013", "target_patient_id": 13, "original": "Thomas", "corrupted": "THOMAS", }, { "type": "case_corrupt", "targets": [{"row_idx": 16, "field": "city"}], "description": "City lowercased for patient 17 (Minneapolis -> minneapolis)", "target_entity_id": "PAT017", "target_patient_id": 17, "original": "Minneapolis", "corrupted": "minneapolis", }, { "type": "case_corrupt", "targets": [{"row_idx": 18, "field": "state"}], "description": "State lowercased for patient 19 (MO -> mo)", "target_entity_id": "PAT019", "target_patient_id": 19, "original": "MO", "corrupted": "mo", }, { "type": "case_corrupt", "targets": [{"row_idx": 25, "field": "last_name"}], "description": "Last name uppercased for patient 26 (Young -> YOUNG)", "target_entity_id": "PAT026", "target_patient_id": 26, "original": "Young", "corrupted": "YOUNG", }, # --- NEW case corruptions --- { "type": "case_corrupt", "targets": [{"row_idx": 37, "field": "last_name"}], "description": "Last name lowercased for patient 38 (Petrov -> petrov)", "target_entity_id": "PAT038", "target_patient_id": 38, "original": "Petrov", "corrupted": "petrov", }, { "type": "case_corrupt", "targets": [{"row_idx": 42, "field": "first_name"}], "description": "First name uppercased for patient 43 (Dana -> DANA)", "target_entity_id": "PAT043", "target_patient_id": 43, "original": "Dana", "corrupted": "DANA", }, { "type": "case_corrupt", "targets": [{"row_idx": 47, "field": "city"}], "description": "City lowercased for patient 48 (Columbus -> columbus)", "target_entity_id": "PAT048", "target_patient_id": 48, "original": "Columbus", "corrupted": "columbus", }, { "type": "case_corrupt", "targets": [{"row_idx": 49, "field": "state"}], "description": "State lowercased for patient 50 (IA -> ia)", "target_entity_id": "PAT050", "target_patient_id": 50, "original": "IA", "corrupted": "ia", }, # ========================================================================= # ORIGINAL address whitespace corruptions # ========================================================================= { "type": "address_variation", "row_indices": [19], "description": "Extra spaces in address for patient 20 (525 S Orange Ave -> 525 S Orange Ave)", "target_entity_id": "PAT020", "target_patient_id": 20, "field": "address_line", "original": "525 S Orange Ave", "corrupted": "525 S Orange Ave", }, { "type": "address_variation", "row_indices": [25], "description": "Extra space in city for patient 26 (New Orleans -> New Orleans)", "target_entity_id": "PAT026", "target_patient_id": 26, "field": "city", "original": "New Orleans", "corrupted": "New Orleans", }, # ========================================================================= # NEW email domain typo corruptions (subtle) # ========================================================================= { "type": "address_variation", "row_indices": [34], "description": "Email domain typo for patient 35 (yahoo.com -> yaho.com)", "target_entity_id": "PAT035", "target_patient_id": 35, "field": "email", "original": "avery.simmons94@yahoo.com", "corrupted": "avery.simmons94@yaho.com", }, { "type": "address_variation", "row_indices": [41], "description": "Email domain typo for patient 42 (gmail.com -> gmial.com)", "target_entity_id": "PAT042", "target_patient_id": 42, "field": "email", "original": "samantha.rivera97@gmail.com", "corrupted": "samantha.rivera97@gmial.com", }, { "type": "address_variation", "row_indices": [48], "description": "Email domain typo for patient 49 (aol.com -> aol.cm)", "target_entity_id": "PAT049", "target_patient_id": 49, "field": "email", "original": "raymond.foster50@aol.com", "corrupted": "raymond.foster50@aol.cm", }, # ========================================================================= # NEW state full-name instead of abbreviation corruptions # ========================================================================= { "type": "cross_field_corrupt", "row_indices": [33], "description": "State full name instead of abbreviation for patient 34 (FL -> Florida)", "target_entity_id": "PAT034", "target_patient_id": 34, "field": "state", "original": "FL", "corrupted": "Florida", }, { "type": "cross_field_corrupt", "row_indices": [43], "description": "State full name instead of abbreviation for patient 44 (CA -> California)", "target_entity_id": "PAT044", "target_patient_id": 44, "field": "state", "original": "CA", "corrupted": "California", }, { "type": "cross_field_corrupt", "row_indices": [40], "description": "State full name instead of abbreviation for patient 41 (CT -> Connecticut)", "target_entity_id": "PAT041", "target_patient_id": 41, "field": "state", "original": "CT", "corrupted": "Connecticut", }, # ========================================================================= # ORIGINAL + NEW valid_unusual: gender/name traps (NOT errors) # ========================================================================= { "type": "valid_unusual", "description": "Ashley (M) - historically male name, VALID. Do NOT correct.", "entity_id": "PAT014", "patient_id": 14, "first_name": "Ashley", "gender": "M", "note": "Ashley was historically a male name; this is valid.", }, { "type": "valid_unusual", "description": "Jordan (F) - gender-neutral name, VALID. Do NOT correct.", "entity_id": "PAT022", "patient_id": 22, "first_name": "Jordan", "gender": "F", "note": "Jordan is gender-neutral; valid for female patients.", }, { "type": "valid_unusual", "description": "Shannon (M) - historically male Irish name, VALID. Do NOT correct.", "entity_id": "PAT027", "patient_id": 27, "first_name": "Shannon", "gender": "M", "note": "Shannon was historically a male Irish name; this is valid.", }, { "type": "valid_unusual", "description": "Morgan (M) - gender-neutral name, VALID. Do NOT correct.", "entity_id": "PAT031", "patient_id": 31, "first_name": "Morgan", "gender": "M", "note": "Morgan is gender-neutral; valid for male patients.", }, { "type": "valid_unusual", "description": "Avery (M) - gender-neutral name, VALID. Do NOT correct.", "entity_id": "PAT035", "patient_id": 35, "first_name": "Avery", "gender": "M", "note": "Avery is gender-neutral; valid for male patients.", }, { "type": "valid_unusual", "description": "Casey (F) - gender-neutral but often male, VALID. Do NOT correct.", "entity_id": "PAT039", "patient_id": 39, "first_name": "Casey", "gender": "F", "note": "Casey is gender-neutral; valid for female patients.", }, { "type": "valid_unusual", "description": "Dana (M) - gender-neutral name, VALID. Do NOT correct.", "entity_id": "PAT043", "patient_id": 43, "first_name": "Dana", "gender": "M", "note": "Dana is gender-neutral; valid for male patients.", }, { "type": "valid_unusual", "description": "Robin (F) - gender-neutral name, VALID. Do NOT correct.", "entity_id": "PAT047", "patient_id": 47, "first_name": "Robin", "gender": "F", "note": "Robin is gender-neutral; valid for female patients.", }, ] _HARD_PROBES: list[UtilityProbe] = [ UtilityProbe( name="unique_patient_count", description="Count of unique patients after deduplication", query_fn="unique_count", params={"column": "patient_id"}, expected_result=50, ), UtilityProbe( name="insurance_provider_distribution", description="Count of patients per insurance provider", query_fn="distribution", params={"column": "insurance_provider"}, expected_result={ "Blue Cross": 10, "UnitedHealth": 10, "Aetna": 9, "Cigna": 9, "Medicare": 12, }, ), UtilityProbe( name="patients_per_city", description="Count of patients per city", query_fn="distribution", params={"column": "city"}, expected_result={ "Chicago": 3, "Seattle": 2, "Boston": 1, "Portland": 1, "Denver": 2, "Atlanta": 2, "Beverly Hills": 1, "Miami": 1, "Austin": 1, "New York": 1, "San Francisco": 1, "Houston": 2, "Phoenix": 1, "Nashville": 1, "Las Vegas": 1, "Dallas": 1, "Minneapolis": 1, "Charlotte": 1, "Saint Louis": 1, "Orlando": 1, "Pittsburgh": 1, "San Diego": 1, "San Antonio": 1, "Indianapolis": 1, "Cleveland": 1, "New Orleans": 1, "Kansas City": 1, "Milwaukee": 1, "Tysons": 1, "Raleigh": 1, "Tacoma": 1, "Bellevue": 1, "Tampa": 1, "Scottsdale": 1, "Tucson": 1, "Hartford": 1, "Albuquerque": 1, "Memphis": 1, "Oakland": 1, "San Jose": 1, "Greensboro": 1, "Columbus": 1, "Louisville": 1, "Des Moines": 1, }, ), UtilityProbe( name="avg_age_by_gender", description="Average age (2026 - birth year) grouped by gender", query_fn="avg_by_group", params={"value_col": "dob", "group_col": "gender", "transform": "year_age_2026"}, expected_result={"F": 44.80, "M": 51.00}, ), ] _HARD_TASK = Task( task_id="hard_patients", name="Multi-Source Patient Registry", difficulty="hard", description=( "Clean and deduplicate a multi-source patient registry with 50 unique " "patients (14 columns). The dirty dataset contains ~70 rows due to 10 " "duplicate clusters (6 nickname-based, 4 typo-based including a " "Catherine/Katherine cross-patient trap). Includes 5 false-positive " "duplicate pairs (same name, genuinely different people with subtle " "overlaps in location, insurance, or DOB) that must NOT be merged, " "8 gender/name traps that are valid and must NOT be corrected, " "cross-field validation rules (zip-city map, insurance prefix map), " "date and phone format inconsistencies, DOB off-by-one corruptions, " "email domain typos, state full-name expansions, null injections, " "whitespace corruptions, and insurance ID mismatches." ), ground_truth=_HARD_GROUND_TRUTH, schema=_HARD_SCHEMA, corruptions=_HARD_CORRUPTIONS, max_steps=80, utility_probes=_HARD_PROBES, # Genuinely ambiguous cells where the correct value is debatable. # Gender/name traps: unusual but valid gender assignments. # An ideal agent should escalate these rather than "fix" them. ambiguous_cells=[ ("PAT014", "gender"), # Ashley (M) - historically male name ("PAT022", "gender"), # Jordan (F) - gender-neutral name ("PAT027", "gender"), # Shannon (M) - historically male Irish name ("PAT031", "gender"), # Morgan (M) - gender-neutral name ("PAT035", "gender"), # Avery (M) - gender-neutral name ("PAT039", "gender"), # Casey (F) - gender-neutral but often male ("PAT043", "gender"), # Dana (M) - gender-neutral name ("PAT047", "gender"), # Robin (F) - gender-neutral name ("PAT003", "last_name"), # O'Brien vs OBrien vs O Brien - apostrophe handling debatable ], ) register_task(_HARD_TASK)