dataclean-env / server /tasks.py
Anuj424614's picture
Upload folder using huggingface_hub
8345e43 verified
"""
Task definitions for the Data Cleaning OpenEnv environment.
Defines three tasks at easy, medium, and hard difficulty levels, each with
complete ground truth data, schema definitions, and corruption specifications.
SELF-CONTAINED: All ground truth data is inlined. No external imports for data.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
# ============================================================================
# Utility Probe dataclass
# ============================================================================
@dataclass
class UtilityProbe:
"""A downstream analytics probe that validates the cleaned data produces
correct aggregate results, not just cell-level correctness."""
name: str # e.g., "unique_customer_count"
description: str # e.g., "Count of unique customers after dedup"
query_fn: str # Name of the probe function to run
params: dict[str, Any] = field(default_factory=dict) # e.g., {"column": "email"}
expected_result: Any = None # The correct answer from ground truth
# ============================================================================
# Task dataclass
# ============================================================================
@dataclass
class Task:
task_id: str
name: str
difficulty: str
description: str
ground_truth: list[dict[str, Any]]
schema: dict[str, Any]
corruptions: list[dict[str, Any]]
max_steps: int
utility_probes: list[UtilityProbe] = field(default_factory=list)
ambiguous_cells: list[tuple[str, str]] = field(default_factory=list)
# ============================================================================
# Task registry
# ============================================================================
_TASK_REGISTRY: dict[str, Task] = {}
def register_task(task: Task) -> None:
"""Register a task in the global registry, keyed by task_id."""
_TASK_REGISTRY[task.task_id] = task
def get_task(task_id: str) -> Task:
"""Retrieve a registered task by its task_id.
Raises ``KeyError`` if the task_id is not found.
"""
if task_id not in _TASK_REGISTRY:
raise KeyError(f"Task '{task_id}' not found. Available: {list(_TASK_REGISTRY)}")
return _TASK_REGISTRY[task_id]
def list_tasks() -> list[dict[str, Any]]:
"""Return a list of metadata dicts (one per registered task)."""
return [
{
"task_id": t.task_id,
"name": t.name,
"difficulty": t.difficulty,
"description": t.description,
"num_ground_truth_rows": len(t.ground_truth),
"num_corruptions": len(t.corruptions),
"max_steps": t.max_steps,
"num_utility_probes": len(t.utility_probes),
}
for t in _TASK_REGISTRY.values()
]
# ############################################################################
# EASY TASK: Customer Contact Cleanup
# ############################################################################
_EASY_GROUND_TRUTH: list[dict[str, Any]] = [
{
"_entity_id": "CONTACT001",
"id": 1,
"first_name": "Alice",
"last_name": "Morgan",
"email": "alice.morgan@example.com",
"phone": "(555) 123-4567",
"signup_date": "2022-01-15",
"state": "CA",
},
{
"_entity_id": "CONTACT002",
"id": 2,
"first_name": "Brian",
"last_name": "Cho",
"email": "brian.cho@example.com",
"phone": "(555) 234-5678",
"signup_date": "2022-03-22",
"state": "NY",
},
{
"_entity_id": "CONTACT003",
"id": 3,
"first_name": "Carmen",
"last_name": "Reyes",
"email": "carmen.reyes@example.com",
"phone": "(555) 345-6789",
"signup_date": "2022-06-10",
"state": "TX",
},
{
"_entity_id": "CONTACT004",
"id": 4,
"first_name": "David",
"last_name": "Novak",
"email": "david.novak@example.com",
"phone": "(555) 456-7890",
"signup_date": "2022-08-05",
"state": "FL",
},
{
"_entity_id": "CONTACT005",
"id": 5,
"first_name": "Elena",
"last_name": "Petrova",
"email": "elena.petrova@example.com",
"phone": "(555) 567-8901",
"signup_date": "2023-01-18",
"state": "WA",
},
{
"_entity_id": "CONTACT006",
"id": 6,
"first_name": "Frank",
"last_name": "Oduya",
"email": "frank.oduya@example.com",
"phone": "(555) 678-9012",
"signup_date": "2023-04-30",
"state": "IL",
},
{
"_entity_id": "CONTACT007",
"id": 7,
"first_name": "Grace",
"last_name": "Kim",
"email": "grace.kim@example.com",
"phone": "(555) 789-0123",
"signup_date": "2023-07-14",
"state": "OR",
},
{
"_entity_id": "CONTACT008",
"id": 8,
"first_name": "Hassan",
"last_name": "Ali",
"email": "hassan.ali@example.com",
"phone": "(555) 890-1234",
"signup_date": "2023-10-02",
"state": "CO",
},
]
_EASY_SCHEMA: dict[str, Any] = {
"primary_key": "id",
"expected_types": {
"id": "int",
"first_name": "str",
"last_name": "str",
"email": "email",
"phone": "phone",
"signup_date": "date",
"state": "str",
},
"constraints": {
"email": {"format": r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}$"},
"phone": {"format": r"^\(\d{3}\) \d{3}-\d{4}$"},
"signup_date": {"format": "YYYY-MM-DD"},
"state": {
"allowed_values": [
"AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
"HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
"MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
"NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
"SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY",
],
},
},
}
_EASY_CORRUPTIONS: list[dict[str, Any]] = [
# --- char_swap: 2 typos ---
{
"type": "char_swap",
"targets": [{"row_idx": 1, "field": "first_name"}],
"description": "Swapped letters in first_name for contact 2 (Brian -> Biran)",
"target_entity_id": "CONTACT002",
"original": "Brian",
"corrupted": "Biran",
},
{
"type": "char_swap",
"targets": [{"row_idx": 5, "field": "last_name"}],
"description": "Transposed letters in last name for contact 6 (Oduya -> Oduay)",
"target_entity_id": "CONTACT006",
"original": "Oduya",
"corrupted": "Oduay",
},
# --- format_randomize: 2 dates ---
{
"type": "format_randomize",
"column": "signup_date",
"row_indices": [0],
"description": "Date reformatted to MM/DD/YYYY for contact 1",
"target_entity_id": "CONTACT001",
"original": "2022-01-15",
"corrupted": "01/15/2022",
},
{
"type": "format_randomize",
"column": "signup_date",
"row_indices": [4],
"description": "Date reformatted to DD-Mon-YYYY for contact 5",
"target_entity_id": "CONTACT005",
"original": "2023-01-18",
"corrupted": "18-Jan-2023",
},
# --- null_inject: 1 ---
{
"type": "null_inject",
"targets": [{"row_idx": 3, "field": "email"}],
"description": "Email set to None for contact 4",
"target_entity_id": "CONTACT004",
"original": "david.novak@example.com",
"corrupted": None,
},
# --- format_strip: 1 phone ---
{
"type": "format_strip",
"column": "phone",
"row_indices": [2],
"description": "Phone stripped of formatting for contact 3",
"target_entity_id": "CONTACT003",
"original": "(555) 345-6789",
"corrupted": "5553456789",
},
# --- case_corrupt: 2 ---
{
"type": "case_corrupt",
"targets": [{"row_idx": 6, "field": "first_name"}],
"description": "First name uppercased for contact 7",
"target_entity_id": "CONTACT007",
"original": "Grace",
"corrupted": "GRACE",
},
{
"type": "case_corrupt",
"targets": [{"row_idx": 7, "field": "state"}],
"description": "State lowercased for contact 8",
"target_entity_id": "CONTACT008",
"original": "CO",
"corrupted": "co",
},
]
_EASY_PROBES: list[UtilityProbe] = [
UtilityProbe(
name="unique_email_count",
description="Count of unique non-null email addresses after cleaning",
query_fn="unique_count",
params={"column": "email"},
expected_result=8,
),
UtilityProbe(
name="state_distribution",
description="Count of contacts per state",
query_fn="distribution",
params={"column": "state"},
expected_result={
"CA": 1, "NY": 1, "TX": 1, "FL": 1,
"WA": 1, "IL": 1, "OR": 1, "CO": 1,
},
),
]
_EASY_TASK = Task(
task_id="easy_contacts",
name="Customer Contact Cleanup",
difficulty="easy",
description=(
"Clean a small customer contact list with 8 rows and 7 columns plus id. "
"Corruptions include character swaps (typos), randomised date formats, "
"a null injection, a stripped phone number, and case corruptions. "
"There are NO duplicate rows in this task."
),
ground_truth=_EASY_GROUND_TRUTH,
schema=_EASY_SCHEMA,
corruptions=_EASY_CORRUPTIONS,
max_steps=30,
utility_probes=_EASY_PROBES,
)
register_task(_EASY_TASK)
# ############################################################################
# MEDIUM TASK: Employee Records Reconciliation
# ############################################################################
_MEDIUM_GROUND_TRUTH: list[dict[str, Any]] = [
{
"_entity_id": "EMP001",
"emp_id": 1001,
"first_name": "Robert",
"last_name": "Chen",
"email": "r.chen@company.com",
"phone": "(415) 555-0142",
"department": "Engineering",
"hire_date": "2019-03-15",
"salary": 145000.00,
"office_city": "San Francisco",
"office_zip": "94105",
},
{
"_entity_id": "EMP002",
"emp_id": 1002,
"first_name": "Jennifer",
"last_name": "Okafor",
"email": "j.okafor@company.com",
"phone": "(212) 555-0387",
"department": "Marketing",
"hire_date": "2020-07-22",
"salary": 112000.00,
"office_city": "New York",
"office_zip": "10013",
},
{
"_entity_id": "EMP003",
"emp_id": 1003,
"first_name": "William",
"last_name": "Hernandez",
"email": "w.hernandez@company.com",
"phone": "(512) 555-0219",
"department": "Sales",
"hire_date": "2021-01-10",
"salary": 98000.00,
"office_city": "Austin",
"office_zip": "78701",
},
{
"_entity_id": "EMP004",
"emp_id": 1004,
"first_name": "Priya",
"last_name": "Patel",
"email": "p.patel@company.com",
"phone": "(206) 555-0463",
"department": "Engineering",
"hire_date": "2019-11-04",
"salary": 162000.00,
"office_city": "Seattle",
"office_zip": "98101",
},
{
"_entity_id": "EMP005",
"emp_id": 1005,
"first_name": "Michael",
"last_name": "Thompson",
"email": "m.thompson@company.com",
"phone": "(312) 555-0578",
"department": "Finance",
"hire_date": "2020-04-18",
"salary": 131000.00,
"office_city": "Chicago",
"office_zip": "60601",
},
{
"_entity_id": "EMP006",
"emp_id": 1006,
"first_name": "Aisha",
"last_name": "Washington",
"email": "a.washington@company.com",
"phone": "(303) 555-0691",
"department": "HR",
"hire_date": "2022-02-28",
"salary": 95000.00,
"office_city": "Denver",
"office_zip": "80202",
},
{
"_entity_id": "EMP007",
"emp_id": 1007,
"first_name": "Carlos",
"last_name": "Rivera",
"email": "c.rivera@company.com",
"phone": "(415) 555-0734",
"department": "Engineering",
"hire_date": "2021-06-14",
"salary": 138000.00,
"office_city": "San Francisco",
"office_zip": "94107",
},
{
"_entity_id": "EMP008",
"emp_id": 1008,
"first_name": "Sarah",
"last_name": "Kim",
"email": "s.kim@company.com",
"phone": "(212) 555-0856",
"department": "Marketing",
"hire_date": "2023-01-09",
"salary": 87000.00,
"office_city": "New York",
"office_zip": "10001",
},
{
"_entity_id": "EMP009",
"emp_id": 1009,
"first_name": "James",
"last_name": "O'Brien",
"email": "j.obrien@company.com",
"phone": "(512) 555-0923",
"department": "Operations",
"hire_date": "2019-08-21",
"salary": 105000.00,
"office_city": "Austin",
"office_zip": "78702",
},
{
"_entity_id": "EMP010",
"emp_id": 1010,
"first_name": "Wei",
"last_name": "Zhang",
"email": "w.zhang@company.com",
"phone": "(206) 555-0147",
"department": "Engineering",
"hire_date": "2020-10-05",
"salary": 155000.00,
"office_city": "Seattle",
"office_zip": "98104",
},
{
"_entity_id": "EMP011",
"emp_id": 1011,
"first_name": "Maria",
"last_name": "Gonzalez",
"email": "m.gonzalez@company.com",
"phone": "(303) 555-0268",
"department": "Sales",
"hire_date": "2022-05-16",
"salary": 91000.00,
"office_city": "Denver",
"office_zip": "80203",
},
{
"_entity_id": "EMP012",
"emp_id": 1012,
"first_name": "David",
"last_name": "Nakamura",
"email": "d.nakamura@company.com",
"phone": "(312) 555-0395",
"department": "Finance",
"hire_date": "2021-09-01",
"salary": 118000.00,
"office_city": "Chicago",
"office_zip": "60602",
},
{
"_entity_id": "EMP013",
"emp_id": 1013,
"first_name": "Fatima",
"last_name": "Al-Rashidi",
"email": "f.al-rashidi@company.com",
"phone": "(415) 555-0512",
"department": "Engineering",
"hire_date": "2023-03-20",
"salary": 128000.00,
"office_city": "San Francisco",
"office_zip": "94103",
},
{
"_entity_id": "EMP014",
"emp_id": 1014,
"first_name": "Marcus",
"last_name": "Johnson",
"email": "m.johnson@company.com",
"phone": "(212) 555-0643",
"department": "Operations",
"hire_date": "2020-12-07",
"salary": 102000.00,
"office_city": "New York",
"office_zip": "10016",
},
{
"_entity_id": "EMP015",
"emp_id": 1015,
"first_name": "Elena",
"last_name": "Petrov",
"email": "e.petrov@company.com",
"phone": "(512) 555-0178",
"department": "HR",
"hire_date": "2024-01-15",
"salary": 88000.00,
"office_city": "Austin",
"office_zip": "78703",
},
{
"_entity_id": "EMP016",
"emp_id": 1016,
"first_name": "Andre",
"last_name": "Williams",
"email": "a.williams@company.com",
"phone": "(206) 555-0834",
"department": "Sales",
"hire_date": "2022-08-30",
"salary": 96000.00,
"office_city": "Seattle",
"office_zip": "98102",
},
{
"_entity_id": "EMP017",
"emp_id": 1017,
"first_name": "Lisa",
"last_name": "Svensson",
"email": "l.svensson@company.com",
"phone": "(312) 555-0956",
"department": "Finance",
"hire_date": "2023-07-11",
"salary": 108000.00,
"office_city": "Chicago",
"office_zip": "60603",
},
]
_MEDIUM_SCHEMA: dict[str, Any] = {
"primary_key": "emp_id",
"expected_types": {
"emp_id": "int",
"first_name": "str",
"last_name": "str",
"email": "email",
"phone": "phone",
"department": "str",
"hire_date": "date",
"salary": "float",
"office_city": "str",
"office_zip": "str",
},
"constraints": {
"email": {"format": r"^[a-z]\.[a-z\-]+@company\.com$"},
"phone": {"format": r"^\(\d{3}\) \d{3}-\d{4}$"},
"hire_date": {"format": "YYYY-MM-DD"},
"salary": {"min": 55000.0, "max": 185000.0},
"department": {
"allowed_values": [
"Engineering", "Marketing", "Sales", "HR", "Finance", "Operations",
],
},
"office_city": {
"allowed_values": [
"San Francisco", "New York", "Chicago", "Austin", "Seattle", "Denver",
],
},
"office_zip": {"format": r"^\d{5}$"},
},
}
_MEDIUM_CORRUPTIONS: list[dict[str, Any]] = [
# --- duplicate_with_noise: 3 rows (nickname + slight field noise) ---
{
"type": "duplicate_with_noise",
"source_indices": [0],
"noise_fields": ["first_name", "phone"],
"description": "Duplicate of EMP001 (Robert Chen) as 'Bob Chen' with transposed phone digits",
"source_entity_id": "EMP001",
"source_emp_id": 1001,
"corrupted_row": {
"emp_id": 1001,
"first_name": "Bob",
"last_name": "Chen",
"email": "r.chen@company.com",
"phone": "(415) 555-0124",
"department": "Engineering",
"hire_date": "2019-03-15",
"salary": 145000.00,
"office_city": "San Francisco",
"office_zip": "94105",
},
},
{
"type": "duplicate_with_noise",
"source_indices": [2],
"noise_fields": ["first_name", "phone"],
"description": "Duplicate of EMP003 (William Hernandez) as 'Will Hernandez' with slightly different salary",
"source_entity_id": "EMP003",
"source_emp_id": 1003,
"corrupted_row": {
"emp_id": 1003,
"first_name": "Will",
"last_name": "Hernandez",
"email": "w.hernandez@company.com",
"phone": "(512) 555-0219",
"department": "Sales",
"hire_date": "2021-01-10",
"salary": 98500.00,
"office_city": "Austin",
"office_zip": "78701",
},
},
{
"type": "duplicate_with_noise",
"source_indices": [4],
"noise_fields": ["first_name", "phone"],
"description": "Duplicate of EMP005 (Michael Thompson) as 'Mike Thompson' with wrong date format",
"source_entity_id": "EMP005",
"source_emp_id": 1005,
"corrupted_row": {
"emp_id": 1005,
"first_name": "Mike",
"last_name": "Thompson",
"email": "m.thompson@company.com",
"phone": "(312) 555-0578",
"department": "Finance",
"hire_date": "04/18/2020",
"salary": 131000.00,
"office_city": "Chicago",
"office_zip": "60601",
},
},
# --- format_randomize dates: 4 ---
{
"type": "format_randomize",
"column": "hire_date",
"row_indices": [10],
"description": "Hire date in DD-MM-YYYY for EMP011 (Maria Gonzalez)",
"target_entity_id": "EMP011",
"target_emp_id": 1011,
"field": "hire_date",
"original": "2022-05-16",
"corrupted": "16-05-2022",
},
{
"type": "format_randomize",
"column": "hire_date",
"row_indices": [14],
"description": "Hire date in MM/DD/YYYY for EMP015 (Elena Petrov)",
"target_entity_id": "EMP015",
"target_emp_id": 1015,
"field": "hire_date",
"original": "2024-01-15",
"corrupted": "01/15/2024",
},
{
"type": "format_randomize",
"column": "hire_date",
"row_indices": [7],
"description": "Hire date in Mon DD, YYYY for EMP008 (Sarah Kim)",
"target_entity_id": "EMP008",
"target_emp_id": 1008,
"field": "hire_date",
"original": "2023-01-09",
"corrupted": "Jan 09, 2023",
},
{
"type": "format_randomize",
"column": "hire_date",
"row_indices": [16],
"description": "Hire date in DD/MM/YYYY for EMP017 (Lisa Svensson)",
"target_entity_id": "EMP017",
"target_emp_id": 1017,
"field": "hire_date",
"original": "2023-07-11",
"corrupted": "11/07/2023",
},
# --- null_inject: 3 ---
{
"type": "null_inject",
"targets": [{"row_idx": 5, "field": "phone"}],
"description": "Phone set to None for EMP006 (Aisha Washington)",
"target_entity_id": "EMP006",
"target_emp_id": 1006,
"original": "(303) 555-0691",
"corrupted": None,
},
{
"type": "null_inject",
"targets": [{"row_idx": 11, "field": "department"}],
"description": "Department set to None for EMP012 (David Nakamura)",
"target_entity_id": "EMP012",
"target_emp_id": 1012,
"original": "Finance",
"corrupted": None,
},
{
"type": "null_inject",
"targets": [{"row_idx": 15, "field": "email"}],
"description": "Email set to None for EMP016 (Andre Williams)",
"target_entity_id": "EMP016",
"target_emp_id": 1016,
"original": "a.williams@company.com",
"corrupted": None,
},
# --- value_variation: department name variations ---
{
"type": "value_variation",
"column": "department",
"mapping": {"Engineering": ["Engneering"]},
"description": "Department 'Engineering' misspelled as 'Engneering' for EMP007 (Carlos Rivera)",
"target_entity_id": "EMP007",
"target_emp_id": 1007,
"field": "department",
"original": "Engineering",
"corrupted": "Engneering",
},
{
"type": "value_variation",
"column": "last_name",
"mapping": {"Okafor": ["Okafro"]},
"description": "Last name 'Okafor' transposed as 'Okafro' for EMP002 (Jennifer Okafor)",
"target_entity_id": "EMP002",
"target_emp_id": 1002,
"field": "last_name",
"original": "Okafor",
"corrupted": "Okafro",
},
# --- state_expand: 2 ---
{
"type": "state_expand",
"row_indices": [12],
"description": "Office city 'San Francisco' misspelled as 'San Fransisco' for EMP013 (Fatima Al-Rashidi)",
"target_entity_id": "EMP013",
"target_emp_id": 1013,
"field": "office_city",
"original": "San Francisco",
"corrupted": "San Fransisco",
},
{
"type": "state_expand",
"row_indices": [13],
"description": "Department 'Operations' corrupted with extra space as 'Operat ions' for EMP014 (Marcus Johnson)",
"target_entity_id": "EMP014",
"target_emp_id": 1014,
"field": "department",
"original": "Operations",
"corrupted": "Operat ions",
},
# --- format_randomize salary ---
{
"type": "format_randomize",
"column": "salary",
"row_indices": [7],
"description": "Salary turned negative for EMP008 (Sarah Kim) - invalid value",
"target_entity_id": "EMP008",
"target_emp_id": 1008,
"field": "salary",
"original": 87000.00,
"corrupted": -87000.00,
},
]
_MEDIUM_PROBES: list[UtilityProbe] = [
UtilityProbe(
name="unique_employee_count",
description="Count of unique employees after deduplication",
query_fn="unique_count",
params={"column": "emp_id"},
expected_result=17,
),
UtilityProbe(
name="department_salary_avg",
description="Average salary per department",
query_fn="avg_by_group",
params={"value_col": "salary", "group_col": "department"},
expected_result={
"Engineering": 145600.00,
"Finance": 119000.00,
"HR": 91500.00,
"Marketing": 99500.00,
"Operations": 103500.00,
"Sales": 95000.00,
},
),
UtilityProbe(
name="engineering_headcount",
description="Number of employees in the Engineering department",
query_fn="count_where",
params={"column": "department", "value": "Engineering"},
expected_result=5,
),
]
_MEDIUM_TASK = Task(
task_id="medium_employees",
name="Employee Records Reconciliation",
difficulty="medium",
description=(
"Reconcile a set of 17 unique employee records (10 columns) that have "
"been corrupted with 3 near-duplicate rows (nickname variants), "
"date format inconsistencies, null injections, department name typos, "
"city misspellings, and an invalid salary. The dirty dataset has 20 rows."
),
ground_truth=_MEDIUM_GROUND_TRUTH,
schema=_MEDIUM_SCHEMA,
corruptions=_MEDIUM_CORRUPTIONS,
max_steps=60,
utility_probes=_MEDIUM_PROBES,
)
register_task(_MEDIUM_TASK)
# ############################################################################
# HARD TASK: Multi-Source Patient Registry
# ############################################################################
_HARD_GROUND_TRUTH: list[dict[str, Any]] = [
# --- DUPLICATE CLUSTER CANDIDATES (patients 1-6) ---
{
"_entity_id": "PAT001",
"patient_id": 1,
"first_name": "William",
"last_name": "Thompson",
"dob": "1958-03-14",
"gender": "M",
"phone": "312-555-0147",
"email": "william.thompson@gmail.com",
"address_line": "742 North Michigan Ave",
"city": "Chicago",
"state": "IL",
"zip": "60601",
"insurance_provider": "Blue Cross",
"insurance_id": "BC-4471583",
"last_visit_date": "2024-01-15",
},
{
"_entity_id": "PAT002",
"patient_id": 2,
"first_name": "Robert",
"last_name": "Martinez",
"dob": "1975-07-22",
"gender": "M",
"phone": "206-555-0293",
"email": "robert.martinez@outlook.com",
"address_line": "1501 Pike Place",
"city": "Seattle",
"state": "WA",
"zip": "98101",
"insurance_provider": "UnitedHealth",
"insurance_id": "UH-8823146",
"last_visit_date": "2024-03-08",
},
{
"_entity_id": "PAT003",
"patient_id": 3,
"first_name": "Elizabeth",
"last_name": "O'Brien",
"dob": "1982-11-05",
"gender": "F",
"phone": "617-555-0381",
"email": "elizabeth.obrien@yahoo.com",
"address_line": "88 Beacon Street",
"city": "Boston",
"state": "MA",
"zip": "02101",
"insurance_provider": "Aetna",
"insurance_id": "AE-5539274",
"last_visit_date": "2023-09-20",
},
{
"_entity_id": "PAT004",
"patient_id": 4,
"first_name": "Jennifer",
"last_name": "Nguyen",
"dob": "1990-04-18",
"gender": "F",
"phone": "503-555-0462",
"email": "jennifer.nguyen@gmail.com",
"address_line": "2200 NW Burnside Rd",
"city": "Portland",
"state": "OR",
"zip": "97201",
"insurance_provider": "Cigna",
"insurance_id": "CI-3317892",
"last_visit_date": "2024-06-11",
},
{
"_entity_id": "PAT005",
"patient_id": 5,
"first_name": "James",
"last_name": "Kowalski",
"dob": "1967-09-30",
"gender": "M",
"phone": "303-555-0518",
"email": "james.kowalski@aol.com",
"address_line": "1600 Colfax Ave",
"city": "Denver",
"state": "CO",
"zip": "80201",
"insurance_provider": "Medicare",
"insurance_id": "MC-6642018",
"last_visit_date": "2024-02-28",
},
{
"_entity_id": "PAT006",
"patient_id": 6,
"first_name": "Katherine",
"last_name": "Patel",
"dob": "1988-01-27",
"gender": "F",
"phone": "404-555-0637",
"email": "katherine.patel@hotmail.com",
"address_line": "350 Peachtree St NE",
"city": "Atlanta",
"state": "GA",
"zip": "30301",
"insurance_provider": "Blue Cross",
"insurance_id": "BC-9918453",
"last_visit_date": "2023-12-05",
},
# --- FALSE POSITIVE PAIR 1: Two different "Michael Davis" ---
{
"_entity_id": "PAT007",
"patient_id": 7,
"first_name": "Michael",
"last_name": "Davis",
"dob": "1972-06-10",
"gender": "M",
"phone": "310-555-0744",
"email": "michael.davis72@gmail.com",
"address_line": "456 Rodeo Drive",
"city": "Beverly Hills",
"state": "CA",
"zip": "90210",
"insurance_provider": "UnitedHealth",
"insurance_id": "UH-1157329",
"last_visit_date": "2024-04-22",
},
{
"_entity_id": "PAT008",
"patient_id": 8,
"first_name": "Michael",
"last_name": "Davis",
"dob": "1995-02-14",
"gender": "M",
"phone": "305-555-0856",
"email": "mdavis95@yahoo.com",
"address_line": "900 Biscayne Blvd",
"city": "Miami",
"state": "FL",
"zip": "33101",
"insurance_provider": "Aetna",
"insurance_id": "AE-7743201",
"last_visit_date": "2023-11-30",
},
# --- FALSE POSITIVE PAIR 2: Two different "Maria Garcia" ---
{
"_entity_id": "PAT009",
"patient_id": 9,
"first_name": "Maria",
"last_name": "Garcia",
"dob": "1960-08-03",
"gender": "F",
"phone": "512-555-0912",
"email": "maria.garcia60@gmail.com",
"address_line": "1100 Congress Ave",
"city": "Austin",
"state": "TX",
"zip": "73301",
"insurance_provider": "Medicare",
"insurance_id": "MC-2204857",
"last_visit_date": "2024-05-14",
},
{
"_entity_id": "PAT010",
"patient_id": 10,
"first_name": "Maria",
"last_name": "Garcia",
"dob": "1985-12-19",
"gender": "F",
"phone": "212-555-1023",
"email": "mgarcia.nyc@outlook.com",
"address_line": "250 West 34th St",
"city": "New York",
"state": "NY",
"zip": "10001",
"insurance_provider": "Cigna",
"insurance_id": "CI-8856134",
"last_visit_date": "2023-08-27",
},
# --- REMAINING UNIQUE PATIENTS (11-30) ---
{
"_entity_id": "PAT011",
"patient_id": 11,
"first_name": "David",
"last_name": "Chen",
"dob": "1979-05-11",
"gender": "M",
"phone": "415-555-1134",
"email": "david.chen@gmail.com",
"address_line": "580 Market Street",
"city": "San Francisco",
"state": "CA",
"zip": "94105",
"insurance_provider": "Blue Cross",
"insurance_id": "BC-3364721",
"last_visit_date": "2024-07-03",
},
{
"_entity_id": "PAT012",
"patient_id": 12,
"first_name": "Sarah",
"last_name": "Johnson",
"dob": "1993-02-28",
"gender": "F",
"phone": "713-555-1245",
"email": "sarah.johnson93@yahoo.com",
"address_line": "4200 Westheimer Rd",
"city": "Houston",
"state": "TX",
"zip": "77027",
"insurance_provider": "UnitedHealth",
"insurance_id": "UH-5578934",
"last_visit_date": "2023-10-18",
},
{
"_entity_id": "PAT013",
"patient_id": 13,
"first_name": "Thomas",
"last_name": "Wilson",
"dob": "1945-10-07",
"gender": "M",
"phone": "602-555-1356",
"email": "tom.wilson45@aol.com",
"address_line": "3300 N Central Ave",
"city": "Phoenix",
"state": "AZ",
"zip": "85012",
"insurance_provider": "Medicare",
"insurance_id": "MC-1192746",
"last_visit_date": "2024-01-09",
},
# --- STATISTICALLY UNUSUAL BUT VALID: Male named Ashley ---
{
"_entity_id": "PAT014",
"patient_id": 14,
"first_name": "Ashley",
"last_name": "Richardson",
"dob": "1970-12-22",
"gender": "M",
"phone": "615-555-1467",
"email": "ashley.richardson@gmail.com",
"address_line": "210 Broadway",
"city": "Nashville",
"state": "TN",
"zip": "37201",
"insurance_provider": "Aetna",
"insurance_id": "AE-4426183",
"last_visit_date": "2024-03-19",
},
{
"_entity_id": "PAT015",
"patient_id": 15,
"first_name": "Patricia",
"last_name": "Lee",
"dob": "1952-04-15",
"gender": "F",
"phone": "702-555-1578",
"email": "patricia.lee@outlook.com",
"address_line": "3600 Las Vegas Blvd S",
"city": "Las Vegas",
"state": "NV",
"zip": "89109",
"insurance_provider": "Medicare",
"insurance_id": "MC-8835492",
"last_visit_date": "2023-07-25",
},
{
"_entity_id": "PAT016",
"patient_id": 16,
"first_name": "Daniel",
"last_name": "Brown",
"dob": "1998-08-09",
"gender": "M",
"phone": "214-555-1689",
"email": "daniel.brown98@gmail.com",
"address_line": "1700 Pacific Ave",
"city": "Dallas",
"state": "TX",
"zip": "75201",
"insurance_provider": "Cigna",
"insurance_id": "CI-2293847",
"last_visit_date": "2024-08-01",
},
{
"_entity_id": "PAT017",
"patient_id": 17,
"first_name": "Linda",
"last_name": "Anderson",
"dob": "1963-01-30",
"gender": "F",
"phone": "952-555-1791",
"email": "linda.anderson@yahoo.com",
"address_line": "800 Nicollet Mall",
"city": "Minneapolis",
"state": "MN",
"zip": "55402",
"insurance_provider": "Blue Cross",
"insurance_id": "BC-6671835",
"last_visit_date": "2024-05-30",
},
{
"_entity_id": "PAT018",
"patient_id": 18,
"first_name": "Christopher",
"last_name": "Taylor",
"dob": "1984-06-14",
"gender": "M",
"phone": "704-555-1802",
"email": "chris.taylor84@gmail.com",
"address_line": "401 N Tryon St",
"city": "Charlotte",
"state": "NC",
"zip": "28202",
"insurance_provider": "UnitedHealth",
"insurance_id": "UH-3349128",
"last_visit_date": "2023-12-12",
},
{
"_entity_id": "PAT019",
"patient_id": 19,
"first_name": "Nancy",
"last_name": "White",
"dob": "1948-09-21",
"gender": "F",
"phone": "314-555-1913",
"email": "nancy.white48@hotmail.com",
"address_line": "100 Washington Ave",
"city": "Saint Louis",
"state": "MO",
"zip": "63101",
"insurance_provider": "Medicare",
"insurance_id": "MC-4457631",
"last_visit_date": "2024-02-14",
},
{
"_entity_id": "PAT020",
"patient_id": 20,
"first_name": "Kevin",
"last_name": "Harris",
"dob": "2001-03-05",
"gender": "M",
"phone": "407-555-2024",
"email": "kevin.harris01@gmail.com",
"address_line": "525 S Orange Ave",
"city": "Orlando",
"state": "FL",
"zip": "32801",
"insurance_provider": "Aetna",
"insurance_id": "AE-9914567",
"last_visit_date": "2024-06-28",
},
{
"_entity_id": "PAT021",
"patient_id": 21,
"first_name": "Susan",
"last_name": "Clark",
"dob": "1977-11-13",
"gender": "F",
"phone": "412-555-2135",
"email": "susan.clark77@outlook.com",
"address_line": "600 Grant St",
"city": "Pittsburgh",
"state": "PA",
"zip": "15219",
"insurance_provider": "Cigna",
"insurance_id": "CI-5582719",
"last_visit_date": "2023-09-05",
},
# --- STATISTICALLY UNUSUAL BUT VALID: Female named Jordan ---
{
"_entity_id": "PAT022",
"patient_id": 22,
"first_name": "Jordan",
"last_name": "Mitchell",
"dob": "1996-07-08",
"gender": "F",
"phone": "619-555-2246",
"email": "jordan.mitchell96@gmail.com",
"address_line": "750 B Street",
"city": "San Diego",
"state": "CA",
"zip": "92101",
"insurance_provider": "Blue Cross",
"insurance_id": "BC-7728364",
"last_visit_date": "2024-04-10",
},
{
"_entity_id": "PAT023",
"patient_id": 23,
"first_name": "Richard",
"last_name": "Lopez",
"dob": "1955-12-01",
"gender": "M",
"phone": "210-555-2357",
"email": "richard.lopez55@aol.com",
"address_line": "300 Alamo Plaza",
"city": "San Antonio",
"state": "TX",
"zip": "78205",
"insurance_provider": "Medicare",
"insurance_id": "MC-3346285",
"last_visit_date": "2024-01-22",
},
{
"_entity_id": "PAT024",
"patient_id": 24,
"first_name": "Angela",
"last_name": "Robinson",
"dob": "1989-05-26",
"gender": "F",
"phone": "317-555-2468",
"email": "angela.robinson@yahoo.com",
"address_line": "200 E Washington St",
"city": "Indianapolis",
"state": "IN",
"zip": "46204",
"insurance_provider": "UnitedHealth",
"insurance_id": "UH-6693241",
"last_visit_date": "2023-11-08",
},
{
"_entity_id": "PAT025",
"patient_id": 25,
"first_name": "Steven",
"last_name": "Walker",
"dob": "1941-07-19",
"gender": "M",
"phone": "216-555-2579",
"email": "steven.walker@gmail.com",
"address_line": "1100 Superior Ave",
"city": "Cleveland",
"state": "OH",
"zip": "44114",
"insurance_provider": "Medicare",
"insurance_id": "MC-1128574",
"last_visit_date": "2024-03-05",
},
{
"_entity_id": "PAT026",
"patient_id": 26,
"first_name": "Michelle",
"last_name": "Young",
"dob": "2003-10-31",
"gender": "F",
"phone": "504-555-2681",
"email": "michelle.young03@outlook.com",
"address_line": "800 Canal St",
"city": "New Orleans",
"state": "LA",
"zip": "70112",
"insurance_provider": "Aetna",
"insurance_id": "AE-8847392",
"last_visit_date": "2024-07-19",
},
# --- STATISTICALLY UNUSUAL BUT VALID: Male named Shannon ---
{
"_entity_id": "PAT027",
"patient_id": 27,
"first_name": "Shannon",
"last_name": "Burke",
"dob": "1974-02-08",
"gender": "M",
"phone": "816-555-2792",
"email": "shannon.burke@gmail.com",
"address_line": "1200 Main St",
"city": "Kansas City",
"state": "MO",
"zip": "64105",
"insurance_provider": "Cigna",
"insurance_id": "CI-4431957",
"last_visit_date": "2023-08-14",
},
{
"_entity_id": "PAT028",
"patient_id": 28,
"first_name": "Dorothy",
"last_name": "Hall",
"dob": "1943-06-17",
"gender": "F",
"phone": "414-555-2903",
"email": "dorothy.hall43@yahoo.com",
"address_line": "500 W Wisconsin Ave",
"city": "Milwaukee",
"state": "WI",
"zip": "53203",
"insurance_provider": "Medicare",
"insurance_id": "MC-5563418",
"last_visit_date": "2024-02-07",
},
{
"_entity_id": "PAT029",
"patient_id": 29,
"first_name": "Brian",
"last_name": "Kim",
"dob": "1992-09-14",
"gender": "M",
"phone": "571-555-3014",
"email": "brian.kim92@gmail.com",
"address_line": "1750 Tysons Blvd",
"city": "Tysons",
"state": "VA",
"zip": "22102",
"insurance_provider": "Blue Cross",
"insurance_id": "BC-2215847",
"last_visit_date": "2024-05-02",
},
{
"_entity_id": "PAT030",
"patient_id": 30,
"first_name": "Laura",
"last_name": "Scott",
"dob": "1999-01-23",
"gender": "F",
"phone": "919-555-3125",
"email": "laura.scott99@hotmail.com",
"address_line": "400 Fayetteville St",
"city": "Raleigh",
"state": "NC",
"zip": "27601",
"insurance_provider": "UnitedHealth",
"insurance_id": "UH-7741926",
"last_visit_date": "2023-10-31",
},
# --- NEW PATIENTS (PAT031-PAT050) ---
# --- GENDER-NEUTRAL NAME TRAP: Morgan (M) ---
{
"_entity_id": "PAT031",
"patient_id": 31,
"first_name": "Morgan",
"last_name": "Fletcher",
"dob": "1986-04-12",
"gender": "M",
"phone": "253-555-3201",
"email": "morgan.fletcher86@gmail.com",
"address_line": "1900 Pacific Ave",
"city": "Tacoma",
"state": "WA",
"zip": "98402",
"insurance_provider": "Cigna",
"insurance_id": "CI-7712548",
"last_visit_date": "2024-03-22",
},
# --- FALSE POSITIVE PAIR 3: Two "David Kim" (PAT032 & PAT033) ---
{
"_entity_id": "PAT032",
"patient_id": 32,
"first_name": "David",
"last_name": "Kim",
"dob": "1988-07-20",
"gender": "M",
"phone": "425-555-3312",
"email": "david.kim88@gmail.com",
"address_line": "300 108th Ave NE",
"city": "Bellevue",
"state": "WA",
"zip": "98004",
"insurance_provider": "UnitedHealth",
"insurance_id": "UH-4423891",
"last_visit_date": "2024-05-18",
},
{
"_entity_id": "PAT033",
"patient_id": 33,
"first_name": "David",
"last_name": "Kim",
"dob": "1990-11-03",
"gender": "M",
"phone": "206-555-3423",
"email": "dkim90@outlook.com",
"address_line": "815 Pine St",
"city": "Seattle",
"state": "WA",
"zip": "98101",
"insurance_provider": "UnitedHealth",
"insurance_id": "UH-6638172",
"last_visit_date": "2024-01-29",
},
# --- DUPLICATE CLUSTER: Christopher -> typo variants (PAT034) ---
{
"_entity_id": "PAT034",
"patient_id": 34,
"first_name": "Christopher",
"last_name": "Reeves",
"dob": "1976-08-25",
"gender": "M",
"phone": "813-555-3534",
"email": "christopher.reeves@gmail.com",
"address_line": "1400 N Dale Mabry Hwy",
"city": "Tampa",
"state": "FL",
"zip": "33607",
"insurance_provider": "Blue Cross",
"insurance_id": "BC-5547832",
"last_visit_date": "2024-06-05",
},
# --- GENDER-NEUTRAL NAME TRAP: Avery (M) ---
{
"_entity_id": "PAT035",
"patient_id": 35,
"first_name": "Avery",
"last_name": "Simmons",
"dob": "1994-02-17",
"gender": "M",
"phone": "678-555-3645",
"email": "avery.simmons94@yahoo.com",
"address_line": "2500 Peachtree Rd NW",
"city": "Atlanta",
"state": "GA",
"zip": "30305",
"insurance_provider": "Aetna",
"insurance_id": "AE-3318724",
"last_visit_date": "2023-11-15",
},
# --- FALSE POSITIVE PAIR 4: Two "Sarah Williams" (PAT036 & PAT037) ---
{
"_entity_id": "PAT036",
"patient_id": 36,
"first_name": "Sarah",
"last_name": "Williams",
"dob": "1983-09-14",
"gender": "F",
"phone": "312-555-3756",
"email": "sarah.williams83@gmail.com",
"address_line": "55 E Monroe St",
"city": "Chicago",
"state": "IL",
"zip": "60603",
"insurance_provider": "Blue Cross",
"insurance_id": "BC-8834291",
"last_visit_date": "2024-04-01",
},
{
"_entity_id": "PAT037",
"patient_id": 37,
"first_name": "Sarah",
"last_name": "Williams",
"dob": "1978-03-22",
"gender": "F",
"phone": "773-555-3867",
"email": "swilliams78@yahoo.com",
"address_line": "4700 N Lincoln Ave",
"city": "Chicago",
"state": "IL",
"zip": "60625",
"insurance_provider": "Cigna",
"insurance_id": "CI-2247163",
"last_visit_date": "2023-07-19",
},
# --- DUPLICATE CLUSTER: Alexandra -> misspelling variant (PAT038) ---
{
"_entity_id": "PAT038",
"patient_id": 38,
"first_name": "Alexandra",
"last_name": "Petrov",
"dob": "1991-06-08",
"gender": "F",
"phone": "480-555-3978",
"email": "alexandra.petrov@gmail.com",
"address_line": "7100 E Camelback Rd",
"city": "Scottsdale",
"state": "AZ",
"zip": "85251",
"insurance_provider": "UnitedHealth",
"insurance_id": "UH-9917453",
"last_visit_date": "2024-02-20",
},
# --- GENDER-NEUTRAL NAME TRAP: Casey (F) ---
{
"_entity_id": "PAT039",
"patient_id": 39,
"first_name": "Casey",
"last_name": "Morgan",
"dob": "2000-01-30",
"gender": "F",
"phone": "720-555-4089",
"email": "casey.morgan00@outlook.com",
"address_line": "1600 Stout St",
"city": "Denver",
"state": "CO",
"zip": "80202",
"insurance_provider": "Medicare",
"insurance_id": "MC-7724316",
"last_visit_date": "2024-08-10",
},
# --- DUPLICATE CLUSTER: Patricia -> typo variants (PAT040) ---
{
"_entity_id": "PAT040",
"patient_id": 40,
"first_name": "Patricia",
"last_name": "Hernandez",
"dob": "1969-12-04",
"gender": "F",
"phone": "520-555-4190",
"email": "patricia.hernandez@gmail.com",
"address_line": "150 N Stone Ave",
"city": "Tucson",
"state": "AZ",
"zip": "85701",
"insurance_provider": "Aetna",
"insurance_id": "AE-6641258",
"last_visit_date": "2024-01-17",
},
{
"_entity_id": "PAT041",
"patient_id": 41,
"first_name": "Gregory",
"last_name": "Adams",
"dob": "1957-03-19",
"gender": "M",
"phone": "860-555-4201",
"email": "gregory.adams57@aol.com",
"address_line": "250 Constitution Plaza",
"city": "Hartford",
"state": "CT",
"zip": "06103",
"insurance_provider": "Medicare",
"insurance_id": "MC-3392841",
"last_visit_date": "2024-07-08",
},
{
"_entity_id": "PAT042",
"patient_id": 42,
"first_name": "Samantha",
"last_name": "Rivera",
"dob": "1997-08-22",
"gender": "F",
"phone": "505-555-4312",
"email": "samantha.rivera97@gmail.com",
"address_line": "400 Central Ave SW",
"city": "Albuquerque",
"state": "NM",
"zip": "87102",
"insurance_provider": "Cigna",
"insurance_id": "CI-8812347",
"last_visit_date": "2023-12-29",
},
# --- GENDER-NEUTRAL NAME TRAP: Dana (M) ---
{
"_entity_id": "PAT043",
"patient_id": 43,
"first_name": "Dana",
"last_name": "Crawford",
"dob": "1965-11-28",
"gender": "M",
"phone": "901-555-4423",
"email": "dana.crawford65@hotmail.com",
"address_line": "203 Beale St",
"city": "Memphis",
"state": "TN",
"zip": "38103",
"insurance_provider": "Blue Cross",
"insurance_id": "BC-1148273",
"last_visit_date": "2024-04-15",
},
# --- FALSE POSITIVE PAIR 5: Two "James Lee" (PAT044 & PAT045) ---
{
"_entity_id": "PAT044",
"patient_id": 44,
"first_name": "James",
"last_name": "Lee",
"dob": "1981-05-09",
"gender": "M",
"phone": "510-555-4534",
"email": "james.lee81@gmail.com",
"address_line": "1901 Harrison St",
"city": "Oakland",
"state": "CA",
"zip": "94612",
"insurance_provider": "Aetna",
"insurance_id": "AE-5523918",
"last_visit_date": "2024-06-22",
},
{
"_entity_id": "PAT045",
"patient_id": 45,
"first_name": "James",
"last_name": "Lee",
"dob": "1982-10-31",
"gender": "M",
"phone": "408-555-4645",
"email": "jlee82@yahoo.com",
"address_line": "225 W Santa Clara St",
"city": "San Jose",
"state": "CA",
"zip": "95113",
"insurance_provider": "Aetna",
"insurance_id": "AE-7739482",
"last_visit_date": "2023-09-14",
},
{
"_entity_id": "PAT046",
"patient_id": 46,
"first_name": "Theresa",
"last_name": "Nguyen",
"dob": "1973-07-14",
"gender": "F",
"phone": "832-555-4756",
"email": "theresa.nguyen73@gmail.com",
"address_line": "2100 Travis St",
"city": "Houston",
"state": "TX",
"zip": "77002",
"insurance_provider": "UnitedHealth",
"insurance_id": "UH-2248637",
"last_visit_date": "2024-05-07",
},
# --- GENDER-NEUTRAL NAME TRAP: Robin (F) ---
{
"_entity_id": "PAT047",
"patient_id": 47,
"first_name": "Robin",
"last_name": "Blackwell",
"dob": "1980-05-16",
"gender": "F",
"phone": "336-555-4867",
"email": "robin.blackwell@outlook.com",
"address_line": "300 N Greene St",
"city": "Greensboro",
"state": "NC",
"zip": "27401",
"insurance_provider": "Medicare",
"insurance_id": "MC-4458923",
"last_visit_date": "2024-03-11",
},
# --- DUPLICATE CLUSTER: Catherine -> spelling variants (PAT048) ---
# NOTE: PAT006 is Katherine Patel (different person!). Agent must NOT
# merge PAT048's duplicates with PAT006.
{
"_entity_id": "PAT048",
"patient_id": 48,
"first_name": "Catherine",
"last_name": "Brooks",
"dob": "1987-09-03",
"gender": "F",
"phone": "614-555-4978",
"email": "catherine.brooks@gmail.com",
"address_line": "100 E Broad St",
"city": "Columbus",
"state": "OH",
"zip": "43215",
"insurance_provider": "Blue Cross",
"insurance_id": "BC-6693147",
"last_visit_date": "2024-07-25",
},
{
"_entity_id": "PAT049",
"patient_id": 49,
"first_name": "Raymond",
"last_name": "Foster",
"dob": "1950-02-11",
"gender": "M",
"phone": "502-555-5089",
"email": "raymond.foster50@aol.com",
"address_line": "700 W Main St",
"city": "Louisville",
"state": "KY",
"zip": "40202",
"insurance_provider": "Medicare",
"insurance_id": "MC-8817294",
"last_visit_date": "2024-02-19",
},
{
"_entity_id": "PAT050",
"patient_id": 50,
"first_name": "Heather",
"last_name": "Sanchez",
"dob": "2004-06-21",
"gender": "F",
"phone": "515-555-5190",
"email": "heather.sanchez04@gmail.com",
"address_line": "1000 Walnut St",
"city": "Des Moines",
"state": "IA",
"zip": "50309",
"insurance_provider": "Cigna",
"insurance_id": "CI-3347291",
"last_visit_date": "2023-10-05",
},
]
_HARD_SCHEMA: dict[str, Any] = {
"primary_key": "patient_id",
"expected_types": {
"patient_id": "int",
"first_name": "str",
"last_name": "str",
"dob": "date",
"gender": "str",
"phone": "phone",
"email": "email",
"address_line": "str",
"city": "str",
"state": "str",
"zip": "str",
"insurance_provider": "str",
"insurance_id": "str",
"last_visit_date": "date",
},
"constraints": {
"dob": {"format": "YYYY-MM-DD", "min": "1940-01-01", "max": "2005-12-31"},
"gender": {"allowed_values": ["M", "F"]},
"phone": {"format": r"^\d{3}-\d{3}-\d{4}$"},
"email": {"format": r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}$"},
"state": {
"allowed_values": [
"AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
"HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
"MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
"NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
"SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY",
],
},
"zip": {"format": r"^\d{5}$"},
"insurance_provider": {
"allowed_values": ["Blue Cross", "UnitedHealth", "Aetna", "Cigna", "Medicare"],
},
"insurance_id": {
"format": r"^(BC|UH|AE|CI|MC)-\d{7}$",
},
"last_visit_date": {"format": "YYYY-MM-DD", "min": "2023-01-01", "max": "2024-12-31"},
},
"cross_field_rules": {
"zip_city_map": {
"60601": "Chicago",
"60602": "Chicago",
"60603": "Chicago",
"60625": "Chicago",
"98101": "Seattle",
"02101": "Boston",
"97201": "Portland",
"80201": "Denver",
"80202": "Denver",
"30301": "Atlanta",
"30305": "Atlanta",
"90210": "Beverly Hills",
"33101": "Miami",
"73301": "Austin",
"10001": "New York",
"94105": "San Francisco",
"77027": "Houston",
"77002": "Houston",
"85012": "Phoenix",
"37201": "Nashville",
"89109": "Las Vegas",
"75201": "Dallas",
"55402": "Minneapolis",
"28202": "Charlotte",
"63101": "Saint Louis",
"32801": "Orlando",
"15219": "Pittsburgh",
"92101": "San Diego",
"78205": "San Antonio",
"46204": "Indianapolis",
"44114": "Cleveland",
"70112": "New Orleans",
"64105": "Kansas City",
"53203": "Milwaukee",
"22102": "Tysons",
"27601": "Raleigh",
"98402": "Tacoma",
"98004": "Bellevue",
"33607": "Tampa",
"85251": "Scottsdale",
"85701": "Tucson",
"06103": "Hartford",
"87102": "Albuquerque",
"38103": "Memphis",
"94612": "Oakland",
"95113": "San Jose",
"27401": "Greensboro",
"43215": "Columbus",
"40202": "Louisville",
"50309": "Des Moines",
},
"insurance_prefix_map": {
"Blue Cross": "BC",
"UnitedHealth": "UH",
"Aetna": "AE",
"Cigna": "CI",
"Medicare": "MC",
},
},
}
_HARD_CORRUPTIONS: list[dict[str, Any]] = [
# =========================================================================
# ORIGINAL 6 duplicate clusters (nickname variants) creating ~10 extra rows
# Duplicate patient_ids start at 51 (PAT031-PAT050 are real patients now)
# =========================================================================
{
"type": "duplicate_cluster",
"source_indices": [0],
"cluster_sizes": [2],
"noise_fields": ["first_name", "phone", "address_line"],
"description": "William Thompson -> 'Bill Thompson' and 'Wm Thompson'",
"source_entity_id": "PAT001",
"source_patient_id": 1,
"duplicates": [
{
"new_patient_id": 51,
"changes": {
"first_name": "Bill",
"email": "bill.thompson@gmail.com",
"phone": "312-555-0148",
"address_line": "742 N Michigan Ave",
"last_visit_date": "2024-01-16",
},
},
{
"new_patient_id": 52,
"changes": {
"first_name": "Wm",
"email": "wm.thompson@yahoo.com",
"phone": "(312) 555-0147",
"insurance_id": "BC-4471583",
"zip": "60602",
},
},
],
},
{
"type": "duplicate_cluster",
"source_indices": [1],
"cluster_sizes": [1],
"noise_fields": ["first_name", "phone", "address_line"],
"description": "Robert Martinez -> 'Bob Martinez'",
"source_entity_id": "PAT002",
"source_patient_id": 2,
"duplicates": [
{
"new_patient_id": 53,
"changes": {
"first_name": "Bob",
"email": "bob.martinez@outlook.com",
"address_line": "1501 Pike Pl",
"last_visit_date": "2024-03-09",
"insurance_id": "UH-8823146",
},
},
],
},
{
"type": "duplicate_cluster",
"source_indices": [2],
"cluster_sizes": [2],
"noise_fields": ["first_name", "phone", "address_line"],
"description": "Elizabeth O'Brien -> 'Liz OBrien' and 'Beth O Brien'",
"source_entity_id": "PAT003",
"source_patient_id": 3,
"duplicates": [
{
"new_patient_id": 54,
"changes": {
"first_name": "Liz",
"last_name": "OBrien",
"email": "liz.obrien@yahoo.com",
"phone": "617-555-0382",
"address_line": "88 Beacon St",
},
},
{
"new_patient_id": 55,
"changes": {
"first_name": "Beth",
"last_name": "O Brien",
"email": "beth.obrien@gmail.com",
"dob": "1982-11-05",
"insurance_id": "AE-5539274",
},
},
],
},
{
"type": "duplicate_cluster",
"source_indices": [3],
"cluster_sizes": [1],
"noise_fields": ["first_name", "phone", "address_line"],
"description": "Jennifer Nguyen -> 'Jen Nguyen'",
"source_entity_id": "PAT004",
"source_patient_id": 4,
"duplicates": [
{
"new_patient_id": 56,
"changes": {
"first_name": "Jen",
"email": "jen.nguyen@gmail.com",
"phone": "503-555-0463",
"address_line": "2200 NW Burnside Road",
"last_visit_date": "2024-06-12",
},
},
],
},
{
"type": "duplicate_cluster",
"source_indices": [4],
"cluster_sizes": [1],
"noise_fields": ["first_name", "phone", "address_line"],
"description": "James Kowalski -> 'Jim Kowalski'",
"source_entity_id": "PAT005",
"source_patient_id": 5,
"duplicates": [
{
"new_patient_id": 57,
"changes": {
"first_name": "Jim",
"email": "jim.kowalski@aol.com",
"address_line": "1600 Colfax Avenue",
"phone": "3035550518",
"insurance_id": "MC-6642018",
},
},
],
},
{
"type": "duplicate_cluster",
"source_indices": [5],
"cluster_sizes": [2],
"noise_fields": ["first_name", "phone", "address_line"],
"description": "Katherine Patel -> 'Kate Patel' and 'Kathy Patel'",
"source_entity_id": "PAT006",
"source_patient_id": 6,
"duplicates": [
{
"new_patient_id": 58,
"changes": {
"first_name": "Kate",
"email": "kate.patel@hotmail.com",
"phone": "404-555-0638",
"city": "atlanta",
"state": "ga",
},
},
{
"new_patient_id": 59,
"changes": {
"first_name": "Kathy",
"email": "kathy.patel@gmail.com",
"address_line": "350 Peachtree Street NE",
"dob": "1988-01-27",
"insurance_id": "BC-9918453",
},
},
],
},
# =========================================================================
# NEW 4 duplicate clusters (TYPO-based, much harder than nicknames)
# =========================================================================
{
"type": "duplicate_cluster",
"source_indices": [33],
"cluster_sizes": [2],
"noise_fields": ["first_name", "address_line"],
"description": "Christopher Reeves -> 'Christpher Reeves' (dropped 'o') and 'Chistopher Reeves' (dropped 'r')",
"source_entity_id": "PAT034",
"source_patient_id": 34,
"duplicates": [
{
"new_patient_id": 60,
"changes": {
"first_name": "Christpher",
"email": "christpher.reeves@gmail.com",
"phone": "813-555-3535",
"address_line": "1400 N Dale Mabry",
},
},
{
"new_patient_id": 61,
"changes": {
"first_name": "Chistopher",
"email": "chistopher.reeves@yahoo.com",
"address_line": "1400 North Dale Mabry Hwy",
"last_visit_date": "2024-06-06",
},
},
],
},
{
"type": "duplicate_cluster",
"source_indices": [37],
"cluster_sizes": [1],
"noise_fields": ["first_name"],
"description": "Alexandra Petrov -> 'Alessandra Petrov' (common misspelling/Italian variant)",
"source_entity_id": "PAT038",
"source_patient_id": 38,
"duplicates": [
{
"new_patient_id": 62,
"changes": {
"first_name": "Alessandra",
"email": "alessandra.petrov@gmail.com",
"phone": "480-555-3979",
"address_line": "7100 E Camelback Road",
},
},
],
},
{
"type": "duplicate_cluster",
"source_indices": [39],
"cluster_sizes": [2],
"noise_fields": ["first_name", "address_line"],
"description": "Patricia Hernandez -> 'Patricla Hernandez' (typo i->l) and 'Patrica Hernandez' (dropped 'i')",
"source_entity_id": "PAT040",
"source_patient_id": 40,
"duplicates": [
{
"new_patient_id": 63,
"changes": {
"first_name": "Patricla",
"email": "patricla.hernandez@gmail.com",
"phone": "520-555-4191",
"address_line": "150 North Stone Ave",
},
},
{
"new_patient_id": 64,
"changes": {
"first_name": "Patrica",
"email": "patrica.hernandez@yahoo.com",
"address_line": "150 N Stone Avenue",
"last_visit_date": "2024-01-18",
},
},
],
},
{
"type": "duplicate_cluster",
"source_indices": [47],
"cluster_sizes": [2],
"noise_fields": ["first_name", "address_line"],
"description": (
"Catherine Brooks -> 'Katherine Brooks' (C->K variant) and 'Catharine Brooks' (e->a variant). "
"TRAP: PAT006 is Katherine Patel - agent must NOT merge these with PAT006!"
),
"source_entity_id": "PAT048",
"source_patient_id": 48,
"duplicates": [
{
"new_patient_id": 65,
"changes": {
"first_name": "Katherine",
"email": "katherine.brooks@gmail.com",
"phone": "614-555-4979",
"address_line": "100 East Broad St",
},
},
{
"new_patient_id": 66,
"changes": {
"first_name": "Catharine",
"email": "catharine.brooks@yahoo.com",
"address_line": "100 E Broad Street",
"last_visit_date": "2024-07-26",
},
},
],
},
# =========================================================================
# ORIGINAL cross_field_corrupt: zip-city mismatches
# =========================================================================
{
"type": "cross_field_corrupt",
"row_indices": [17],
"description": "Zip-city mismatch: Charlotte patient 18 given Raleigh zip 27601",
"target_entity_id": "PAT018",
"target_patient_id": 18,
"field": "zip",
"original": "28202",
"corrupted": "27601",
},
{
"type": "cross_field_corrupt",
"row_indices": [23],
"description": "Zip-city mismatch: Indianapolis patient 24 given Chicago zip 60601",
"target_entity_id": "PAT024",
"target_patient_id": 24,
"field": "zip",
"original": "46204",
"corrupted": "60601",
},
# --- NEW zip-city mismatches for new patients ---
{
"type": "cross_field_corrupt",
"row_indices": [30],
"description": "Zip-city mismatch: Tacoma patient 31 given Seattle zip 98101",
"target_entity_id": "PAT031",
"target_patient_id": 31,
"field": "zip",
"original": "98402",
"corrupted": "98101",
},
{
"type": "cross_field_corrupt",
"row_indices": [41],
"description": "Zip-city mismatch: Albuquerque patient 42 given Tucson zip 85701",
"target_entity_id": "PAT042",
"target_patient_id": 42,
"field": "zip",
"original": "87102",
"corrupted": "85701",
},
{
"type": "cross_field_corrupt",
"row_indices": [48],
"description": "Zip-city mismatch: Louisville patient 49 given Columbus zip 43215",
"target_entity_id": "PAT049",
"target_patient_id": 49,
"field": "zip",
"original": "40202",
"corrupted": "43215",
},
{
"type": "cross_field_corrupt",
"row_indices": [45],
"description": "Zip-city mismatch: Houston patient 46 given Dallas zip 75201",
"target_entity_id": "PAT046",
"target_patient_id": 46,
"field": "zip",
"original": "77002",
"corrupted": "75201",
},
# =========================================================================
# ORIGINAL cross_field_corrupt: insurance ID prefix mismatches
# =========================================================================
{
"type": "cross_field_corrupt",
"row_indices": [20],
"description": "Insurance ID prefix mismatch: Cigna patient 21 given BC prefix",
"target_entity_id": "PAT021",
"target_patient_id": 21,
"field": "insurance_id",
"original": "CI-5582719",
"corrupted": "BC-5582719",
},
{
"type": "cross_field_corrupt",
"row_indices": [28],
"description": "Insurance ID prefix mismatch: Blue Cross patient 29 given AE prefix",
"target_entity_id": "PAT029",
"target_patient_id": 29,
"field": "insurance_id",
"original": "BC-2215847",
"corrupted": "AE-2215847",
},
# --- NEW insurance prefix mismatches ---
{
"type": "cross_field_corrupt",
"row_indices": [40],
"description": "Insurance ID prefix mismatch: Medicare patient 41 given UH prefix",
"target_entity_id": "PAT041",
"target_patient_id": 41,
"field": "insurance_id",
"original": "MC-3392841",
"corrupted": "UH-3392841",
},
{
"type": "cross_field_corrupt",
"row_indices": [34],
"description": "Insurance ID prefix mismatch: Aetna patient 35 given CI prefix",
"target_entity_id": "PAT035",
"target_patient_id": 35,
"field": "insurance_id",
"original": "AE-3318724",
"corrupted": "CI-3318724",
},
{
"type": "cross_field_corrupt",
"row_indices": [49],
"description": "Insurance ID prefix mismatch: Cigna patient 50 given MC prefix",
"target_entity_id": "PAT050",
"target_patient_id": 50,
"field": "insurance_id",
"original": "CI-3347291",
"corrupted": "MC-3347291",
},
# =========================================================================
# ORIGINAL cross_field_corrupt: gender format
# =========================================================================
{
"type": "cross_field_corrupt",
"row_indices": [11],
"description": "Gender format corruption: patient 12 'F' -> 'Female'",
"target_entity_id": "PAT012",
"target_patient_id": 12,
"field": "gender",
"original": "F",
"corrupted": "Female",
},
# =========================================================================
# ORIGINAL impossible_date: date format corruptions
# =========================================================================
{
"type": "impossible_date",
"targets": [{"row_idx": 11, "field": "dob", "corrupt_type": "format"}],
"description": "DOB reformatted to MM/DD/YYYY for patient 12",
"target_entity_id": "PAT012",
"target_patient_id": 12,
"field": "dob",
"original": "1993-02-28",
"corrupted": "02/28/1993",
},
{
"type": "impossible_date",
"targets": [{"row_idx": 15, "field": "last_visit_date", "corrupt_type": "format"}],
"description": "Last visit date reformatted to MM-DD-YYYY for patient 16",
"target_entity_id": "PAT016",
"target_patient_id": 16,
"field": "last_visit_date",
"original": "2024-08-01",
"corrupted": "08-01-2024",
},
{
"type": "impossible_date",
"targets": [{"row_idx": 22, "field": "dob", "corrupt_type": "format"}],
"description": "DOB reformatted to 'Dec 1, 1955' for patient 23",
"target_entity_id": "PAT023",
"target_patient_id": 23,
"field": "dob",
"original": "1955-12-01",
"corrupted": "Dec 1, 1955",
},
{
"type": "impossible_date",
"targets": [{"row_idx": 27, "field": "last_visit_date", "corrupt_type": "format"}],
"description": "Last visit date reformatted to M/D/YYYY for patient 28",
"target_entity_id": "PAT028",
"target_patient_id": 28,
"field": "last_visit_date",
"original": "2024-02-07",
"corrupted": "2/7/2024",
},
# --- NEW date format corruptions for new patients ---
{
"type": "impossible_date",
"targets": [{"row_idx": 33, "field": "dob", "corrupt_type": "format"}],
"description": "DOB reformatted to DD/MM/YYYY for patient 34",
"target_entity_id": "PAT034",
"target_patient_id": 34,
"field": "dob",
"original": "1976-08-25",
"corrupted": "25/08/1976",
},
{
"type": "impossible_date",
"targets": [{"row_idx": 37, "field": "last_visit_date", "corrupt_type": "format"}],
"description": "Last visit date reformatted to 'Feb 20, 2024' for patient 38",
"target_entity_id": "PAT038",
"target_patient_id": 38,
"field": "last_visit_date",
"original": "2024-02-20",
"corrupted": "Feb 20, 2024",
},
{
"type": "impossible_date",
"targets": [{"row_idx": 42, "field": "dob", "corrupt_type": "format"}],
"description": "DOB reformatted to MM-DD-YYYY for patient 43",
"target_entity_id": "PAT043",
"target_patient_id": 43,
"field": "dob",
"original": "1965-11-28",
"corrupted": "11-28-1965",
},
{
"type": "impossible_date",
"targets": [{"row_idx": 46, "field": "last_visit_date", "corrupt_type": "format"}],
"description": "Last visit date reformatted to D/M/YYYY for patient 47",
"target_entity_id": "PAT047",
"target_patient_id": 47,
"field": "last_visit_date",
"original": "2024-03-11",
"corrupted": "11/3/2024",
},
{
"type": "impossible_date",
"targets": [{"row_idx": 49, "field": "dob", "corrupt_type": "format"}],
"description": "DOB reformatted to 'Jun 21, 2004' for patient 50",
"target_entity_id": "PAT050",
"target_patient_id": 50,
"field": "dob",
"original": "2004-06-21",
"corrupted": "Jun 21, 2004",
},
# --- NEW subtle DOB off-by-one corruptions (very hard to detect) ---
{
"type": "impossible_date",
"targets": [{"row_idx": 31, "field": "dob", "corrupt_type": "off_by_one"}],
"description": "DOB day off by 1: patient 32 '1988-07-20' -> '1988-07-21'",
"target_entity_id": "PAT032",
"target_patient_id": 32,
"field": "dob",
"original": "1988-07-20",
"corrupted": "1988-07-21",
},
{
"type": "impossible_date",
"targets": [{"row_idx": 43, "field": "dob", "corrupt_type": "off_by_one"}],
"description": "DOB day off by 1: patient 44 '1981-05-09' -> '1981-05-10'",
"target_entity_id": "PAT044",
"target_patient_id": 44,
"field": "dob",
"original": "1981-05-09",
"corrupted": "1981-05-10",
},
# =========================================================================
# ORIGINAL insurance_id_mismatch: missing/corrupted + gender mismatches
# =========================================================================
{
"type": "insurance_id_mismatch",
"row_indices": [26],
"description": "Patient 27 insurance_id set to empty string",
"target_entity_id": "PAT027",
"target_patient_id": 27,
"field": "insurance_id",
"original": "CI-4431957",
"corrupted": "",
},
{
"type": "insurance_id_mismatch",
"row_indices": [15],
"description": "Patient 16 gender 'M' -> 'male' (format mismatch)",
"target_entity_id": "PAT016",
"target_patient_id": 16,
"field": "gender",
"original": "M",
"corrupted": "male",
},
{
"type": "insurance_id_mismatch",
"row_indices": [22],
"description": "Patient 23 gender 'M' -> 'm' (case mismatch)",
"target_entity_id": "PAT023",
"target_patient_id": 23,
"field": "gender",
"original": "M",
"corrupted": "m",
},
# =========================================================================
# ORIGINAL null_inject_contextual: missing values and whitespace
# =========================================================================
{
"type": "null_inject_contextual",
"targets": [{"row_idx": 13, "field": "email"}],
"description": "Email set to None for patient 14 (Ashley Richardson)",
"target_entity_id": "PAT014",
"target_patient_id": 14,
"field": "email",
"original": "ashley.richardson@gmail.com",
"corrupted": None,
},
{
"type": "null_inject_contextual",
"targets": [{"row_idx": 21, "field": "phone"}],
"description": "Phone set to None for patient 22 (Jordan Mitchell)",
"target_entity_id": "PAT022",
"target_patient_id": 22,
"field": "phone",
"original": "619-555-2246",
"corrupted": None,
},
{
"type": "null_inject_contextual",
"targets": [{"row_idx": 29, "field": "address_line"}],
"description": "Address set to None for patient 30 (Laura Scott)",
"target_entity_id": "PAT030",
"target_patient_id": 30,
"field": "address_line",
"original": "400 Fayetteville St",
"corrupted": None,
},
# --- NEW null injections for new patients ---
{
"type": "null_inject_contextual",
"targets": [{"row_idx": 40, "field": "phone"}],
"description": "Phone set to None for patient 41 (Gregory Adams)",
"target_entity_id": "PAT041",
"target_patient_id": 41,
"field": "phone",
"original": "860-555-4201",
"corrupted": None,
},
{
"type": "null_inject_contextual",
"targets": [{"row_idx": 47, "field": "email"}],
"description": "Email set to None for patient 48 (Catherine Brooks)",
"target_entity_id": "PAT048",
"target_patient_id": 48,
"field": "email",
"original": "catherine.brooks@gmail.com",
"corrupted": None,
},
{
"type": "null_inject_contextual",
"targets": [{"row_idx": 38, "field": "insurance_id"}],
"description": "Insurance ID set to None for patient 39 (Casey Morgan)",
"target_entity_id": "PAT039",
"target_patient_id": 39,
"field": "insurance_id",
"original": "MC-7724316",
"corrupted": None,
},
# --- ORIGINAL whitespace corruptions ---
{
"type": "null_inject_contextual",
"targets": [{"row_idx": 10, "field": "first_name"}],
"description": "First name padded with spaces for patient 11 (David Chen)",
"target_entity_id": "PAT011",
"target_patient_id": 11,
"field": "first_name",
"original": "David",
"corrupted": " David ",
},
{
"type": "null_inject_contextual",
"targets": [{"row_idx": 14, "field": "email"}],
"description": "Email with extra space for patient 15 (Patricia Lee)",
"target_entity_id": "PAT015",
"target_patient_id": 15,
"field": "email",
"original": "patricia.lee@outlook.com",
"corrupted": "patricia.lee @outlook.com",
},
# --- NEW whitespace corruptions ---
{
"type": "null_inject_contextual",
"targets": [{"row_idx": 44, "field": "last_name"}],
"description": "Last name with trailing space for patient 45 (James Lee)",
"target_entity_id": "PAT045",
"target_patient_id": 45,
"field": "last_name",
"original": "Lee",
"corrupted": "Lee ",
},
{
"type": "null_inject_contextual",
"targets": [{"row_idx": 39, "field": "first_name"}],
"description": "First name with leading tab for patient 40 (Patricia Hernandez)",
"target_entity_id": "PAT040",
"target_patient_id": 40,
"field": "first_name",
"original": "Patricia",
"corrupted": "\tPatricia",
},
{
"type": "null_inject_contextual",
"targets": [{"row_idx": 48, "field": "city"}],
"description": "City with trailing whitespace for patient 49 (Raymond Foster)",
"target_entity_id": "PAT049",
"target_patient_id": 49,
"field": "city",
"original": "Louisville",
"corrupted": "Louisville ",
},
# =========================================================================
# ORIGINAL false_positive_duplicate: 2 pairs (already in ground truth)
# =========================================================================
{
"type": "false_positive_duplicate",
"pairs": [[6, 7]],
"description": (
"Two different 'Michael Davis' patients (PAT007 and PAT008) share the same "
"name but have different DOB, location, insurance. Must NOT be merged."
),
"entity_ids": ["PAT007", "PAT008"],
"patient_ids": [7, 8],
"distinguishing_fields": ["dob", "city", "state", "zip", "email", "insurance_provider", "insurance_id"],
},
{
"type": "false_positive_duplicate",
"pairs": [[8, 9]],
"description": (
"Two different 'Maria Garcia' patients (PAT009 and PAT010) share the same "
"name but have different DOB, location, insurance. Must NOT be merged."
),
"entity_ids": ["PAT009", "PAT010"],
"patient_ids": [9, 10],
"distinguishing_fields": ["dob", "city", "state", "zip", "email", "insurance_provider", "insurance_id"],
},
# --- NEW false_positive_duplicate: 3 harder pairs ---
{
"type": "false_positive_duplicate",
"pairs": [[31, 32]],
"description": (
"Two different 'David Kim' patients (PAT032 and PAT033) - SAME insurance "
"provider (UnitedHealth), SAME state (WA), DOBs only 2 years apart. "
"Distinguishable by different insurance IDs, different cities, different DOB. "
"Must NOT be merged."
),
"entity_ids": ["PAT032", "PAT033"],
"patient_ids": [32, 33],
"distinguishing_fields": ["dob", "city", "zip", "email", "insurance_id", "phone"],
},
{
"type": "false_positive_duplicate",
"pairs": [[35, 36]],
"description": (
"Two different 'Sarah Williams' patients (PAT036 and PAT037) - SAME city "
"(Chicago), SAME state (IL). DOBs 5 years apart, different insurance. "
"Must NOT be merged."
),
"entity_ids": ["PAT036", "PAT037"],
"patient_ids": [36, 37],
"distinguishing_fields": ["dob", "zip", "address_line", "email", "insurance_provider", "insurance_id", "phone"],
},
{
"type": "false_positive_duplicate",
"pairs": [[43, 44]],
"description": (
"Two different 'James Lee' patients (PAT044 and PAT045) - DOBs only 1 year "
"apart, SAME state (CA), SAME insurance provider (Aetna). Distinguishable "
"by different insurance IDs, different cities. Must NOT be merged."
),
"entity_ids": ["PAT044", "PAT045"],
"patient_ids": [44, 45],
"distinguishing_fields": ["dob", "city", "zip", "email", "insurance_id", "phone"],
},
# =========================================================================
# ORIGINAL address_variation: phone format variations
# =========================================================================
{
"type": "address_variation",
"row_indices": [10],
"description": "Phone format changed to (XXX) XXX-XXXX for patient 11",
"target_entity_id": "PAT011",
"target_patient_id": 11,
"field": "phone",
"original": "415-555-1134",
"corrupted": "(415) 555-1134",
},
{
"type": "address_variation",
"row_indices": [14],
"description": "Phone format changed to XXX.XXX.XXXX for patient 15",
"target_entity_id": "PAT015",
"target_patient_id": 15,
"field": "phone",
"original": "702-555-1578",
"corrupted": "702.555.1578",
},
{
"type": "address_variation",
"row_indices": [19],
"description": "Phone stripped of dashes for patient 20",
"target_entity_id": "PAT020",
"target_patient_id": 20,
"field": "phone",
"original": "407-555-2024",
"corrupted": "4075552024",
},
{
"type": "address_variation",
"row_indices": [24],
"description": "Phone with country code prefix for patient 25",
"target_entity_id": "PAT025",
"target_patient_id": 25,
"field": "phone",
"original": "216-555-2579",
"corrupted": "+1-216-555-2579",
},
# --- NEW phone format variations ---
{
"type": "address_variation",
"row_indices": [33],
"description": "Phone format changed to (XXX) XXX-XXXX for patient 34",
"target_entity_id": "PAT034",
"target_patient_id": 34,
"field": "phone",
"original": "813-555-3534",
"corrupted": "(813) 555-3534",
},
{
"type": "address_variation",
"row_indices": [41],
"description": "Phone format changed to XXX.XXX.XXXX for patient 42",
"target_entity_id": "PAT042",
"target_patient_id": 42,
"field": "phone",
"original": "505-555-4312",
"corrupted": "505.555.4312",
},
{
"type": "address_variation",
"row_indices": [45],
"description": "Phone stripped of dashes for patient 46",
"target_entity_id": "PAT046",
"target_patient_id": 46,
"field": "phone",
"original": "832-555-4756",
"corrupted": "8325554756",
},
{
"type": "address_variation",
"row_indices": [48],
"description": "Phone with country code prefix for patient 49",
"target_entity_id": "PAT049",
"target_patient_id": 49,
"field": "phone",
"original": "502-555-5089",
"corrupted": "+1-502-555-5089",
},
# =========================================================================
# ORIGINAL case corruptions
# =========================================================================
{
"type": "case_corrupt",
"targets": [{"row_idx": 12, "field": "first_name"}],
"description": "First name uppercased for patient 13 (Thomas -> THOMAS)",
"target_entity_id": "PAT013",
"target_patient_id": 13,
"original": "Thomas",
"corrupted": "THOMAS",
},
{
"type": "case_corrupt",
"targets": [{"row_idx": 16, "field": "city"}],
"description": "City lowercased for patient 17 (Minneapolis -> minneapolis)",
"target_entity_id": "PAT017",
"target_patient_id": 17,
"original": "Minneapolis",
"corrupted": "minneapolis",
},
{
"type": "case_corrupt",
"targets": [{"row_idx": 18, "field": "state"}],
"description": "State lowercased for patient 19 (MO -> mo)",
"target_entity_id": "PAT019",
"target_patient_id": 19,
"original": "MO",
"corrupted": "mo",
},
{
"type": "case_corrupt",
"targets": [{"row_idx": 25, "field": "last_name"}],
"description": "Last name uppercased for patient 26 (Young -> YOUNG)",
"target_entity_id": "PAT026",
"target_patient_id": 26,
"original": "Young",
"corrupted": "YOUNG",
},
# --- NEW case corruptions ---
{
"type": "case_corrupt",
"targets": [{"row_idx": 37, "field": "last_name"}],
"description": "Last name lowercased for patient 38 (Petrov -> petrov)",
"target_entity_id": "PAT038",
"target_patient_id": 38,
"original": "Petrov",
"corrupted": "petrov",
},
{
"type": "case_corrupt",
"targets": [{"row_idx": 42, "field": "first_name"}],
"description": "First name uppercased for patient 43 (Dana -> DANA)",
"target_entity_id": "PAT043",
"target_patient_id": 43,
"original": "Dana",
"corrupted": "DANA",
},
{
"type": "case_corrupt",
"targets": [{"row_idx": 47, "field": "city"}],
"description": "City lowercased for patient 48 (Columbus -> columbus)",
"target_entity_id": "PAT048",
"target_patient_id": 48,
"original": "Columbus",
"corrupted": "columbus",
},
{
"type": "case_corrupt",
"targets": [{"row_idx": 49, "field": "state"}],
"description": "State lowercased for patient 50 (IA -> ia)",
"target_entity_id": "PAT050",
"target_patient_id": 50,
"original": "IA",
"corrupted": "ia",
},
# =========================================================================
# ORIGINAL address whitespace corruptions
# =========================================================================
{
"type": "address_variation",
"row_indices": [19],
"description": "Extra spaces in address for patient 20 (525 S Orange Ave -> 525 S Orange Ave)",
"target_entity_id": "PAT020",
"target_patient_id": 20,
"field": "address_line",
"original": "525 S Orange Ave",
"corrupted": "525 S Orange Ave",
},
{
"type": "address_variation",
"row_indices": [25],
"description": "Extra space in city for patient 26 (New Orleans -> New Orleans)",
"target_entity_id": "PAT026",
"target_patient_id": 26,
"field": "city",
"original": "New Orleans",
"corrupted": "New Orleans",
},
# =========================================================================
# NEW email domain typo corruptions (subtle)
# =========================================================================
{
"type": "address_variation",
"row_indices": [34],
"description": "Email domain typo for patient 35 (yahoo.com -> yaho.com)",
"target_entity_id": "PAT035",
"target_patient_id": 35,
"field": "email",
"original": "avery.simmons94@yahoo.com",
"corrupted": "avery.simmons94@yaho.com",
},
{
"type": "address_variation",
"row_indices": [41],
"description": "Email domain typo for patient 42 (gmail.com -> gmial.com)",
"target_entity_id": "PAT042",
"target_patient_id": 42,
"field": "email",
"original": "samantha.rivera97@gmail.com",
"corrupted": "samantha.rivera97@gmial.com",
},
{
"type": "address_variation",
"row_indices": [48],
"description": "Email domain typo for patient 49 (aol.com -> aol.cm)",
"target_entity_id": "PAT049",
"target_patient_id": 49,
"field": "email",
"original": "raymond.foster50@aol.com",
"corrupted": "raymond.foster50@aol.cm",
},
# =========================================================================
# NEW state full-name instead of abbreviation corruptions
# =========================================================================
{
"type": "cross_field_corrupt",
"row_indices": [33],
"description": "State full name instead of abbreviation for patient 34 (FL -> Florida)",
"target_entity_id": "PAT034",
"target_patient_id": 34,
"field": "state",
"original": "FL",
"corrupted": "Florida",
},
{
"type": "cross_field_corrupt",
"row_indices": [43],
"description": "State full name instead of abbreviation for patient 44 (CA -> California)",
"target_entity_id": "PAT044",
"target_patient_id": 44,
"field": "state",
"original": "CA",
"corrupted": "California",
},
{
"type": "cross_field_corrupt",
"row_indices": [40],
"description": "State full name instead of abbreviation for patient 41 (CT -> Connecticut)",
"target_entity_id": "PAT041",
"target_patient_id": 41,
"field": "state",
"original": "CT",
"corrupted": "Connecticut",
},
# =========================================================================
# ORIGINAL + NEW valid_unusual: gender/name traps (NOT errors)
# =========================================================================
{
"type": "valid_unusual",
"description": "Ashley (M) - historically male name, VALID. Do NOT correct.",
"entity_id": "PAT014",
"patient_id": 14,
"first_name": "Ashley",
"gender": "M",
"note": "Ashley was historically a male name; this is valid.",
},
{
"type": "valid_unusual",
"description": "Jordan (F) - gender-neutral name, VALID. Do NOT correct.",
"entity_id": "PAT022",
"patient_id": 22,
"first_name": "Jordan",
"gender": "F",
"note": "Jordan is gender-neutral; valid for female patients.",
},
{
"type": "valid_unusual",
"description": "Shannon (M) - historically male Irish name, VALID. Do NOT correct.",
"entity_id": "PAT027",
"patient_id": 27,
"first_name": "Shannon",
"gender": "M",
"note": "Shannon was historically a male Irish name; this is valid.",
},
{
"type": "valid_unusual",
"description": "Morgan (M) - gender-neutral name, VALID. Do NOT correct.",
"entity_id": "PAT031",
"patient_id": 31,
"first_name": "Morgan",
"gender": "M",
"note": "Morgan is gender-neutral; valid for male patients.",
},
{
"type": "valid_unusual",
"description": "Avery (M) - gender-neutral name, VALID. Do NOT correct.",
"entity_id": "PAT035",
"patient_id": 35,
"first_name": "Avery",
"gender": "M",
"note": "Avery is gender-neutral; valid for male patients.",
},
{
"type": "valid_unusual",
"description": "Casey (F) - gender-neutral but often male, VALID. Do NOT correct.",
"entity_id": "PAT039",
"patient_id": 39,
"first_name": "Casey",
"gender": "F",
"note": "Casey is gender-neutral; valid for female patients.",
},
{
"type": "valid_unusual",
"description": "Dana (M) - gender-neutral name, VALID. Do NOT correct.",
"entity_id": "PAT043",
"patient_id": 43,
"first_name": "Dana",
"gender": "M",
"note": "Dana is gender-neutral; valid for male patients.",
},
{
"type": "valid_unusual",
"description": "Robin (F) - gender-neutral name, VALID. Do NOT correct.",
"entity_id": "PAT047",
"patient_id": 47,
"first_name": "Robin",
"gender": "F",
"note": "Robin is gender-neutral; valid for female patients.",
},
]
_HARD_PROBES: list[UtilityProbe] = [
UtilityProbe(
name="unique_patient_count",
description="Count of unique patients after deduplication",
query_fn="unique_count",
params={"column": "patient_id"},
expected_result=50,
),
UtilityProbe(
name="insurance_provider_distribution",
description="Count of patients per insurance provider",
query_fn="distribution",
params={"column": "insurance_provider"},
expected_result={
"Blue Cross": 10,
"UnitedHealth": 10,
"Aetna": 9,
"Cigna": 9,
"Medicare": 12,
},
),
UtilityProbe(
name="patients_per_city",
description="Count of patients per city",
query_fn="distribution",
params={"column": "city"},
expected_result={
"Chicago": 3, "Seattle": 2, "Boston": 1, "Portland": 1,
"Denver": 2, "Atlanta": 2, "Beverly Hills": 1, "Miami": 1,
"Austin": 1, "New York": 1, "San Francisco": 1, "Houston": 2,
"Phoenix": 1, "Nashville": 1, "Las Vegas": 1, "Dallas": 1,
"Minneapolis": 1, "Charlotte": 1, "Saint Louis": 1,
"Orlando": 1, "Pittsburgh": 1, "San Diego": 1,
"San Antonio": 1, "Indianapolis": 1, "Cleveland": 1,
"New Orleans": 1, "Kansas City": 1, "Milwaukee": 1,
"Tysons": 1, "Raleigh": 1,
"Tacoma": 1, "Bellevue": 1, "Tampa": 1, "Scottsdale": 1,
"Tucson": 1, "Hartford": 1, "Albuquerque": 1, "Memphis": 1,
"Oakland": 1, "San Jose": 1, "Greensboro": 1, "Columbus": 1,
"Louisville": 1, "Des Moines": 1,
},
),
UtilityProbe(
name="avg_age_by_gender",
description="Average age (2026 - birth year) grouped by gender",
query_fn="avg_by_group",
params={"value_col": "dob", "group_col": "gender", "transform": "year_age_2026"},
expected_result={"F": 44.80, "M": 51.00},
),
]
_HARD_TASK = Task(
task_id="hard_patients",
name="Multi-Source Patient Registry",
difficulty="hard",
description=(
"Clean and deduplicate a multi-source patient registry with 50 unique "
"patients (14 columns). The dirty dataset contains ~70 rows due to 10 "
"duplicate clusters (6 nickname-based, 4 typo-based including a "
"Catherine/Katherine cross-patient trap). Includes 5 false-positive "
"duplicate pairs (same name, genuinely different people with subtle "
"overlaps in location, insurance, or DOB) that must NOT be merged, "
"8 gender/name traps that are valid and must NOT be corrected, "
"cross-field validation rules (zip-city map, insurance prefix map), "
"date and phone format inconsistencies, DOB off-by-one corruptions, "
"email domain typos, state full-name expansions, null injections, "
"whitespace corruptions, and insurance ID mismatches."
),
ground_truth=_HARD_GROUND_TRUTH,
schema=_HARD_SCHEMA,
corruptions=_HARD_CORRUPTIONS,
max_steps=80,
utility_probes=_HARD_PROBES,
# Genuinely ambiguous cells where the correct value is debatable.
# Gender/name traps: unusual but valid gender assignments.
# An ideal agent should escalate these rather than "fix" them.
ambiguous_cells=[
("PAT014", "gender"), # Ashley (M) - historically male name
("PAT022", "gender"), # Jordan (F) - gender-neutral name
("PAT027", "gender"), # Shannon (M) - historically male Irish name
("PAT031", "gender"), # Morgan (M) - gender-neutral name
("PAT035", "gender"), # Avery (M) - gender-neutral name
("PAT039", "gender"), # Casey (F) - gender-neutral but often male
("PAT043", "gender"), # Dana (M) - gender-neutral name
("PAT047", "gender"), # Robin (F) - gender-neutral name
("PAT003", "last_name"), # O'Brien vs OBrien vs O Brien - apostrophe handling debatable
],
)
register_task(_HARD_TASK)