pujithapsx's picture
initial push
e9084d7
from pydantic import BaseModel, Field, field_validator, model_validator
from typing import Dict, List, Optional, Any, Union
from enum import Enum
# =========================================================
# ENUM
# =========================================================
class MatchingMode(str, Enum):
"""Supported matching modes"""
EMBEDDING = "embedding"
# =========================================================
# CONSTANTS
# =========================================================
MISSING_PLACEHOLDERS = {"missing value", "missing", "na", "n/a", "null", "none", "-"}
# ---------------------------------------------------------------------------
# Flat-format key β†’ EntityRecord field name mapping.
# To support a new flat key in future, just add it here.
# ---------------------------------------------------------------------------
_FLAT_KEY_MAP: Dict[str, str] = {
# Personal identifiers
"GENDER": "gender",
"NAME": "name",
"FIRSTNAME": "firstname",
"MIDDLENAME": "middlename",
"LASTNAME": "lastname",
"SPOUSENAME": "spousename",
"MOTHERNAME": "mothername",
"FATHERNAME": "fathername",
"COMPANYNAME": "companyname",
"PARENTCOMPANYNAME": "parentcompanyname",
# ID documents
"AADHAR": "aadhar",
"PAN": "pan",
"LICENSEID": "licenseid",
"PASSPORTID": "passportid",
"VOTERID": "voterid",
# DOB
"BIRTHDATE": "dob",
"DOB": "dob",
# Contact β€” collected into lists
"PHONE": "_phone_flat",
"EMAIL": "_email_flat",
# Address components β€” collected into addresses[0]
"ADDRESSLINE": "_addressline_flat",
"CITY": "_city_flat",
"STATE": "_state_flat",
"ZIPCODE": "_zipcode_flat",
}
_FLAT_ADDRESS_KEYS = {"_addressline_flat", "_city_flat", "_state_flat", "_zipcode_flat"}
def _is_placeholder(val: Any) -> bool:
"""Return True if value is a known missing/placeholder sentinel."""
if val is None:
return True
return str(val).strip().lower() in MISSING_PLACEHOLDERS
def _normalize_flat_to_nested(data: Dict[str, Any]) -> Dict[str, Any]:
"""
Detect whether *data* is in flat format (uppercase keys like ADDRESSLINE,
BIRTHDATE …) and, if so, convert it to the nested EntityRecord format.
If data already looks nested (has 'addresses' / 'phones' / 'emails' keys)
it is returned unchanged β€” this is the fast-path for the nested format
that supports multiple addresses/phones/emails.
"""
# Fast-path: already nested
if "addresses" in data or "phones" in data or "emails" in data:
return data
# Check if this looks like flat format
upper_keys = {k.upper() for k in data}
is_flat = bool(upper_keys & set(_FLAT_KEY_MAP.keys()))
if not is_flat:
return data # Unrecognized β€” pass through and let Pydantic handle
# ---- Convert flat β†’ nested -------------------------------------------
nested: Dict[str, Any] = {}
address_parts: Dict[str, str] = {}
phones: List[str] = []
emails: List[str] = []
for raw_key, raw_val in data.items():
target = _FLAT_KEY_MAP.get(raw_key.upper())
if target is None:
# Unknown flat key β€” pass through (may end up in custom_fields)
nested[raw_key] = raw_val
continue
if _is_placeholder(raw_val):
continue
if target == "_phone_flat":
phones.append(str(raw_val).strip())
elif target == "_email_flat":
emails.append(str(raw_val).strip())
elif target in _FLAT_ADDRESS_KEYS:
addr_key = target.replace("_flat", "").lstrip("_")
address_parts[addr_key] = str(raw_val).strip()
else:
nested[target] = raw_val
if address_parts:
nested["addresses"] = [address_parts]
if phones:
nested["phones"] = phones
if emails:
nested["emails"] = emails
return nested
# =========================================================
# REQUEST MODELS
# =========================================================
class AddressRecord(BaseModel):
"""A single address entry."""
addressline: str = Field(default="", description="Street address")
city: str = Field(default="", description="City name")
state: str = Field(default="", description="State name")
zipcode: str = Field(default="", description="6-digit postal code (pincode)")
@model_validator(mode="before")
@classmethod
def strip_address_placeholders(cls, values: Any) -> Any:
"""Replace placeholder strings in address fields with empty string."""
if isinstance(values, dict):
return {
k: ("" if _is_placeholder(v) else v)
for k, v in values.items()
}
return values
def is_empty(self) -> bool:
"""Return True when every field is blank β€” used to filter ghost entries."""
return not any([self.addressline, self.city, self.state, self.zipcode])
class EntityRecord(BaseModel):
"""
A single entity record with all possible fields.
All fields are optional β€” only provided fields are matched.
── Multi-value fields ──────────────────────────────────────────
addresses : List[AddressRecord]
Send as many addresses as needed.
Duplicates and all-blank entries are removed automatically.
Matching uses best-of-N across all address combinations
(handled by get_dynamic_fields + embedding_match_addresses
in matching_service.py β€” no service changes needed).
phones : List[str]
Send as many phone numbers as needed.
Duplicates and placeholder strings are removed automatically.
Matching uses compare_phone_any_match (any-match across all phones).
emails : List[str]
Same as phones, uses compare_email_any_match.
── Input formats ───────────────────────────────────────────────
Accepts BOTH nested format and flat uppercase-key format.
Flat keys are transparently converted to nested via handle_flat_format.
"""
# ---- Name fields -------------------------------------------------------
name: str = Field(default="", description="Full name")
firstname: str = Field(default="", description="First name")
middlename: str = Field(default="", description="Middle name")
lastname: str = Field(default="", description="Last name")
# ---- Related person names ----------------------------------------------
mothername: str = Field(default="", description="Mother's name")
fathername: str = Field(default="", description="Father's name")
spousename: str = Field(default="", description="Spouse's name")
othername: str = Field(default="", description="Other/alias name")
# ---- Personal info -----------------------------------------------------
dob: str = Field(default="", description="Date of birth (various formats accepted)")
gender: str = Field(default="", description="Gender (M/F/Male/Female/Other)")
# ---- Identity documents ------------------------------------------------
aadhar: str = Field(default="", alias="AADHAR", description="Aadhar number (12 digits)")
pan: str = Field(default="", description="PAN number (AAAAA9999A)")
licenseid: str = Field(default="", description="Driving license number")
passportid: str = Field(default="", description="Passport number")
voterid: str = Field(default="", description="Voter ID")
# ---- Addresses β€” N entries supported -----------------------------------
addresses: List[AddressRecord] = Field(
default_factory=list,
description=(
"List of addresses. Send any number β€” duplicates and blank entries "
"are removed. Matching uses best-of-N across all combinations."
)
)
# ---- Contact β€” N entries supported -------------------------------------
phones: List[str] = Field(
default_factory=list,
description=(
"List of phone numbers. Send any number β€” duplicates and placeholders "
"are removed. Matching uses any-match (match if any pair matches)."
)
)
emails: List[str] = Field(
default_factory=list,
description=(
"List of email addresses. Send any number β€” duplicates and placeholders "
"are removed. Matching uses any-match."
)
)
# ---- Employment --------------------------------------------------------
companyname: str = Field(default="", description="Company/employer name")
parentcompanyname: str = Field(default="", description="Parent company name")
# ---- Custom fields -----------------------------------------------------
custom_fields: Dict[str, str] = Field(
default_factory=dict,
description="Arbitrary key-value pairs for exact matching (e.g. MemberID, AccountNumber)"
)
# ── model_validator: runs BEFORE individual field validators ──────────
@model_validator(mode="before")
@classmethod
def handle_flat_format(cls, values: Any) -> Any:
"""
Transparently convert flat-format records (uppercase keys like
ADDRESSLINE, BIRTHDATE, PHONE …) into the nested format.
Already-nested data is returned unchanged.
"""
if isinstance(values, dict):
return _normalize_flat_to_nested(values)
return values
# ── Scalar field placeholder cleanup ─────────────────────────────────
@field_validator(
"name", "firstname", "middlename", "lastname",
"mothername", "fathername", "spousename", "othername",
"dob", "gender", "aadhar", "pan", "licenseid",
"passportid", "voterid", "companyname", "parentcompanyname",
mode="before"
)
@classmethod
def strip_missing_placeholders(cls, v):
"""Convert placeholder strings β†’ empty string."""
if isinstance(v, str) and v.strip().lower() in MISSING_PLACEHOLDERS:
return ""
return v
# ── phones: deduplicate + strip placeholders ─────────────────────────
@field_validator("phones", mode="before")
@classmethod
def clean_phones(cls, v):
if not isinstance(v, list):
return v
seen, result = set(), []
for item in v:
s = str(item).strip()
if s and s.lower() not in MISSING_PLACEHOLDERS and s not in seen:
seen.add(s)
result.append(s)
return result
# ── emails: deduplicate + strip placeholders ─────────────────────────
@field_validator("emails", mode="before")
@classmethod
def clean_emails(cls, v):
if not isinstance(v, list):
return v
seen, result = set(), []
for item in v:
s = str(item).strip().lower()
if s and s not in MISSING_PLACEHOLDERS and s not in seen:
seen.add(s)
result.append(s)
return result
# ── addresses: remove empty entries + deduplicate ────────────────────
@field_validator("addresses", mode="after")
@classmethod
def clean_addresses(cls, v: List[AddressRecord]) -> List[AddressRecord]:
"""
Remove all-blank address entries and deduplicate by
(addressline, city, state, zipcode) tuple.
This prevents ghost entries from inflating match scores.
"""
seen, result = set(), []
for addr in v:
if addr.is_empty():
continue
key = (
addr.addressline.strip().lower(),
addr.city.strip().lower(),
addr.state.strip().lower(),
addr.zipcode.strip(),
)
if key not in seen:
seen.add(key)
result.append(addr)
return result
model_config = {
"populate_by_name": True,
"alias_generator": str.upper,
"json_schema_extra": {
"examples": [
# ── Nested format: multiple addresses + phones ──
{
"name": "RAJESH KUMAR SHARMA",
"firstname": "RAJESH",
"dob": "15-01-1990",
"aadhar": "234567890123",
"addresses": [
{
"addressline": "123 MG Road, Koramangala",
"city": "Bangalore",
"state": "Karnataka",
"zipcode": "560034"
},
{
"addressline": "45 Brigade Road",
"city": "Bangalore",
"state": "Karnataka",
"zipcode": "560025"
}
],
"phones": ["9876543210", "9123456789"],
"emails": ["rajesh@example.com"]
},
# ── Flat format (single address/phone/email) ──
{
"NAME": "RAJESH KUMAR SHARMA",
"BIRTHDATE": "15-01-1990",
"AADHAR": "234567890123",
"ADDRESSLINE": "123 MG Road, Koramangala",
"CITY": "Bangalore",
"STATE": "Karnataka",
"ZIPCODE": "560034",
"PHONE": "9876543210",
"EMAIL": "rajesh@example.com"
}
]
}
}
class MatchRequest(BaseModel):
"""Request body for matching two entity records."""
record1: EntityRecord = Field(..., description="First entity record")
record2: EntityRecord = Field(..., description="Second entity record")
mode: MatchingMode = Field(
default=MatchingMode.EMBEDDING,
description="Matching mode: 'embedding'"
)
model_config = {
"json_schema_extra": {
"examples": [
# ── Example 1: Multiple addresses + phones (nested) ──────────────
{
"mode": "embedding",
"record1": {
"NAME": "RAJESH KUMAR SHARMA",
"dob": "15-01-1990",
"phones": ["9876543210", "9123456789"],
"emails": ["rajesh@example.com"],
"addresses": [
{
"addressline": "123 MG Road",
"city": "Bangalore",
"state": "Karnataka",
"zipcode": "560034"
},
{
"addressline": "45 Brigade Road",
"city": "Bangalore",
"state": "Karnataka",
"zipcode": "560025"
}
]
},
"record2": {
"NAME": "RAJESH K SHARMA",
"dob": "15/01/1990",
"phones": ["9876543210"],
"emails": ["rajesh@example.com"],
"addresses": [
{
"addressline": "123 Mahatma Gandhi Rd",
"city": "Bengaluru",
"state": "KA",
"zipcode": "560034"
},
{
"addressline": "45 Brigade Road",
"city": "Bangalore",
"state": "Karnataka",
"zipcode": "560025"
}
]
}
},
# ── Example 2: Flat format ───────────────────────────────────────
{
"mode": "embedding",
"record1": {
"GENDER": "missing value",
"NAME": "RAJESH KUMAR SHARMA",
"FIRSTNAME": "missing value",
"MIDDLENAME": "missing value",
"LASTNAME": "missing value",
"SPOUSENAME": "missing value",
"MOTHERNAME": "missing value",
"FATHERNAME": "missing value",
"COMPANYNAME": "missing value",
"PARENTCOMPANYNAME": "missing value",
"AADHAR": "missing value",
"PAN": "missing value",
"LICENSEID": "missing value",
"PASSPORTID": "missing value",
"VOTERID": "missing value",
"ADDRESSLINE": "123 MG Road",
"BIRTHDATE": "15-01-1990",
"PHONE": "9876543210",
"EMAIL": "missing value",
"CITY": "Bangalore",
"STATE": "Karnataka",
"ZIPCODE": "560034"
},
"record2": {
"GENDER": "missing value",
"NAME": "RAJESH K SHARMA",
"FIRSTNAME": "missing value",
"MIDDLENAME": "missing value",
"LASTNAME": "missing value",
"SPOUSENAME": "missing value",
"MOTHERNAME": "missing value",
"FATHERNAME": "missing value",
"COMPANYNAME": "missing value",
"PARENTCOMPANYNAME": "missing value",
"AADHAR": "missing value",
"PAN": "missing value",
"LICENSEID": "missing value",
"PASSPORTID": "missing value",
"VOTERID": "missing value",
"ADDRESSLINE": "123 Mahatma Gandhi Rd",
"BIRTHDATE": "15/01/1990",
"PHONE": "9876543210",
"EMAIL": "missing value",
"CITY": "Bengaluru",
"STATE": "KA",
"ZIPCODE": "560034"
}
}
]
}
}
class BatchMatchRequest(BaseModel):
"""Request body for batch matching (load testing)."""
pairs: List[MatchRequest] = Field(
...,
description="List of record pairs to match",
min_length=1,
max_length=100
)
# =========================================================
# RESPONSE MODELS
# =========================================================
class FieldScore(BaseModel):
"""Individual field matching result."""
field: str
score: Union[float, str] = Field(
description="Numeric score (0-100) in embedding mode"
)
class MatchResult(BaseModel):
"""Result of matching two entity records."""
overall_decision: str = Field(description="'Match' or 'No Match'")
reason: str = Field(description="Human-readable explanation of the matching decision")
field_scores: Dict[str, Union[float, str]] = Field(
description="Per-field matching scores. Embedding: numeric 0-100."
)
mode: str = Field(description="Matching mode used: 'embedding'")
class MatchResponse(BaseModel):
"""API response for a single match request."""
success: bool = True
result: Optional[MatchResult] = None
error: Optional[str] = None
processing_time_ms: float = Field(description="Time taken to process this match in milliseconds")
model_config = {"populate_by_name": True}
class BatchMatchResponse(BaseModel):
"""API response for batch matching."""
success: bool = True
total: int = Field(description="Total number of pairs submitted")
completed: int = Field(description="Number of pairs successfully matched")
failed: int = Field(description="Number of pairs that failed")
results: List[MatchResponse] = Field(description="Individual match results")
total_processing_time_ms: float = Field(description="Total processing time in milliseconds")
model_config = {"populate_by_name": True}
class HealthResponse(BaseModel):
"""Health check response."""
status: str = Field(description="'healthy' or 'unhealthy'")
version: str = Field(default="8.0", description="API version")
components: Dict[str, str] = Field(
description="Health status of individual components (csv_data, embedding_models)"
)
model_config = {"populate_by_name": True}
class ErrorResponse(BaseModel):
"""Standard error response."""
success: bool = False
error: str
detail: Optional[str] = None