from pydantic import BaseModel, Field, field_validator, model_validator from typing import Dict, List, Optional, Any, Union from enum import Enum # ========================================================= # ENUM # ========================================================= class MatchingMode(str, Enum): """Supported matching modes""" EMBEDDING = "embedding" # ========================================================= # CONSTANTS # ========================================================= MISSING_PLACEHOLDERS = {"missing value", "missing", "na", "n/a", "null", "none", "-"} # --------------------------------------------------------------------------- # Flat-format key → EntityRecord field name mapping. # To support a new flat key in future, just add it here. # --------------------------------------------------------------------------- _FLAT_KEY_MAP: Dict[str, str] = { # Personal identifiers "GENDER": "gender", "NAME": "name", "FIRSTNAME": "firstname", "MIDDLENAME": "middlename", "LASTNAME": "lastname", "SPOUSENAME": "spousename", "MOTHERNAME": "mothername", "FATHERNAME": "fathername", "COMPANYNAME": "companyname", "PARENTCOMPANYNAME": "parentcompanyname", # ID documents "AADHAR": "aadhar", "PAN": "pan", "LICENSEID": "licenseid", "PASSPORTID": "passportid", "VOTERID": "voterid", # DOB "BIRTHDATE": "dob", "DOB": "dob", # Contact — collected into lists "PHONE": "_phone_flat", "EMAIL": "_email_flat", # Address components — collected into addresses[0] "ADDRESSLINE": "_addressline_flat", "CITY": "_city_flat", "STATE": "_state_flat", "ZIPCODE": "_zipcode_flat", } _FLAT_ADDRESS_KEYS = {"_addressline_flat", "_city_flat", "_state_flat", "_zipcode_flat"} def _is_placeholder(val: Any) -> bool: """Return True if value is a known missing/placeholder sentinel.""" if val is None: return True return str(val).strip().lower() in MISSING_PLACEHOLDERS def _normalize_flat_to_nested(data: Dict[str, Any]) -> Dict[str, Any]: """ Detect whether *data* is in flat format (uppercase keys like ADDRESSLINE, BIRTHDATE …) and, if so, convert it to the nested EntityRecord format. If data already looks nested (has 'addresses' / 'phones' / 'emails' keys) it is returned unchanged — this is the fast-path for the nested format that supports multiple addresses/phones/emails. """ # Fast-path: already nested if "addresses" in data or "phones" in data or "emails" in data: return data # Check if this looks like flat format upper_keys = {k.upper() for k in data} is_flat = bool(upper_keys & set(_FLAT_KEY_MAP.keys())) if not is_flat: return data # Unrecognized — pass through and let Pydantic handle # ---- Convert flat → nested ------------------------------------------- nested: Dict[str, Any] = {} address_parts: Dict[str, str] = {} phones: List[str] = [] emails: List[str] = [] for raw_key, raw_val in data.items(): target = _FLAT_KEY_MAP.get(raw_key.upper()) if target is None: # Unknown flat key — pass through (may end up in custom_fields) nested[raw_key] = raw_val continue if _is_placeholder(raw_val): continue if target == "_phone_flat": phones.append(str(raw_val).strip()) elif target == "_email_flat": emails.append(str(raw_val).strip()) elif target in _FLAT_ADDRESS_KEYS: addr_key = target.replace("_flat", "").lstrip("_") address_parts[addr_key] = str(raw_val).strip() else: nested[target] = raw_val if address_parts: nested["addresses"] = [address_parts] if phones: nested["phones"] = phones if emails: nested["emails"] = emails return nested # ========================================================= # REQUEST MODELS # ========================================================= class AddressRecord(BaseModel): """A single address entry.""" addressline: str = Field(default="", description="Street address") city: str = Field(default="", description="City name") state: str = Field(default="", description="State name") zipcode: str = Field(default="", description="6-digit postal code (pincode)") @model_validator(mode="before") @classmethod def strip_address_placeholders(cls, values: Any) -> Any: """Replace placeholder strings in address fields with empty string.""" if isinstance(values, dict): return { k: ("" if _is_placeholder(v) else v) for k, v in values.items() } return values def is_empty(self) -> bool: """Return True when every field is blank — used to filter ghost entries.""" return not any([self.addressline, self.city, self.state, self.zipcode]) class EntityRecord(BaseModel): """ A single entity record with all possible fields. All fields are optional — only provided fields are matched. ── Multi-value fields ────────────────────────────────────────── addresses : List[AddressRecord] Send as many addresses as needed. Duplicates and all-blank entries are removed automatically. Matching uses best-of-N across all address combinations (handled by get_dynamic_fields + embedding_match_addresses in matching_service.py — no service changes needed). phones : List[str] Send as many phone numbers as needed. Duplicates and placeholder strings are removed automatically. Matching uses compare_phone_any_match (any-match across all phones). emails : List[str] Same as phones, uses compare_email_any_match. ── Input formats ─────────────────────────────────────────────── Accepts BOTH nested format and flat uppercase-key format. Flat keys are transparently converted to nested via handle_flat_format. """ # ---- Name fields ------------------------------------------------------- name: str = Field(default="", description="Full name") firstname: str = Field(default="", description="First name") middlename: str = Field(default="", description="Middle name") lastname: str = Field(default="", description="Last name") # ---- Related person names ---------------------------------------------- mothername: str = Field(default="", description="Mother's name") fathername: str = Field(default="", description="Father's name") spousename: str = Field(default="", description="Spouse's name") othername: str = Field(default="", description="Other/alias name") # ---- Personal info ----------------------------------------------------- dob: str = Field(default="", description="Date of birth (various formats accepted)") gender: str = Field(default="", description="Gender (M/F/Male/Female/Other)") # ---- Identity documents ------------------------------------------------ aadhar: str = Field(default="", alias="AADHAR", description="Aadhar number (12 digits)") pan: str = Field(default="", description="PAN number (AAAAA9999A)") licenseid: str = Field(default="", description="Driving license number") passportid: str = Field(default="", description="Passport number") voterid: str = Field(default="", description="Voter ID") # ---- Addresses — N entries supported ----------------------------------- addresses: List[AddressRecord] = Field( default_factory=list, description=( "List of addresses. Send any number — duplicates and blank entries " "are removed. Matching uses best-of-N across all combinations." ) ) # ---- Contact — N entries supported ------------------------------------- phones: List[str] = Field( default_factory=list, description=( "List of phone numbers. Send any number — duplicates and placeholders " "are removed. Matching uses any-match (match if any pair matches)." ) ) emails: List[str] = Field( default_factory=list, description=( "List of email addresses. Send any number — duplicates and placeholders " "are removed. Matching uses any-match." ) ) # ---- Employment -------------------------------------------------------- companyname: str = Field(default="", description="Company/employer name") parentcompanyname: str = Field(default="", description="Parent company name") # ---- Custom fields ----------------------------------------------------- custom_fields: Dict[str, str] = Field( default_factory=dict, description="Arbitrary key-value pairs for exact matching (e.g. MemberID, AccountNumber)" ) # ── model_validator: runs BEFORE individual field validators ────────── @model_validator(mode="before") @classmethod def handle_flat_format(cls, values: Any) -> Any: """ Transparently convert flat-format records (uppercase keys like ADDRESSLINE, BIRTHDATE, PHONE …) into the nested format. Already-nested data is returned unchanged. """ if isinstance(values, dict): return _normalize_flat_to_nested(values) return values # ── Scalar field placeholder cleanup ───────────────────────────────── @field_validator( "name", "firstname", "middlename", "lastname", "mothername", "fathername", "spousename", "othername", "dob", "gender", "aadhar", "pan", "licenseid", "passportid", "voterid", "companyname", "parentcompanyname", mode="before" ) @classmethod def strip_missing_placeholders(cls, v): """Convert placeholder strings → empty string.""" if isinstance(v, str) and v.strip().lower() in MISSING_PLACEHOLDERS: return "" return v # ── phones: deduplicate + strip placeholders ───────────────────────── @field_validator("phones", mode="before") @classmethod def clean_phones(cls, v): if not isinstance(v, list): return v seen, result = set(), [] for item in v: s = str(item).strip() if s and s.lower() not in MISSING_PLACEHOLDERS and s not in seen: seen.add(s) result.append(s) return result # ── emails: deduplicate + strip placeholders ───────────────────────── @field_validator("emails", mode="before") @classmethod def clean_emails(cls, v): if not isinstance(v, list): return v seen, result = set(), [] for item in v: s = str(item).strip().lower() if s and s not in MISSING_PLACEHOLDERS and s not in seen: seen.add(s) result.append(s) return result # ── addresses: remove empty entries + deduplicate ──────────────────── @field_validator("addresses", mode="after") @classmethod def clean_addresses(cls, v: List[AddressRecord]) -> List[AddressRecord]: """ Remove all-blank address entries and deduplicate by (addressline, city, state, zipcode) tuple. This prevents ghost entries from inflating match scores. """ seen, result = set(), [] for addr in v: if addr.is_empty(): continue key = ( addr.addressline.strip().lower(), addr.city.strip().lower(), addr.state.strip().lower(), addr.zipcode.strip(), ) if key not in seen: seen.add(key) result.append(addr) return result model_config = { "populate_by_name": True, "alias_generator": str.upper, "json_schema_extra": { "examples": [ # ── Nested format: multiple addresses + phones ── { "name": "RAJESH KUMAR SHARMA", "firstname": "RAJESH", "dob": "15-01-1990", "aadhar": "234567890123", "addresses": [ { "addressline": "123 MG Road, Koramangala", "city": "Bangalore", "state": "Karnataka", "zipcode": "560034" }, { "addressline": "45 Brigade Road", "city": "Bangalore", "state": "Karnataka", "zipcode": "560025" } ], "phones": ["9876543210", "9123456789"], "emails": ["rajesh@example.com"] }, # ── Flat format (single address/phone/email) ── { "NAME": "RAJESH KUMAR SHARMA", "BIRTHDATE": "15-01-1990", "AADHAR": "234567890123", "ADDRESSLINE": "123 MG Road, Koramangala", "CITY": "Bangalore", "STATE": "Karnataka", "ZIPCODE": "560034", "PHONE": "9876543210", "EMAIL": "rajesh@example.com" } ] } } class MatchRequest(BaseModel): """Request body for matching two entity records.""" record1: EntityRecord = Field(..., description="First entity record") record2: EntityRecord = Field(..., description="Second entity record") mode: MatchingMode = Field( default=MatchingMode.EMBEDDING, description="Matching mode: 'embedding'" ) model_config = { "json_schema_extra": { "examples": [ # ── Example 1: Multiple addresses + phones (nested) ────────────── { "mode": "embedding", "record1": { "NAME": "RAJESH KUMAR SHARMA", "dob": "15-01-1990", "phones": ["9876543210", "9123456789"], "emails": ["rajesh@example.com"], "addresses": [ { "addressline": "123 MG Road", "city": "Bangalore", "state": "Karnataka", "zipcode": "560034" }, { "addressline": "45 Brigade Road", "city": "Bangalore", "state": "Karnataka", "zipcode": "560025" } ] }, "record2": { "NAME": "RAJESH K SHARMA", "dob": "15/01/1990", "phones": ["9876543210"], "emails": ["rajesh@example.com"], "addresses": [ { "addressline": "123 Mahatma Gandhi Rd", "city": "Bengaluru", "state": "KA", "zipcode": "560034" }, { "addressline": "45 Brigade Road", "city": "Bangalore", "state": "Karnataka", "zipcode": "560025" } ] } }, # ── Example 2: Flat format ─────────────────────────────────────── { "mode": "embedding", "record1": { "GENDER": "missing value", "NAME": "RAJESH KUMAR SHARMA", "FIRSTNAME": "missing value", "MIDDLENAME": "missing value", "LASTNAME": "missing value", "SPOUSENAME": "missing value", "MOTHERNAME": "missing value", "FATHERNAME": "missing value", "COMPANYNAME": "missing value", "PARENTCOMPANYNAME": "missing value", "AADHAR": "missing value", "PAN": "missing value", "LICENSEID": "missing value", "PASSPORTID": "missing value", "VOTERID": "missing value", "ADDRESSLINE": "123 MG Road", "BIRTHDATE": "15-01-1990", "PHONE": "9876543210", "EMAIL": "missing value", "CITY": "Bangalore", "STATE": "Karnataka", "ZIPCODE": "560034" }, "record2": { "GENDER": "missing value", "NAME": "RAJESH K SHARMA", "FIRSTNAME": "missing value", "MIDDLENAME": "missing value", "LASTNAME": "missing value", "SPOUSENAME": "missing value", "MOTHERNAME": "missing value", "FATHERNAME": "missing value", "COMPANYNAME": "missing value", "PARENTCOMPANYNAME": "missing value", "AADHAR": "missing value", "PAN": "missing value", "LICENSEID": "missing value", "PASSPORTID": "missing value", "VOTERID": "missing value", "ADDRESSLINE": "123 Mahatma Gandhi Rd", "BIRTHDATE": "15/01/1990", "PHONE": "9876543210", "EMAIL": "missing value", "CITY": "Bengaluru", "STATE": "KA", "ZIPCODE": "560034" } } ] } } class BatchMatchRequest(BaseModel): """Request body for batch matching (load testing).""" pairs: List[MatchRequest] = Field( ..., description="List of record pairs to match", min_length=1, max_length=100 ) # ========================================================= # RESPONSE MODELS # ========================================================= class FieldScore(BaseModel): """Individual field matching result.""" field: str score: Union[float, str] = Field( description="Numeric score (0-100) in embedding mode" ) class MatchResult(BaseModel): """Result of matching two entity records.""" overall_decision: str = Field(description="'Match' or 'No Match'") reason: str = Field(description="Human-readable explanation of the matching decision") field_scores: Dict[str, Union[float, str]] = Field( description="Per-field matching scores. Embedding: numeric 0-100." ) mode: str = Field(description="Matching mode used: 'embedding'") class MatchResponse(BaseModel): """API response for a single match request.""" success: bool = True result: Optional[MatchResult] = None error: Optional[str] = None processing_time_ms: float = Field(description="Time taken to process this match in milliseconds") model_config = {"populate_by_name": True} class BatchMatchResponse(BaseModel): """API response for batch matching.""" success: bool = True total: int = Field(description="Total number of pairs submitted") completed: int = Field(description="Number of pairs successfully matched") failed: int = Field(description="Number of pairs that failed") results: List[MatchResponse] = Field(description="Individual match results") total_processing_time_ms: float = Field(description="Total processing time in milliseconds") model_config = {"populate_by_name": True} class HealthResponse(BaseModel): """Health check response.""" status: str = Field(description="'healthy' or 'unhealthy'") version: str = Field(default="8.0", description="API version") components: Dict[str, str] = Field( description="Health status of individual components (csv_data, embedding_models)" ) model_config = {"populate_by_name": True} class ErrorResponse(BaseModel): """Standard error response.""" success: bool = False error: str detail: Optional[str] = None