| from pydantic import BaseModel, Field, field_validator, model_validator |
| from typing import Dict, List, Optional, Any, Union |
| from enum import Enum |
|
|
|
|
| |
| |
| |
| class MatchingMode(str, Enum): |
| """Supported matching modes""" |
| EMBEDDING = "embedding" |
| |
|
|
| |
| |
| |
| MISSING_PLACEHOLDERS = {"missing value", "missing", "na", "n/a", "null", "none", "-"} |
|
|
| |
| |
| |
| |
| _FLAT_KEY_MAP: Dict[str, str] = { |
| |
| "GENDER": "gender", |
| "NAME": "name", |
| "FIRSTNAME": "firstname", |
| "MIDDLENAME": "middlename", |
| "LASTNAME": "lastname", |
| "SPOUSENAME": "spousename", |
| "MOTHERNAME": "mothername", |
| "FATHERNAME": "fathername", |
| "COMPANYNAME": "companyname", |
| "PARENTCOMPANYNAME": "parentcompanyname", |
| |
| "AADHAR": "aadhar", |
| "PAN": "pan", |
| "LICENSEID": "licenseid", |
| "PASSPORTID": "passportid", |
| "VOTERID": "voterid", |
| |
| "BIRTHDATE": "dob", |
| "DOB": "dob", |
| |
| "PHONE": "_phone_flat", |
| "EMAIL": "_email_flat", |
| |
| "ADDRESSLINE": "_addressline_flat", |
| "CITY": "_city_flat", |
| "STATE": "_state_flat", |
| "ZIPCODE": "_zipcode_flat", |
| } |
|
|
| _FLAT_ADDRESS_KEYS = {"_addressline_flat", "_city_flat", "_state_flat", "_zipcode_flat"} |
|
|
|
|
| def _is_placeholder(val: Any) -> bool: |
| """Return True if value is a known missing/placeholder sentinel.""" |
| if val is None: |
| return True |
| return str(val).strip().lower() in MISSING_PLACEHOLDERS |
|
|
|
|
| def _normalize_flat_to_nested(data: Dict[str, Any]) -> Dict[str, Any]: |
| """ |
| Detect whether *data* is in flat format (uppercase keys like ADDRESSLINE, |
| BIRTHDATE β¦) and, if so, convert it to the nested EntityRecord format. |
| |
| If data already looks nested (has 'addresses' / 'phones' / 'emails' keys) |
| it is returned unchanged β this is the fast-path for the nested format |
| that supports multiple addresses/phones/emails. |
| """ |
| |
| if "addresses" in data or "phones" in data or "emails" in data: |
| return data |
|
|
| |
| upper_keys = {k.upper() for k in data} |
| is_flat = bool(upper_keys & set(_FLAT_KEY_MAP.keys())) |
| if not is_flat: |
| return data |
|
|
| |
| nested: Dict[str, Any] = {} |
| address_parts: Dict[str, str] = {} |
| phones: List[str] = [] |
| emails: List[str] = [] |
|
|
| for raw_key, raw_val in data.items(): |
| target = _FLAT_KEY_MAP.get(raw_key.upper()) |
|
|
| if target is None: |
| |
| nested[raw_key] = raw_val |
| continue |
|
|
| if _is_placeholder(raw_val): |
| continue |
|
|
| if target == "_phone_flat": |
| phones.append(str(raw_val).strip()) |
| elif target == "_email_flat": |
| emails.append(str(raw_val).strip()) |
| elif target in _FLAT_ADDRESS_KEYS: |
| addr_key = target.replace("_flat", "").lstrip("_") |
| address_parts[addr_key] = str(raw_val).strip() |
| else: |
| nested[target] = raw_val |
|
|
| if address_parts: |
| nested["addresses"] = [address_parts] |
| if phones: |
| nested["phones"] = phones |
| if emails: |
| nested["emails"] = emails |
|
|
| return nested |
|
|
|
|
| |
| |
| |
| class AddressRecord(BaseModel): |
| """A single address entry.""" |
| addressline: str = Field(default="", description="Street address") |
| city: str = Field(default="", description="City name") |
| state: str = Field(default="", description="State name") |
| zipcode: str = Field(default="", description="6-digit postal code (pincode)") |
|
|
| @model_validator(mode="before") |
| @classmethod |
| def strip_address_placeholders(cls, values: Any) -> Any: |
| """Replace placeholder strings in address fields with empty string.""" |
| if isinstance(values, dict): |
| return { |
| k: ("" if _is_placeholder(v) else v) |
| for k, v in values.items() |
| } |
| return values |
|
|
| def is_empty(self) -> bool: |
| """Return True when every field is blank β used to filter ghost entries.""" |
| return not any([self.addressline, self.city, self.state, self.zipcode]) |
|
|
|
|
| class EntityRecord(BaseModel): |
| """ |
| A single entity record with all possible fields. |
| All fields are optional β only provided fields are matched. |
| |
| ββ Multi-value fields ββββββββββββββββββββββββββββββββββββββββββ |
| addresses : List[AddressRecord] |
| Send as many addresses as needed. |
| Duplicates and all-blank entries are removed automatically. |
| Matching uses best-of-N across all address combinations |
| (handled by get_dynamic_fields + embedding_match_addresses |
| in matching_service.py β no service changes needed). |
| |
| phones : List[str] |
| Send as many phone numbers as needed. |
| Duplicates and placeholder strings are removed automatically. |
| Matching uses compare_phone_any_match (any-match across all phones). |
| |
| emails : List[str] |
| Same as phones, uses compare_email_any_match. |
| |
| ββ Input formats βββββββββββββββββββββββββββββββββββββββββββββββ |
| Accepts BOTH nested format and flat uppercase-key format. |
| Flat keys are transparently converted to nested via handle_flat_format. |
| """ |
|
|
| |
| name: str = Field(default="", description="Full name") |
| firstname: str = Field(default="", description="First name") |
| middlename: str = Field(default="", description="Middle name") |
| lastname: str = Field(default="", description="Last name") |
|
|
| |
| mothername: str = Field(default="", description="Mother's name") |
| fathername: str = Field(default="", description="Father's name") |
| spousename: str = Field(default="", description="Spouse's name") |
| othername: str = Field(default="", description="Other/alias name") |
|
|
| |
| dob: str = Field(default="", description="Date of birth (various formats accepted)") |
| gender: str = Field(default="", description="Gender (M/F/Male/Female/Other)") |
|
|
| |
| aadhar: str = Field(default="", alias="AADHAR", description="Aadhar number (12 digits)") |
| pan: str = Field(default="", description="PAN number (AAAAA9999A)") |
| licenseid: str = Field(default="", description="Driving license number") |
| passportid: str = Field(default="", description="Passport number") |
| voterid: str = Field(default="", description="Voter ID") |
|
|
| |
| addresses: List[AddressRecord] = Field( |
| default_factory=list, |
| description=( |
| "List of addresses. Send any number β duplicates and blank entries " |
| "are removed. Matching uses best-of-N across all combinations." |
| ) |
| ) |
|
|
| |
| phones: List[str] = Field( |
| default_factory=list, |
| description=( |
| "List of phone numbers. Send any number β duplicates and placeholders " |
| "are removed. Matching uses any-match (match if any pair matches)." |
| ) |
| ) |
| emails: List[str] = Field( |
| default_factory=list, |
| description=( |
| "List of email addresses. Send any number β duplicates and placeholders " |
| "are removed. Matching uses any-match." |
| ) |
| ) |
|
|
| |
| companyname: str = Field(default="", description="Company/employer name") |
| parentcompanyname: str = Field(default="", description="Parent company name") |
|
|
| |
| custom_fields: Dict[str, str] = Field( |
| default_factory=dict, |
| description="Arbitrary key-value pairs for exact matching (e.g. MemberID, AccountNumber)" |
| ) |
|
|
| |
| @model_validator(mode="before") |
| @classmethod |
| def handle_flat_format(cls, values: Any) -> Any: |
| """ |
| Transparently convert flat-format records (uppercase keys like |
| ADDRESSLINE, BIRTHDATE, PHONE β¦) into the nested format. |
| Already-nested data is returned unchanged. |
| """ |
| if isinstance(values, dict): |
| return _normalize_flat_to_nested(values) |
| return values |
|
|
| |
| @field_validator( |
| "name", "firstname", "middlename", "lastname", |
| "mothername", "fathername", "spousename", "othername", |
| "dob", "gender", "aadhar", "pan", "licenseid", |
| "passportid", "voterid", "companyname", "parentcompanyname", |
| mode="before" |
| ) |
| @classmethod |
| def strip_missing_placeholders(cls, v): |
| """Convert placeholder strings β empty string.""" |
| if isinstance(v, str) and v.strip().lower() in MISSING_PLACEHOLDERS: |
| return "" |
| return v |
|
|
| |
| @field_validator("phones", mode="before") |
| @classmethod |
| def clean_phones(cls, v): |
| if not isinstance(v, list): |
| return v |
| seen, result = set(), [] |
| for item in v: |
| s = str(item).strip() |
| if s and s.lower() not in MISSING_PLACEHOLDERS and s not in seen: |
| seen.add(s) |
| result.append(s) |
| return result |
|
|
| |
| @field_validator("emails", mode="before") |
| @classmethod |
| def clean_emails(cls, v): |
| if not isinstance(v, list): |
| return v |
| seen, result = set(), [] |
| for item in v: |
| s = str(item).strip().lower() |
| if s and s not in MISSING_PLACEHOLDERS and s not in seen: |
| seen.add(s) |
| result.append(s) |
| return result |
|
|
| |
| @field_validator("addresses", mode="after") |
| @classmethod |
| def clean_addresses(cls, v: List[AddressRecord]) -> List[AddressRecord]: |
| """ |
| Remove all-blank address entries and deduplicate by |
| (addressline, city, state, zipcode) tuple. |
| This prevents ghost entries from inflating match scores. |
| """ |
| seen, result = set(), [] |
| for addr in v: |
| if addr.is_empty(): |
| continue |
| key = ( |
| addr.addressline.strip().lower(), |
| addr.city.strip().lower(), |
| addr.state.strip().lower(), |
| addr.zipcode.strip(), |
| ) |
| if key not in seen: |
| seen.add(key) |
| result.append(addr) |
| return result |
|
|
| model_config = { |
| "populate_by_name": True, |
| "alias_generator": str.upper, |
| "json_schema_extra": { |
| "examples": [ |
| |
| { |
| "name": "RAJESH KUMAR SHARMA", |
| "firstname": "RAJESH", |
| "dob": "15-01-1990", |
| "aadhar": "234567890123", |
| "addresses": [ |
| { |
| "addressline": "123 MG Road, Koramangala", |
| "city": "Bangalore", |
| "state": "Karnataka", |
| "zipcode": "560034" |
| }, |
| { |
| "addressline": "45 Brigade Road", |
| "city": "Bangalore", |
| "state": "Karnataka", |
| "zipcode": "560025" |
| } |
| ], |
| "phones": ["9876543210", "9123456789"], |
| "emails": ["rajesh@example.com"] |
| }, |
| |
| { |
| "NAME": "RAJESH KUMAR SHARMA", |
| "BIRTHDATE": "15-01-1990", |
| "AADHAR": "234567890123", |
| "ADDRESSLINE": "123 MG Road, Koramangala", |
| "CITY": "Bangalore", |
| "STATE": "Karnataka", |
| "ZIPCODE": "560034", |
| "PHONE": "9876543210", |
| "EMAIL": "rajesh@example.com" |
| } |
| ] |
| } |
| } |
|
|
|
|
| class MatchRequest(BaseModel): |
| """Request body for matching two entity records.""" |
| record1: EntityRecord = Field(..., description="First entity record") |
| record2: EntityRecord = Field(..., description="Second entity record") |
| mode: MatchingMode = Field( |
| default=MatchingMode.EMBEDDING, |
| description="Matching mode: 'embedding'" |
| ) |
|
|
| model_config = { |
| "json_schema_extra": { |
| "examples": [ |
| |
| { |
| "mode": "embedding", |
| "record1": { |
| "NAME": "RAJESH KUMAR SHARMA", |
| "dob": "15-01-1990", |
| "phones": ["9876543210", "9123456789"], |
| "emails": ["rajesh@example.com"], |
| "addresses": [ |
| { |
| "addressline": "123 MG Road", |
| "city": "Bangalore", |
| "state": "Karnataka", |
| "zipcode": "560034" |
| }, |
| { |
| "addressline": "45 Brigade Road", |
| "city": "Bangalore", |
| "state": "Karnataka", |
| "zipcode": "560025" |
| } |
| ] |
| }, |
| "record2": { |
| "NAME": "RAJESH K SHARMA", |
| "dob": "15/01/1990", |
| "phones": ["9876543210"], |
| "emails": ["rajesh@example.com"], |
| "addresses": [ |
| { |
| "addressline": "123 Mahatma Gandhi Rd", |
| "city": "Bengaluru", |
| "state": "KA", |
| "zipcode": "560034" |
| }, |
| { |
| "addressline": "45 Brigade Road", |
| "city": "Bangalore", |
| "state": "Karnataka", |
| "zipcode": "560025" |
| } |
| ] |
| } |
| }, |
| |
| { |
| "mode": "embedding", |
| "record1": { |
| "GENDER": "missing value", |
| "NAME": "RAJESH KUMAR SHARMA", |
| "FIRSTNAME": "missing value", |
| "MIDDLENAME": "missing value", |
| "LASTNAME": "missing value", |
| "SPOUSENAME": "missing value", |
| "MOTHERNAME": "missing value", |
| "FATHERNAME": "missing value", |
| "COMPANYNAME": "missing value", |
| "PARENTCOMPANYNAME": "missing value", |
| "AADHAR": "missing value", |
| "PAN": "missing value", |
| "LICENSEID": "missing value", |
| "PASSPORTID": "missing value", |
| "VOTERID": "missing value", |
| "ADDRESSLINE": "123 MG Road", |
| "BIRTHDATE": "15-01-1990", |
| "PHONE": "9876543210", |
| "EMAIL": "missing value", |
| "CITY": "Bangalore", |
| "STATE": "Karnataka", |
| "ZIPCODE": "560034" |
| }, |
| "record2": { |
| "GENDER": "missing value", |
| "NAME": "RAJESH K SHARMA", |
| "FIRSTNAME": "missing value", |
| "MIDDLENAME": "missing value", |
| "LASTNAME": "missing value", |
| "SPOUSENAME": "missing value", |
| "MOTHERNAME": "missing value", |
| "FATHERNAME": "missing value", |
| "COMPANYNAME": "missing value", |
| "PARENTCOMPANYNAME": "missing value", |
| "AADHAR": "missing value", |
| "PAN": "missing value", |
| "LICENSEID": "missing value", |
| "PASSPORTID": "missing value", |
| "VOTERID": "missing value", |
| "ADDRESSLINE": "123 Mahatma Gandhi Rd", |
| "BIRTHDATE": "15/01/1990", |
| "PHONE": "9876543210", |
| "EMAIL": "missing value", |
| "CITY": "Bengaluru", |
| "STATE": "KA", |
| "ZIPCODE": "560034" |
| } |
| } |
| ] |
| } |
| } |
|
|
|
|
| class BatchMatchRequest(BaseModel): |
| """Request body for batch matching (load testing).""" |
| pairs: List[MatchRequest] = Field( |
| ..., |
| description="List of record pairs to match", |
| min_length=1, |
| max_length=100 |
| ) |
|
|
|
|
| |
| |
| |
| class FieldScore(BaseModel): |
| """Individual field matching result.""" |
| field: str |
| score: Union[float, str] = Field( |
| description="Numeric score (0-100) in embedding mode" |
| ) |
|
|
|
|
| class MatchResult(BaseModel): |
| """Result of matching two entity records.""" |
| overall_decision: str = Field(description="'Match' or 'No Match'") |
| reason: str = Field(description="Human-readable explanation of the matching decision") |
| field_scores: Dict[str, Union[float, str]] = Field( |
| description="Per-field matching scores. Embedding: numeric 0-100." |
| ) |
| mode: str = Field(description="Matching mode used: 'embedding'") |
|
|
|
|
| class MatchResponse(BaseModel): |
| """API response for a single match request.""" |
| success: bool = True |
| result: Optional[MatchResult] = None |
| error: Optional[str] = None |
| processing_time_ms: float = Field(description="Time taken to process this match in milliseconds") |
|
|
| model_config = {"populate_by_name": True} |
|
|
|
|
| class BatchMatchResponse(BaseModel): |
| """API response for batch matching.""" |
| success: bool = True |
| total: int = Field(description="Total number of pairs submitted") |
| completed: int = Field(description="Number of pairs successfully matched") |
| failed: int = Field(description="Number of pairs that failed") |
| results: List[MatchResponse] = Field(description="Individual match results") |
| total_processing_time_ms: float = Field(description="Total processing time in milliseconds") |
|
|
| model_config = {"populate_by_name": True} |
|
|
|
|
| class HealthResponse(BaseModel): |
| """Health check response.""" |
| status: str = Field(description="'healthy' or 'unhealthy'") |
| version: str = Field(default="8.0", description="API version") |
| components: Dict[str, str] = Field( |
| description="Health status of individual components (csv_data, embedding_models)" |
| ) |
|
|
| model_config = {"populate_by_name": True} |
|
|
|
|
| class ErrorResponse(BaseModel): |
| """Standard error response.""" |
| success: bool = False |
| error: str |
| detail: Optional[str] = None |