market-intelligence / src /dpi /simulator.py
jtlevine's picture
Clean up DPI commit: redundant state, dead field, stringly typing
febe111
"""
Simulated farmer registry β€” pre-generates a DPI profile for every entry
in SAMPLE_FARMERS at import time.
Deterministic: each farmer's profile is seeded by
`hashlib.md5(farmer_id)`, so the same 100 profiles appear on every
restart and across environments. Phone numbers match the farmer's
`FarmerPersona.farmer_id` via the same seed, which means DPI lookups
resolve consistently whether keyed by farmer_id or phone.
Design rule: **land record acreage must plausibly support the claimed
quintal yield** on the FarmerPersona. If Lakshmi grows 25 quintals of
rice, her land record needs to be at least 0.6 ha (rice yield ~40-50
q/ha in the Cauvery delta). The simulator enforces this so the credit
readiness assessment can trust the land record when cross-checking.
Similarly, KCC credit limits are scaled against land holdings using
realistic NABARD-style formulas (Rs 50k–150k per hectare for Tamil Nadu
crops), so "strong/moderate/not_yet" classifications end up grounded
in numbers a real lender would produce.
No DB, no network. Entire module loads in ~20 ms.
"""
from __future__ import annotations
import hashlib
import random
from typing import Dict, List, Optional
from config import MANDI_MAP, SAMPLE_FARMERS
from src.dpi.models import (
AadhaarProfile,
FarmerProfile,
KCCRecord,
LandRecord,
)
from src.geo import haversine_km
# ---------------------------------------------------------------------------
# Tamil name pool for local-script Aadhaar names. Kept flat β€” no family
# names per the common Tamil Nadu pattern where the father's name is the
# second identifier. Used only when the farmer's FarmerPersona doesn't
# already carry a local-script variant.
# ---------------------------------------------------------------------------
_TAMIL_LOCAL_NAMES: Dict[str, str] = {
"Lakshmi": "\u0bb2\u0b9f\u0bcd\u0b9a\u0bc1\u0bae\u0bbf",
"Kumar": "\u0b95\u0bc1\u0bae\u0bbe\u0bb0\u0bcd",
"Meena": "\u0bae\u0bc0\u0ba9\u0bbe",
"Arun": "\u0b85\u0bb0\u0bc1\u0ba3\u0bcd",
"Bala": "\u0baa\u0bbe\u0bb2\u0bbe",
"Divya": "\u0ba4\u0bbf\u0bb5\u0bcd\u0baf\u0bbe",
"Ganesh": "\u0b95\u0ba3\u0bc7\u0bb7\u0bcd",
"Priya": "\u0baa\u0bbf\u0bb0\u0bbf\u0baf\u0bbe",
"Raja": "\u0bb0\u0bbe\u0b9c\u0bbe",
"Selvi": "\u0b9a\u0bc6\u0bb2\u0bcd\u0bb5\u0bbf",
"Vel": "\u0bb5\u0bc7\u0bb2\u0bcd",
"Saroja": "\u0b9a\u0bb0\u0bcb\u0b9c\u0bbe",
"Karthik": "\u0b95\u0bbe\u0bb0\u0bcd\u0ba4\u0bcd\u0ba4\u0bbf\u0b95\u0bcd",
"Uma": "\u0b89\u0bae\u0bbe",
"Murugan": "\u0bae\u0bc1\u0bb0\u0bc1\u0b95\u0ba9\u0bcd",
}
def _local_name(english_name: str) -> str:
"""Return Tamil-script name if known, otherwise a generic placeholder.
We don't try to transliterate programmatically β€” getting Tamil
transliteration right is non-trivial and outside the scope of a
demo registry. Unknown names get a deterministic Tamil placeholder
so the field is never empty.
"""
first = english_name.split()[0]
if first in _TAMIL_LOCAL_NAMES:
return _TAMIL_LOCAL_NAMES[first]
# Placeholder Tamil text: "farmer" (\u0bb5\u0bbf\u0bb5\u0b9a\u0bbe\u0baf\u0bbf)
return "\u0bb5\u0bbf\u0bb5\u0b9a\u0bbe\u0baf\u0bbf"
# ---------------------------------------------------------------------------
# Crop yield assumptions used to size land records against claimed
# quintals. These are rough NABARD/ICAR baselines for Tamil Nadu dry +
# irrigated systems. Numbers err on the low side so the simulator
# generates land parcels that comfortably support the claimed yield.
# Units: quintals per hectare per season.
# ---------------------------------------------------------------------------
_YIELD_Q_PER_HA: Dict[str, float] = {
"RICE-SAMBA": 40.0,
"TUR-FIN": 25.0, # turmeric dry yield
"GNUT-POD": 20.0,
"COT-MCU": 15.0,
"ONI-RED": 200.0, # onion is very high yield per hectare
"COP-DRY": 80.0, # copra dried
"MZE-YEL": 50.0,
"URD-BLK": 7.0, # pulses are low yield
"MNG-GRN": 8.0,
"BAN-ROB": 300.0, # banana has very high per-hectare yield
}
# Credit limit per hectare for each commodity (rupees per hectare).
# Based on NABARD's scale-of-finance tables for Tamil Nadu cropping
# seasons, rounded to convenient figures. These drive KCC limits.
_KCC_PER_HA: Dict[str, float] = {
"RICE-SAMBA": 70_000,
"TUR-FIN": 140_000, # turmeric is input-intensive
"GNUT-POD": 55_000,
"COT-MCU": 85_000,
"ONI-RED": 95_000,
"COP-DRY": 90_000,
"MZE-YEL": 50_000,
"URD-BLK": 45_000,
"MNG-GRN": 45_000,
"BAN-ROB": 180_000,
}
def _seed_rng(farmer_id: str) -> random.Random:
"""Return a deterministic RNG seeded from the farmer_id."""
h = hashlib.md5(farmer_id.encode()).hexdigest()
return random.Random(int(h, 16))
def _make_masked_aadhaar(rng: random.Random) -> str:
return f"XXXX-XXXX-{rng.randint(1000, 9999)}"
def _make_phone(rng: random.Random) -> str:
"""Generate a realistic Indian mobile phone number."""
return f"+91{rng.randint(7_000_000_000, 9_999_999_999)}"
def _make_kcc_number(rng: random.Random) -> str:
return f"KCC-TN-{rng.randint(100_000, 999_999)}"
def _make_survey_number(rng: random.Random) -> str:
"""Tamil Nadu land survey numbers are like '142/3B'."""
return f"{rng.randint(100, 999)}/{rng.randint(1, 9)}{rng.choice('ABCDE')}"
def _size_land_for_yield(commodity_id: str, claimed_quintals: float, rng: random.Random) -> float:
"""Return a plausible land area (hectares) that supports the claimed quintals.
Adds 20-80% headroom on top of the minimum needed area so the farmer
looks like they have realistic slack β€” a real farmer growing 25 q of
rice has more than the bare minimum 0.6 ha, because not all their land
is at peak yield.
"""
yield_per_ha = _YIELD_Q_PER_HA.get(commodity_id, 20.0)
min_area = claimed_quintals / max(yield_per_ha, 1.0)
headroom_factor = 1.0 + rng.uniform(0.2, 0.8)
return round(min_area * headroom_factor, 2)
def _make_land_record(
farmer, commodity_id: str, area_ha: float, rng: random.Random
) -> LandRecord:
soil_choices = ["alluvial", "red", "black cotton", "laterite", "sandy loam"]
irrigation_choices = ["canal", "borewell", "tank", "rainfed"]
# Slight GPS jitter so the land record isn't exactly at the farmer pin.
lat = round(farmer.latitude + rng.uniform(-0.02, 0.02), 4)
lon = round(farmer.longitude + rng.uniform(-0.02, 0.02), 4)
# Secondary crop: pick a rotation partner if the farmer's commodity has
# a typical pairing. Otherwise include only the primary.
rotation_partners = {
"RICE-SAMBA": ["URD-BLK", "MNG-GRN"],
"GNUT-POD": ["MZE-YEL"],
"TUR-FIN": ["MZE-YEL"],
"COT-MCU": ["GNUT-POD"],
"MZE-YEL": ["URD-BLK"],
}
crops = [commodity_id]
partner_pool = rotation_partners.get(commodity_id, [])
if partner_pool and rng.random() < 0.65:
crops.append(rng.choice(partner_pool))
return LandRecord(
survey_number=_make_survey_number(rng),
area_hectares=area_ha,
soil_type=rng.choice(soil_choices),
irrigation_type=rng.choice(irrigation_choices),
gps_lat=lat,
gps_lon=lon,
crops_registered=crops,
nearest_mandi_id="", # filled in by registry after load (mandi lookup)
)
def _make_kcc(commodity_id: str, area_ha: float, rng: random.Random) -> KCCRecord:
"""Generate a KCC record scaled to land holdings."""
per_ha_limit = _KCC_PER_HA.get(commodity_id, 60_000)
base_limit = per_ha_limit * area_ha
# Round up to the nearest 5,000 β€” real KCC limits are round numbers.
credit_limit = round(base_limit / 5_000) * 5_000
credit_limit = max(credit_limit, 15_000) # minimum meaningful limit
# Utilization: 70% of farmers carry real outstanding balances (active
# users of the card), 20% are lightly used (recent disbursement),
# 10% have very high utilization approaching the limit.
roll = rng.random()
if roll < 0.1:
util_pct = rng.uniform(0.85, 0.98)
elif roll < 0.3:
util_pct = rng.uniform(0.05, 0.35)
else:
util_pct = rng.uniform(0.35, 0.75)
outstanding = round(credit_limit * util_pct / 100) * 100
# Repayment status: 85% current, 12% overdue, 3% defaulted.
status_roll = rng.random()
if status_roll < 0.85:
repayment_status = "current"
elif status_roll < 0.97:
repayment_status = "overdue"
else:
repayment_status = "defaulted"
return KCCRecord(
kcc_number=_make_kcc_number(rng),
credit_limit=credit_limit,
outstanding=outstanding,
crops_financed=[commodity_id],
repayment_status=repayment_status,
last_payment_date=f"2025-{rng.randint(1, 12):02d}-{rng.randint(1, 28):02d}",
)
# ---------------------------------------------------------------------------
# Registry
# ---------------------------------------------------------------------------
class SimulatedDPIRegistry:
"""Pre-generates one FarmerProfile per entry in SAMPLE_FARMERS.
Indexed by farmer_id (primary) and phone (secondary). Phone β†’ profile
lookups work because each farmer's phone is deterministically seeded
from the same farmer_id used here.
"""
def __init__(self):
self._by_farmer_id: Dict[str, FarmerProfile] = {}
self._by_phone: Dict[str, FarmerProfile] = {}
self._generate_all()
def _generate_all(self) -> None:
for farmer in SAMPLE_FARMERS:
rng = _seed_rng(farmer.farmer_id)
aadhaar = AadhaarProfile(
aadhaar_id=_make_masked_aadhaar(rng),
name=farmer.name,
name_local=_local_name(farmer.name),
phone=_make_phone(rng),
district=farmer.location_name,
state="Tamil Nadu",
language="ta",
dob_year=rng.randint(1965, 1995),
)
area_ha = _size_land_for_yield(
farmer.primary_commodity, farmer.quantity_quintals, rng
)
land = _make_land_record(farmer, farmer.primary_commodity, area_ha, rng)
# Fill in nearest_mandi_id by distance. This binds the DPI profile
# to the actual mandi network without needing a separate service.
best_mandi_id = ""
best_dist = float("inf")
for mandi in MANDI_MAP.values():
dist = haversine_km(land.gps_lat, land.gps_lon, mandi.latitude, mandi.longitude)
if dist < best_dist:
best_dist = dist
best_mandi_id = mandi.mandi_id
land.nearest_mandi_id = best_mandi_id
kcc = _make_kcc(farmer.primary_commodity, area_ha, rng)
profile = FarmerProfile(
aadhaar=aadhaar,
land_records=[land],
kcc=kcc,
)
self._by_farmer_id[farmer.farmer_id] = profile
self._by_phone[aadhaar.phone] = profile
# ─── lookup API ──────────────────────────────────────────────────────
def lookup_by_farmer_id(self, farmer_id: str) -> Optional[FarmerProfile]:
return self._by_farmer_id.get(farmer_id)
def lookup_by_phone(self, phone: str) -> Optional[FarmerProfile]:
return self._by_phone.get(phone)
def list_profiles(self) -> List[FarmerProfile]:
return list(self._by_farmer_id.values())
@property
def profile_count(self) -> int:
return len(self._by_farmer_id)
_REGISTRY: Optional[SimulatedDPIRegistry] = None
def get_registry() -> SimulatedDPIRegistry:
"""Lazy-init module-level singleton. Generated on first access."""
global _REGISTRY
if _REGISTRY is None:
_REGISTRY = SimulatedDPIRegistry()
return _REGISTRY