| |
| """Generate and seed ~2000 worker personas for the climate-risk pilot. |
| |
| Augments the 30 hand-crafted personas from populate_worker_personas.py with a |
| programmatically-generated population distributed proportionally across the 15 |
| Dar es Salaam neighborhoods by `worker_population_est`. Deterministic (seed=42) |
| so re-running produces the same population. |
| |
| Run once (or any time — idempotent via ON CONFLICT DO UPDATE): |
| |
| DATABASE_URL=... python3 scripts/populate_worker_population.py [TOTAL] |
| |
| Default TOTAL = 2000. |
| """ |
|
|
| import os |
| import random |
| import sys |
|
|
| import psycopg2 |
|
|
|
|
| TARGET_TOTAL = int(sys.argv[1]) if len(sys.argv) > 1 else 2000 |
|
|
|
|
| |
| |
| |
| ZONES = [ |
| ("DAR-BUG", "Buguruni", "informal", 22000), |
| ("DAR-JAN", "Jangwani", "informal", 25000), |
| ("DAR-KAR", "Kariakoo", "commercial", 12000), |
| ("DAR-KIG", "Kigamboni", "mixed", 15000), |
| ("DAR-KIN", "Kinondoni", "formal", 15000), |
| ("DAR-MAG", "Magomeni", "mixed", 18000), |
| ("DAR-MAS", "Masaki", "formal", 4000), |
| ("DAR-MBA", "Mbagala", "informal", 40000), |
| ("DAR-MIK", "Mikocheni", "formal", 8000), |
| ("DAR-MNZ", "Manzese", "informal", 35000), |
| ("DAR-MSA", "Msasani", "mixed", 8000), |
| ("DAR-TAN", "Tandale", "informal", 28000), |
| ("DAR-TEM", "Temeke", "mixed", 30000), |
| ("DAR-UBU", "Ubungo", "mixed", 20000), |
| ("DAR-VIN", "Vingunguti", "informal", 20000), |
| ] |
|
|
| MALE_FIRST = [ |
| "Juma", "Hassan", "Mustafa", "Emmanuel", "Baraka", "Saidi", "Rashid", |
| "Omari", "Peter", "Ibrahim", "Ally", "Athumani", "Joseph", "Hamisi", |
| "Mohamed", "Daudi", "Yusuf", "Suleiman", "Rajabu", "Salim", "Amani", |
| "Kassim", "Fred", "John", "George", "Samuel", "Henry", "James", "Paul", |
| "Thomas", "Patrick", "Michael", "Charles", "Francis", "Robert", "Anthony", |
| "Philip", "David", "Martin", "Simon", "Elias", "Joshua", "Samson", |
| "Solomon", "Abraham", "Isaac", "Jacob", "Noah", "Joel", "Abdallah", |
| "Issa", "Ramadhani", "Shabani", "Rweyongera", "Magufuli", "Mkapa", |
| ] |
|
|
| FEMALE_FIRST = [ |
| "Amina", "Fatuma", "Mariam", "Zainabu", "Halima", "Rehema", "Asha", |
| "Khadija", "Tatu", "Bahati", "Rose", "Grace", "Neema", "Mwanahamisi", |
| "Esther", "Fausta", "Lucia", "Anna", "Mary", "Tabitha", "Magdalena", |
| "Joyce", "Agnes", "Imani", "Sara", "Sofia", "Upendo", "Salma", "Tumaini", |
| "Husna", "Zena", "Mwajabu", "Mwanajuma", "Riziki", "Subira", "Jane", |
| "Ruth", "Eva", "Sarah", "Elizabeth", "Rachel", "Rebecca", "Hannah", |
| "Naomi", "Deborah", "Martha", "Lydia", "Priscilla", "Phoebe", "Mwajuma", |
| "Asia", "Halima", "Shamsa", "Latifa", "Nuru", "Aisha", |
| ] |
|
|
| SURNAMES = [ |
| "Mwakalinga", "Shaaban", "Msuya", "Mushi", "Kimaro", "Kibona", "Maerere", |
| "Mbwiliza", "Ngolwa", "Swai", "Ishengoma", "Mlelwa", "Athumani", "Hassan", |
| "Juma", "Kilonzo", "Makawa", "Mlima", "Nkya", "Rweyemamu", "Sanga", |
| "Temba", "Kaniki", "Mariki", "Lyimo", "Kikwete", "Mkumbo", "Msafiri", |
| "Kateka", "Kimaryo", "Msigwa", "Mbowe", "Lema", "Mrema", "Kivuyo", |
| "Bujiku", "Chijika", "Kilama", "Mbeya", "Matonya", "Njokopa", "Minja", |
| "Mpemba", "Kilumbe", "Lugazia", "Chamshama", "Mkasa", "Mngumi", "Luhanga", |
| "Mtelewa", "Kagaruki", "Ndimbo", "Ndaki", "Ngowi", "Chimanga", "Mollel", |
| ] |
|
|
|
|
| OCCUPATIONS = { |
| "informal": [ |
| "waste picker", "charcoal seller", "water seller", "water carrier", |
| "day laborer", "street food cook", "charcoal trader", "fish vendor", |
| "street tailor", "market porter", "domestic worker", |
| "secondhand clothes vendor", "vegetable seller", "fruit hawker", |
| "chapati cook", "sand carrier", "sack seller", "metalworker", |
| "cobbler", "motorbike repair", |
| ], |
| "commercial": [ |
| "market porter", "stall vendor", "food vendor", "tailor", |
| "shoe repairer", "small shop keeper", "hardware hawker", "spice vendor", |
| "fabric seller", "cloth hawker", "fruit vendor", "fish seller", |
| "wholesale porter", "grain seller", "appliance hawker", |
| "cosmetics vendor", |
| ], |
| "mixed": [ |
| "boda-boda driver", "mama lishe", "car washer", "construction laborer", |
| "gardener", "tailor", "fishmonger", "kiosk operator", "masonry helper", |
| "painter", "welder", "glass cutter", "food stall operator", |
| "yogurt seller", "nyama choma cook", "coconut vendor", "banana vendor", |
| ], |
| "formal": [ |
| "security guard", "gardener", "driver", "housekeeper", "cleaner", |
| "office messenger", "office gardener", "maintenance worker", |
| "receptionist", "groundskeeper", "caretaker", "watchman", "janitor", |
| "valet", "doorman", |
| ], |
| } |
|
|
|
|
| |
| MOBILE_MONEY = [("M-Pesa", 55), ("Tigo Pesa", 27), ("Airtel Money", 18)] |
|
|
| |
| TASAF_PROB = {"informal": 0.40, "commercial": 0.15, "mixed": 0.20, "formal": 0.05} |
|
|
|
|
| CREATE_TABLE = """ |
| CREATE TABLE IF NOT EXISTS workers ( |
| worker_id TEXT PRIMARY KEY, |
| name TEXT NOT NULL, |
| name_swahili TEXT, |
| nida_id TEXT, |
| phone TEXT NOT NULL, |
| zone_id TEXT NOT NULL REFERENCES zones(zone_id), |
| occupation TEXT NOT NULL, |
| age INTEGER, |
| years_outdoor INTEGER, |
| household_size INTEGER, |
| mobile_money TEXT, |
| tasaf_enrolled BOOLEAN DEFAULT false, |
| enrolled_at TIMESTAMPTZ DEFAULT NOW() |
| ); |
| |
| CREATE INDEX IF NOT EXISTS idx_workers_zone ON workers (zone_id); |
| """ |
|
|
|
|
| UPSERT = """ |
| INSERT INTO workers ( |
| worker_id, name, name_swahili, nida_id, phone, zone_id, occupation, |
| age, years_outdoor, household_size, mobile_money, tasaf_enrolled |
| ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) |
| ON CONFLICT (worker_id) DO UPDATE SET |
| name = EXCLUDED.name, |
| name_swahili = EXCLUDED.name_swahili, |
| nida_id = EXCLUDED.nida_id, |
| phone = EXCLUDED.phone, |
| zone_id = EXCLUDED.zone_id, |
| occupation = EXCLUDED.occupation, |
| age = EXCLUDED.age, |
| years_outdoor = EXCLUDED.years_outdoor, |
| household_size = EXCLUDED.household_size, |
| mobile_money = EXCLUDED.mobile_money, |
| tasaf_enrolled = EXCLUDED.tasaf_enrolled |
| """ |
|
|
|
|
| def weighted_choice(rng: random.Random, weighted: list[tuple[str, int]]) -> str: |
| total = sum(w for _, w in weighted) |
| r = rng.uniform(0, total) |
| for item, w in weighted: |
| r -= w |
| if r <= 0: |
| return item |
| return weighted[-1][0] |
|
|
|
|
| def random_phone(rng: random.Random) -> str: |
| prefix = rng.choice([12, 13, 14, 15, 16, 17, 18]) |
| middle = rng.randint(0, 9) |
| tail = rng.randint(1000, 9999) |
| return f"+2557{prefix}{middle}XX{tail}" |
|
|
|
|
| def random_nida(rng: random.Random) -> str: |
| year = rng.randint(1965, 2004) |
| month = rng.randint(1, 12) |
| tail = rng.randint(1000, 9999) |
| return f"{year}{month:02d}-XXXX-XXXX-{tail}" |
|
|
|
|
| def generate_worker(rng: random.Random, zone_id: str, settlement: str, |
| index: int) -> tuple: |
| is_female = rng.random() < 0.48 |
| first = rng.choice(FEMALE_FIRST if is_female else MALE_FIRST) |
| surname = rng.choice(SURNAMES) |
| name = f"{first} {surname}" |
| age = rng.randint(20, 60) |
| years_outdoor = rng.randint(1, min(max(age - 16, 1), 35)) |
| household_size = rng.randint(2, 10) |
| occupation = rng.choice(OCCUPATIONS[settlement]) |
| mobile_money = weighted_choice(rng, MOBILE_MONEY) |
| tasaf_enrolled = rng.random() < TASAF_PROB[settlement] |
| worker_id = f"{zone_id}-P{index:04d}" |
| return ( |
| worker_id, name, first, random_nida(rng), random_phone(rng), zone_id, |
| occupation, age, years_outdoor, household_size, mobile_money, |
| tasaf_enrolled, |
| ) |
|
|
|
|
| def main() -> int: |
| db_url = os.environ.get("DATABASE_URL") |
| if not db_url: |
| print("ERROR: DATABASE_URL not set. Export your Neon connection string.", |
| file=sys.stderr) |
| return 1 |
|
|
| total_pop = sum(z[3] for z in ZONES) |
| rng = random.Random(42) |
| workers: list[tuple] = [] |
|
|
| for zone_id, _name, settlement, pop in ZONES: |
| count = max(1, round(TARGET_TOTAL * pop / total_pop)) |
| for i in range(count): |
| workers.append(generate_worker(rng, zone_id, settlement, i)) |
|
|
| with psycopg2.connect(db_url) as conn: |
| with conn.cursor() as cur: |
| cur.execute(CREATE_TABLE) |
| for row in workers: |
| cur.execute(UPSERT, row) |
| conn.commit() |
|
|
| print(f"Seeded {len(workers)} worker personas across {len(ZONES)} " |
| f"neighborhoods (target was {TARGET_TOTAL})") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|