#!/usr/bin/env python3 """ Synthetic Data Generator for CPI Tender Matcher Generates 40 tender documents across EN/FR as .txt files Run: python generate_data.py """ import json import random import os from datetime import datetime, timedelta random.seed(42) SECTORS = ["agritech", "healthtech", "cleantech", "edtech", "fintech", "wastetech"] BUDGETS = [ ("5,000", 5000), ("50,000", 50000), ("200,000", 200000), ("1,000,000", 1000000), ] REGIONS = ["East Africa", "West Africa", "Central Africa", "Southern Africa"] COUNTRIES = ["Rwanda", "Kenya", "Uganda", "Senegal", "DRC", "Ethiopia", "Tanzania", "Ghana", "Nigeria", "Cameroon"] ORGS = [ "African Development Bank", "USAID", "EU Delegation", "World Bank", "GIZ", "UNDP", "African Union", "Bill & Melinda Gates Foundation", "Mastercard Foundation", "Omidyar Network" ] EN_TEMPLATES = [ """GRANT OPPORTUNITY: {title} Issuing Organization: {org} Tender Reference: TND-{ref} Sector: {sector} Region: {region} Eligible Countries: {countries} OVERVIEW {org} invites applications from qualified organizations for the {title}. This grant supports innovative solutions in the {sector} space across {region}. BUDGET Total available funding: USD {budget_str} Maximum grant per applicant: USD {max_grant} ELIGIBILITY - Registered organizations operating in {region} - Minimum {min_employees} full-time employees - At least 1 year of operational history - Prior funding experience preferred: {prior_funding} OBJECTIVES This tender aims to: 1. Accelerate {sector} innovation in underserved communities 2. Support scalable and sustainable business models 3. Foster cross-border collaboration in {region} 4. Promote gender inclusion and youth employment APPLICATION REQUIREMENTS Applicants must submit: - Technical proposal (max 15 pages) - Budget breakdown - Organizational profile - Letters of support from local partners DEADLINE Application deadline: {deadline} Results announcement: {result_date} CONTACT For inquiries, contact: grants@{org_email}.org Reference: {ref} """, """FUNDING CALL: {title} Reference Number: FC-{ref} Funding Body: {org} Focus Area: {sector} Target Geography: {region} BACKGROUND Access to {sector} solutions remains limited across {region}. {org} is committed to bridging this gap through targeted grant support. GRANT DETAILS - Total envelope: USD {budget_str} - Individual awards: up to USD {max_grant} - Duration: 12–24 months WHO CAN APPLY Eligible applicants include: • Social enterprises and cooperatives in {countries} • NGOs with a proven track record in {sector} • University spin-offs and research centres • Minimum team size: {min_employees} employees EVALUATION CRITERIA Applications will be scored on: - Innovation and scalability (30%) - Impact on underserved populations (25%) - Financial sustainability (20%) - Team capability (15%) - Regional relevance (10%) KEY DATES Submission deadline: {deadline} Interview round: {result_date} SUBMIT AT: apply.{org_email}.org/FC-{ref} """ ] FR_TEMPLATES = [ """APPEL À CANDIDATURES : {title} Organisme émetteur : {org} Référence : TND-{ref} Secteur : {sector} Région : {region} Pays éligibles : {countries} PRÉSENTATION {org} lance un appel à candidatures pour le {title}. Ce financement soutient des solutions innovantes dans le domaine {sector} à travers {region}. BUDGET Enveloppe totale disponible : USD {budget_str} Subvention maximale par candidat : USD {max_grant} ÉLIGIBILITÉ - Organisations enregistrées opérant en {region} - Au moins {min_employees} employés à temps plein - Au moins 1 an d'existence - Expérience de financement antérieure souhaitée : {prior_funding} OBJECTIFS Cet appel vise à : 1. Accélérer l'innovation {sector} dans les communautés mal desservies 2. Soutenir des modèles économiques évolutifs et durables 3. Favoriser la coopération transfrontalière en {region} 4. Promouvoir l'inclusion des femmes et l'emploi des jeunes DOSSIER DE CANDIDATURE Les candidats doivent soumettre : - Proposition technique (15 pages max) - Détail budgétaire - Profil organisationnel - Lettres de soutien de partenaires locaux DATE LIMITE Date de soumission : {deadline} Annonce des résultats : {result_date} CONTACT Pour toute question : subventions@{org_email}.org Référence : {ref} """, """APPEL À PROJETS : {title} Numéro de référence : AP-{ref} Bailleur de fonds : {org} Domaine prioritaire : {sector} Zone géographique : {region} CONTEXTE L'accès aux solutions {sector} reste limité dans {region}. {org} s'engage à combler ce fossé grâce à un soutien ciblé. DÉTAILS DU FINANCEMENT - Enveloppe totale : USD {budget_str} - Subventions individuelles : jusqu'à USD {max_grant} - Durée : 12 à 24 mois QUI PEUT CANDIDATER Les candidats éligibles comprennent : • Entreprises sociales et coopératives en {countries} • ONG avec un historique prouvé dans {sector} • Start-ups universitaires et centres de recherche • Taille minimale de l'équipe : {min_employees} employés CRITÈRES D'ÉVALUATION Les dossiers seront notés sur : - Innovation et capacité à l'échelle (30%) - Impact sur les populations mal desservies (25%) - Viabilité financière (20%) - Compétences de l'équipe (15%) - Pertinence régionale (10%) CALENDRIER Date limite de soumission : {deadline} Entretiens : {result_date} SOUMISSION : candidatures.{org_email}.org/AP-{ref} """ ] SECTOR_TITLES_EN = { "agritech": ["Digital Agriculture Innovation Grant", "Precision Farming Support Fund", "Smallholder AgriTech Scale-Up Grant", "Agricultural Digitization Challenge"], "healthtech": ["Rural Health Technology Grant", "Community Health Innovation Fund", "Digital Health Access Programme", "Telemedicine Expansion Grant"], "cleantech": ["Clean Energy Access Fund", "Renewable Energy Scale-Up Grant", "Green Technology Innovation Award", "Solar Solutions Deployment Grant"], "edtech": ["Digital Learning Innovation Fund", "EdTech for Inclusion Grant", "Offline Education Technology Grant", "Rural Digital Literacy Programme"], "fintech": ["Financial Inclusion Innovation Grant", "Digital Finance Scale-Up Fund", "Cooperative Finance Technology Grant", "Mobile Money Expansion Award"], "wastetech": ["Circular Economy Innovation Grant", "Waste-to-Value Technology Fund", "Sustainable Waste Management Grant", "Biogas and Composting Scale-Up"] } SECTOR_TITLES_FR = { "agritech": ["Subvention pour l'Innovation Agricole Numérique", "Fonds de Soutien à l'Agriculture de Précision", "Programme AgriTech pour Petits Exploitants"], "healthtech": ["Subvention Technologie Santé Rurale", "Fonds Innovation Santé Communautaire", "Programme de Télémédecine Rurale"], "cleantech": ["Fonds d'Accès à l'Énergie Propre", "Subvention Énergie Renouvelable", "Prix Innovation Technologie Verte"], "edtech": ["Fonds Innovation Apprentissage Numérique", "Subvention EdTech pour l'Inclusion", "Programme Éducation Hors-Ligne"], "fintech": ["Subvention Inclusion Financière", "Fonds Finance Numérique", "Programme Finance Coopérative Mobile"], "wastetech": ["Subvention Économie Circulaire", "Fonds Valorisation des Déchets", "Programme Biogaz et Compostage"] } def random_deadline(days_min=30, days_max=120): future = datetime.now() + timedelta(days=random.randint(days_min, days_max)) return future.strftime("%d %B %Y") def random_result_date(deadline_str): deadline = datetime.strptime(deadline_str, "%d %B %Y") result = deadline + timedelta(days=random.randint(30, 60)) return result.strftime("%d %B %Y") def generate_tender(tender_id, lang, sector, budget_tuple): budget_str, budget_val = budget_tuple max_grant = budget_val // 2 is_fr = lang == "fr" if is_fr: title = random.choice(SECTOR_TITLES_FR[sector]) template = random.choice(FR_TEMPLATES) else: title = random.choice(SECTOR_TITLES_EN[sector]) template = random.choice(EN_TEMPLATES) org = random.choice(ORGS) region = random.choice(REGIONS) countries = ", ".join(random.sample(COUNTRIES, 3)) min_employees = random.choice([3, 5, 10, 15]) prior_funding = random.choice(["Not required", "Preferred", "Required"]) deadline = random_deadline() result_date = random_result_date(deadline) org_email = org.lower().replace(" ", "").replace("&", "and")[:15] ref = f"{tender_id:03d}{random.randint(100,999)}" content = template.format( title=title, org=org, ref=ref, sector=sector, region=region, countries=countries, budget_str=budget_str, max_grant=f"{max_grant:,}", min_employees=min_employees, prior_funding=prior_funding, deadline=deadline, result_date=result_date, org_email=org_email ) return { "id": f"T{tender_id:03d}", "title": title, "sector": sector, "budget": budget_val, "deadline": deadline, "region": region, "language": lang, "content": content } def main(): os.makedirs("data/tenders", exist_ok=True) tenders = [] tender_id = 1 # Generate 40 tenders: 60% EN, 40% FR # Ensure each sector has tenders in both languages plan = [] for sector in SECTORS: for budget in BUDGETS[:2]: # 2 budgets per sector = 12 EN plan.append(("en", sector, budget)) for sector in SECTORS: for budget in BUDGETS[2:]: # 2 budgets per sector = 12 FR ... adjust plan.append(("fr", sector, budget)) # Add 16 more EN tenders for 60/40 split extras_en = [] for sector in random.choices(SECTORS, k=8): extras_en.append(("en", sector, random.choice(BUDGETS))) extras_fr = [] for sector in random.choices(SECTORS, k=4): extras_fr.append(("fr", sector, random.choice(BUDGETS))) plan = plan + extras_en + extras_fr random.shuffle(plan) plan = plan[:40] for lang, sector, budget in plan: tender = generate_tender(tender_id, lang, sector, budget) tenders.append(tender) fname = f"data/tenders/{tender['id']}_{lang}_{sector}.txt" with open(fname, "w", encoding="utf-8") as f: f.write(tender["content"]) print(f" Generated: {fname}") tender_id += 1 # Save metadata meta = [{k: v for k, v in t.items() if k != "content"} for t in tenders] with open("data/tenders_meta.json", "w") as f: json.dump(meta, f, indent=2) # Generate gold_matches.csv (3 expert matches per profile) profiles = json.load(open("data/profiles.json")) gold_rows = ["profile_id,tender_id,rank"] sector_to_tenders = {} for t in tenders: sector_to_tenders.setdefault(t["sector"], []).append(t["id"]) for p in profiles: sector = p["sector"] candidates = sector_to_tenders.get(sector, []) if len(candidates) < 3: # fallback: any tender candidates = [t["id"] for t in tenders] chosen = random.sample(candidates, min(3, len(candidates))) for rank, tid in enumerate(chosen, 1): gold_rows.append(f"{p['id']},{tid},{rank}") with open("data/gold_matches.csv", "w") as f: f.write("\n".join(gold_rows)) print(f"\n✅ Generated {len(tenders)} tenders in data/tenders/") print(f"✅ Saved data/tenders_meta.json") print(f"✅ Saved data/gold_matches.csv") if __name__ == "__main__": main()