Tender_Matcher / generate_data.py
Samson NIYIZURUGERO
code migration
dffabb7
#!/usr/bin/env python3
"""
Synthetic Data Generator for CPI Tender Matcher
Generates 40 tender documents across EN/FR as .txt files
Run: python generate_data.py
"""
import json
import random
import os
from datetime import datetime, timedelta
random.seed(42)
SECTORS = ["agritech", "healthtech", "cleantech", "edtech", "fintech", "wastetech"]
BUDGETS = [
("5,000", 5000),
("50,000", 50000),
("200,000", 200000),
("1,000,000", 1000000),
]
REGIONS = ["East Africa", "West Africa", "Central Africa", "Southern Africa"]
COUNTRIES = ["Rwanda", "Kenya", "Uganda", "Senegal", "DRC", "Ethiopia", "Tanzania", "Ghana", "Nigeria", "Cameroon"]
ORGS = [
"African Development Bank", "USAID", "EU Delegation", "World Bank",
"GIZ", "UNDP", "African Union", "Bill & Melinda Gates Foundation",
"Mastercard Foundation", "Omidyar Network"
]
EN_TEMPLATES = [
"""GRANT OPPORTUNITY: {title}
Issuing Organization: {org}
Tender Reference: TND-{ref}
Sector: {sector}
Region: {region}
Eligible Countries: {countries}
OVERVIEW
{org} invites applications from qualified organizations for the {title}. This grant supports innovative solutions in the {sector} space across {region}.
BUDGET
Total available funding: USD {budget_str}
Maximum grant per applicant: USD {max_grant}
ELIGIBILITY
- Registered organizations operating in {region}
- Minimum {min_employees} full-time employees
- At least 1 year of operational history
- Prior funding experience preferred: {prior_funding}
OBJECTIVES
This tender aims to:
1. Accelerate {sector} innovation in underserved communities
2. Support scalable and sustainable business models
3. Foster cross-border collaboration in {region}
4. Promote gender inclusion and youth employment
APPLICATION REQUIREMENTS
Applicants must submit:
- Technical proposal (max 15 pages)
- Budget breakdown
- Organizational profile
- Letters of support from local partners
DEADLINE
Application deadline: {deadline}
Results announcement: {result_date}
CONTACT
For inquiries, contact: grants@{org_email}.org
Reference: {ref}
""",
"""FUNDING CALL: {title}
Reference Number: FC-{ref}
Funding Body: {org}
Focus Area: {sector}
Target Geography: {region}
BACKGROUND
Access to {sector} solutions remains limited across {region}. {org} is committed to bridging this gap through targeted grant support.
GRANT DETAILS
- Total envelope: USD {budget_str}
- Individual awards: up to USD {max_grant}
- Duration: 12–24 months
WHO CAN APPLY
Eligible applicants include:
• Social enterprises and cooperatives in {countries}
• NGOs with a proven track record in {sector}
• University spin-offs and research centres
• Minimum team size: {min_employees} employees
EVALUATION CRITERIA
Applications will be scored on:
- Innovation and scalability (30%)
- Impact on underserved populations (25%)
- Financial sustainability (20%)
- Team capability (15%)
- Regional relevance (10%)
KEY DATES
Submission deadline: {deadline}
Interview round: {result_date}
SUBMIT AT: apply.{org_email}.org/FC-{ref}
"""
]
FR_TEMPLATES = [
"""APPEL À CANDIDATURES : {title}
Organisme émetteur : {org}
Référence : TND-{ref}
Secteur : {sector}
Région : {region}
Pays éligibles : {countries}
PRÉSENTATION
{org} lance un appel à candidatures pour le {title}. Ce financement soutient des solutions innovantes dans le domaine {sector} à travers {region}.
BUDGET
Enveloppe totale disponible : USD {budget_str}
Subvention maximale par candidat : USD {max_grant}
ÉLIGIBILITÉ
- Organisations enregistrées opérant en {region}
- Au moins {min_employees} employés à temps plein
- Au moins 1 an d'existence
- Expérience de financement antérieure souhaitée : {prior_funding}
OBJECTIFS
Cet appel vise à :
1. Accélérer l'innovation {sector} dans les communautés mal desservies
2. Soutenir des modèles économiques évolutifs et durables
3. Favoriser la coopération transfrontalière en {region}
4. Promouvoir l'inclusion des femmes et l'emploi des jeunes
DOSSIER DE CANDIDATURE
Les candidats doivent soumettre :
- Proposition technique (15 pages max)
- Détail budgétaire
- Profil organisationnel
- Lettres de soutien de partenaires locaux
DATE LIMITE
Date de soumission : {deadline}
Annonce des résultats : {result_date}
CONTACT
Pour toute question : subventions@{org_email}.org
Référence : {ref}
""",
"""APPEL À PROJETS : {title}
Numéro de référence : AP-{ref}
Bailleur de fonds : {org}
Domaine prioritaire : {sector}
Zone géographique : {region}
CONTEXTE
L'accès aux solutions {sector} reste limité dans {region}. {org} s'engage à combler ce fossé grâce à un soutien ciblé.
DÉTAILS DU FINANCEMENT
- Enveloppe totale : USD {budget_str}
- Subventions individuelles : jusqu'à USD {max_grant}
- Durée : 12 à 24 mois
QUI PEUT CANDIDATER
Les candidats éligibles comprennent :
• Entreprises sociales et coopératives en {countries}
• ONG avec un historique prouvé dans {sector}
• Start-ups universitaires et centres de recherche
• Taille minimale de l'équipe : {min_employees} employés
CRITÈRES D'ÉVALUATION
Les dossiers seront notés sur :
- Innovation et capacité à l'échelle (30%)
- Impact sur les populations mal desservies (25%)
- Viabilité financière (20%)
- Compétences de l'équipe (15%)
- Pertinence régionale (10%)
CALENDRIER
Date limite de soumission : {deadline}
Entretiens : {result_date}
SOUMISSION : candidatures.{org_email}.org/AP-{ref}
"""
]
SECTOR_TITLES_EN = {
"agritech": ["Digital Agriculture Innovation Grant", "Precision Farming Support Fund", "Smallholder AgriTech Scale-Up Grant", "Agricultural Digitization Challenge"],
"healthtech": ["Rural Health Technology Grant", "Community Health Innovation Fund", "Digital Health Access Programme", "Telemedicine Expansion Grant"],
"cleantech": ["Clean Energy Access Fund", "Renewable Energy Scale-Up Grant", "Green Technology Innovation Award", "Solar Solutions Deployment Grant"],
"edtech": ["Digital Learning Innovation Fund", "EdTech for Inclusion Grant", "Offline Education Technology Grant", "Rural Digital Literacy Programme"],
"fintech": ["Financial Inclusion Innovation Grant", "Digital Finance Scale-Up Fund", "Cooperative Finance Technology Grant", "Mobile Money Expansion Award"],
"wastetech": ["Circular Economy Innovation Grant", "Waste-to-Value Technology Fund", "Sustainable Waste Management Grant", "Biogas and Composting Scale-Up"]
}
SECTOR_TITLES_FR = {
"agritech": ["Subvention pour l'Innovation Agricole Numérique", "Fonds de Soutien à l'Agriculture de Précision", "Programme AgriTech pour Petits Exploitants"],
"healthtech": ["Subvention Technologie Santé Rurale", "Fonds Innovation Santé Communautaire", "Programme de Télémédecine Rurale"],
"cleantech": ["Fonds d'Accès à l'Énergie Propre", "Subvention Énergie Renouvelable", "Prix Innovation Technologie Verte"],
"edtech": ["Fonds Innovation Apprentissage Numérique", "Subvention EdTech pour l'Inclusion", "Programme Éducation Hors-Ligne"],
"fintech": ["Subvention Inclusion Financière", "Fonds Finance Numérique", "Programme Finance Coopérative Mobile"],
"wastetech": ["Subvention Économie Circulaire", "Fonds Valorisation des Déchets", "Programme Biogaz et Compostage"]
}
def random_deadline(days_min=30, days_max=120):
future = datetime.now() + timedelta(days=random.randint(days_min, days_max))
return future.strftime("%d %B %Y")
def random_result_date(deadline_str):
deadline = datetime.strptime(deadline_str, "%d %B %Y")
result = deadline + timedelta(days=random.randint(30, 60))
return result.strftime("%d %B %Y")
def generate_tender(tender_id, lang, sector, budget_tuple):
budget_str, budget_val = budget_tuple
max_grant = budget_val // 2
is_fr = lang == "fr"
if is_fr:
title = random.choice(SECTOR_TITLES_FR[sector])
template = random.choice(FR_TEMPLATES)
else:
title = random.choice(SECTOR_TITLES_EN[sector])
template = random.choice(EN_TEMPLATES)
org = random.choice(ORGS)
region = random.choice(REGIONS)
countries = ", ".join(random.sample(COUNTRIES, 3))
min_employees = random.choice([3, 5, 10, 15])
prior_funding = random.choice(["Not required", "Preferred", "Required"])
deadline = random_deadline()
result_date = random_result_date(deadline)
org_email = org.lower().replace(" ", "").replace("&", "and")[:15]
ref = f"{tender_id:03d}{random.randint(100,999)}"
content = template.format(
title=title,
org=org,
ref=ref,
sector=sector,
region=region,
countries=countries,
budget_str=budget_str,
max_grant=f"{max_grant:,}",
min_employees=min_employees,
prior_funding=prior_funding,
deadline=deadline,
result_date=result_date,
org_email=org_email
)
return {
"id": f"T{tender_id:03d}",
"title": title,
"sector": sector,
"budget": budget_val,
"deadline": deadline,
"region": region,
"language": lang,
"content": content
}
def main():
os.makedirs("data/tenders", exist_ok=True)
tenders = []
tender_id = 1
# Generate 40 tenders: 60% EN, 40% FR
# Ensure each sector has tenders in both languages
plan = []
for sector in SECTORS:
for budget in BUDGETS[:2]: # 2 budgets per sector = 12 EN
plan.append(("en", sector, budget))
for sector in SECTORS:
for budget in BUDGETS[2:]: # 2 budgets per sector = 12 FR ... adjust
plan.append(("fr", sector, budget))
# Add 16 more EN tenders for 60/40 split
extras_en = []
for sector in random.choices(SECTORS, k=8):
extras_en.append(("en", sector, random.choice(BUDGETS)))
extras_fr = []
for sector in random.choices(SECTORS, k=4):
extras_fr.append(("fr", sector, random.choice(BUDGETS)))
plan = plan + extras_en + extras_fr
random.shuffle(plan)
plan = plan[:40]
for lang, sector, budget in plan:
tender = generate_tender(tender_id, lang, sector, budget)
tenders.append(tender)
fname = f"data/tenders/{tender['id']}_{lang}_{sector}.txt"
with open(fname, "w", encoding="utf-8") as f:
f.write(tender["content"])
print(f" Generated: {fname}")
tender_id += 1
# Save metadata
meta = [{k: v for k, v in t.items() if k != "content"} for t in tenders]
with open("data/tenders_meta.json", "w") as f:
json.dump(meta, f, indent=2)
# Generate gold_matches.csv (3 expert matches per profile)
profiles = json.load(open("data/profiles.json"))
gold_rows = ["profile_id,tender_id,rank"]
sector_to_tenders = {}
for t in tenders:
sector_to_tenders.setdefault(t["sector"], []).append(t["id"])
for p in profiles:
sector = p["sector"]
candidates = sector_to_tenders.get(sector, [])
if len(candidates) < 3:
# fallback: any tender
candidates = [t["id"] for t in tenders]
chosen = random.sample(candidates, min(3, len(candidates)))
for rank, tid in enumerate(chosen, 1):
gold_rows.append(f"{p['id']},{tid},{rank}")
with open("data/gold_matches.csv", "w") as f:
f.write("\n".join(gold_rows))
print(f"\n✅ Generated {len(tenders)} tenders in data/tenders/")
print(f"✅ Saved data/tenders_meta.json")
print(f"✅ Saved data/gold_matches.csv")
if __name__ == "__main__":
main()