AliMustapha's picture
modiv
c8cf296
#!/usr/bin/env python3
__copyright__ = "Copyright (C) 2022 Davide Rossi"
__license__ = "GPL-3.0-or-later"
import pandas as pd
import numpy as np
import random
from collections import defaultdict
random.seed(42)
generators = dict()
initial_ds = None
places_ds = None
population_by_place = None
population_by_zone = None
rel_population_cumsum_by_zone = None
places_by_zone = None
zone_by_place = None
names_file = 'names_codes.tab'
places_file = 'places.tab'
def setup_names_file(file):
global names_file
names_file = file
def setup_code(code):
global initial_ds, generators, names_file
if initial_ds is None:
initial_ds = pd.read_csv(names_file, sep='\t', names=['name', 'type', 'nation', 'code', 'freq'], keep_default_na=False)
ds = initial_ds
ds = ds[ds['code'] == code]
if len(ds) == 0:
generators[code] = None
return
ds_fore = ds[ds['type'] == 'forename'].reindex()
total_fore = ds_fore['freq'].sum()
ds_fore['freq'] = ds_fore['freq'].apply(lambda x: x/total_fore)
ds_fore = ds_fore.sort_values('freq')
ds_fore['cum_freq'] = ds_fore['freq'].cumsum()
ds_fore.reset_index(drop=True, inplace=True)
ds_fore_cum_freq = ds_fore['cum_freq'].tolist()
ds_sur = ds[ds['type']== 'surname'].reindex()
total_sur = ds_sur['freq'].sum()
ds_sur['freq'] = ds_sur['freq'].apply(lambda x: x/total_sur)
ds_sur = ds_sur.sort_values('freq')
ds_sur['cum_freq'] = ds_sur['freq'].cumsum()
ds_sur.reset_index(drop=True, inplace=True)
ds_sur_cum_freq = ds_sur['cum_freq'].tolist()
generators[code] = (ds_fore, ds_fore_cum_freq, ds_sur, ds_sur_cum_freq)
def setup_places_file(file):
global places_file
places_file = file
def setup_places():
global places_ds
global population_by_place
global population_by_zone
global places_by_zone
global zone_by_place
global rel_population_cumsum_by_zone
places_ds = pd.read_csv(places_file, sep='\t', header=0, keep_default_na=False, na_values='',
names=['country', 'state_name', 'region', 'un_subregion', 'zone', 'tz', 'population', 'sovereignty_numeric', 'sovereignty', 'code', 'code3', 'code_num', 'cctdl', 'fore'],
dtype={'population':int, 'sovereignty_numeric':int, 'code_num':int})
population_by_place = {}
for place, population in places_ds[places_ds.population != 0].groupby('code')['population'].sum().iteritems():
population_by_place[place] = population
population_by_zone = {}
for zone, population in places_ds[places_ds.population != 0].groupby('zone')['population'].sum().iteritems():
population_by_zone[zone] = population
places_by_zone = {}
rel_population_cumsum_by_zone = {}
zone_by_place = {}
for zone in places_ds[places_ds.population != 0]['zone'].unique():
for place in places_ds[(places_ds.population != 0) & (places_ds.zone == zone)]['code'].unique():
places_by_zone[zone] = {place} if zone not in places_by_zone else places_by_zone[zone].union({place})
zone_by_place[place] = zone
places_by_zone[zone] = list(places_by_zone[zone])
rel_population_cumsum_by_zone[zone] = np.cumsum([population_by_place[place]/float(population_by_zone[zone]) for place in places_by_zone[zone]])
def make_up_a_name_for_zone(zone):
global population_by_zone
global population_by_place
global places_by_zone
global rel_population_cumsum_by_zone
if places_by_zone is None:
setup_places()
zone_population = population_by_zone[zone]
places = places_by_zone[zone]
rel_population = rel_population_cumsum_by_zone[zone]
name = None
while name is None:
place = places[np.searchsorted(rel_population, random.random())]
name = make_up_a_name(place)
return name
def all_inhabited_zones():
if places_by_zone is None:
setup_places()
return places_by_zone.keys()
def make_up_a_name(code, switch=False):
#faster than write that as a combination of make_up_a_forename and make_up_a_surname
global generators
if code not in generators:
setup_code(code)
if code in generators and generators[code] is None:
return None
ds_fore, ds_fore_cum_freq, ds_sur, ds_sur_cum_freq = generators[code]
forename = ds_fore.iloc[np.searchsorted(ds_fore_cum_freq, random.random())]['name']
surname = ds_sur.iloc[np.searchsorted(ds_sur_cum_freq, random.random())]['name']
if switch:
return surname+" "+forename
else:
return forename+" "+surname
def make_up_a_forename(code):
global generators
if code in generators and generators[code] is None:
return None
if code not in generators:
setup_code(code)
ds_fore, ds_fore_cum_freq, _, _ = generators[code]
return ds_fore.iloc[np.searchsorted(ds_fore_cum_freq, random.random())]['name']
def make_up_a_surname(code):
global generators
if code in generators and generators[code] is None:
return None
if code not in generators:
setup_code(code)
_, _, ds_sur, ds_sur_cum_freq = generators[code]
return ds_sur.iloc[np.searchsorted(ds_sur_cum_freq, random.random())]['name']