Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| __copyright__ = "Copyright (C) 2022 Davide Rossi" | |
| __license__ = "GPL-3.0-or-later" | |
| import pandas as pd | |
| import numpy as np | |
| import random | |
| from collections import defaultdict | |
| random.seed(42) | |
| generators = dict() | |
| initial_ds = None | |
| places_ds = None | |
| population_by_place = None | |
| population_by_zone = None | |
| rel_population_cumsum_by_zone = None | |
| places_by_zone = None | |
| zone_by_place = None | |
| names_file = 'names_codes.tab' | |
| places_file = 'places.tab' | |
| def setup_names_file(file): | |
| global names_file | |
| names_file = file | |
| def setup_code(code): | |
| global initial_ds, generators, names_file | |
| if initial_ds is None: | |
| initial_ds = pd.read_csv(names_file, sep='\t', names=['name', 'type', 'nation', 'code', 'freq'], keep_default_na=False) | |
| ds = initial_ds | |
| ds = ds[ds['code'] == code] | |
| if len(ds) == 0: | |
| generators[code] = None | |
| return | |
| ds_fore = ds[ds['type'] == 'forename'].reindex() | |
| total_fore = ds_fore['freq'].sum() | |
| ds_fore['freq'] = ds_fore['freq'].apply(lambda x: x/total_fore) | |
| ds_fore = ds_fore.sort_values('freq') | |
| ds_fore['cum_freq'] = ds_fore['freq'].cumsum() | |
| ds_fore.reset_index(drop=True, inplace=True) | |
| ds_fore_cum_freq = ds_fore['cum_freq'].tolist() | |
| ds_sur = ds[ds['type']== 'surname'].reindex() | |
| total_sur = ds_sur['freq'].sum() | |
| ds_sur['freq'] = ds_sur['freq'].apply(lambda x: x/total_sur) | |
| ds_sur = ds_sur.sort_values('freq') | |
| ds_sur['cum_freq'] = ds_sur['freq'].cumsum() | |
| ds_sur.reset_index(drop=True, inplace=True) | |
| ds_sur_cum_freq = ds_sur['cum_freq'].tolist() | |
| generators[code] = (ds_fore, ds_fore_cum_freq, ds_sur, ds_sur_cum_freq) | |
| def setup_places_file(file): | |
| global places_file | |
| places_file = file | |
| def setup_places(): | |
| global places_ds | |
| global population_by_place | |
| global population_by_zone | |
| global places_by_zone | |
| global zone_by_place | |
| global rel_population_cumsum_by_zone | |
| places_ds = pd.read_csv(places_file, sep='\t', header=0, keep_default_na=False, na_values='', | |
| names=['country', 'state_name', 'region', 'un_subregion', 'zone', 'tz', 'population', 'sovereignty_numeric', 'sovereignty', 'code', 'code3', 'code_num', 'cctdl', 'fore'], | |
| dtype={'population':int, 'sovereignty_numeric':int, 'code_num':int}) | |
| population_by_place = {} | |
| for place, population in places_ds[places_ds.population != 0].groupby('code')['population'].sum().iteritems(): | |
| population_by_place[place] = population | |
| population_by_zone = {} | |
| for zone, population in places_ds[places_ds.population != 0].groupby('zone')['population'].sum().iteritems(): | |
| population_by_zone[zone] = population | |
| places_by_zone = {} | |
| rel_population_cumsum_by_zone = {} | |
| zone_by_place = {} | |
| for zone in places_ds[places_ds.population != 0]['zone'].unique(): | |
| for place in places_ds[(places_ds.population != 0) & (places_ds.zone == zone)]['code'].unique(): | |
| places_by_zone[zone] = {place} if zone not in places_by_zone else places_by_zone[zone].union({place}) | |
| zone_by_place[place] = zone | |
| places_by_zone[zone] = list(places_by_zone[zone]) | |
| rel_population_cumsum_by_zone[zone] = np.cumsum([population_by_place[place]/float(population_by_zone[zone]) for place in places_by_zone[zone]]) | |
| def make_up_a_name_for_zone(zone): | |
| global population_by_zone | |
| global population_by_place | |
| global places_by_zone | |
| global rel_population_cumsum_by_zone | |
| if places_by_zone is None: | |
| setup_places() | |
| zone_population = population_by_zone[zone] | |
| places = places_by_zone[zone] | |
| rel_population = rel_population_cumsum_by_zone[zone] | |
| name = None | |
| while name is None: | |
| place = places[np.searchsorted(rel_population, random.random())] | |
| name = make_up_a_name(place) | |
| return name | |
| def all_inhabited_zones(): | |
| if places_by_zone is None: | |
| setup_places() | |
| return places_by_zone.keys() | |
| def make_up_a_name(code, switch=False): | |
| #faster than write that as a combination of make_up_a_forename and make_up_a_surname | |
| global generators | |
| if code not in generators: | |
| setup_code(code) | |
| if code in generators and generators[code] is None: | |
| return None | |
| ds_fore, ds_fore_cum_freq, ds_sur, ds_sur_cum_freq = generators[code] | |
| forename = ds_fore.iloc[np.searchsorted(ds_fore_cum_freq, random.random())]['name'] | |
| surname = ds_sur.iloc[np.searchsorted(ds_sur_cum_freq, random.random())]['name'] | |
| if switch: | |
| return surname+" "+forename | |
| else: | |
| return forename+" "+surname | |
| def make_up_a_forename(code): | |
| global generators | |
| if code in generators and generators[code] is None: | |
| return None | |
| if code not in generators: | |
| setup_code(code) | |
| ds_fore, ds_fore_cum_freq, _, _ = generators[code] | |
| return ds_fore.iloc[np.searchsorted(ds_fore_cum_freq, random.random())]['name'] | |
| def make_up_a_surname(code): | |
| global generators | |
| if code in generators and generators[code] is None: | |
| return None | |
| if code not in generators: | |
| setup_code(code) | |
| _, _, ds_sur, ds_sur_cum_freq = generators[code] | |
| return ds_sur.iloc[np.searchsorted(ds_sur_cum_freq, random.random())]['name'] |