Spaces:
Runtime error
Runtime error
| __copyright__ = "Copyright (C) 2023 Ali Mustapha" | |
| __license__ = "GPL-3.0-or-later" | |
| import pandas as pd | |
| import numpy as np | |
| import tensorflow as tf | |
| import pickle | |
| from Dictionary_guesser.name_nation_guesser import NameNationGuesser | |
| import datetime | |
| from utils import data_utils | |
| class RegionPredictor: | |
| def __init__(self, models_directory,places_filename='Dictionary_guesser/places.tab'): | |
| self.guesser =NameNationGuesser(places_filename=places_filename, guess_first_second_min_mag=None,place_column_name="sub-region") | |
| self.models_directory = models_directory | |
| def load_model(self, path): | |
| model = tf.keras.models.load_model(path+"bestmodel.tf") | |
| #compile and train the model | |
| model.compile( | |
| loss = tf.keras.losses.categorical_crossentropy, | |
| optimizer=tf.keras.optimizers.Adam(), | |
| metrics=['accuracy']) | |
| with open(path+'label_encoder.pkl', 'rb') as file: | |
| label_encoder = pickle.load(file) | |
| with open(path+'optimal_thresholds_f1.pkl', 'rb') as file: | |
| optF1 = pickle.load(file) | |
| with open(path+'optimal_thresholds_ROC.pkl', 'rb') as file: | |
| optROC = pickle.load(file) | |
| return model,label_encoder,optF1,optROC | |
| def model_prediction(self, dataset,model,label_encoder,optF1=None,optROC=None): | |
| input_Full_name=np.asarray(dataset['Author']).astype('str') | |
| input_offset=np.asarray(dataset['Author_Timezone']).astype('float') | |
| predictions_proba = model.predict({ | |
| "input_text": input_Full_name, | |
| "input_offset": input_offset | |
| },verbose=0) | |
| # predictions = np.argmax(predictions_proba,axis=1) | |
| y_pred_F1=[] | |
| y_pred_ROC=[] | |
| if optF1 is not None: | |
| y_pred_F1 = (predictions_proba >= optF1).astype(int) | |
| y_pred_F1=np.argmax(y_pred_F1,axis=1) | |
| y_pred_F1 = label_encoder.inverse_transform(y_pred_F1) | |
| if optROC is not None: | |
| y_pred_ROC = (predictions_proba >= optROC).astype(int) | |
| y_pred_ROC=np.argmax(y_pred_ROC,axis=1) | |
| y_pred_ROC = label_encoder.inverse_transform(y_pred_ROC) | |
| return y_pred_F1,y_pred_ROC | |
| def guess_zone(self,name, epoch, offset): | |
| dt = datetime.datetime.fromtimestamp(epoch) | |
| country_pop_map = self.guesser.country_pop_from_datetime(dt, offset) | |
| return self.guesser.guess_zone(name, country_pop_map=country_pop_map) | |
| def get_Dictionary_Based_Region(self,df): | |
| df['Commit_Seconds'] = df['First_Commit_Date'].apply(lambda x: x.timestamp()) | |
| df['sub-region_Dictionary'] = df.apply(lambda row: self.guess_zone(row['Author'],row['Commit_Seconds'], row['Author_Timezone']), axis=1) | |
| df['sub-region_Dictionary'] = df['sub-region_Dictionary'].apply(lambda x: x if pd.notna(x) else None) | |
| code_to_region = self.guesser.places_data.set_index('sub-region')['region'].to_dict() | |
| df['region_Dictionary'] = df['sub-region_Dictionary'].map(code_to_region) | |
| return df | |
| def get_region(self,dataset): | |
| def transform_string(name): | |
| translator = str.maketrans(r"-._\/+", " ") | |
| name = name.translate(translator) | |
| translator = str.maketrans("", "", "0123456789") | |
| name = name.translate(translator) | |
| name = data_utils.text_to_romanize(name) | |
| name = data_utils.remove_spaces_from_ends(name) | |
| return name | |
| dataset['Author'] = dataset['Author'].apply(transform_string) | |
| dataset["Author_Timezone"]= dataset["Author_Timezone"] /60 | |
| dataset=self.get_Dictionary_Based_Region(dataset) | |
| model,label_encoder,optF1,optROC=self.load_model(self.models_directory+"/region/files/") | |
| y_pred_F1,y_pred_ROC=self.model_prediction(dataset,model,label_encoder,optF1,optROC) | |
| dataset["region-prediction_F1"]=y_pred_F1 | |
| dataset["region-prediction_ROC"]=y_pred_ROC | |
| dataset['region_Dictionary'] = dataset['region_Dictionary'].apply(lambda x: x if pd.notna(x) else None) | |
| dataset["region-prediction"] = dataset.apply(lambda row: row["region_Dictionary"] if row["region_Dictionary"] else row["region-prediction_F1"] , axis=1) | |
| Europe=dataset[dataset["region-prediction"]=="Europe"] | |
| Africa=dataset[dataset["region-prediction"]=="Africa"] | |
| Asia=dataset[dataset["region-prediction"]=="Asia"] | |
| Americas=dataset[dataset["region-prediction"]=="Americas"] | |
| Oceania=dataset[dataset["region-prediction"]=="Oceania"] | |
| if not Europe.empty: | |
| model,label_encoder,optF1,optROC=self.load_model(self.models_directory+"/Europe/files/") | |
| y_pred_F1,y_pred_ROC=self.model_prediction(Europe,model,label_encoder,optF1,optROC) | |
| Europe["sub-region-prediction_F1"]=y_pred_F1 | |
| Europe["sub-region-prediction_ROC"]=y_pred_ROC | |
| if not Asia.empty: | |
| model,label_encoder,optF1,optROC=self.load_model(self.models_directory+"/Asia/files/") | |
| y_pred_F1,y_pred_ROC=self.model_prediction(Asia,model,label_encoder,optF1,optROC) | |
| Asia["sub-region-prediction_F1"]=y_pred_F1 | |
| Asia["sub-region-prediction_ROC"]=y_pred_ROC | |
| if not Americas.empty: | |
| model,label_encoder,optF1,optROC=self.load_model(self.models_directory+"/Americas/files/") | |
| y_pred_F1,y_pred_ROC=self.model_prediction(Americas,model,label_encoder,optF1,optROC) | |
| Americas["sub-region-prediction_F1"]=y_pred_F1 | |
| Americas["sub-region-prediction_ROC"]=y_pred_ROC | |
| if not Oceania.empty: | |
| Oceania["sub-region-prediction_F1"]="Australia and New Zealand" | |
| Oceania["sub-region-prediction_ROC"]="Australia and New Zealand" | |
| if not Africa.empty: | |
| Africa["sub-region-prediction_F1"]=Africa["region-prediction"] | |
| Africa["sub-region-prediction_ROC"]=Africa["region-prediction"] | |
| data=pd.concat([Europe,Asia,Oceania,Americas,Africa]) | |
| data["sub-region_Dictionary"] = data["sub-region_Dictionary"].apply(lambda x: x if pd.notna(x) else None) | |
| data["sub-region-prediction"] = data.apply(lambda row: row["sub-region_Dictionary"] if row["sub-region_Dictionary"] else row["sub-region-prediction_F1"] , axis=1) | |
| return data | |