Spaces:
Sleeping
Sleeping
| import csv | |
| import json | |
| import os | |
| import pickle | |
| from datetime import datetime | |
| import pandas | |
| import pandas as pd | |
| from sklearn import metrics | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.model_selection import train_test_split | |
| from Pinpoint_Internal import Logger | |
| class random_forest(): | |
| """ | |
| A class used for creating a random forest binary classifier. | |
| """ | |
| model = None | |
| accuracy = None | |
| precision = None | |
| recall = None | |
| f_measure = None | |
| # Model variables populated on creation or reading of file | |
| original_name = None | |
| creation_date = None | |
| _FRAMEWORK_VERSION = 0.2 # Used when creating a new model file | |
| # v0.1 - versioning added. | |
| # v0.2 - Added more LIWC scores and minkowski distance | |
| model_version = _FRAMEWORK_VERSION # can be updated if reading and using a model file of a different version | |
| _outputs_folder = None | |
| _model_folder = None | |
| # Categories of features used in the model | |
| RADICAL_LANGUAGE_ENABLED = True # RF-IDF Scores, Word Embeddings | |
| PSYCHOLOGICAL_SIGNALS_ENABLED = True # LIWC Dictionaries, Minkowski distance | |
| BEHAVIOURAL_FEATURES_ENABLED = True # frequency of tweets, followers / following ratio, centrality | |
| def __init__(self, outputs_folder="outputs", model_folder=None): | |
| """ | |
| Constructor | |
| The random_forest() class can be initialised with outputs_folder() and model_folder(). The outputs folder is | |
| where output files are stored and the model folder is where the model will be created if not overwritten. | |
| """ | |
| if model_folder is None: | |
| model_folder = outputs_folder | |
| self._outputs_folder = outputs_folder | |
| self._model_folder = model_folder | |
| def get_features_as_df(self, features_file, force_new_dataset=True): | |
| """ | |
| Reads a JSON file file and converts to a Pandas dataframe that can be used to train and test the classifier. | |
| :param features_file: the location of the JSON features file to convert to a dataframe | |
| :param force_new_dataset: if true a new CSV file will be created even if one already exists. | |
| :return: a Pandas dataframe with the features. | |
| """ | |
| with open(features_file) as json_features_file: | |
| csv_file = "{}.csv".format(features_file) | |
| if force_new_dataset or not os.path.isfile(csv_file): | |
| features = json.load(json_features_file) | |
| # todo remove the data for the features not being used. | |
| filtered_list_after_filters_applied = [] | |
| # If any of the filters are not true remove the features not requested | |
| column_names = [] | |
| if self.PSYCHOLOGICAL_SIGNALS_ENABLED: | |
| column_names = column_names + ["clout", "analytic", "tone", "authentic", | |
| "anger", "sadness", "anxiety", | |
| "power", "reward", "risk", "achievement", "affiliation", | |
| "i_pronoun", "p_pronoun", | |
| "minkowski"] | |
| if self.BEHAVIOURAL_FEATURES_ENABLED: | |
| column_names = column_names + ['centrality'] | |
| if self.RADICAL_LANGUAGE_ENABLED: | |
| # Add column names | |
| column_names = column_names + ["cap_freq", "violent_freq"] | |
| # Add the two hundred vectors columns | |
| for iterator in range(1, 201): | |
| column_names.append("message_vector_{}".format(iterator)) | |
| column_names = column_names + ['is_extremist'] | |
| if not self.BEHAVIOURAL_FEATURES_ENABLED or not self.PSYCHOLOGICAL_SIGNALS_ENABLED or self.RADICAL_LANGUAGE_ENABLED: | |
| # Loops through list of dicts (messages) | |
| number_of_processed_messages = 0 | |
| for message in features: | |
| number_of_processed_messages = number_of_processed_messages + 1 | |
| Logger.logger.print_message( | |
| "Extracting information from message {} of {} in file {}".format( | |
| number_of_processed_messages, | |
| len(features), | |
| features_file), | |
| logging_level=1) | |
| # Loops through dict keys (usernames) | |
| for user in message.keys(): | |
| message_features = message[user] | |
| feature_dict = {} | |
| if self.PSYCHOLOGICAL_SIGNALS_ENABLED: | |
| # Summary variables | |
| feature_dict["clout"] = message_features["clout"] | |
| feature_dict["analytic"] = message_features["analytic"] | |
| feature_dict["tone"] = message_features["tone"] | |
| feature_dict["authentic"] = message_features["authentic"] | |
| # Emotional Analysis | |
| feature_dict["anger"] = message_features["anger"] | |
| feature_dict["sadness"] = message_features["sadness"] | |
| feature_dict["anxiety"] = message_features["anxiety"] | |
| # Personal Drives | |
| feature_dict["power"] = message_features["power"] | |
| feature_dict["reward"] = message_features["reward"] | |
| feature_dict["risk"] = message_features["risk"] | |
| feature_dict["achievement"] = message_features["achievement"] | |
| feature_dict["affiliation"] = message_features["affiliation"] | |
| # Personal Pronouns | |
| feature_dict["i_pronoun"] = message_features["i_pronoun"] | |
| feature_dict["p_pronoun"] = message_features["p_pronoun"] | |
| # Minkowski distance | |
| feature_dict["minkowski"] = message_features["minkowski"] | |
| if self.BEHAVIOURAL_FEATURES_ENABLED: | |
| #feature_dict['post_freq'] = message_features['post_freq'] | |
| #feature_dict['follower_freq'] = message_features['follower_freq'] | |
| feature_dict['centrality'] = message_features['centrality'] | |
| if self.RADICAL_LANGUAGE_ENABLED: | |
| feature_dict["message_vector"] = message_features["message_vector"] | |
| feature_dict["violent_freq"] = message_features["violent_freq"] | |
| feature_dict["cap_freq"] = message_features["cap_freq"] | |
| feature_dict['is_extremist'] = message_features['is_extremist'] | |
| user = {user: feature_dict} | |
| filtered_list_after_filters_applied.append(user) | |
| number_of_features = len(filtered_list_after_filters_applied) | |
| # Creates the columns for the data frame | |
| df = pd.DataFrame( | |
| columns=column_names) | |
| completed_features = 0 | |
| iterator = 0 | |
| error_count = 0 | |
| for message in features: | |
| # should only be one user per entry | |
| for user_id in message: | |
| feature_data = message[user_id] | |
| # ID is not included as it's hexidecimal and not float | |
| row = [] | |
| if self.PSYCHOLOGICAL_SIGNALS_ENABLED: | |
| clout = feature_data['clout'] | |
| analytic = feature_data['analytic'] | |
| tone = feature_data['tone'] | |
| authentic = feature_data['authentic'] | |
| anger = feature_data["anger"] | |
| sadness = feature_data["sadness"] | |
| anxiety = feature_data["anxiety"] | |
| power = feature_data["power"] | |
| reward = feature_data["reward"] | |
| risk = feature_data["risk"] | |
| achievement = feature_data["achievement"] | |
| affiliation = feature_data["affiliation"] | |
| i_pronoun = feature_data["i_pronoun"] | |
| p_pronoun = feature_data["p_pronoun"] | |
| minkowski = feature_data["minkowski"] | |
| row = row + [clout, analytic, tone, authentic, anger, sadness, anxiety, power, | |
| reward, risk, achievement, affiliation, i_pronoun, p_pronoun, minkowski] | |
| if self.BEHAVIOURAL_FEATURES_ENABLED: | |
| #post_freq = feature_data['post_freq'] | |
| #follower_freq = feature_data['follower_freq'] | |
| centrality = feature_data['centrality'] | |
| row = row + [#post_freq, follower_freq, | |
| centrality] | |
| if self.RADICAL_LANGUAGE_ENABLED: | |
| cap_freq = feature_data['cap_freq'] | |
| violent_freq = feature_data['violent_freq'] | |
| message_vector = feature_data['message_vector'] | |
| row = row + [cap_freq, violent_freq] + message_vector | |
| is_extremist = feature_data['is_extremist'] | |
| row = row + [is_extremist] | |
| try: | |
| df.loc[iterator] = row | |
| except ValueError as e: | |
| print(e) | |
| error_count = error_count + 1 | |
| pass # if error with value probably column mismatch which is down to taking a mesage with no data | |
| iterator = iterator + 1 | |
| completed_features = completed_features + 1 | |
| user_name = list(message.keys())[0] | |
| Logger.logger.print_message( | |
| "Added a message from user {} to data frame - {} messages of {} completed".format(user_name, | |
| completed_features, | |
| number_of_features), | |
| logging_level=1) | |
| Logger.logger.print_message("Total errors when creating data frame: {}".format(error_count), | |
| logging_level=1) | |
| # Replace boolean with float | |
| df.replace({False: 0, True: 1}, inplace=True) | |
| # Sets ID field | |
| df.index.name = "ID" | |
| df.to_csv("{}.csv".format(features_file)) | |
| else: | |
| df = pandas.read_csv(csv_file) | |
| return df | |
| def create_model_info_output_file(self, location_of_output_file = None, training_data_csv_location = None): | |
| """ | |
| If the model has been loaded or trained this function will create a summary text file with information relating to | |
| the model. | |
| :param location_of_output_file: The location to save the output file to. | |
| :param training_data_csv_location: The location of the training data csv. This is used to retrieve the name of the | |
| feature columns. | |
| """ | |
| # Check if model has been created | |
| if not self.creation_date: | |
| Logger.logger.print_message("Model has not been trained, created, or loaded. Cannot output model data in this state.",logging_level=1) | |
| else: | |
| Logger.logger.print_message("Creating model info text file") | |
| output_text = "" | |
| # Add summary information | |
| output_text += "Model {}, version {}, created at {} \n".format(self.original_name, self.model_version, self.creation_date) | |
| output_text += "\nAccuracy: {}\nRecall: {} \nPrecision: {}\nF-Measure: {}\n".format(self.accuracy, self.recall, | |
| self.precision, self.f_measure) | |
| # Retrieve the header names if available | |
| if training_data_csv_location: | |
| with open(training_data_csv_location, "r") as csv_file: | |
| reader = csv.reader(csv_file) | |
| headers = next(reader) | |
| # Loop through all feature importance scores | |
| for iterator in range(len(self.model.feature_importances_)): | |
| if training_data_csv_location: | |
| # Plus one to ignore ID field | |
| output_text += "\n{}: {}".format(headers[iterator+1], self.model.feature_importances_[iterator]) | |
| else: | |
| output_text += "\nFeature {}: {}".format(iterator,self.model.feature_importances_[iterator]) | |
| # If no name has been set write to outputs folder | |
| if location_of_output_file: | |
| file_name = location_of_output_file | |
| else: | |
| file_name = os.path.join(self._outputs_folder,"model-output-{}.txt".format(datetime.today().strftime('%Y-%m-%d-%H%M%S'))) | |
| # Write to file | |
| with open(file_name, "w") as output_file: | |
| output_file.write(output_text) | |
| def train_model(self, features_file, force_new_dataset=True, model_location=None): | |
| """ | |
| Trains the model of the proveded data unless the model file already exists or if the force new dataset flag is True. | |
| :param features_file: the location of the feature file to be used to train the model | |
| :param force_new_dataset: If True a new dataset will be created and new model created even if a model already exists. | |
| :param model_location: the location to save the model file to | |
| """ | |
| # Sets model location based on default folder location and placeholder name if none was given | |
| if model_location is None: | |
| model_location = os.path.join(self._model_folder, "predictor.model") | |
| # if told to force the creation of a new dataset to train off or the model location does not exist then make a new model | |
| if force_new_dataset or not os.path.isfile(model_location): | |
| # Import train_test_split function | |
| feature_data = self.get_features_as_df(features_file, force_new_dataset) | |
| # Removes index column | |
| if "ID" in feature_data.keys(): | |
| feature_data.drop(feature_data.columns[0], axis=1, inplace=True) | |
| feature_data.reset_index(drop=True, inplace=True) | |
| y = feature_data[['is_extremist']] # Labels | |
| X = feature_data.drop(axis=1, labels=['is_extremist']) # Features | |
| # Split dataset into training set and test set | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 80% training and 20% test | |
| # Create a Gaussian Classifier | |
| random_forest = RandomForestClassifier(n_estimators=100, max_depth=50, oob_score=True | |
| ) # class_weight={0:1,1:5} # A higher weight for the minority class (is_extreamist) | |
| # Train the model using the training sets y_pred=random_forest.predict(X_test) | |
| random_forest.fit(X_train, y_train.values.ravel()) | |
| y_pred = random_forest.predict(X_test) | |
| # Model Accuracy, how often is the classifier correct? | |
| self.accuracy = metrics.accuracy_score(y_test, y_pred) | |
| self.recall = metrics.recall_score(y_test, y_pred) | |
| self.precision = metrics.precision_score(y_test, y_pred) | |
| self.f_measure = metrics.f1_score(y_test, y_pred) | |
| Logger.logger.print_message("Accuracy: {}".format(self.accuracy), logging_level=1) | |
| Logger.logger.print_message("Recall: {}".format(self.recall), logging_level=1) | |
| Logger.logger.print_message("Precision: {}".format(self.precision), logging_level=1) | |
| Logger.logger.print_message("F-Measure: {}".format(self.f_measure), logging_level=1) | |
| self.model = random_forest | |
| self.original_name = model_location | |
| self.creation_date = datetime.today().strftime('%Y-%m-%d') | |
| # write model and accuracy to file to file | |
| model_data = {"model": self.model, | |
| "original_name": self.original_name, | |
| "creation_date": self.creation_date, | |
| "accuracy": self.accuracy, | |
| "recall": self.recall, | |
| "precision": self.precision, | |
| "f1": self.f_measure, | |
| "version": self._FRAMEWORK_VERSION | |
| } | |
| pickle.dump(model_data, open(model_location, "wb")) | |
| else: | |
| # Read model and accuracy from file | |
| saved_file = pickle.load(open(model_location, "rb")) | |
| self.accuracy = saved_file["accuracy"] | |
| self.recall = saved_file["recall"] | |
| self.precision = saved_file["precision"] | |
| self.f_measure = saved_file["f1"] | |
| self.model = saved_file["model"] | |
| self.model_version = saved_file["version"] | |
| self.original_name = saved_file["original_name"] | |
| self.creation_date = saved_file["creation_date"] | |
| # A check to identify if the loaded model is of the same version as the tooling | |
| if self.model_version is not self._FRAMEWORK_VERSION: | |
| Logger.logger.print_message("Model provided is of version {}, tooling is of " | |
| "version {}. Using the model may not work as expected." | |
| .format(self.model_version, self._FRAMEWORK_VERSION)) |