ShadowProgrammer
/

HateSpeechRegressor

Model card Files Files and versions

HateSpeechRegressor / train.py

ShadowProgrammer's picture

ShadowProgrammer

Upload 5 files

a7eddd3 over 2 years ago

3.15 kB

	import time

	print("Loading libraries...")
	start_time = time.time()

	import sklearn
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error
	from sklearn.neural_network import MLPRegressor
	from sklearn.feature_extraction.text import CountVectorizer
	import matplotlib.pyplot as plt
	import datasets
	import pickle

	print(f"Libraries loaded in {round((time.time() - start_time) * 1000, 3)} ms.")
	print("Loading vectorizer...")
	start_time = time.time()

	count_vect = CountVectorizer()

	print(f"Vectorizer loaded in {round((time.time() - start_time) * 1000, 3)} ms.")
	print(f"Saving vectorizer...")
	start_time = time.time()

	# Save vectorizer
	pickle.dump(count_vect, open('vectorizer.pkl', 'wb'))


	print("Setting configuration...")
	start_time = time.time()

	# Set configuration
	sklearn.set_config(working_memory=4096)
	data_size = 100000


	print(f"Configuration set in {round((time.time() - start_time) * 1000, 3)} ms.")
	print("Loading data...")
	start_time = time.time()

	# Load data
	dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')
	df = dataset['train'].to_pandas()

	print(f"Data loaded in {round((time.time() - start_time) * 1000, 3)} ms.")
	print(df.head())

	print("Preprocessing data...")
	start_time = time.time()

	# Extract text and labels
	X_text = df['text'][:data_size] # Assuming 'text' is the column containing the text data
	y_columns = ['hate_speech_score', 'sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech']
	y = df[y_columns][:data_size]
	y = y.fillna(0)

	# Convert text to vectors
	X = count_vect.fit_transform(X_text)

	print(f"Data preprocessed in {round((time.time() - start_time) * 1000, 3)} ms.")
	print("Splitting data...")
	start_time = time.time()
	# Load data
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

	print(f"Data split in {round((time.time() - start_time) * 1000, 3)} ms.")
	print("Training model...")
	start_time = time.time()

	# Create MLPRegressor model
	mlp = MLPRegressor(hidden_layer_sizes=(256, 128, 64, 32, 16), activation='relu', max_iter=100, alpha=0.0001, learning_rate_init=0.003, solver='adam', verbose=True, tol=0.000000000001, early_stopping=False, n_iter_no_change=5000)
	mlp.fit(X_train, y_train)

	print(f"Model trained in {round((time.time() - start_time), 3)} s.")
	print("Evaluating model...")

	# Predict and score
	predictions = mlp.predict(X_test)
	print("Mean squared error: ", mean_squared_error(y_test, predictions))

	# Plot the loss curve
	plt.plot(mlp.loss_curve_)
	plt.title("Loss curve")
	plt.xlabel("Iteration")
	plt.ylabel("Loss")
	plt.show()

	print("Done!")

	# Save the model to disk

	filename = 'model.pkl'
	pickle.dump(mlp, open(filename, 'wb'))

	# Test the model for fun :)
	sentences = count_vect.fit_transform(["Fuck you you stupid nigger", "You're a piece of shit", "Awesome!", "Oh my god, I never realized that!"])

	predictions = mlp.predict(sentences)
	# Write dict of sentences and predictions
	values = {sentences[i]: predictions[i] for i in range(len(sentences))}