Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import warnings | |
| warnings.filterwarnings("ignore", category=FutureWarning) | |
| import pickle | |
| def preprocess_bag_of_words(preprocessed_text_list): | |
| texts = [preprocessed["original_text"] for preprocessed in preprocessed_text_list] | |
| vectorizer = CountVectorizer() | |
| bag_of_words = vectorizer.fit_transform(texts) | |
| bow_df = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out()) | |
| return bow_df, vectorizer | |
| if __name__ == "__main__": | |
| preprocessed_data = pd.read_pickle('preprocessed_deeds.pkl') | |
| texts = preprocessed_data['original_text'] | |
| preprocessed_text_list = texts.apply(lambda x: {"original_text": x}).tolist() | |
| bow_df, vectorizer = preprocess_bag_of_words(preprocessed_text_list) | |
| X = bow_df | |
| y = preprocessed_data['is_racist'] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) | |
| logistic_model = LogisticRegression(max_iter=1000) | |
| logistic_model.fit(X_train, y_train) | |
| # Save the model and vectorizer | |
| with open('vectorizer.pkl', 'wb') as vec_file: | |
| pickle.dump(vectorizer, vec_file) | |
| with open('logistic_model.pkl', 'wb') as model_file: | |
| pickle.dump(logistic_model, model_file) | |
| y_pred = logistic_model.predict(X_test) | |
| accuracy = accuracy_score(y_test, y_pred) | |
| print(f"Accuracy: {accuracy:.2f}") | |
| print("\nClassification Report:") | |
| print(classification_report(y_test, y_pred)) | |
| conf_matrix = confusion_matrix(y_test, y_pred) | |
| plt.figure(figsize=(6, 4)) | |
| sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-racist', 'Racist'], yticklabels=['Non-racist', 'Racist']) | |
| plt.title('Confusion Matrix') | |
| plt.xlabel('Predicted') | |
| plt.ylabel('Actual') | |
| plt.show() | |
| y_prob = logistic_model.predict_proba(X_test)[:, 1] | |
| fpr, tpr, _ = roc_curve(y_test, y_prob) | |
| roc_auc = auc(fpr, tpr) | |
| plt.figure(figsize=(6, 4)) | |
| plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})') | |
| plt.plot([0, 1], [0, 1], 'k--') | |
| plt.xlim([0.0, 1.0]) | |
| plt.ylim([0.0, 1.05]) | |
| plt.xlabel('False Positive Rate') | |
| plt.ylabel('True Positive Rate') | |
| plt.title('Receiver Operating Characteristic (ROC) Curve') | |
| plt.legend(loc="lower right") | |
| plt.show() | |
| feature_importance = pd.Series(logistic_model.coef_[0], index=vectorizer.get_feature_names_out()) | |
| top_features = feature_importance.nlargest(10) | |
| plt.figure(figsize=(8, 6)) | |
| top_features.plot(kind='barh', color='skyblue') | |
| plt.title('Top 10 Most Influential Words for Racist Classification') | |
| plt.xlabel('Coefficient Value') | |
| plt.ylabel('Word') | |
| plt.show() | |
| # Function to make predictions based on the trained model | |
| def predict(processed_text, vectorizer, logistic_model): | |
| bow_text = vectorizer.transform([processed_text["original_text"]]) | |
| prediction = logistic_model.predict(bow_text) | |
| return { | |
| 'is_racist': bool(prediction[0]), | |
| } |