| |
| """model.ipynb |
| |
| Automatically generated by Colab. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/15FRJX9FjEoohrUP2imym9WsfgDFmXYxe |
| """ |
|
|
| import pandas as pd |
|
|
| df = pd.read_csv('/content/combined_emails_with_natural_pii.csv') |
| print(df.columns) |
| df.head() |
|
|
| |
| !pip install imbalanced-learn |
|
|
| import pandas as pd |
| import numpy as np |
| import re |
| from sklearn.model_selection import train_test_split, GridSearchCV |
| from sklearn.ensemble import RandomForestClassifier |
| from sklearn.metrics import classification_report, accuracy_score |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.pipeline import Pipeline |
| from sklearn.preprocessing import LabelEncoder |
| from imblearn.over_sampling import SMOTE |
| from nltk.corpus import stopwords |
| from nltk.stem import WordNetLemmatizer |
| import nltk |
| import joblib |
|
|
| nltk.download('stopwords') |
| nltk.download('wordnet') |
|
|
| |
| df = pd.read_csv('/content/combined_emails_with_natural_pii.csv') |
|
|
| |
| df = df.rename(columns={'email': 'email_body', 'type': 'category'}) |
| df = df.dropna(subset=['email_body', 'category']) |
|
|
| |
| def clean_text(text): |
| text = re.sub(r'\n|\r', ' ', text) |
| text = re.sub(r'Subject:', '', text) |
| text = text.lower() |
| text = re.sub(r'[^a-z\s]', '', text) |
| text = re.sub(r'\s+', ' ', text) |
|
|
| stop_words = set(stopwords.words('english')) |
| lemmatizer = WordNetLemmatizer() |
| tokens = text.split() |
| tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] |
|
|
| return ' '.join(tokens) |
|
|
| df['cleaned_text'] = df['email_body'].apply(clean_text) |
|
|
| |
| le = LabelEncoder() |
| df['category_encoded'] = le.fit_transform(df['category']) |
|
|
| |
| X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['category_encoded'], test_size=0.2, random_state=42) |
|
|
| |
| vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000) |
|
|
| |
| X_train_vec = vectorizer.fit_transform(X_train) |
| X_test_vec = vectorizer.transform(X_test) |
|
|
| |
| smote = SMOTE(random_state=42) |
| X_train_res, y_train_res = smote.fit_resample(X_train_vec, y_train) |
|
|
| |
| clf = RandomForestClassifier(class_weight='balanced', random_state=42) |
| params = { |
| 'n_estimators': [100, 200], |
| 'max_depth': [None, 20], |
| } |
| grid = GridSearchCV(clf, params, cv=3, scoring='f1_weighted', verbose=1, n_jobs=-1) |
| grid.fit(X_train_res, y_train_res) |
|
|
| |
| y_pred = grid.predict(X_test_vec) |
|
|
| |
| print("Accuracy:", accuracy_score(y_test, y_pred)) |
| print(classification_report(y_test, y_pred, target_names=le.classes_)) |
|
|
| |
| joblib.dump(grid.best_estimator_, 'rf_classifier.pkl') |
| joblib.dump(vectorizer, 'vectorizer.pkl') |
| joblib.dump(le, 'label_encoder.pkl') |
|
|
| joblib.dump(vectorizer, 'vectorizer.pkl') |
|
|
|
|