| |
| """glove+bilstm.ipynb |
| |
| Automatically generated by Colab. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/10fLw7V6G3vV_STF7KcWe8qcTvyLQq0NT |
| """ |
|
|
| import os |
| import numpy as np |
| import pandas as pd |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| from itertools import permutations |
|
|
| |
| from sklearn.model_selection import train_test_split |
| from sklearn.metrics import accuracy_score, balanced_accuracy_score |
| from sklearn.metrics import precision_score, recall_score, f1_score |
| from sklearn.metrics import roc_auc_score, confusion_matrix |
| from sklearn.metrics import roc_curve, precision_recall_curve |
|
|
| |
| from tensorflow.keras.preprocessing.text import Tokenizer |
| from tensorflow.keras.preprocessing.sequence import pad_sequences |
| from tensorflow.keras.models import Sequential |
| from tensorflow.keras.layers import Embedding, Bidirectional, LSTM |
| from tensorflow.keras.layers import Dense, Dropout |
|
|
| base_path = "/root/output" |
|
|
| os.makedirs(base_path+"/dataset_splits", exist_ok=True) |
| os.makedirs(base_path+"/figures", exist_ok=True) |
| os.makedirs(base_path+"/results_tables", exist_ok=True) |
| os.makedirs(base_path+"/trained_models", exist_ok=True) |
|
|
| data_path = "/root/dataset.csv" |
|
|
| df = pd.read_csv(data_path) |
|
|
| df.head() |
|
|
| plt.figure(figsize=(6,4)) |
| df['language'].value_counts().plot.pie(autopct='%1.1f%%') |
| plt.title("Dataset Language Distribution") |
| plt.ylabel("") |
| plt.savefig(base_path+"/figures/language_distribution.png", dpi=300) |
| plt.show() |
|
|
| X = df["clean_text"] |
| y = df["hate_label"] |
| lang = df["language"] |
|
|
| X_temp, X_test, y_temp, y_test, lang_temp, lang_test = train_test_split( |
| X, y, lang, test_size=0.30, stratify=y, random_state=42) |
|
|
| X_train, X_val, y_train, y_val, lang_train, lang_val = train_test_split( |
| X_temp, y_temp, lang_temp, |
| test_size=0.1428, |
| stratify=y_temp, |
| random_state=42 |
| ) |
|
|
| pd.DataFrame({"text":X_train,"label":y_train,"lang":lang_train}).to_csv( |
| base_path+"/dataset_splits/train.csv", index=False) |
|
|
| pd.DataFrame({"text":X_val,"label":y_val,"lang":lang_val}).to_csv( |
| base_path+"/dataset_splits/val.csv", index=False) |
|
|
| pd.DataFrame({"text":X_test,"label":y_test,"lang":lang_test}).to_csv( |
| base_path+"/dataset_splits/test.csv", index=False) |
|
|
| MAX_LEN = 100 |
| VOCAB = 50000 |
|
|
| tokenizer = Tokenizer(num_words=VOCAB) |
| tokenizer.fit_on_texts(X_train) |
|
|
| X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN) |
| X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=MAX_LEN) |
| X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN) |
|
|
| EMBEDDING_DIM = 300 |
| glove_path = "/root/glove.6B.300d.txt" |
|
|
| embeddings_index = {} |
|
|
| with open(glove_path, encoding="utf8") as f: |
| for line in f: |
| values = line.split() |
| word = values[0] |
| vector = np.asarray(values[1:], dtype="float32") |
| embeddings_index[word] = vector |
|
|
| print("Loaded %s word vectors." % len(embeddings_index)) |
|
|
| word_index = tokenizer.word_index |
| embedding_dim = 300 |
|
|
| embedding_matrix = np.zeros((len(word_index)+1, embedding_dim)) |
|
|
| for word, i in word_index.items(): |
| vector = embeddings_index.get(word) |
| if vector is not None: |
| embedding_matrix[i] = vector |
|
|
|
|
| |
| |
| |
| def build_model(): |
| """Construct and compile a fresh BiLSTM model with frozen GloVe embeddings.""" |
| m = Sequential() |
| m.add(Embedding( |
| input_dim=len(word_index)+1, |
| output_dim=embedding_dim, |
| weights=[embedding_matrix], |
| input_length=MAX_LEN, |
| trainable=False |
| )) |
| m.add(Bidirectional(LSTM(128))) |
| m.add(Dropout(0.5)) |
| m.add(Dense(64, activation="relu")) |
| m.add(Dense(1, activation="sigmoid")) |
| m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) |
| return m |
|
|
|
|
| def evaluate_metrics(y_true, y_pred_prob): |
| y_pred = (y_pred_prob > 0.5).astype(int) |
| acc = accuracy_score(y_true, y_pred) |
| bal = balanced_accuracy_score(y_true, y_pred) |
| prec = precision_score(y_true, y_pred) |
| rec = recall_score(y_true, y_pred) |
| f1 = f1_score(y_true, y_pred) |
| auc = roc_auc_score(y_true, y_pred_prob) |
| tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() |
| spec = tn / (tn + fp) |
| return acc, bal, prec, rec, spec, f1, auc |
|
|
|
|
| def plot_training_curves(history, tag, base_path): |
| """Save accuracy and loss curves for one training phase.""" |
| fig, axes = plt.subplots(1, 2, figsize=(14, 5)) |
|
|
| axes[0].plot(history.history['accuracy'], label="Train Accuracy") |
| axes[0].plot(history.history['val_accuracy'], label="Val Accuracy") |
| axes[0].set_title(f"{tag} - Accuracy Curve") |
| axes[0].set_xlabel("Epoch") |
| axes[0].set_ylabel("Accuracy") |
| axes[0].legend() |
| axes[0].grid(True) |
|
|
| axes[1].plot(history.history['loss'], label="Train Loss") |
| axes[1].plot(history.history['val_loss'], label="Val Loss") |
| axes[1].set_title(f"{tag} - Loss Curve") |
| axes[1].set_xlabel("Epoch") |
| axes[1].set_ylabel("Loss") |
| axes[1].legend() |
| axes[1].grid(True) |
|
|
| plt.tight_layout() |
| fname = tag.replace(" -> ", "_to_").replace(" ", "_") |
| plt.savefig(os.path.join(base_path, f"{fname}_curves.png"), dpi=300) |
| plt.show() |
|
|
|
|
| def plot_eval_charts(y_test, preds, tag, base_path): |
| """Save confusion matrix, ROC, PR, and F1 curves after evaluation.""" |
| fname = tag.replace(" -> ", "_to_").replace(" ", "_") |
|
|
| |
| cm = confusion_matrix(y_test, (preds > 0.5).astype(int)) |
| plt.figure(figsize=(6,4)) |
| sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", |
| xticklabels=["Non-Hate","Hate"], |
| yticklabels=["Non-Hate","Hate"]) |
| plt.title(f"{tag} - Confusion Matrix") |
| plt.xlabel("Predicted") |
| plt.ylabel("Actual") |
| plt.savefig(os.path.join(base_path, f"{fname}_cm.png"), dpi=300) |
| plt.show() |
|
|
| |
| fpr, tpr, _ = roc_curve(y_test, preds) |
| auc_val = roc_auc_score(y_test, preds) |
| plt.figure(figsize=(6,4)) |
| plt.plot(fpr, tpr, label=f"AUC={auc_val:.4f}") |
| plt.plot([0,1],[0,1],'--') |
| plt.title(f"{tag} - ROC Curve") |
| plt.xlabel("FPR") |
| plt.ylabel("TPR") |
| plt.legend() |
| plt.grid(True) |
| plt.savefig(os.path.join(base_path, f"{fname}_roc.png"), dpi=300) |
| plt.show() |
|
|
| |
| precision, recall, thresholds = precision_recall_curve(y_test, preds) |
| plt.figure(figsize=(6,4)) |
| plt.plot(recall, precision) |
| plt.title(f"{tag} - Precision-Recall Curve") |
| plt.xlabel("Recall") |
| plt.ylabel("Precision") |
| plt.grid(True) |
| plt.savefig(os.path.join(base_path, f"{fname}_pr.png"), dpi=300) |
| plt.show() |
|
|
| |
| f1_scores = (2 * precision * recall) / (precision + recall + 1e-8) |
| plt.figure(figsize=(6,4)) |
| plt.plot(thresholds, f1_scores[:-1]) |
| plt.title(f"{tag} - F1 Score vs Threshold") |
| plt.xlabel("Threshold") |
| plt.ylabel("F1 Score") |
| plt.grid(True) |
| plt.savefig(os.path.join(base_path, f"{fname}_f1.png"), dpi=300) |
| plt.show() |
|
|
|
|
| |
| |
| |
| |
| |
| print("\n" + "="*60) |
| print("PLAN B: Sequential Transfer Learning + Full Dataset Fine-tune") |
| print("="*60) |
|
|
| languages = ["english", "hindi", "hinglish"] |
|
|
| |
| np.random.seed(42) |
| shuffle_idx = np.random.permutation(len(X_train_seq)) |
| X_full_shuffled = np.ascontiguousarray(X_train_seq[shuffle_idx], dtype=np.int32) |
| y_full_shuffled = np.ascontiguousarray(y_train.values[shuffle_idx], dtype=np.float32) |
|
|
| |
| lang_test_idx = { |
| lang: (lang_test.values == lang) |
| for lang in languages |
| } |
| lang_test_X = { |
| lang: X_test_seq[lang_test_idx[lang]] |
| for lang in languages |
| } |
| lang_test_y = { |
| lang: y_test.values[lang_test_idx[lang]] |
| for lang in languages |
| } |
|
|
| cols = ["Strategy", "Phase", "Accuracy", "Balanced Acc", |
| "Precision", "Recall", "Specificity", "F1", "ROC-AUC"] |
|
|
| for perm in permutations(languages): |
| perm_name = " -> ".join(perm) |
| strategy_name = perm_name + " -> Full" |
| strategy_results = [] |
|
|
| print(f"\n{'='*50}") |
| print(f"Strategy: {strategy_name}") |
| print(f"{'='*50}") |
|
|
| |
| strat_tag = perm_name.replace(" -> ", "_to_") |
| strat_fig_path = base_path + f"/figures/{strat_tag}" |
| os.makedirs(strat_fig_path, exist_ok=True) |
|
|
| |
| model = build_model() |
|
|
| |
| for lang in perm: |
| idx = (lang_train == lang) |
| X_lang = X_train_seq[idx] |
| y_lang = y_train[idx] |
|
|
| print(f" Training on: {lang} ({X_lang.shape[0]} samples)") |
|
|
| history = model.fit( |
| X_lang, y_lang, |
| validation_data=(X_val_seq, y_val), |
| epochs=8, |
| batch_size=32, |
| verbose=1 |
| ) |
|
|
| |
| plot_training_curves(history, f"{strat_tag} [{lang}]", strat_fig_path) |
|
|
| |
| preds = model.predict(lang_test_X[lang]).flatten() |
| acc, bal, prec, rec, spec, f1, auc = evaluate_metrics(lang_test_y[lang], preds) |
| strategy_results.append([strategy_name, lang, acc, bal, prec, rec, spec, f1, auc]) |
|
|
| |
| plot_eval_charts(lang_test_y[lang], preds, f"{strat_tag} [{lang}]", strat_fig_path) |
|
|
| print(f" Acc={acc:.4f} F1={f1:.4f} AUC={auc:.4f}") |
|
|
| |
| print(f" Training on: Full Dataset ({X_full_shuffled.shape[0]} samples, shuffled)") |
|
|
| history_full = model.fit( |
| X_full_shuffled, y_full_shuffled, |
| validation_data=(X_val_seq, y_val), |
| epochs=8, |
| batch_size=64, |
| verbose=1 |
| ) |
|
|
| |
| plot_training_curves(history_full, f"{strat_tag} [Full]", strat_fig_path) |
|
|
| |
| preds_full = model.predict(X_test_seq).flatten() |
| acc, bal, prec, rec, spec, f1, auc = evaluate_metrics(y_test.values, preds_full) |
| strategy_results.append([strategy_name, "Full", acc, bal, prec, rec, spec, f1, auc]) |
|
|
| |
| plot_eval_charts(y_test.values, preds_full, f"{strat_tag} [Full]", strat_fig_path) |
|
|
| print(f" Acc={acc:.4f} F1={f1:.4f} AUC={auc:.4f}") |
|
|
| |
| strat_df = pd.DataFrame(strategy_results, columns=cols) |
| strat_df.to_csv( |
| base_path + f"/results_tables/{strat_tag}_results.csv", |
| index=False |
| ) |
|
|
| print(f"\n Results for strategy: {strategy_name}") |
| print(strat_df.to_string(index=False)) |
|
|
| model.save(base_path + f"/trained_models/planB_{strat_tag}_Full.h5") |
| print(f" Saved model: planB_{strat_tag}_Full.h5") |
|
|
|
|
| |
| |
| |
| all_csv = [ |
| base_path + f"/results_tables/{('_to_'.join(perm))}_results.csv" |
| for perm in permutations(languages) |
| ] |
|
|
| combined_df = pd.concat([pd.read_csv(f) for f in all_csv], ignore_index=True) |
| combined_df.to_csv(base_path + "/results_tables/all_strategies_results.csv", index=False) |
|
|
| print("\n" + "="*60) |
| print("ALL STRATEGIES β COMBINED RESULTS") |
| print("="*60) |
| print(combined_df.to_string(index=False)) |
|
|