# -*- coding: utf-8 -*- """glove+bilstm.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/10fLw7V6G3vV_STF7KcWe8qcTvyLQq0NT """ import os import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from itertools import permutations # For train-test split and evaluation from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, balanced_accuracy_score from sklearn.metrics import precision_score, recall_score, f1_score from sklearn.metrics import roc_auc_score, confusion_matrix from sklearn.metrics import roc_curve, precision_recall_curve # Deep learning libraries from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, Bidirectional, LSTM from tensorflow.keras.layers import Dense, Dropout base_path = "/root/output" os.makedirs(base_path+"/dataset_splits", exist_ok=True) os.makedirs(base_path+"/figures", exist_ok=True) os.makedirs(base_path+"/results_tables", exist_ok=True) os.makedirs(base_path+"/trained_models", exist_ok=True) data_path = "/root/dataset.csv" df = pd.read_csv(data_path) df.head() plt.figure(figsize=(6,4)) df['language'].value_counts().plot.pie(autopct='%1.1f%%') plt.title("Dataset Language Distribution") plt.ylabel("") plt.savefig(base_path+"/figures/language_distribution.png", dpi=300) plt.show() X = df["clean_text"] y = df["hate_label"] lang = df["language"] X_temp, X_test, y_temp, y_test, lang_temp, lang_test = train_test_split( X, y, lang, test_size=0.30, stratify=y, random_state=42) X_train, X_val, y_train, y_val, lang_train, lang_val = train_test_split( X_temp, y_temp, lang_temp, test_size=0.1428, stratify=y_temp, random_state=42 ) pd.DataFrame({"text":X_train,"label":y_train,"lang":lang_train}).to_csv( base_path+"/dataset_splits/train.csv", index=False) pd.DataFrame({"text":X_val,"label":y_val,"lang":lang_val}).to_csv( base_path+"/dataset_splits/val.csv", index=False) pd.DataFrame({"text":X_test,"label":y_test,"lang":lang_test}).to_csv( base_path+"/dataset_splits/test.csv", index=False) MAX_LEN = 100 VOCAB = 50000 tokenizer = Tokenizer(num_words=VOCAB) tokenizer.fit_on_texts(X_train) X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN) X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=MAX_LEN) X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN) EMBEDDING_DIM = 300 glove_path = "/root/glove.6B.300d.txt" embeddings_index = {} with open(glove_path, encoding="utf8") as f: for line in f: values = line.split() word = values[0] vector = np.asarray(values[1:], dtype="float32") embeddings_index[word] = vector print("Loaded %s word vectors." % len(embeddings_index)) word_index = tokenizer.word_index embedding_dim = 300 embedding_matrix = np.zeros((len(word_index)+1, embedding_dim)) for word, i in word_index.items(): vector = embeddings_index.get(word) if vector is not None: embedding_matrix[i] = vector # ============================================================ # Helper: build a fresh model (called once per permutation) # ============================================================ def build_model(): """Construct and compile a fresh BiLSTM model with frozen GloVe embeddings.""" m = Sequential() m.add(Embedding( input_dim=len(word_index)+1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=MAX_LEN, trainable=False )) m.add(Bidirectional(LSTM(128))) m.add(Dropout(0.5)) m.add(Dense(64, activation="relu")) m.add(Dense(1, activation="sigmoid")) m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) return m def evaluate_metrics(y_true, y_pred_prob): y_pred = (y_pred_prob > 0.5).astype(int) acc = accuracy_score(y_true, y_pred) bal = balanced_accuracy_score(y_true, y_pred) prec = precision_score(y_true, y_pred) rec = recall_score(y_true, y_pred) f1 = f1_score(y_true, y_pred) auc = roc_auc_score(y_true, y_pred_prob) tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() spec = tn / (tn + fp) return acc, bal, prec, rec, spec, f1, auc def plot_training_curves(history, tag, base_path): """Save accuracy and loss curves for one training phase.""" fig, axes = plt.subplots(1, 2, figsize=(14, 5)) axes[0].plot(history.history['accuracy'], label="Train Accuracy") axes[0].plot(history.history['val_accuracy'], label="Val Accuracy") axes[0].set_title(f"{tag} - Accuracy Curve") axes[0].set_xlabel("Epoch") axes[0].set_ylabel("Accuracy") axes[0].legend() axes[0].grid(True) axes[1].plot(history.history['loss'], label="Train Loss") axes[1].plot(history.history['val_loss'], label="Val Loss") axes[1].set_title(f"{tag} - Loss Curve") axes[1].set_xlabel("Epoch") axes[1].set_ylabel("Loss") axes[1].legend() axes[1].grid(True) plt.tight_layout() fname = tag.replace(" -> ", "_to_").replace(" ", "_") plt.savefig(os.path.join(base_path, f"{fname}_curves.png"), dpi=300) plt.show() def plot_eval_charts(y_test, preds, tag, base_path): """Save confusion matrix, ROC, PR, and F1 curves after evaluation.""" fname = tag.replace(" -> ", "_to_").replace(" ", "_") # Confusion Matrix cm = confusion_matrix(y_test, (preds > 0.5).astype(int)) plt.figure(figsize=(6,4)) sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Non-Hate","Hate"], yticklabels=["Non-Hate","Hate"]) plt.title(f"{tag} - Confusion Matrix") plt.xlabel("Predicted") plt.ylabel("Actual") plt.savefig(os.path.join(base_path, f"{fname}_cm.png"), dpi=300) plt.show() # ROC Curve fpr, tpr, _ = roc_curve(y_test, preds) auc_val = roc_auc_score(y_test, preds) plt.figure(figsize=(6,4)) plt.plot(fpr, tpr, label=f"AUC={auc_val:.4f}") plt.plot([0,1],[0,1],'--') plt.title(f"{tag} - ROC Curve") plt.xlabel("FPR") plt.ylabel("TPR") plt.legend() plt.grid(True) plt.savefig(os.path.join(base_path, f"{fname}_roc.png"), dpi=300) plt.show() # Precision-Recall Curve precision, recall, thresholds = precision_recall_curve(y_test, preds) plt.figure(figsize=(6,4)) plt.plot(recall, precision) plt.title(f"{tag} - Precision-Recall Curve") plt.xlabel("Recall") plt.ylabel("Precision") plt.grid(True) plt.savefig(os.path.join(base_path, f"{fname}_pr.png"), dpi=300) plt.show() # F1 Curve f1_scores = (2 * precision * recall) / (precision + recall + 1e-8) plt.figure(figsize=(6,4)) plt.plot(thresholds, f1_scores[:-1]) plt.title(f"{tag} - F1 Score vs Threshold") plt.xlabel("Threshold") plt.ylabel("F1 Score") plt.grid(True) plt.savefig(os.path.join(base_path, f"{fname}_f1.png"), dpi=300) plt.show() # ============================================================ # PLAN B: All 6 permutations + final Full (Shuffled) fine-tune # After each training phase → evaluate on that language's test set # After Full phase → evaluate on full test set # ============================================================ print("\n" + "="*60) print("PLAN B: Sequential Transfer Learning + Full Dataset Fine-tune") print("="*60) languages = ["english", "hindi", "hinglish"] # Pre-shuffle full training data once (same shuffle for all permutations) np.random.seed(42) shuffle_idx = np.random.permutation(len(X_train_seq)) X_full_shuffled = np.ascontiguousarray(X_train_seq[shuffle_idx], dtype=np.int32) y_full_shuffled = np.ascontiguousarray(y_train.values[shuffle_idx], dtype=np.float32) # Pre-build per-language test splits lang_test_idx = { lang: (lang_test.values == lang) for lang in languages } lang_test_X = { lang: X_test_seq[lang_test_idx[lang]] for lang in languages } lang_test_y = { lang: y_test.values[lang_test_idx[lang]] for lang in languages } cols = ["Strategy", "Phase", "Accuracy", "Balanced Acc", "Precision", "Recall", "Specificity", "F1", "ROC-AUC"] for perm in permutations(languages): perm_name = " -> ".join(perm) strategy_name = perm_name + " -> Full" strategy_results = [] print(f"\n{'='*50}") print(f"Strategy: {strategy_name}") print(f"{'='*50}") # Make a clean folder per strategy for figures strat_tag = perm_name.replace(" -> ", "_to_") strat_fig_path = base_path + f"/figures/{strat_tag}" os.makedirs(strat_fig_path, exist_ok=True) # Model built ONCE — weights carry forward across all phases model = build_model() # ── Language phases ────────────────────────────────────── for lang in perm: idx = (lang_train == lang) X_lang = X_train_seq[idx] y_lang = y_train[idx] print(f" Training on: {lang} ({X_lang.shape[0]} samples)") history = model.fit( X_lang, y_lang, validation_data=(X_val_seq, y_val), epochs=8, batch_size=32, verbose=1 ) # Train/Val accuracy + loss curves plot_training_curves(history, f"{strat_tag} [{lang}]", strat_fig_path) # Evaluate on this language's test subset preds = model.predict(lang_test_X[lang]).flatten() acc, bal, prec, rec, spec, f1, auc = evaluate_metrics(lang_test_y[lang], preds) strategy_results.append([strategy_name, lang, acc, bal, prec, rec, spec, f1, auc]) # Eval plots for this language plot_eval_charts(lang_test_y[lang], preds, f"{strat_tag} [{lang}]", strat_fig_path) print(f" Acc={acc:.4f} F1={f1:.4f} AUC={auc:.4f}") # ── Full phase ─────────────────────────────────────────── print(f" Training on: Full Dataset ({X_full_shuffled.shape[0]} samples, shuffled)") history_full = model.fit( X_full_shuffled, y_full_shuffled, validation_data=(X_val_seq, y_val), epochs=8, batch_size=64, verbose=1 ) # Train/Val accuracy + loss curves for full phase plot_training_curves(history_full, f"{strat_tag} [Full]", strat_fig_path) # Evaluate on full test set preds_full = model.predict(X_test_seq).flatten() acc, bal, prec, rec, spec, f1, auc = evaluate_metrics(y_test.values, preds_full) strategy_results.append([strategy_name, "Full", acc, bal, prec, rec, spec, f1, auc]) # Eval plots for full phase plot_eval_charts(y_test.values, preds_full, f"{strat_tag} [Full]", strat_fig_path) print(f" Acc={acc:.4f} F1={f1:.4f} AUC={auc:.4f}") # Save per-strategy results table (4 rows: 3 langs + Full) strat_df = pd.DataFrame(strategy_results, columns=cols) strat_df.to_csv( base_path + f"/results_tables/{strat_tag}_results.csv", index=False ) print(f"\n Results for strategy: {strategy_name}") print(strat_df.to_string(index=False)) model.save(base_path + f"/trained_models/planB_{strat_tag}_Full.h5") print(f" Saved model: planB_{strat_tag}_Full.h5") # ============================================================ # COMBINED RESULTS TABLE (all 6 strategies × 4 phases = 24 rows) # ============================================================ all_csv = [ base_path + f"/results_tables/{('_to_'.join(perm))}_results.csv" for perm in permutations(languages) ] combined_df = pd.concat([pd.read_csv(f) for f in all_csv], ignore_index=True) combined_df.to_csv(base_path + "/results_tables/all_strategies_results.csv", index=False) print("\n" + "="*60) print("ALL STRATEGIES — COMBINED RESULTS") print("="*60) print(combined_df.to_string(index=False))