File size: 12,136 Bytes

47bafb1

# -*- coding: utf-8 -*-
"""glove+bilstm.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/10fLw7V6G3vV_STF7KcWe8qcTvyLQq0NT
"""

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import permutations

# For train-test split and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.metrics import roc_curve, precision_recall_curve

# Deep learning libraries
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM
from tensorflow.keras.layers import Dense, Dropout

base_path = "/root/output"

os.makedirs(base_path+"/dataset_splits", exist_ok=True)
os.makedirs(base_path+"/figures", exist_ok=True)
os.makedirs(base_path+"/results_tables", exist_ok=True)
os.makedirs(base_path+"/trained_models", exist_ok=True)

data_path = "/root/dataset.csv"

df = pd.read_csv(data_path)

df.head()

plt.figure(figsize=(6,4))
df['language'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title("Dataset Language Distribution")
plt.ylabel("")
plt.savefig(base_path+"/figures/language_distribution.png", dpi=300)
plt.show()

X = df["clean_text"]
y = df["hate_label"]
lang = df["language"]

X_temp, X_test, y_temp, y_test, lang_temp, lang_test = train_test_split(
    X, y, lang, test_size=0.30, stratify=y, random_state=42)

X_train, X_val, y_train, y_val, lang_train, lang_val = train_test_split(
    X_temp, y_temp, lang_temp,
    test_size=0.1428,
    stratify=y_temp,
    random_state=42
)

pd.DataFrame({"text":X_train,"label":y_train,"lang":lang_train}).to_csv(
    base_path+"/dataset_splits/train.csv", index=False)

pd.DataFrame({"text":X_val,"label":y_val,"lang":lang_val}).to_csv(
    base_path+"/dataset_splits/val.csv", index=False)

pd.DataFrame({"text":X_test,"label":y_test,"lang":lang_test}).to_csv(
    base_path+"/dataset_splits/test.csv", index=False)

MAX_LEN = 100
VOCAB = 50000

tokenizer = Tokenizer(num_words=VOCAB)
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
X_val_seq   = pad_sequences(tokenizer.texts_to_sequences(X_val),   maxlen=MAX_LEN)
X_test_seq  = pad_sequences(tokenizer.texts_to_sequences(X_test),  maxlen=MAX_LEN)

EMBEDDING_DIM = 300
glove_path = "/root/glove.6B.300d.txt"

embeddings_index = {}

with open(glove_path, encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = vector

print("Loaded %s word vectors." % len(embeddings_index))

word_index = tokenizer.word_index
embedding_dim = 300

embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))

for word, i in word_index.items():
    vector = embeddings_index.get(word)
    if vector is not None:
        embedding_matrix[i] = vector


# ============================================================
# Helper: build a fresh model (called once per permutation)
# ============================================================
def build_model():
    """Construct and compile a fresh BiLSTM model with frozen GloVe embeddings."""
    m = Sequential()
    m.add(Embedding(
        input_dim=len(word_index)+1,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=MAX_LEN,
        trainable=False
    ))
    m.add(Bidirectional(LSTM(128)))
    m.add(Dropout(0.5))
    m.add(Dense(64, activation="relu"))
    m.add(Dense(1, activation="sigmoid"))
    m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return m


def evaluate_metrics(y_true, y_pred_prob):
    y_pred = (y_pred_prob > 0.5).astype(int)
    acc  = accuracy_score(y_true, y_pred)
    bal  = balanced_accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec  = recall_score(y_true, y_pred)
    f1   = f1_score(y_true, y_pred)
    auc  = roc_auc_score(y_true, y_pred_prob)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    spec = tn / (tn + fp)
    return acc, bal, prec, rec, spec, f1, auc


def plot_training_curves(history, tag, base_path):
    """Save accuracy and loss curves for one training phase."""
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    axes[0].plot(history.history['accuracy'], label="Train Accuracy")
    axes[0].plot(history.history['val_accuracy'], label="Val Accuracy")
    axes[0].set_title(f"{tag} - Accuracy Curve")
    axes[0].set_xlabel("Epoch")
    axes[0].set_ylabel("Accuracy")
    axes[0].legend()
    axes[0].grid(True)

    axes[1].plot(history.history['loss'], label="Train Loss")
    axes[1].plot(history.history['val_loss'], label="Val Loss")
    axes[1].set_title(f"{tag} - Loss Curve")
    axes[1].set_xlabel("Epoch")
    axes[1].set_ylabel("Loss")
    axes[1].legend()
    axes[1].grid(True)

    plt.tight_layout()
    fname = tag.replace(" -> ", "_to_").replace(" ", "_")
    plt.savefig(os.path.join(base_path, f"{fname}_curves.png"), dpi=300)
    plt.show()


def plot_eval_charts(y_test, preds, tag, base_path):
    """Save confusion matrix, ROC, PR, and F1 curves after evaluation."""
    fname = tag.replace(" -> ", "_to_").replace(" ", "_")

    # Confusion Matrix
    cm = confusion_matrix(y_test, (preds > 0.5).astype(int))
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=["Non-Hate","Hate"],
                yticklabels=["Non-Hate","Hate"])
    plt.title(f"{tag} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.savefig(os.path.join(base_path, f"{fname}_cm.png"), dpi=300)
    plt.show()

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, preds)
    auc_val = roc_auc_score(y_test, preds)
    plt.figure(figsize=(6,4))
    plt.plot(fpr, tpr, label=f"AUC={auc_val:.4f}")
    plt.plot([0,1],[0,1],'--')
    plt.title(f"{tag} - ROC Curve")
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(base_path, f"{fname}_roc.png"), dpi=300)
    plt.show()

    # Precision-Recall Curve
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    plt.figure(figsize=(6,4))
    plt.plot(recall, precision)
    plt.title(f"{tag} - Precision-Recall Curve")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.grid(True)
    plt.savefig(os.path.join(base_path, f"{fname}_pr.png"), dpi=300)
    plt.show()

    # F1 Curve
    f1_scores = (2 * precision * recall) / (precision + recall + 1e-8)
    plt.figure(figsize=(6,4))
    plt.plot(thresholds, f1_scores[:-1])
    plt.title(f"{tag} - F1 Score vs Threshold")
    plt.xlabel("Threshold")
    plt.ylabel("F1 Score")
    plt.grid(True)
    plt.savefig(os.path.join(base_path, f"{fname}_f1.png"), dpi=300)
    plt.show()


# ============================================================
# PLAN B: All 6 permutations + final Full (Shuffled) fine-tune
#         After each training phase → evaluate on that language's test set
#         After Full phase → evaluate on full test set
# ============================================================
print("\n" + "="*60)
print("PLAN B: Sequential Transfer Learning + Full Dataset Fine-tune")
print("="*60)

languages = ["english", "hindi", "hinglish"]

# Pre-shuffle full training data once (same shuffle for all permutations)
np.random.seed(42)
shuffle_idx     = np.random.permutation(len(X_train_seq))
X_full_shuffled = np.ascontiguousarray(X_train_seq[shuffle_idx], dtype=np.int32)
y_full_shuffled = np.ascontiguousarray(y_train.values[shuffle_idx], dtype=np.float32)

# Pre-build per-language test splits
lang_test_idx = {
    lang: (lang_test.values == lang)
    for lang in languages
}
lang_test_X = {
    lang: X_test_seq[lang_test_idx[lang]]
    for lang in languages
}
lang_test_y = {
    lang: y_test.values[lang_test_idx[lang]]
    for lang in languages
}

cols = ["Strategy", "Phase", "Accuracy", "Balanced Acc",
        "Precision", "Recall", "Specificity", "F1", "ROC-AUC"]

for perm in permutations(languages):
    perm_name     = " -> ".join(perm)
    strategy_name = perm_name + " -> Full"
    strategy_results = []

    print(f"\n{'='*50}")
    print(f"Strategy: {strategy_name}")
    print(f"{'='*50}")

    # Make a clean folder per strategy for figures
    strat_tag = perm_name.replace(" -> ", "_to_")
    strat_fig_path = base_path + f"/figures/{strat_tag}"
    os.makedirs(strat_fig_path, exist_ok=True)

    # Model built ONCE — weights carry forward across all phases
    model = build_model()

    # ── Language phases ──────────────────────────────────────
    for lang in perm:
        idx    = (lang_train == lang)
        X_lang = X_train_seq[idx]
        y_lang = y_train[idx]

        print(f"  Training on: {lang} ({X_lang.shape[0]} samples)")

        history = model.fit(
            X_lang, y_lang,
            validation_data=(X_val_seq, y_val),
            epochs=8,
            batch_size=32,
            verbose=1
        )

        # Train/Val accuracy + loss curves
        plot_training_curves(history, f"{strat_tag} [{lang}]", strat_fig_path)

        # Evaluate on this language's test subset
        preds = model.predict(lang_test_X[lang]).flatten()
        acc, bal, prec, rec, spec, f1, auc = evaluate_metrics(lang_test_y[lang], preds)
        strategy_results.append([strategy_name, lang, acc, bal, prec, rec, spec, f1, auc])

        # Eval plots for this language
        plot_eval_charts(lang_test_y[lang], preds, f"{strat_tag} [{lang}]", strat_fig_path)

        print(f"    Acc={acc:.4f}  F1={f1:.4f}  AUC={auc:.4f}")

    # ── Full phase ───────────────────────────────────────────
    print(f"  Training on: Full Dataset ({X_full_shuffled.shape[0]} samples, shuffled)")

    history_full = model.fit(
        X_full_shuffled, y_full_shuffled,
        validation_data=(X_val_seq, y_val),
        epochs=8,
        batch_size=64,
        verbose=1
    )

    # Train/Val accuracy + loss curves for full phase
    plot_training_curves(history_full, f"{strat_tag} [Full]", strat_fig_path)

    # Evaluate on full test set
    preds_full = model.predict(X_test_seq).flatten()
    acc, bal, prec, rec, spec, f1, auc = evaluate_metrics(y_test.values, preds_full)
    strategy_results.append([strategy_name, "Full", acc, bal, prec, rec, spec, f1, auc])

    # Eval plots for full phase
    plot_eval_charts(y_test.values, preds_full, f"{strat_tag} [Full]", strat_fig_path)

    print(f"    Acc={acc:.4f}  F1={f1:.4f}  AUC={auc:.4f}")

    # Save per-strategy results table (4 rows: 3 langs + Full)
    strat_df = pd.DataFrame(strategy_results, columns=cols)
    strat_df.to_csv(
        base_path + f"/results_tables/{strat_tag}_results.csv",
        index=False
    )

    print(f"\n  Results for strategy: {strategy_name}")
    print(strat_df.to_string(index=False))

    model.save(base_path + f"/trained_models/planB_{strat_tag}_Full.h5")
    print(f"  Saved model: planB_{strat_tag}_Full.h5")


# ============================================================
# COMBINED RESULTS TABLE (all 6 strategies × 4 phases = 24 rows)
# ============================================================
all_csv = [
    base_path + f"/results_tables/{('_to_'.join(perm))}_results.csv"
    for perm in permutations(languages)
]

combined_df = pd.concat([pd.read_csv(f) for f in all_csv], ignore_index=True)
combined_df.to_csv(base_path + "/results_tables/all_strategies_results.csv", index=False)

print("\n" + "="*60)
print("ALL STRATEGIES — COMBINED RESULTS")
print("="*60)
print(combined_df.to_string(index=False))