File size: 13,250 Bytes

7e5f759

# -*- coding: utf-8 -*-
"""
Strategy: Hinglish -> Hindi -> English -> Full
- 50 epochs per phase (200 total)
- Evaluate on each individual language + full after every phase
- All figures: figsize=(8,6), dpi=300
- Output dir: /root/output_v2  (old output_v1 untouched)
"""

import os
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, balanced_accuracy_score,
                             precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix,
                             roc_curve, precision_recall_curve)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# ── Paths ────────────────────────────────────────────────────────────────────
base_path = "/root/output_v2"
data_path = "/root/dataset.csv"
glove_path = "/root/glove.6B.300d.txt"

for sub in ["dataset_splits", "figures", "results_tables", "trained_models"]:
    os.makedirs(os.path.join(base_path, sub), exist_ok=True)

# ── Load data ────────────────────────────────────────────────────────────────
df = pd.read_csv(data_path)

# Language distribution pie
plt.figure(figsize=(8, 6))
df['language'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title("Dataset Language Distribution")
plt.ylabel("")
plt.savefig(os.path.join(base_path, "figures", "language_distribution.png"), dpi=300, bbox_inches="tight")
plt.close()

X    = df["clean_text"]
y    = df["hate_label"]
lang = df["language"]

# ── Splits ───────────────────────────────────────────────────────────────────
X_temp, X_test, y_temp, y_test, lang_temp, lang_test = train_test_split(
    X, y, lang, test_size=0.30, stratify=y, random_state=42)

X_train, X_val, y_train, y_val, lang_train, lang_val = train_test_split(
    X_temp, y_temp, lang_temp,
    test_size=0.1428, stratify=y_temp, random_state=42)

pd.DataFrame({"text": X_train, "label": y_train, "lang": lang_train}).to_csv(
    os.path.join(base_path, "dataset_splits", "train.csv"), index=False)
pd.DataFrame({"text": X_val,   "label": y_val,   "lang": lang_val}).to_csv(
    os.path.join(base_path, "dataset_splits", "val.csv"),   index=False)
pd.DataFrame({"text": X_test,  "label": y_test,  "lang": lang_test}).to_csv(
    os.path.join(base_path, "dataset_splits", "test.csv"),  index=False)

# ── Tokenise & pad ───────────────────────────────────────────────────────────
MAX_LEN = 100
VOCAB   = 50000

tokenizer = Tokenizer(num_words=VOCAB)
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
X_val_seq   = pad_sequences(tokenizer.texts_to_sequences(X_val),   maxlen=MAX_LEN)
X_test_seq  = pad_sequences(tokenizer.texts_to_sequences(X_test),  maxlen=MAX_LEN)

# ── GloVe embeddings ─────────────────────────────────────────────────────────
EMBEDDING_DIM = 300
print("Loading GloVe …")
embeddings_index = {}
with open(glove_path, encoding="utf8") as f:
    for line in f:
        values = line.split()
        embeddings_index[values[0]] = np.asarray(values[1:], dtype="float32")
print(f"Loaded {len(embeddings_index):,} word vectors.")

word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    vec = embeddings_index.get(word)
    if vec is not None:
        embedding_matrix[i] = vec

# ── Per-language test subsets ────────────────────────────────────────────────
languages = ["english", "hindi", "hinglish"]
lang_test_X = {la: X_test_seq[lang_test.values == la] for la in languages}
lang_test_y = {la: y_test.values[lang_test.values == la] for la in languages}

# ── Helpers ──────────────────────────────────────────────────────────────────
def build_model():
    m = Sequential([
        Embedding(len(word_index) + 1, EMBEDDING_DIM,
                  weights=[embedding_matrix], input_length=MAX_LEN, trainable=False),
        Bidirectional(LSTM(128)),
        Dropout(0.5),
        Dense(64, activation="relu"),
        Dense(1, activation="sigmoid"),
    ])
    m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return m


def evaluate_metrics(y_true, y_pred_prob):
    y_pred = (y_pred_prob > 0.5).astype(int)
    acc  = accuracy_score(y_true, y_pred)
    bal  = balanced_accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    f1   = f1_score(y_true, y_pred, zero_division=0)
    auc  = roc_auc_score(y_true, y_pred_prob)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    spec = tn / (tn + fp)
    return acc, bal, prec, rec, spec, f1, auc


def safe_tag(s):
    return s.replace(" -> ", "_to_").replace(" ", "_")


def plot_training_curves(history, tag, fig_dir):
    fig, axes = plt.subplots(1, 2, figsize=(8, 6))
    axes[0].plot(history.history['accuracy'],     label="Train Acc")
    axes[0].plot(history.history['val_accuracy'], label="Val Acc")
    axes[0].set_title(f"{tag} — Accuracy")
    axes[0].set_xlabel("Epoch"); axes[0].set_ylabel("Accuracy")
    axes[0].legend(); axes[0].grid(True)

    axes[1].plot(history.history['loss'],     label="Train Loss")
    axes[1].plot(history.history['val_loss'], label="Val Loss")
    axes[1].set_title(f"{tag} — Loss")
    axes[1].set_xlabel("Epoch"); axes[1].set_ylabel("Loss")
    axes[1].legend(); axes[1].grid(True)

    plt.tight_layout()
    plt.savefig(os.path.join(fig_dir, f"{safe_tag(tag)}_curves.png"), dpi=300, bbox_inches="tight")
    plt.close()


def plot_eval_charts(y_true, preds, tag, fig_dir):
    ftag = safe_tag(tag)

    # Confusion matrix
    cm = confusion_matrix(y_true, (preds > 0.5).astype(int))
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=["Non-Hate", "Hate"],
                yticklabels=["Non-Hate", "Hate"])
    plt.title(f"{tag} — Confusion Matrix")
    plt.xlabel("Predicted"); plt.ylabel("Actual")
    plt.savefig(os.path.join(fig_dir, f"{ftag}_cm.png"), dpi=300, bbox_inches="tight")
    plt.close()

    # ROC
    fpr, tpr, _ = roc_curve(y_true, preds)
    auc_val = roc_auc_score(y_true, preds)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f"AUC={auc_val:.4f}")
    plt.plot([0, 1], [0, 1], '--')
    plt.title(f"{tag} — ROC Curve")
    plt.xlabel("FPR"); plt.ylabel("TPR")
    plt.legend(); plt.grid(True)
    plt.savefig(os.path.join(fig_dir, f"{ftag}_roc.png"), dpi=300, bbox_inches="tight")
    plt.close()

    # Precision-Recall
    precision, recall, thresholds = precision_recall_curve(y_true, preds)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision)
    plt.title(f"{tag} — Precision-Recall Curve")
    plt.xlabel("Recall"); plt.ylabel("Precision")
    plt.grid(True)
    plt.savefig(os.path.join(fig_dir, f"{ftag}_pr.png"), dpi=300, bbox_inches="tight")
    plt.close()

    # F1 vs Threshold
    f1_scores = (2 * precision * recall) / (precision + recall + 1e-8)
    plt.figure(figsize=(8, 6))
    plt.plot(thresholds, f1_scores[:-1])
    plt.title(f"{tag} — F1 Score vs Threshold")
    plt.xlabel("Threshold"); plt.ylabel("F1 Score")
    plt.grid(True)
    plt.savefig(os.path.join(fig_dir, f"{ftag}_f1.png"), dpi=300, bbox_inches="tight")
    plt.close()


# ── Strategy ─────────────────────────────────────────────────────────────────
STRATEGY   = ("hinglish", "hindi", "english")
EPOCHS     = 50
BATCH_LANG = 32
BATCH_FULL = 64

strategy_name = " -> ".join(STRATEGY) + " -> Full"
print("\n" + "=" * 60)
print(f"Strategy: {strategy_name}")
print(f"Epochs per phase: {EPOCHS}  (Total: {EPOCHS * 4})")
print("=" * 60)

fig_dir = os.path.join(base_path, "figures", safe_tag(" -> ".join(STRATEGY)))
os.makedirs(fig_dir, exist_ok=True)

# Full training data (pre-shuffled, used in final phase)
np.random.seed(42)
shuffle_idx     = np.random.permutation(len(X_train_seq))
X_full_shuffled = np.ascontiguousarray(X_train_seq[shuffle_idx], dtype=np.int32)
y_full_shuffled = np.ascontiguousarray(y_train.values[shuffle_idx], dtype=np.float32)

cols = ["Phase", "Eval_On", "Accuracy", "Balanced_Acc",
        "Precision", "Recall", "Specificity", "F1", "ROC_AUC"]
all_rows = []

model = build_model()
model.summary()

# ── Language phases ──────────────────────────────────────────────────────────
for phase_lang in STRATEGY:
    idx    = (lang_train == phase_lang)
    X_lang = X_train_seq[idx]
    y_lang = y_train[idx]

    print(f"\n{'─'*50}")
    print(f"Phase: training on '{phase_lang}'  ({X_lang.shape[0]} samples, {EPOCHS} epochs)")
    print(f"{'─'*50}")

    history = model.fit(
        X_lang, y_lang,
        validation_data=(X_val_seq, y_val),
        epochs=EPOCHS,
        batch_size=BATCH_LANG,
        verbose=1,
    )

    plot_training_curves(history, f"Phase_{phase_lang}", fig_dir)

    # Evaluate on every individual language + full
    for eval_lang in languages:
        preds = model.predict(lang_test_X[eval_lang]).flatten()
        metrics = evaluate_metrics(lang_test_y[eval_lang], preds)
        all_rows.append([phase_lang, eval_lang] + list(metrics))
        plot_eval_charts(lang_test_y[eval_lang], preds,
                         f"Phase_{phase_lang}_eval_{eval_lang}", fig_dir)
        print(f"  eval on {eval_lang:10s} | Acc={metrics[0]:.4f}  F1={metrics[5]:.4f}  AUC={metrics[6]:.4f}")

    # Full test set
    preds_full = model.predict(X_test_seq).flatten()
    metrics_full = evaluate_metrics(y_test.values, preds_full)
    all_rows.append([phase_lang, "full"] + list(metrics_full))
    plot_eval_charts(y_test.values, preds_full,
                     f"Phase_{phase_lang}_eval_full", fig_dir)
    print(f"  eval on {'full':10s} | Acc={metrics_full[0]:.4f}  F1={metrics_full[5]:.4f}  AUC={metrics_full[6]:.4f}")

# ── Full dataset phase ───────────────────────────────────────────────────────
print(f"\n{'─'*50}")
print(f"Phase: training on Full dataset  ({X_full_shuffled.shape[0]} samples, {EPOCHS} epochs)")
print(f"{'─'*50}")

history_full = model.fit(
    X_full_shuffled, y_full_shuffled,
    validation_data=(X_val_seq, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_FULL,
    verbose=1,
)

plot_training_curves(history_full, "Phase_Full", fig_dir)

for eval_lang in languages:
    preds = model.predict(lang_test_X[eval_lang]).flatten()
    metrics = evaluate_metrics(lang_test_y[eval_lang], preds)
    all_rows.append(["Full", eval_lang] + list(metrics))
    plot_eval_charts(lang_test_y[eval_lang], preds,
                     f"Phase_Full_eval_{eval_lang}", fig_dir)
    print(f"  eval on {eval_lang:10s} | Acc={metrics[0]:.4f}  F1={metrics[5]:.4f}  AUC={metrics[6]:.4f}")

preds_full = model.predict(X_test_seq).flatten()
metrics_full = evaluate_metrics(y_test.values, preds_full)
all_rows.append(["Full", "full"] + list(metrics_full))
plot_eval_charts(y_test.values, preds_full, "Phase_Full_eval_full", fig_dir)
print(f"  eval on {'full':10s} | Acc={metrics_full[0]:.4f}  F1={metrics_full[5]:.4f}  AUC={metrics_full[6]:.4f}")

# ── Save results ─────────────────────────────────────────────────────────────
results_df = pd.DataFrame(all_rows, columns=cols)
results_df.to_csv(os.path.join(base_path, "results_tables", "hinglish_hindi_english_full_results.csv"), index=False)

print("\n" + "=" * 60)
print("FINAL RESULTS TABLE")
print("=" * 60)
print(results_df.to_string(index=False))

model.save(os.path.join(base_path, "trained_models", "hinglish_hindi_english_full.h5"))
print("\nModel saved.")
print("Done.")