# -*- coding: utf-8 -*- """ Strategy: Hinglish -> Hindi -> English -> Full - 50 epochs per phase (200 total) - Evaluate on each individual language + full after every phase - All figures: figsize=(8,6), dpi=300 - Output dir: /root/output_v2 (old output_v1 untouched) """ import os import numpy as np import pandas as pd import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.metrics import (accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, precision_recall_curve) from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint # ── Paths ──────────────────────────────────────────────────────────────────── base_path = "/root/output_v2" data_path = "/root/dataset.csv" glove_path = "/root/glove.6B.300d.txt" for sub in ["dataset_splits", "figures", "results_tables", "trained_models"]: os.makedirs(os.path.join(base_path, sub), exist_ok=True) # ── Load data ──────────────────────────────────────────────────────────────── df = pd.read_csv(data_path) # Language distribution pie plt.figure(figsize=(8, 6)) df['language'].value_counts().plot.pie(autopct='%1.1f%%') plt.title("Dataset Language Distribution") plt.ylabel("") plt.savefig(os.path.join(base_path, "figures", "language_distribution.png"), dpi=300, bbox_inches="tight") plt.close() X = df["clean_text"] y = df["hate_label"] lang = df["language"] # ── Splits ─────────────────────────────────────────────────────────────────── X_temp, X_test, y_temp, y_test, lang_temp, lang_test = train_test_split( X, y, lang, test_size=0.30, stratify=y, random_state=42) X_train, X_val, y_train, y_val, lang_train, lang_val = train_test_split( X_temp, y_temp, lang_temp, test_size=0.1428, stratify=y_temp, random_state=42) pd.DataFrame({"text": X_train, "label": y_train, "lang": lang_train}).to_csv( os.path.join(base_path, "dataset_splits", "train.csv"), index=False) pd.DataFrame({"text": X_val, "label": y_val, "lang": lang_val}).to_csv( os.path.join(base_path, "dataset_splits", "val.csv"), index=False) pd.DataFrame({"text": X_test, "label": y_test, "lang": lang_test}).to_csv( os.path.join(base_path, "dataset_splits", "test.csv"), index=False) # ── Tokenise & pad ─────────────────────────────────────────────────────────── MAX_LEN = 100 VOCAB = 50000 tokenizer = Tokenizer(num_words=VOCAB) tokenizer.fit_on_texts(X_train) X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN) X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=MAX_LEN) X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN) # ── GloVe embeddings ───────────────────────────────────────────────────────── EMBEDDING_DIM = 300 print("Loading GloVe …") embeddings_index = {} with open(glove_path, encoding="utf8") as f: for line in f: values = line.split() embeddings_index[values[0]] = np.asarray(values[1:], dtype="float32") print(f"Loaded {len(embeddings_index):,} word vectors.") word_index = tokenizer.word_index embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM)) for word, i in word_index.items(): vec = embeddings_index.get(word) if vec is not None: embedding_matrix[i] = vec # ── Per-language test subsets ──────────────────────────────────────────────── languages = ["english", "hindi", "hinglish"] lang_test_X = {la: X_test_seq[lang_test.values == la] for la in languages} lang_test_y = {la: y_test.values[lang_test.values == la] for la in languages} # ── Helpers ────────────────────────────────────────────────────────────────── def build_model(): m = Sequential([ Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_LEN, trainable=False), Bidirectional(LSTM(128)), Dropout(0.5), Dense(64, activation="relu"), Dense(1, activation="sigmoid"), ]) m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) return m def evaluate_metrics(y_true, y_pred_prob): y_pred = (y_pred_prob > 0.5).astype(int) acc = accuracy_score(y_true, y_pred) bal = balanced_accuracy_score(y_true, y_pred) prec = precision_score(y_true, y_pred, zero_division=0) rec = recall_score(y_true, y_pred, zero_division=0) f1 = f1_score(y_true, y_pred, zero_division=0) auc = roc_auc_score(y_true, y_pred_prob) tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() spec = tn / (tn + fp) return acc, bal, prec, rec, spec, f1, auc def safe_tag(s): return s.replace(" -> ", "_to_").replace(" ", "_") def plot_training_curves(history, tag, fig_dir): fig, axes = plt.subplots(1, 2, figsize=(8, 6)) axes[0].plot(history.history['accuracy'], label="Train Acc") axes[0].plot(history.history['val_accuracy'], label="Val Acc") axes[0].set_title(f"{tag} — Accuracy") axes[0].set_xlabel("Epoch"); axes[0].set_ylabel("Accuracy") axes[0].legend(); axes[0].grid(True) axes[1].plot(history.history['loss'], label="Train Loss") axes[1].plot(history.history['val_loss'], label="Val Loss") axes[1].set_title(f"{tag} — Loss") axes[1].set_xlabel("Epoch"); axes[1].set_ylabel("Loss") axes[1].legend(); axes[1].grid(True) plt.tight_layout() plt.savefig(os.path.join(fig_dir, f"{safe_tag(tag)}_curves.png"), dpi=300, bbox_inches="tight") plt.close() def plot_eval_charts(y_true, preds, tag, fig_dir): ftag = safe_tag(tag) # Confusion matrix cm = confusion_matrix(y_true, (preds > 0.5).astype(int)) plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Non-Hate", "Hate"], yticklabels=["Non-Hate", "Hate"]) plt.title(f"{tag} — Confusion Matrix") plt.xlabel("Predicted"); plt.ylabel("Actual") plt.savefig(os.path.join(fig_dir, f"{ftag}_cm.png"), dpi=300, bbox_inches="tight") plt.close() # ROC fpr, tpr, _ = roc_curve(y_true, preds) auc_val = roc_auc_score(y_true, preds) plt.figure(figsize=(8, 6)) plt.plot(fpr, tpr, label=f"AUC={auc_val:.4f}") plt.plot([0, 1], [0, 1], '--') plt.title(f"{tag} — ROC Curve") plt.xlabel("FPR"); plt.ylabel("TPR") plt.legend(); plt.grid(True) plt.savefig(os.path.join(fig_dir, f"{ftag}_roc.png"), dpi=300, bbox_inches="tight") plt.close() # Precision-Recall precision, recall, thresholds = precision_recall_curve(y_true, preds) plt.figure(figsize=(8, 6)) plt.plot(recall, precision) plt.title(f"{tag} — Precision-Recall Curve") plt.xlabel("Recall"); plt.ylabel("Precision") plt.grid(True) plt.savefig(os.path.join(fig_dir, f"{ftag}_pr.png"), dpi=300, bbox_inches="tight") plt.close() # F1 vs Threshold f1_scores = (2 * precision * recall) / (precision + recall + 1e-8) plt.figure(figsize=(8, 6)) plt.plot(thresholds, f1_scores[:-1]) plt.title(f"{tag} — F1 Score vs Threshold") plt.xlabel("Threshold"); plt.ylabel("F1 Score") plt.grid(True) plt.savefig(os.path.join(fig_dir, f"{ftag}_f1.png"), dpi=300, bbox_inches="tight") plt.close() # ── Strategy ───────────────────────────────────────────────────────────────── STRATEGY = ("hinglish", "hindi", "english") EPOCHS = 50 BATCH_LANG = 32 BATCH_FULL = 64 strategy_name = " -> ".join(STRATEGY) + " -> Full" print("\n" + "=" * 60) print(f"Strategy: {strategy_name}") print(f"Epochs per phase: {EPOCHS} (Total: {EPOCHS * 4})") print("=" * 60) fig_dir = os.path.join(base_path, "figures", safe_tag(" -> ".join(STRATEGY))) os.makedirs(fig_dir, exist_ok=True) # Full training data (pre-shuffled, used in final phase) np.random.seed(42) shuffle_idx = np.random.permutation(len(X_train_seq)) X_full_shuffled = np.ascontiguousarray(X_train_seq[shuffle_idx], dtype=np.int32) y_full_shuffled = np.ascontiguousarray(y_train.values[shuffle_idx], dtype=np.float32) cols = ["Phase", "Eval_On", "Accuracy", "Balanced_Acc", "Precision", "Recall", "Specificity", "F1", "ROC_AUC"] all_rows = [] model = build_model() model.summary() # ── Language phases ────────────────────────────────────────────────────────── for phase_lang in STRATEGY: idx = (lang_train == phase_lang) X_lang = X_train_seq[idx] y_lang = y_train[idx] print(f"\n{'─'*50}") print(f"Phase: training on '{phase_lang}' ({X_lang.shape[0]} samples, {EPOCHS} epochs)") print(f"{'─'*50}") history = model.fit( X_lang, y_lang, validation_data=(X_val_seq, y_val), epochs=EPOCHS, batch_size=BATCH_LANG, verbose=1, ) plot_training_curves(history, f"Phase_{phase_lang}", fig_dir) # Evaluate on every individual language + full for eval_lang in languages: preds = model.predict(lang_test_X[eval_lang]).flatten() metrics = evaluate_metrics(lang_test_y[eval_lang], preds) all_rows.append([phase_lang, eval_lang] + list(metrics)) plot_eval_charts(lang_test_y[eval_lang], preds, f"Phase_{phase_lang}_eval_{eval_lang}", fig_dir) print(f" eval on {eval_lang:10s} | Acc={metrics[0]:.4f} F1={metrics[5]:.4f} AUC={metrics[6]:.4f}") # Full test set preds_full = model.predict(X_test_seq).flatten() metrics_full = evaluate_metrics(y_test.values, preds_full) all_rows.append([phase_lang, "full"] + list(metrics_full)) plot_eval_charts(y_test.values, preds_full, f"Phase_{phase_lang}_eval_full", fig_dir) print(f" eval on {'full':10s} | Acc={metrics_full[0]:.4f} F1={metrics_full[5]:.4f} AUC={metrics_full[6]:.4f}") # ── Full dataset phase ─────────────────────────────────────────────────────── print(f"\n{'─'*50}") print(f"Phase: training on Full dataset ({X_full_shuffled.shape[0]} samples, {EPOCHS} epochs)") print(f"{'─'*50}") history_full = model.fit( X_full_shuffled, y_full_shuffled, validation_data=(X_val_seq, y_val), epochs=EPOCHS, batch_size=BATCH_FULL, verbose=1, ) plot_training_curves(history_full, "Phase_Full", fig_dir) for eval_lang in languages: preds = model.predict(lang_test_X[eval_lang]).flatten() metrics = evaluate_metrics(lang_test_y[eval_lang], preds) all_rows.append(["Full", eval_lang] + list(metrics)) plot_eval_charts(lang_test_y[eval_lang], preds, f"Phase_Full_eval_{eval_lang}", fig_dir) print(f" eval on {eval_lang:10s} | Acc={metrics[0]:.4f} F1={metrics[5]:.4f} AUC={metrics[6]:.4f}") preds_full = model.predict(X_test_seq).flatten() metrics_full = evaluate_metrics(y_test.values, preds_full) all_rows.append(["Full", "full"] + list(metrics_full)) plot_eval_charts(y_test.values, preds_full, "Phase_Full_eval_full", fig_dir) print(f" eval on {'full':10s} | Acc={metrics_full[0]:.4f} F1={metrics_full[5]:.4f} AUC={metrics_full[6]:.4f}") # ── Save results ───────────────────────────────────────────────────────────── results_df = pd.DataFrame(all_rows, columns=cols) results_df.to_csv(os.path.join(base_path, "results_tables", "hinglish_hindi_english_full_results.csv"), index=False) print("\n" + "=" * 60) print("FINAL RESULTS TABLE") print("=" * 60) print(results_df.to_string(index=False)) model.save(os.path.join(base_path, "trained_models", "hinglish_hindi_english_full.h5")) print("\nModel saved.") print("Done.")