| |
| """ |
| Strategy: Hinglish -> Hindi -> English -> Full |
| - 50 epochs per phase (200 total) |
| - Evaluate on each individual language + full after every phase |
| - All figures: figsize=(8,6), dpi=300 |
| - Output dir: /root/output_v2 (old output_v1 untouched) |
| """ |
|
|
| import os |
| import numpy as np |
| import pandas as pd |
| import matplotlib |
| matplotlib.use("Agg") |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
|
|
| from sklearn.model_selection import train_test_split |
| from sklearn.metrics import (accuracy_score, balanced_accuracy_score, |
| precision_score, recall_score, f1_score, |
| roc_auc_score, confusion_matrix, |
| roc_curve, precision_recall_curve) |
|
|
| from tensorflow.keras.preprocessing.text import Tokenizer |
| from tensorflow.keras.preprocessing.sequence import pad_sequences |
| from tensorflow.keras.models import Sequential |
| from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout |
| from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint |
|
|
| |
| base_path = "/root/output_v2" |
| data_path = "/root/dataset.csv" |
| glove_path = "/root/glove.6B.300d.txt" |
|
|
| for sub in ["dataset_splits", "figures", "results_tables", "trained_models"]: |
| os.makedirs(os.path.join(base_path, sub), exist_ok=True) |
|
|
| |
| df = pd.read_csv(data_path) |
|
|
| |
| plt.figure(figsize=(8, 6)) |
| df['language'].value_counts().plot.pie(autopct='%1.1f%%') |
| plt.title("Dataset Language Distribution") |
| plt.ylabel("") |
| plt.savefig(os.path.join(base_path, "figures", "language_distribution.png"), dpi=300, bbox_inches="tight") |
| plt.close() |
|
|
| X = df["clean_text"] |
| y = df["hate_label"] |
| lang = df["language"] |
|
|
| |
| X_temp, X_test, y_temp, y_test, lang_temp, lang_test = train_test_split( |
| X, y, lang, test_size=0.30, stratify=y, random_state=42) |
|
|
| X_train, X_val, y_train, y_val, lang_train, lang_val = train_test_split( |
| X_temp, y_temp, lang_temp, |
| test_size=0.1428, stratify=y_temp, random_state=42) |
|
|
| pd.DataFrame({"text": X_train, "label": y_train, "lang": lang_train}).to_csv( |
| os.path.join(base_path, "dataset_splits", "train.csv"), index=False) |
| pd.DataFrame({"text": X_val, "label": y_val, "lang": lang_val}).to_csv( |
| os.path.join(base_path, "dataset_splits", "val.csv"), index=False) |
| pd.DataFrame({"text": X_test, "label": y_test, "lang": lang_test}).to_csv( |
| os.path.join(base_path, "dataset_splits", "test.csv"), index=False) |
|
|
| |
| MAX_LEN = 100 |
| VOCAB = 50000 |
|
|
| tokenizer = Tokenizer(num_words=VOCAB) |
| tokenizer.fit_on_texts(X_train) |
|
|
| X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN) |
| X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=MAX_LEN) |
| X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN) |
|
|
| |
| EMBEDDING_DIM = 300 |
| print("Loading GloVe …") |
| embeddings_index = {} |
| with open(glove_path, encoding="utf8") as f: |
| for line in f: |
| values = line.split() |
| embeddings_index[values[0]] = np.asarray(values[1:], dtype="float32") |
| print(f"Loaded {len(embeddings_index):,} word vectors.") |
|
|
| word_index = tokenizer.word_index |
| embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM)) |
| for word, i in word_index.items(): |
| vec = embeddings_index.get(word) |
| if vec is not None: |
| embedding_matrix[i] = vec |
|
|
| |
| languages = ["english", "hindi", "hinglish"] |
| lang_test_X = {la: X_test_seq[lang_test.values == la] for la in languages} |
| lang_test_y = {la: y_test.values[lang_test.values == la] for la in languages} |
|
|
| |
| def build_model(): |
| m = Sequential([ |
| Embedding(len(word_index) + 1, EMBEDDING_DIM, |
| weights=[embedding_matrix], input_length=MAX_LEN, trainable=False), |
| Bidirectional(LSTM(128)), |
| Dropout(0.5), |
| Dense(64, activation="relu"), |
| Dense(1, activation="sigmoid"), |
| ]) |
| m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) |
| return m |
|
|
|
|
| def evaluate_metrics(y_true, y_pred_prob): |
| y_pred = (y_pred_prob > 0.5).astype(int) |
| acc = accuracy_score(y_true, y_pred) |
| bal = balanced_accuracy_score(y_true, y_pred) |
| prec = precision_score(y_true, y_pred, zero_division=0) |
| rec = recall_score(y_true, y_pred, zero_division=0) |
| f1 = f1_score(y_true, y_pred, zero_division=0) |
| auc = roc_auc_score(y_true, y_pred_prob) |
| tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() |
| spec = tn / (tn + fp) |
| return acc, bal, prec, rec, spec, f1, auc |
|
|
|
|
| def safe_tag(s): |
| return s.replace(" -> ", "_to_").replace(" ", "_") |
|
|
|
|
| def plot_training_curves(history, tag, fig_dir): |
| fig, axes = plt.subplots(1, 2, figsize=(8, 6)) |
| axes[0].plot(history.history['accuracy'], label="Train Acc") |
| axes[0].plot(history.history['val_accuracy'], label="Val Acc") |
| axes[0].set_title(f"{tag} — Accuracy") |
| axes[0].set_xlabel("Epoch"); axes[0].set_ylabel("Accuracy") |
| axes[0].legend(); axes[0].grid(True) |
|
|
| axes[1].plot(history.history['loss'], label="Train Loss") |
| axes[1].plot(history.history['val_loss'], label="Val Loss") |
| axes[1].set_title(f"{tag} — Loss") |
| axes[1].set_xlabel("Epoch"); axes[1].set_ylabel("Loss") |
| axes[1].legend(); axes[1].grid(True) |
|
|
| plt.tight_layout() |
| plt.savefig(os.path.join(fig_dir, f"{safe_tag(tag)}_curves.png"), dpi=300, bbox_inches="tight") |
| plt.close() |
|
|
|
|
| def plot_eval_charts(y_true, preds, tag, fig_dir): |
| ftag = safe_tag(tag) |
|
|
| |
| cm = confusion_matrix(y_true, (preds > 0.5).astype(int)) |
| plt.figure(figsize=(8, 6)) |
| sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", |
| xticklabels=["Non-Hate", "Hate"], |
| yticklabels=["Non-Hate", "Hate"]) |
| plt.title(f"{tag} — Confusion Matrix") |
| plt.xlabel("Predicted"); plt.ylabel("Actual") |
| plt.savefig(os.path.join(fig_dir, f"{ftag}_cm.png"), dpi=300, bbox_inches="tight") |
| plt.close() |
|
|
| |
| fpr, tpr, _ = roc_curve(y_true, preds) |
| auc_val = roc_auc_score(y_true, preds) |
| plt.figure(figsize=(8, 6)) |
| plt.plot(fpr, tpr, label=f"AUC={auc_val:.4f}") |
| plt.plot([0, 1], [0, 1], '--') |
| plt.title(f"{tag} — ROC Curve") |
| plt.xlabel("FPR"); plt.ylabel("TPR") |
| plt.legend(); plt.grid(True) |
| plt.savefig(os.path.join(fig_dir, f"{ftag}_roc.png"), dpi=300, bbox_inches="tight") |
| plt.close() |
|
|
| |
| precision, recall, thresholds = precision_recall_curve(y_true, preds) |
| plt.figure(figsize=(8, 6)) |
| plt.plot(recall, precision) |
| plt.title(f"{tag} — Precision-Recall Curve") |
| plt.xlabel("Recall"); plt.ylabel("Precision") |
| plt.grid(True) |
| plt.savefig(os.path.join(fig_dir, f"{ftag}_pr.png"), dpi=300, bbox_inches="tight") |
| plt.close() |
|
|
| |
| f1_scores = (2 * precision * recall) / (precision + recall + 1e-8) |
| plt.figure(figsize=(8, 6)) |
| plt.plot(thresholds, f1_scores[:-1]) |
| plt.title(f"{tag} — F1 Score vs Threshold") |
| plt.xlabel("Threshold"); plt.ylabel("F1 Score") |
| plt.grid(True) |
| plt.savefig(os.path.join(fig_dir, f"{ftag}_f1.png"), dpi=300, bbox_inches="tight") |
| plt.close() |
|
|
|
|
| |
| STRATEGY = ("hinglish", "hindi", "english") |
| EPOCHS = 50 |
| BATCH_LANG = 32 |
| BATCH_FULL = 64 |
|
|
| strategy_name = " -> ".join(STRATEGY) + " -> Full" |
| print("\n" + "=" * 60) |
| print(f"Strategy: {strategy_name}") |
| print(f"Epochs per phase: {EPOCHS} (Total: {EPOCHS * 4})") |
| print("=" * 60) |
|
|
| fig_dir = os.path.join(base_path, "figures", safe_tag(" -> ".join(STRATEGY))) |
| os.makedirs(fig_dir, exist_ok=True) |
|
|
| |
| np.random.seed(42) |
| shuffle_idx = np.random.permutation(len(X_train_seq)) |
| X_full_shuffled = np.ascontiguousarray(X_train_seq[shuffle_idx], dtype=np.int32) |
| y_full_shuffled = np.ascontiguousarray(y_train.values[shuffle_idx], dtype=np.float32) |
|
|
| cols = ["Phase", "Eval_On", "Accuracy", "Balanced_Acc", |
| "Precision", "Recall", "Specificity", "F1", "ROC_AUC"] |
| all_rows = [] |
|
|
| model = build_model() |
| model.summary() |
|
|
| |
| for phase_lang in STRATEGY: |
| idx = (lang_train == phase_lang) |
| X_lang = X_train_seq[idx] |
| y_lang = y_train[idx] |
|
|
| print(f"\n{'─'*50}") |
| print(f"Phase: training on '{phase_lang}' ({X_lang.shape[0]} samples, {EPOCHS} epochs)") |
| print(f"{'─'*50}") |
|
|
| history = model.fit( |
| X_lang, y_lang, |
| validation_data=(X_val_seq, y_val), |
| epochs=EPOCHS, |
| batch_size=BATCH_LANG, |
| verbose=1, |
| ) |
|
|
| plot_training_curves(history, f"Phase_{phase_lang}", fig_dir) |
|
|
| |
| for eval_lang in languages: |
| preds = model.predict(lang_test_X[eval_lang]).flatten() |
| metrics = evaluate_metrics(lang_test_y[eval_lang], preds) |
| all_rows.append([phase_lang, eval_lang] + list(metrics)) |
| plot_eval_charts(lang_test_y[eval_lang], preds, |
| f"Phase_{phase_lang}_eval_{eval_lang}", fig_dir) |
| print(f" eval on {eval_lang:10s} | Acc={metrics[0]:.4f} F1={metrics[5]:.4f} AUC={metrics[6]:.4f}") |
|
|
| |
| preds_full = model.predict(X_test_seq).flatten() |
| metrics_full = evaluate_metrics(y_test.values, preds_full) |
| all_rows.append([phase_lang, "full"] + list(metrics_full)) |
| plot_eval_charts(y_test.values, preds_full, |
| f"Phase_{phase_lang}_eval_full", fig_dir) |
| print(f" eval on {'full':10s} | Acc={metrics_full[0]:.4f} F1={metrics_full[5]:.4f} AUC={metrics_full[6]:.4f}") |
|
|
| |
| print(f"\n{'─'*50}") |
| print(f"Phase: training on Full dataset ({X_full_shuffled.shape[0]} samples, {EPOCHS} epochs)") |
| print(f"{'─'*50}") |
|
|
| history_full = model.fit( |
| X_full_shuffled, y_full_shuffled, |
| validation_data=(X_val_seq, y_val), |
| epochs=EPOCHS, |
| batch_size=BATCH_FULL, |
| verbose=1, |
| ) |
|
|
| plot_training_curves(history_full, "Phase_Full", fig_dir) |
|
|
| for eval_lang in languages: |
| preds = model.predict(lang_test_X[eval_lang]).flatten() |
| metrics = evaluate_metrics(lang_test_y[eval_lang], preds) |
| all_rows.append(["Full", eval_lang] + list(metrics)) |
| plot_eval_charts(lang_test_y[eval_lang], preds, |
| f"Phase_Full_eval_{eval_lang}", fig_dir) |
| print(f" eval on {eval_lang:10s} | Acc={metrics[0]:.4f} F1={metrics[5]:.4f} AUC={metrics[6]:.4f}") |
|
|
| preds_full = model.predict(X_test_seq).flatten() |
| metrics_full = evaluate_metrics(y_test.values, preds_full) |
| all_rows.append(["Full", "full"] + list(metrics_full)) |
| plot_eval_charts(y_test.values, preds_full, "Phase_Full_eval_full", fig_dir) |
| print(f" eval on {'full':10s} | Acc={metrics_full[0]:.4f} F1={metrics_full[5]:.4f} AUC={metrics_full[6]:.4f}") |
|
|
| |
| results_df = pd.DataFrame(all_rows, columns=cols) |
| results_df.to_csv(os.path.join(base_path, "results_tables", "hinglish_hindi_english_full_results.csv"), index=False) |
|
|
| print("\n" + "=" * 60) |
| print("FINAL RESULTS TABLE") |
| print("=" * 60) |
| print(results_df.to_string(index=False)) |
|
|
| model.save(os.path.join(base_path, "trained_models", "hinglish_hindi_english_full.h5")) |
| print("\nModel saved.") |
| print("Done.") |
|
|