SASCv2 / main_v2.py
tuklu's picture
Add scripts
7e5f759 verified
# -*- coding: utf-8 -*-
"""
Strategy: Hinglish -> Hindi -> English -> Full
- 50 epochs per phase (200 total)
- Evaluate on each individual language + full after every phase
- All figures: figsize=(8,6), dpi=300
- Output dir: /root/output_v2 (old output_v1 untouched)
"""
import os
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, balanced_accuracy_score,
precision_score, recall_score, f1_score,
roc_auc_score, confusion_matrix,
roc_curve, precision_recall_curve)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
# ── Paths ────────────────────────────────────────────────────────────────────
base_path = "/root/output_v2"
data_path = "/root/dataset.csv"
glove_path = "/root/glove.6B.300d.txt"
for sub in ["dataset_splits", "figures", "results_tables", "trained_models"]:
os.makedirs(os.path.join(base_path, sub), exist_ok=True)
# ── Load data ────────────────────────────────────────────────────────────────
df = pd.read_csv(data_path)
# Language distribution pie
plt.figure(figsize=(8, 6))
df['language'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title("Dataset Language Distribution")
plt.ylabel("")
plt.savefig(os.path.join(base_path, "figures", "language_distribution.png"), dpi=300, bbox_inches="tight")
plt.close()
X = df["clean_text"]
y = df["hate_label"]
lang = df["language"]
# ── Splits ───────────────────────────────────────────────────────────────────
X_temp, X_test, y_temp, y_test, lang_temp, lang_test = train_test_split(
X, y, lang, test_size=0.30, stratify=y, random_state=42)
X_train, X_val, y_train, y_val, lang_train, lang_val = train_test_split(
X_temp, y_temp, lang_temp,
test_size=0.1428, stratify=y_temp, random_state=42)
pd.DataFrame({"text": X_train, "label": y_train, "lang": lang_train}).to_csv(
os.path.join(base_path, "dataset_splits", "train.csv"), index=False)
pd.DataFrame({"text": X_val, "label": y_val, "lang": lang_val}).to_csv(
os.path.join(base_path, "dataset_splits", "val.csv"), index=False)
pd.DataFrame({"text": X_test, "label": y_test, "lang": lang_test}).to_csv(
os.path.join(base_path, "dataset_splits", "test.csv"), index=False)
# ── Tokenise & pad ───────────────────────────────────────────────────────────
MAX_LEN = 100
VOCAB = 50000
tokenizer = Tokenizer(num_words=VOCAB)
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=MAX_LEN)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN)
# ── GloVe embeddings ─────────────────────────────────────────────────────────
EMBEDDING_DIM = 300
print("Loading GloVe …")
embeddings_index = {}
with open(glove_path, encoding="utf8") as f:
for line in f:
values = line.split()
embeddings_index[values[0]] = np.asarray(values[1:], dtype="float32")
print(f"Loaded {len(embeddings_index):,} word vectors.")
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
vec = embeddings_index.get(word)
if vec is not None:
embedding_matrix[i] = vec
# ── Per-language test subsets ────────────────────────────────────────────────
languages = ["english", "hindi", "hinglish"]
lang_test_X = {la: X_test_seq[lang_test.values == la] for la in languages}
lang_test_y = {la: y_test.values[lang_test.values == la] for la in languages}
# ── Helpers ──────────────────────────────────────────────────────────────────
def build_model():
m = Sequential([
Embedding(len(word_index) + 1, EMBEDDING_DIM,
weights=[embedding_matrix], input_length=MAX_LEN, trainable=False),
Bidirectional(LSTM(128)),
Dropout(0.5),
Dense(64, activation="relu"),
Dense(1, activation="sigmoid"),
])
m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
return m
def evaluate_metrics(y_true, y_pred_prob):
y_pred = (y_pred_prob > 0.5).astype(int)
acc = accuracy_score(y_true, y_pred)
bal = balanced_accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
auc = roc_auc_score(y_true, y_pred_prob)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
spec = tn / (tn + fp)
return acc, bal, prec, rec, spec, f1, auc
def safe_tag(s):
return s.replace(" -> ", "_to_").replace(" ", "_")
def plot_training_curves(history, tag, fig_dir):
fig, axes = plt.subplots(1, 2, figsize=(8, 6))
axes[0].plot(history.history['accuracy'], label="Train Acc")
axes[0].plot(history.history['val_accuracy'], label="Val Acc")
axes[0].set_title(f"{tag} — Accuracy")
axes[0].set_xlabel("Epoch"); axes[0].set_ylabel("Accuracy")
axes[0].legend(); axes[0].grid(True)
axes[1].plot(history.history['loss'], label="Train Loss")
axes[1].plot(history.history['val_loss'], label="Val Loss")
axes[1].set_title(f"{tag} — Loss")
axes[1].set_xlabel("Epoch"); axes[1].set_ylabel("Loss")
axes[1].legend(); axes[1].grid(True)
plt.tight_layout()
plt.savefig(os.path.join(fig_dir, f"{safe_tag(tag)}_curves.png"), dpi=300, bbox_inches="tight")
plt.close()
def plot_eval_charts(y_true, preds, tag, fig_dir):
ftag = safe_tag(tag)
# Confusion matrix
cm = confusion_matrix(y_true, (preds > 0.5).astype(int))
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
xticklabels=["Non-Hate", "Hate"],
yticklabels=["Non-Hate", "Hate"])
plt.title(f"{tag} — Confusion Matrix")
plt.xlabel("Predicted"); plt.ylabel("Actual")
plt.savefig(os.path.join(fig_dir, f"{ftag}_cm.png"), dpi=300, bbox_inches="tight")
plt.close()
# ROC
fpr, tpr, _ = roc_curve(y_true, preds)
auc_val = roc_auc_score(y_true, preds)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"AUC={auc_val:.4f}")
plt.plot([0, 1], [0, 1], '--')
plt.title(f"{tag} — ROC Curve")
plt.xlabel("FPR"); plt.ylabel("TPR")
plt.legend(); plt.grid(True)
plt.savefig(os.path.join(fig_dir, f"{ftag}_roc.png"), dpi=300, bbox_inches="tight")
plt.close()
# Precision-Recall
precision, recall, thresholds = precision_recall_curve(y_true, preds)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision)
plt.title(f"{tag} — Precision-Recall Curve")
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.grid(True)
plt.savefig(os.path.join(fig_dir, f"{ftag}_pr.png"), dpi=300, bbox_inches="tight")
plt.close()
# F1 vs Threshold
f1_scores = (2 * precision * recall) / (precision + recall + 1e-8)
plt.figure(figsize=(8, 6))
plt.plot(thresholds, f1_scores[:-1])
plt.title(f"{tag} — F1 Score vs Threshold")
plt.xlabel("Threshold"); plt.ylabel("F1 Score")
plt.grid(True)
plt.savefig(os.path.join(fig_dir, f"{ftag}_f1.png"), dpi=300, bbox_inches="tight")
plt.close()
# ── Strategy ─────────────────────────────────────────────────────────────────
STRATEGY = ("hinglish", "hindi", "english")
EPOCHS = 50
BATCH_LANG = 32
BATCH_FULL = 64
strategy_name = " -> ".join(STRATEGY) + " -> Full"
print("\n" + "=" * 60)
print(f"Strategy: {strategy_name}")
print(f"Epochs per phase: {EPOCHS} (Total: {EPOCHS * 4})")
print("=" * 60)
fig_dir = os.path.join(base_path, "figures", safe_tag(" -> ".join(STRATEGY)))
os.makedirs(fig_dir, exist_ok=True)
# Full training data (pre-shuffled, used in final phase)
np.random.seed(42)
shuffle_idx = np.random.permutation(len(X_train_seq))
X_full_shuffled = np.ascontiguousarray(X_train_seq[shuffle_idx], dtype=np.int32)
y_full_shuffled = np.ascontiguousarray(y_train.values[shuffle_idx], dtype=np.float32)
cols = ["Phase", "Eval_On", "Accuracy", "Balanced_Acc",
"Precision", "Recall", "Specificity", "F1", "ROC_AUC"]
all_rows = []
model = build_model()
model.summary()
# ── Language phases ──────────────────────────────────────────────────────────
for phase_lang in STRATEGY:
idx = (lang_train == phase_lang)
X_lang = X_train_seq[idx]
y_lang = y_train[idx]
print(f"\n{'─'*50}")
print(f"Phase: training on '{phase_lang}' ({X_lang.shape[0]} samples, {EPOCHS} epochs)")
print(f"{'─'*50}")
history = model.fit(
X_lang, y_lang,
validation_data=(X_val_seq, y_val),
epochs=EPOCHS,
batch_size=BATCH_LANG,
verbose=1,
)
plot_training_curves(history, f"Phase_{phase_lang}", fig_dir)
# Evaluate on every individual language + full
for eval_lang in languages:
preds = model.predict(lang_test_X[eval_lang]).flatten()
metrics = evaluate_metrics(lang_test_y[eval_lang], preds)
all_rows.append([phase_lang, eval_lang] + list(metrics))
plot_eval_charts(lang_test_y[eval_lang], preds,
f"Phase_{phase_lang}_eval_{eval_lang}", fig_dir)
print(f" eval on {eval_lang:10s} | Acc={metrics[0]:.4f} F1={metrics[5]:.4f} AUC={metrics[6]:.4f}")
# Full test set
preds_full = model.predict(X_test_seq).flatten()
metrics_full = evaluate_metrics(y_test.values, preds_full)
all_rows.append([phase_lang, "full"] + list(metrics_full))
plot_eval_charts(y_test.values, preds_full,
f"Phase_{phase_lang}_eval_full", fig_dir)
print(f" eval on {'full':10s} | Acc={metrics_full[0]:.4f} F1={metrics_full[5]:.4f} AUC={metrics_full[6]:.4f}")
# ── Full dataset phase ───────────────────────────────────────────────────────
print(f"\n{'─'*50}")
print(f"Phase: training on Full dataset ({X_full_shuffled.shape[0]} samples, {EPOCHS} epochs)")
print(f"{'─'*50}")
history_full = model.fit(
X_full_shuffled, y_full_shuffled,
validation_data=(X_val_seq, y_val),
epochs=EPOCHS,
batch_size=BATCH_FULL,
verbose=1,
)
plot_training_curves(history_full, "Phase_Full", fig_dir)
for eval_lang in languages:
preds = model.predict(lang_test_X[eval_lang]).flatten()
metrics = evaluate_metrics(lang_test_y[eval_lang], preds)
all_rows.append(["Full", eval_lang] + list(metrics))
plot_eval_charts(lang_test_y[eval_lang], preds,
f"Phase_Full_eval_{eval_lang}", fig_dir)
print(f" eval on {eval_lang:10s} | Acc={metrics[0]:.4f} F1={metrics[5]:.4f} AUC={metrics[6]:.4f}")
preds_full = model.predict(X_test_seq).flatten()
metrics_full = evaluate_metrics(y_test.values, preds_full)
all_rows.append(["Full", "full"] + list(metrics_full))
plot_eval_charts(y_test.values, preds_full, "Phase_Full_eval_full", fig_dir)
print(f" eval on {'full':10s} | Acc={metrics_full[0]:.4f} F1={metrics_full[5]:.4f} AUC={metrics_full[6]:.4f}")
# ── Save results ─────────────────────────────────────────────────────────────
results_df = pd.DataFrame(all_rows, columns=cols)
results_df.to_csv(os.path.join(base_path, "results_tables", "hinglish_hindi_english_full_results.csv"), index=False)
print("\n" + "=" * 60)
print("FINAL RESULTS TABLE")
print("=" * 60)
print(results_df.to_string(index=False))
model.save(os.path.join(base_path, "trained_models", "hinglish_hindi_english_full.h5"))
print("\nModel saved.")
print("Done.")