SASC / main.py
tuklu's picture
Add README, tokenizer, results
47bafb1 verified
# -*- coding: utf-8 -*-
"""glove+bilstm.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/10fLw7V6G3vV_STF7KcWe8qcTvyLQq0NT
"""
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import permutations
# For train-test split and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.metrics import roc_curve, precision_recall_curve
# Deep learning libraries
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM
from tensorflow.keras.layers import Dense, Dropout
base_path = "/root/output"
os.makedirs(base_path+"/dataset_splits", exist_ok=True)
os.makedirs(base_path+"/figures", exist_ok=True)
os.makedirs(base_path+"/results_tables", exist_ok=True)
os.makedirs(base_path+"/trained_models", exist_ok=True)
data_path = "/root/dataset.csv"
df = pd.read_csv(data_path)
df.head()
plt.figure(figsize=(6,4))
df['language'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title("Dataset Language Distribution")
plt.ylabel("")
plt.savefig(base_path+"/figures/language_distribution.png", dpi=300)
plt.show()
X = df["clean_text"]
y = df["hate_label"]
lang = df["language"]
X_temp, X_test, y_temp, y_test, lang_temp, lang_test = train_test_split(
X, y, lang, test_size=0.30, stratify=y, random_state=42)
X_train, X_val, y_train, y_val, lang_train, lang_val = train_test_split(
X_temp, y_temp, lang_temp,
test_size=0.1428,
stratify=y_temp,
random_state=42
)
pd.DataFrame({"text":X_train,"label":y_train,"lang":lang_train}).to_csv(
base_path+"/dataset_splits/train.csv", index=False)
pd.DataFrame({"text":X_val,"label":y_val,"lang":lang_val}).to_csv(
base_path+"/dataset_splits/val.csv", index=False)
pd.DataFrame({"text":X_test,"label":y_test,"lang":lang_test}).to_csv(
base_path+"/dataset_splits/test.csv", index=False)
MAX_LEN = 100
VOCAB = 50000
tokenizer = Tokenizer(num_words=VOCAB)
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=MAX_LEN)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN)
EMBEDDING_DIM = 300
glove_path = "/root/glove.6B.300d.txt"
embeddings_index = {}
with open(glove_path, encoding="utf8") as f:
for line in f:
values = line.split()
word = values[0]
vector = np.asarray(values[1:], dtype="float32")
embeddings_index[word] = vector
print("Loaded %s word vectors." % len(embeddings_index))
word_index = tokenizer.word_index
embedding_dim = 300
embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))
for word, i in word_index.items():
vector = embeddings_index.get(word)
if vector is not None:
embedding_matrix[i] = vector
# ============================================================
# Helper: build a fresh model (called once per permutation)
# ============================================================
def build_model():
"""Construct and compile a fresh BiLSTM model with frozen GloVe embeddings."""
m = Sequential()
m.add(Embedding(
input_dim=len(word_index)+1,
output_dim=embedding_dim,
weights=[embedding_matrix],
input_length=MAX_LEN,
trainable=False
))
m.add(Bidirectional(LSTM(128)))
m.add(Dropout(0.5))
m.add(Dense(64, activation="relu"))
m.add(Dense(1, activation="sigmoid"))
m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
return m
def evaluate_metrics(y_true, y_pred_prob):
y_pred = (y_pred_prob > 0.5).astype(int)
acc = accuracy_score(y_true, y_pred)
bal = balanced_accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
rec = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_pred_prob)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
spec = tn / (tn + fp)
return acc, bal, prec, rec, spec, f1, auc
def plot_training_curves(history, tag, base_path):
"""Save accuracy and loss curves for one training phase."""
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(history.history['accuracy'], label="Train Accuracy")
axes[0].plot(history.history['val_accuracy'], label="Val Accuracy")
axes[0].set_title(f"{tag} - Accuracy Curve")
axes[0].set_xlabel("Epoch")
axes[0].set_ylabel("Accuracy")
axes[0].legend()
axes[0].grid(True)
axes[1].plot(history.history['loss'], label="Train Loss")
axes[1].plot(history.history['val_loss'], label="Val Loss")
axes[1].set_title(f"{tag} - Loss Curve")
axes[1].set_xlabel("Epoch")
axes[1].set_ylabel("Loss")
axes[1].legend()
axes[1].grid(True)
plt.tight_layout()
fname = tag.replace(" -> ", "_to_").replace(" ", "_")
plt.savefig(os.path.join(base_path, f"{fname}_curves.png"), dpi=300)
plt.show()
def plot_eval_charts(y_test, preds, tag, base_path):
"""Save confusion matrix, ROC, PR, and F1 curves after evaluation."""
fname = tag.replace(" -> ", "_to_").replace(" ", "_")
# Confusion Matrix
cm = confusion_matrix(y_test, (preds > 0.5).astype(int))
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
xticklabels=["Non-Hate","Hate"],
yticklabels=["Non-Hate","Hate"])
plt.title(f"{tag} - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig(os.path.join(base_path, f"{fname}_cm.png"), dpi=300)
plt.show()
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, preds)
auc_val = roc_auc_score(y_test, preds)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f"AUC={auc_val:.4f}")
plt.plot([0,1],[0,1],'--')
plt.title(f"{tag} - ROC Curve")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(base_path, f"{fname}_roc.png"), dpi=300)
plt.show()
# Precision-Recall Curve
precision, recall, thresholds = precision_recall_curve(y_test, preds)
plt.figure(figsize=(6,4))
plt.plot(recall, precision)
plt.title(f"{tag} - Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.grid(True)
plt.savefig(os.path.join(base_path, f"{fname}_pr.png"), dpi=300)
plt.show()
# F1 Curve
f1_scores = (2 * precision * recall) / (precision + recall + 1e-8)
plt.figure(figsize=(6,4))
plt.plot(thresholds, f1_scores[:-1])
plt.title(f"{tag} - F1 Score vs Threshold")
plt.xlabel("Threshold")
plt.ylabel("F1 Score")
plt.grid(True)
plt.savefig(os.path.join(base_path, f"{fname}_f1.png"), dpi=300)
plt.show()
# ============================================================
# PLAN B: All 6 permutations + final Full (Shuffled) fine-tune
# After each training phase β†’ evaluate on that language's test set
# After Full phase β†’ evaluate on full test set
# ============================================================
print("\n" + "="*60)
print("PLAN B: Sequential Transfer Learning + Full Dataset Fine-tune")
print("="*60)
languages = ["english", "hindi", "hinglish"]
# Pre-shuffle full training data once (same shuffle for all permutations)
np.random.seed(42)
shuffle_idx = np.random.permutation(len(X_train_seq))
X_full_shuffled = np.ascontiguousarray(X_train_seq[shuffle_idx], dtype=np.int32)
y_full_shuffled = np.ascontiguousarray(y_train.values[shuffle_idx], dtype=np.float32)
# Pre-build per-language test splits
lang_test_idx = {
lang: (lang_test.values == lang)
for lang in languages
}
lang_test_X = {
lang: X_test_seq[lang_test_idx[lang]]
for lang in languages
}
lang_test_y = {
lang: y_test.values[lang_test_idx[lang]]
for lang in languages
}
cols = ["Strategy", "Phase", "Accuracy", "Balanced Acc",
"Precision", "Recall", "Specificity", "F1", "ROC-AUC"]
for perm in permutations(languages):
perm_name = " -> ".join(perm)
strategy_name = perm_name + " -> Full"
strategy_results = []
print(f"\n{'='*50}")
print(f"Strategy: {strategy_name}")
print(f"{'='*50}")
# Make a clean folder per strategy for figures
strat_tag = perm_name.replace(" -> ", "_to_")
strat_fig_path = base_path + f"/figures/{strat_tag}"
os.makedirs(strat_fig_path, exist_ok=True)
# Model built ONCE β€” weights carry forward across all phases
model = build_model()
# ── Language phases ──────────────────────────────────────
for lang in perm:
idx = (lang_train == lang)
X_lang = X_train_seq[idx]
y_lang = y_train[idx]
print(f" Training on: {lang} ({X_lang.shape[0]} samples)")
history = model.fit(
X_lang, y_lang,
validation_data=(X_val_seq, y_val),
epochs=8,
batch_size=32,
verbose=1
)
# Train/Val accuracy + loss curves
plot_training_curves(history, f"{strat_tag} [{lang}]", strat_fig_path)
# Evaluate on this language's test subset
preds = model.predict(lang_test_X[lang]).flatten()
acc, bal, prec, rec, spec, f1, auc = evaluate_metrics(lang_test_y[lang], preds)
strategy_results.append([strategy_name, lang, acc, bal, prec, rec, spec, f1, auc])
# Eval plots for this language
plot_eval_charts(lang_test_y[lang], preds, f"{strat_tag} [{lang}]", strat_fig_path)
print(f" Acc={acc:.4f} F1={f1:.4f} AUC={auc:.4f}")
# ── Full phase ───────────────────────────────────────────
print(f" Training on: Full Dataset ({X_full_shuffled.shape[0]} samples, shuffled)")
history_full = model.fit(
X_full_shuffled, y_full_shuffled,
validation_data=(X_val_seq, y_val),
epochs=8,
batch_size=64,
verbose=1
)
# Train/Val accuracy + loss curves for full phase
plot_training_curves(history_full, f"{strat_tag} [Full]", strat_fig_path)
# Evaluate on full test set
preds_full = model.predict(X_test_seq).flatten()
acc, bal, prec, rec, spec, f1, auc = evaluate_metrics(y_test.values, preds_full)
strategy_results.append([strategy_name, "Full", acc, bal, prec, rec, spec, f1, auc])
# Eval plots for full phase
plot_eval_charts(y_test.values, preds_full, f"{strat_tag} [Full]", strat_fig_path)
print(f" Acc={acc:.4f} F1={f1:.4f} AUC={auc:.4f}")
# Save per-strategy results table (4 rows: 3 langs + Full)
strat_df = pd.DataFrame(strategy_results, columns=cols)
strat_df.to_csv(
base_path + f"/results_tables/{strat_tag}_results.csv",
index=False
)
print(f"\n Results for strategy: {strategy_name}")
print(strat_df.to_string(index=False))
model.save(base_path + f"/trained_models/planB_{strat_tag}_Full.h5")
print(f" Saved model: planB_{strat_tag}_Full.h5")
# ============================================================
# COMBINED RESULTS TABLE (all 6 strategies Γ— 4 phases = 24 rows)
# ============================================================
all_csv = [
base_path + f"/results_tables/{('_to_'.join(perm))}_results.csv"
for perm in permutations(languages)
]
combined_df = pd.concat([pd.read_csv(f) for f in all_csv], ignore_index=True)
combined_df.to_csv(base_path + "/results_tables/all_strategies_results.csv", index=False)
print("\n" + "="*60)
print("ALL STRATEGIES β€” COMBINED RESULTS")
print("="*60)
print(combined_df.to_string(index=False))