SASC / main.py

Add README, tokenizer, results

47bafb1 verified 8 days ago

12.1 kB

	# -- coding: utf-8 --
	"""glove+bilstm.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/10fLw7V6G3vV_STF7KcWe8qcTvyLQq0NT
	"""

	import os
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from itertools import permutations

	# For train-test split and evaluation
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, balanced_accuracy_score
	from sklearn.metrics import precision_score, recall_score, f1_score
	from sklearn.metrics import roc_auc_score, confusion_matrix
	from sklearn.metrics import roc_curve, precision_recall_curve

	# Deep learning libraries
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.models import Sequential
	from tensorflow.keras.layers import Embedding, Bidirectional, LSTM
	from tensorflow.keras.layers import Dense, Dropout

	base_path = "/root/output"

	os.makedirs(base_path+"/dataset_splits", exist_ok=True)
	os.makedirs(base_path+"/figures", exist_ok=True)
	os.makedirs(base_path+"/results_tables", exist_ok=True)
	os.makedirs(base_path+"/trained_models", exist_ok=True)

	data_path = "/root/dataset.csv"

	df = pd.read_csv(data_path)

	df.head()

	plt.figure(figsize=(6,4))
	df['language'].value_counts().plot.pie(autopct='%1.1f%%')
	plt.title("Dataset Language Distribution")
	plt.ylabel("")
	plt.savefig(base_path+"/figures/language_distribution.png", dpi=300)
	plt.show()

	X = df["clean_text"]
	y = df["hate_label"]
	lang = df["language"]

	X_temp, X_test, y_temp, y_test, lang_temp, lang_test = train_test_split(
	X, y, lang, test_size=0.30, stratify=y, random_state=42)

	X_train, X_val, y_train, y_val, lang_train, lang_val = train_test_split(
	X_temp, y_temp, lang_temp,
	test_size=0.1428,
	stratify=y_temp,
	random_state=42
	)

	pd.DataFrame({"text":X_train,"label":y_train,"lang":lang_train}).to_csv(
	base_path+"/dataset_splits/train.csv", index=False)

	pd.DataFrame({"text":X_val,"label":y_val,"lang":lang_val}).to_csv(
	base_path+"/dataset_splits/val.csv", index=False)

	pd.DataFrame({"text":X_test,"label":y_test,"lang":lang_test}).to_csv(
	base_path+"/dataset_splits/test.csv", index=False)

	MAX_LEN = 100
	VOCAB = 50000

	tokenizer = Tokenizer(num_words=VOCAB)
	tokenizer.fit_on_texts(X_train)

	X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
	X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=MAX_LEN)
	X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN)

	EMBEDDING_DIM = 300
	glove_path = "/root/glove.6B.300d.txt"

	embeddings_index = {}

	with open(glove_path, encoding="utf8") as f:
	for line in f:
	values = line.split()
	word = values[0]
	vector = np.asarray(values[1:], dtype="float32")
	embeddings_index[word] = vector

	print("Loaded %s word vectors." % len(embeddings_index))

	word_index = tokenizer.word_index
	embedding_dim = 300

	embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))

	for word, i in word_index.items():
	vector = embeddings_index.get(word)
	if vector is not None:
	embedding_matrix[i] = vector


	# ============================================================
	# Helper: build a fresh model (called once per permutation)
	# ============================================================
	def build_model():
	"""Construct and compile a fresh BiLSTM model with frozen GloVe embeddings."""
	m = Sequential()
	m.add(Embedding(
	input_dim=len(word_index)+1,
	output_dim=embedding_dim,
	weights=[embedding_matrix],
	input_length=MAX_LEN,
	trainable=False
	))
	m.add(Bidirectional(LSTM(128)))
	m.add(Dropout(0.5))
	m.add(Dense(64, activation="relu"))
	m.add(Dense(1, activation="sigmoid"))
	m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
	return m


	def evaluate_metrics(y_true, y_pred_prob):
	y_pred = (y_pred_prob > 0.5).astype(int)
	acc = accuracy_score(y_true, y_pred)
	bal = balanced_accuracy_score(y_true, y_pred)
	prec = precision_score(y_true, y_pred)
	rec = recall_score(y_true, y_pred)
	f1 = f1_score(y_true, y_pred)
	auc = roc_auc_score(y_true, y_pred_prob)
	tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
	spec = tn / (tn + fp)
	return acc, bal, prec, rec, spec, f1, auc


	def plot_training_curves(history, tag, base_path):
	"""Save accuracy and loss curves for one training phase."""
	fig, axes = plt.subplots(1, 2, figsize=(14, 5))

	axes[0].plot(history.history['accuracy'], label="Train Accuracy")
	axes[0].plot(history.history['val_accuracy'], label="Val Accuracy")
	axes[0].set_title(f"{tag} - Accuracy Curve")
	axes[0].set_xlabel("Epoch")
	axes[0].set_ylabel("Accuracy")
	axes[0].legend()
	axes[0].grid(True)

	axes[1].plot(history.history['loss'], label="Train Loss")
	axes[1].plot(history.history['val_loss'], label="Val Loss")
	axes[1].set_title(f"{tag} - Loss Curve")
	axes[1].set_xlabel("Epoch")
	axes[1].set_ylabel("Loss")
	axes[1].legend()
	axes[1].grid(True)

	plt.tight_layout()
	fname = tag.replace(" -> ", "_to_").replace(" ", "_")
	plt.savefig(os.path.join(base_path, f"{fname}_curves.png"), dpi=300)
	plt.show()


	def plot_eval_charts(y_test, preds, tag, base_path):
	"""Save confusion matrix, ROC, PR, and F1 curves after evaluation."""
	fname = tag.replace(" -> ", "_to_").replace(" ", "_")

	# Confusion Matrix
	cm = confusion_matrix(y_test, (preds > 0.5).astype(int))
	plt.figure(figsize=(6,4))
	sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
	xticklabels=["Non-Hate","Hate"],
	yticklabels=["Non-Hate","Hate"])
	plt.title(f"{tag} - Confusion Matrix")
	plt.xlabel("Predicted")
	plt.ylabel("Actual")
	plt.savefig(os.path.join(base_path, f"{fname}_cm.png"), dpi=300)
	plt.show()

	# ROC Curve
	fpr, tpr, _ = roc_curve(y_test, preds)
	auc_val = roc_auc_score(y_test, preds)
	plt.figure(figsize=(6,4))
	plt.plot(fpr, tpr, label=f"AUC={auc_val:.4f}")
	plt.plot([0,1],[0,1],'--')
	plt.title(f"{tag} - ROC Curve")
	plt.xlabel("FPR")
	plt.ylabel("TPR")
	plt.legend()
	plt.grid(True)
	plt.savefig(os.path.join(base_path, f"{fname}_roc.png"), dpi=300)
	plt.show()

	# Precision-Recall Curve
	precision, recall, thresholds = precision_recall_curve(y_test, preds)
	plt.figure(figsize=(6,4))
	plt.plot(recall, precision)
	plt.title(f"{tag} - Precision-Recall Curve")
	plt.xlabel("Recall")
	plt.ylabel("Precision")
	plt.grid(True)
	plt.savefig(os.path.join(base_path, f"{fname}_pr.png"), dpi=300)
	plt.show()

	# F1 Curve
	f1_scores = (2 * precision * recall) / (precision + recall + 1e-8)
	plt.figure(figsize=(6,4))
	plt.plot(thresholds, f1_scores[:-1])
	plt.title(f"{tag} - F1 Score vs Threshold")
	plt.xlabel("Threshold")
	plt.ylabel("F1 Score")
	plt.grid(True)
	plt.savefig(os.path.join(base_path, f"{fname}_f1.png"), dpi=300)
	plt.show()


	# ============================================================
	# PLAN B: All 6 permutations + final Full (Shuffled) fine-tune
	# After each training phase → evaluate on that language's test set
	# After Full phase → evaluate on full test set
	# ============================================================
	print("\n" + "="*60)
	print("PLAN B: Sequential Transfer Learning + Full Dataset Fine-tune")
	print("="*60)

	languages = ["english", "hindi", "hinglish"]

	# Pre-shuffle full training data once (same shuffle for all permutations)
	np.random.seed(42)
	shuffle_idx = np.random.permutation(len(X_train_seq))
	X_full_shuffled = np.ascontiguousarray(X_train_seq[shuffle_idx], dtype=np.int32)
	y_full_shuffled = np.ascontiguousarray(y_train.values[shuffle_idx], dtype=np.float32)

	# Pre-build per-language test splits
	lang_test_idx = {
	lang: (lang_test.values == lang)
	for lang in languages
	}
	lang_test_X = {
	lang: X_test_seq[lang_test_idx[lang]]
	for lang in languages
	}
	lang_test_y = {
	lang: y_test.values[lang_test_idx[lang]]
	for lang in languages
	}

	cols = ["Strategy", "Phase", "Accuracy", "Balanced Acc",
	"Precision", "Recall", "Specificity", "F1", "ROC-AUC"]

	for perm in permutations(languages):
	perm_name = " -> ".join(perm)
	strategy_name = perm_name + " -> Full"
	strategy_results = []

	print(f"\n{'='*50}")
	print(f"Strategy: {strategy_name}")
	print(f"{'='*50}")

	# Make a clean folder per strategy for figures
	strat_tag = perm_name.replace(" -> ", "_to_")
	strat_fig_path = base_path + f"/figures/{strat_tag}"
	os.makedirs(strat_fig_path, exist_ok=True)

	# Model built ONCE — weights carry forward across all phases
	model = build_model()

	# ── Language phases ──────────────────────────────────────
	for lang in perm:
	idx = (lang_train == lang)
	X_lang = X_train_seq[idx]
	y_lang = y_train[idx]

	print(f" Training on: {lang} ({X_lang.shape[0]} samples)")

	history = model.fit(
	X_lang, y_lang,
	validation_data=(X_val_seq, y_val),
	epochs=8,
	batch_size=32,
	verbose=1
	)

	# Train/Val accuracy + loss curves
	plot_training_curves(history, f"{strat_tag} [{lang}]", strat_fig_path)

	# Evaluate on this language's test subset
	preds = model.predict(lang_test_X[lang]).flatten()
	acc, bal, prec, rec, spec, f1, auc = evaluate_metrics(lang_test_y[lang], preds)
	strategy_results.append([strategy_name, lang, acc, bal, prec, rec, spec, f1, auc])

	# Eval plots for this language
	plot_eval_charts(lang_test_y[lang], preds, f"{strat_tag} [{lang}]", strat_fig_path)

	print(f" Acc={acc:.4f} F1={f1:.4f} AUC={auc:.4f}")

	# ── Full phase ───────────────────────────────────────────
	print(f" Training on: Full Dataset ({X_full_shuffled.shape[0]} samples, shuffled)")

	history_full = model.fit(
	X_full_shuffled, y_full_shuffled,
	validation_data=(X_val_seq, y_val),
	epochs=8,
	batch_size=64,
	verbose=1
	)

	# Train/Val accuracy + loss curves for full phase
	plot_training_curves(history_full, f"{strat_tag} [Full]", strat_fig_path)

	# Evaluate on full test set
	preds_full = model.predict(X_test_seq).flatten()
	acc, bal, prec, rec, spec, f1, auc = evaluate_metrics(y_test.values, preds_full)
	strategy_results.append([strategy_name, "Full", acc, bal, prec, rec, spec, f1, auc])

	# Eval plots for full phase
	plot_eval_charts(y_test.values, preds_full, f"{strat_tag} [Full]", strat_fig_path)

	print(f" Acc={acc:.4f} F1={f1:.4f} AUC={auc:.4f}")

	# Save per-strategy results table (4 rows: 3 langs + Full)
	strat_df = pd.DataFrame(strategy_results, columns=cols)
	strat_df.to_csv(
	base_path + f"/results_tables/{strat_tag}_results.csv",
	index=False
	)

	print(f"\n Results for strategy: {strategy_name}")
	print(strat_df.to_string(index=False))

	model.save(base_path + f"/trained_models/planB_{strat_tag}_Full.h5")
	print(f" Saved model: planB_{strat_tag}_Full.h5")


	# ============================================================
	# COMBINED RESULTS TABLE (all 6 strategies × 4 phases = 24 rows)
	# ============================================================
	all_csv = [
	base_path + f"/results_tables/{('_to_'.join(perm))}_results.csv"
	for perm in permutations(languages)
	]

	combined_df = pd.concat([pd.read_csv(f) for f in all_csv], ignore_index=True)
	combined_df.to_csv(base_path + "/results_tables/all_strategies_results.csv", index=False)

	print("\n" + "="*60)
	print("ALL STRATEGIES — COMBINED RESULTS")
	print("="*60)
	print(combined_df.to_string(index=False))