SASCv2 / main_v2.py

Add scripts

7e5f759 verified 6 days ago

13.3 kB

	# -- coding: utf-8 --
	"""
	Strategy: Hinglish -> Hindi -> English -> Full
	- 50 epochs per phase (200 total)
	- Evaluate on each individual language + full after every phase
	- All figures: figsize=(8,6), dpi=300
	- Output dir: /root/output_v2 (old output_v1 untouched)
	"""

	import os
	import numpy as np
	import pandas as pd
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	import seaborn as sns

	from sklearn.model_selection import train_test_split
	from sklearn.metrics import (accuracy_score, balanced_accuracy_score,
	precision_score, recall_score, f1_score,
	roc_auc_score, confusion_matrix,
	roc_curve, precision_recall_curve)

	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.models import Sequential
	from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
	from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

	# ── Paths ────────────────────────────────────────────────────────────────────
	base_path = "/root/output_v2"
	data_path = "/root/dataset.csv"
	glove_path = "/root/glove.6B.300d.txt"

	for sub in ["dataset_splits", "figures", "results_tables", "trained_models"]:
	os.makedirs(os.path.join(base_path, sub), exist_ok=True)

	# ── Load data ────────────────────────────────────────────────────────────────
	df = pd.read_csv(data_path)

	# Language distribution pie
	plt.figure(figsize=(8, 6))
	df['language'].value_counts().plot.pie(autopct='%1.1f%%')
	plt.title("Dataset Language Distribution")
	plt.ylabel("")
	plt.savefig(os.path.join(base_path, "figures", "language_distribution.png"), dpi=300, bbox_inches="tight")
	plt.close()

	X = df["clean_text"]
	y = df["hate_label"]
	lang = df["language"]

	# ── Splits ───────────────────────────────────────────────────────────────────
	X_temp, X_test, y_temp, y_test, lang_temp, lang_test = train_test_split(
	X, y, lang, test_size=0.30, stratify=y, random_state=42)

	X_train, X_val, y_train, y_val, lang_train, lang_val = train_test_split(
	X_temp, y_temp, lang_temp,
	test_size=0.1428, stratify=y_temp, random_state=42)

	pd.DataFrame({"text": X_train, "label": y_train, "lang": lang_train}).to_csv(
	os.path.join(base_path, "dataset_splits", "train.csv"), index=False)
	pd.DataFrame({"text": X_val, "label": y_val, "lang": lang_val}).to_csv(
	os.path.join(base_path, "dataset_splits", "val.csv"), index=False)
	pd.DataFrame({"text": X_test, "label": y_test, "lang": lang_test}).to_csv(
	os.path.join(base_path, "dataset_splits", "test.csv"), index=False)

	# ── Tokenise & pad ───────────────────────────────────────────────────────────
	MAX_LEN = 100
	VOCAB = 50000

	tokenizer = Tokenizer(num_words=VOCAB)
	tokenizer.fit_on_texts(X_train)

	X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
	X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=MAX_LEN)
	X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN)

	# ── GloVe embeddings ─────────────────────────────────────────────────────────
	EMBEDDING_DIM = 300
	print("Loading GloVe …")
	embeddings_index = {}
	with open(glove_path, encoding="utf8") as f:
	for line in f:
	values = line.split()
	embeddings_index[values[0]] = np.asarray(values[1:], dtype="float32")
	print(f"Loaded {len(embeddings_index):,} word vectors.")

	word_index = tokenizer.word_index
	embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
	for word, i in word_index.items():
	vec = embeddings_index.get(word)
	if vec is not None:
	embedding_matrix[i] = vec

	# ── Per-language test subsets ────────────────────────────────────────────────
	languages = ["english", "hindi", "hinglish"]
	lang_test_X = {la: X_test_seq[lang_test.values == la] for la in languages}
	lang_test_y = {la: y_test.values[lang_test.values == la] for la in languages}

	# ── Helpers ──────────────────────────────────────────────────────────────────
	def build_model():
	m = Sequential([
	Embedding(len(word_index) + 1, EMBEDDING_DIM,
	weights=[embedding_matrix], input_length=MAX_LEN, trainable=False),
	Bidirectional(LSTM(128)),
	Dropout(0.5),
	Dense(64, activation="relu"),
	Dense(1, activation="sigmoid"),
	])
	m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
	return m


	def evaluate_metrics(y_true, y_pred_prob):
	y_pred = (y_pred_prob > 0.5).astype(int)
	acc = accuracy_score(y_true, y_pred)
	bal = balanced_accuracy_score(y_true, y_pred)
	prec = precision_score(y_true, y_pred, zero_division=0)
	rec = recall_score(y_true, y_pred, zero_division=0)
	f1 = f1_score(y_true, y_pred, zero_division=0)
	auc = roc_auc_score(y_true, y_pred_prob)
	tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
	spec = tn / (tn + fp)
	return acc, bal, prec, rec, spec, f1, auc


	def safe_tag(s):
	return s.replace(" -> ", "_to_").replace(" ", "_")


	def plot_training_curves(history, tag, fig_dir):
	fig, axes = plt.subplots(1, 2, figsize=(8, 6))
	axes[0].plot(history.history['accuracy'], label="Train Acc")
	axes[0].plot(history.history['val_accuracy'], label="Val Acc")
	axes[0].set_title(f"{tag} — Accuracy")
	axes[0].set_xlabel("Epoch"); axes[0].set_ylabel("Accuracy")
	axes[0].legend(); axes[0].grid(True)

	axes[1].plot(history.history['loss'], label="Train Loss")
	axes[1].plot(history.history['val_loss'], label="Val Loss")
	axes[1].set_title(f"{tag} — Loss")
	axes[1].set_xlabel("Epoch"); axes[1].set_ylabel("Loss")
	axes[1].legend(); axes[1].grid(True)

	plt.tight_layout()
	plt.savefig(os.path.join(fig_dir, f"{safe_tag(tag)}_curves.png"), dpi=300, bbox_inches="tight")
	plt.close()


	def plot_eval_charts(y_true, preds, tag, fig_dir):
	ftag = safe_tag(tag)

	# Confusion matrix
	cm = confusion_matrix(y_true, (preds > 0.5).astype(int))
	plt.figure(figsize=(8, 6))
	sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
	xticklabels=["Non-Hate", "Hate"],
	yticklabels=["Non-Hate", "Hate"])
	plt.title(f"{tag} — Confusion Matrix")
	plt.xlabel("Predicted"); plt.ylabel("Actual")
	plt.savefig(os.path.join(fig_dir, f"{ftag}_cm.png"), dpi=300, bbox_inches="tight")
	plt.close()

	# ROC
	fpr, tpr, _ = roc_curve(y_true, preds)
	auc_val = roc_auc_score(y_true, preds)
	plt.figure(figsize=(8, 6))
	plt.plot(fpr, tpr, label=f"AUC={auc_val:.4f}")
	plt.plot([0, 1], [0, 1], '--')
	plt.title(f"{tag} — ROC Curve")
	plt.xlabel("FPR"); plt.ylabel("TPR")
	plt.legend(); plt.grid(True)
	plt.savefig(os.path.join(fig_dir, f"{ftag}_roc.png"), dpi=300, bbox_inches="tight")
	plt.close()

	# Precision-Recall
	precision, recall, thresholds = precision_recall_curve(y_true, preds)
	plt.figure(figsize=(8, 6))
	plt.plot(recall, precision)
	plt.title(f"{tag} — Precision-Recall Curve")
	plt.xlabel("Recall"); plt.ylabel("Precision")
	plt.grid(True)
	plt.savefig(os.path.join(fig_dir, f"{ftag}_pr.png"), dpi=300, bbox_inches="tight")
	plt.close()

	# F1 vs Threshold
	f1_scores = (2 * precision * recall) / (precision + recall + 1e-8)
	plt.figure(figsize=(8, 6))
	plt.plot(thresholds, f1_scores[:-1])
	plt.title(f"{tag} — F1 Score vs Threshold")
	plt.xlabel("Threshold"); plt.ylabel("F1 Score")
	plt.grid(True)
	plt.savefig(os.path.join(fig_dir, f"{ftag}_f1.png"), dpi=300, bbox_inches="tight")
	plt.close()


	# ── Strategy ─────────────────────────────────────────────────────────────────
	STRATEGY = ("hinglish", "hindi", "english")
	EPOCHS = 50
	BATCH_LANG = 32
	BATCH_FULL = 64

	strategy_name = " -> ".join(STRATEGY) + " -> Full"
	print("\n" + "=" * 60)
	print(f"Strategy: {strategy_name}")
	print(f"Epochs per phase: {EPOCHS} (Total: {EPOCHS * 4})")
	print("=" * 60)

	fig_dir = os.path.join(base_path, "figures", safe_tag(" -> ".join(STRATEGY)))
	os.makedirs(fig_dir, exist_ok=True)

	# Full training data (pre-shuffled, used in final phase)
	np.random.seed(42)
	shuffle_idx = np.random.permutation(len(X_train_seq))
	X_full_shuffled = np.ascontiguousarray(X_train_seq[shuffle_idx], dtype=np.int32)
	y_full_shuffled = np.ascontiguousarray(y_train.values[shuffle_idx], dtype=np.float32)

	cols = ["Phase", "Eval_On", "Accuracy", "Balanced_Acc",
	"Precision", "Recall", "Specificity", "F1", "ROC_AUC"]
	all_rows = []

	model = build_model()
	model.summary()

	# ── Language phases ──────────────────────────────────────────────────────────
	for phase_lang in STRATEGY:
	idx = (lang_train == phase_lang)
	X_lang = X_train_seq[idx]
	y_lang = y_train[idx]

	print(f"\n{'─'*50}")
	print(f"Phase: training on '{phase_lang}' ({X_lang.shape[0]} samples, {EPOCHS} epochs)")
	print(f"{'─'*50}")

	history = model.fit(
	X_lang, y_lang,
	validation_data=(X_val_seq, y_val),
	epochs=EPOCHS,
	batch_size=BATCH_LANG,
	verbose=1,
	)

	plot_training_curves(history, f"Phase_{phase_lang}", fig_dir)

	# Evaluate on every individual language + full
	for eval_lang in languages:
	preds = model.predict(lang_test_X[eval_lang]).flatten()
	metrics = evaluate_metrics(lang_test_y[eval_lang], preds)
	all_rows.append([phase_lang, eval_lang] + list(metrics))
	plot_eval_charts(lang_test_y[eval_lang], preds,
	f"Phase_{phase_lang}_eval_{eval_lang}", fig_dir)
	print(f" eval on {eval_lang:10s} \| Acc={metrics[0]:.4f} F1={metrics[5]:.4f} AUC={metrics[6]:.4f}")

	# Full test set
	preds_full = model.predict(X_test_seq).flatten()
	metrics_full = evaluate_metrics(y_test.values, preds_full)
	all_rows.append([phase_lang, "full"] + list(metrics_full))
	plot_eval_charts(y_test.values, preds_full,
	f"Phase_{phase_lang}_eval_full", fig_dir)
	print(f" eval on {'full':10s} \| Acc={metrics_full[0]:.4f} F1={metrics_full[5]:.4f} AUC={metrics_full[6]:.4f}")

	# ── Full dataset phase ───────────────────────────────────────────────────────
	print(f"\n{'─'*50}")
	print(f"Phase: training on Full dataset ({X_full_shuffled.shape[0]} samples, {EPOCHS} epochs)")
	print(f"{'─'*50}")

	history_full = model.fit(
	X_full_shuffled, y_full_shuffled,
	validation_data=(X_val_seq, y_val),
	epochs=EPOCHS,
	batch_size=BATCH_FULL,
	verbose=1,
	)

	plot_training_curves(history_full, "Phase_Full", fig_dir)

	for eval_lang in languages:
	preds = model.predict(lang_test_X[eval_lang]).flatten()
	metrics = evaluate_metrics(lang_test_y[eval_lang], preds)
	all_rows.append(["Full", eval_lang] + list(metrics))
	plot_eval_charts(lang_test_y[eval_lang], preds,
	f"Phase_Full_eval_{eval_lang}", fig_dir)
	print(f" eval on {eval_lang:10s} \| Acc={metrics[0]:.4f} F1={metrics[5]:.4f} AUC={metrics[6]:.4f}")

	preds_full = model.predict(X_test_seq).flatten()
	metrics_full = evaluate_metrics(y_test.values, preds_full)
	all_rows.append(["Full", "full"] + list(metrics_full))
	plot_eval_charts(y_test.values, preds_full, "Phase_Full_eval_full", fig_dir)
	print(f" eval on {'full':10s} \| Acc={metrics_full[0]:.4f} F1={metrics_full[5]:.4f} AUC={metrics_full[6]:.4f}")

	# ── Save results ─────────────────────────────────────────────────────────────
	results_df = pd.DataFrame(all_rows, columns=cols)
	results_df.to_csv(os.path.join(base_path, "results_tables", "hinglish_hindi_english_full_results.csv"), index=False)

	print("\n" + "=" * 60)
	print("FINAL RESULTS TABLE")
	print("=" * 60)
	print(results_df.to_string(index=False))

	model.save(os.path.join(base_path, "trained_models", "hinglish_hindi_english_full.h5"))
	print("\nModel saved.")
	print("Done.")