tuklu
/

SASC

+# -*- coding: utf-8 -*-
+"""glove+bilstm.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/10fLw7V6G3vV_STF7KcWe8qcTvyLQq0NT
+"""
+import os
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from itertools import permutations
+# For train-test split and evaluation
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, balanced_accuracy_score
+from sklearn.metrics import precision_score, recall_score, f1_score
+from sklearn.metrics import roc_auc_score, confusion_matrix
+from sklearn.metrics import roc_curve, precision_recall_curve
+# Deep learning libraries
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Embedding, Bidirectional, LSTM
+from tensorflow.keras.layers import Dense, Dropout
+base_path = "/root/output"
+os.makedirs(base_path+"/dataset_splits", exist_ok=True)
+os.makedirs(base_path+"/figures", exist_ok=True)
+os.makedirs(base_path+"/results_tables", exist_ok=True)
+os.makedirs(base_path+"/trained_models", exist_ok=True)
+data_path = "/root/dataset.csv"
+df = pd.read_csv(data_path)
+df.head()
+plt.figure(figsize=(6,4))
+df['language'].value_counts().plot.pie(autopct='%1.1f%%')
+plt.title("Dataset Language Distribution")
+plt.ylabel("")
+plt.savefig(base_path+"/figures/language_distribution.png", dpi=300)
+plt.show()
+X = df["clean_text"]
+y = df["hate_label"]
+lang = df["language"]
+X_temp, X_test, y_temp, y_test, lang_temp, lang_test = train_test_split(
+    X, y, lang, test_size=0.30, stratify=y, random_state=42)
+X_train, X_val, y_train, y_val, lang_train, lang_val = train_test_split(
+    X_temp, y_temp, lang_temp,
+    test_size=0.1428,
+    stratify=y_temp,
+    random_state=42
+)
+pd.DataFrame({"text":X_train,"label":y_train,"lang":lang_train}).to_csv(
+    base_path+"/dataset_splits/train.csv", index=False)
+pd.DataFrame({"text":X_val,"label":y_val,"lang":lang_val}).to_csv(
+    base_path+"/dataset_splits/val.csv", index=False)
+pd.DataFrame({"text":X_test,"label":y_test,"lang":lang_test}).to_csv(
+    base_path+"/dataset_splits/test.csv", index=False)
+MAX_LEN = 100
+VOCAB = 50000
+tokenizer = Tokenizer(num_words=VOCAB)
+tokenizer.fit_on_texts(X_train)
+X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
+X_val_seq   = pad_sequences(tokenizer.texts_to_sequences(X_val),   maxlen=MAX_LEN)
+X_test_seq  = pad_sequences(tokenizer.texts_to_sequences(X_test),  maxlen=MAX_LEN)
+EMBEDDING_DIM = 300
+glove_path = "/root/glove.6B.300d.txt"
+embeddings_index = {}
+with open(glove_path, encoding="utf8") as f:
+    for line in f:
+        values = line.split()
+        word = values[0]
+        vector = np.asarray(values[1:], dtype="float32")
+        embeddings_index[word] = vector
+print("Loaded %s word vectors." % len(embeddings_index))
+word_index = tokenizer.word_index
+embedding_dim = 300
+embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))
+for word, i in word_index.items():
+    vector = embeddings_index.get(word)
+    if vector is not None:
+        embedding_matrix[i] = vector
+# ============================================================
+# Helper: build a fresh model (called once per permutation)
+# ============================================================
+def build_model():
+    """Construct and compile a fresh BiLSTM model with frozen GloVe embeddings."""
+    m = Sequential()
+    m.add(Embedding(
+        input_dim=len(word_index)+1,
+        output_dim=embedding_dim,
+        weights=[embedding_matrix],
+        input_length=MAX_LEN,
+        trainable=False
+    ))
+    m.add(Bidirectional(LSTM(128)))
+    m.add(Dropout(0.5))
+    m.add(Dense(64, activation="relu"))
+    m.add(Dense(1, activation="sigmoid"))
+    m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
+    return m
+def evaluate_metrics(y_true, y_pred_prob):
+    y_pred = (y_pred_prob > 0.5).astype(int)
+    acc  = accuracy_score(y_true, y_pred)
+    bal  = balanced_accuracy_score(y_true, y_pred)
+    prec = precision_score(y_true, y_pred)
+    rec  = recall_score(y_true, y_pred)
+    f1   = f1_score(y_true, y_pred)
+    auc  = roc_auc_score(y_true, y_pred_prob)
+    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
+    spec = tn / (tn + fp)
+    return acc, bal, prec, rec, spec, f1, auc
+def plot_training_curves(history, tag, base_path):
+    """Save accuracy and loss curves for one training phase."""
+    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+    axes[0].plot(history.history['accuracy'], label="Train Accuracy")
+    axes[0].plot(history.history['val_accuracy'], label="Val Accuracy")
+    axes[0].set_title(f"{tag} - Accuracy Curve")
+    axes[0].set_xlabel("Epoch")
+    axes[0].set_ylabel("Accuracy")
+    axes[0].legend()
+    axes[0].grid(True)
+    axes[1].plot(history.history['loss'], label="Train Loss")
+    axes[1].plot(history.history['val_loss'], label="Val Loss")
+    axes[1].set_title(f"{tag} - Loss Curve")
+    axes[1].set_xlabel("Epoch")
+    axes[1].set_ylabel("Loss")
+    axes[1].legend()
+    axes[1].grid(True)
+    plt.tight_layout()
+    fname = tag.replace(" -> ", "_to_").replace(" ", "_")
+    plt.savefig(os.path.join(base_path, f"{fname}_curves.png"), dpi=300)
+    plt.show()
+def plot_eval_charts(y_test, preds, tag, base_path):
+    """Save confusion matrix, ROC, PR, and F1 curves after evaluation."""
+    fname = tag.replace(" -> ", "_to_").replace(" ", "_")
+    # Confusion Matrix
+    cm = confusion_matrix(y_test, (preds > 0.5).astype(int))
+    plt.figure(figsize=(6,4))
+    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
+                xticklabels=["Non-Hate","Hate"],
+                yticklabels=["Non-Hate","Hate"])
+    plt.title(f"{tag} - Confusion Matrix")
+    plt.xlabel("Predicted")
+    plt.ylabel("Actual")
+    plt.savefig(os.path.join(base_path, f"{fname}_cm.png"), dpi=300)
+    plt.show()
+    # ROC Curve
+    fpr, tpr, _ = roc_curve(y_test, preds)
+    auc_val = roc_auc_score(y_test, preds)
+    plt.figure(figsize=(6,4))
+    plt.plot(fpr, tpr, label=f"AUC={auc_val:.4f}")
+    plt.plot([0,1],[0,1],'--')
+    plt.title(f"{tag} - ROC Curve")
+    plt.xlabel("FPR")
+    plt.ylabel("TPR")
+    plt.legend()
+    plt.grid(True)
+    plt.savefig(os.path.join(base_path, f"{fname}_roc.png"), dpi=300)
+    plt.show()
+    # Precision-Recall Curve
+    precision, recall, thresholds = precision_recall_curve(y_test, preds)
+    plt.figure(figsize=(6,4))
+    plt.plot(recall, precision)
+    plt.title(f"{tag} - Precision-Recall Curve")
+    plt.xlabel("Recall")
+    plt.ylabel("Precision")
+    plt.grid(True)
+    plt.savefig(os.path.join(base_path, f"{fname}_pr.png"), dpi=300)
+    plt.show()
+    # F1 Curve
+    f1_scores = (2 * precision * recall) / (precision + recall + 1e-8)
+    plt.figure(figsize=(6,4))
+    plt.plot(thresholds, f1_scores[:-1])
+    plt.title(f"{tag} - F1 Score vs Threshold")
+    plt.xlabel("Threshold")
+    plt.ylabel("F1 Score")
+    plt.grid(True)
+    plt.savefig(os.path.join(base_path, f"{fname}_f1.png"), dpi=300)
+    plt.show()
+# ============================================================
+# PLAN B: All 6 permutations + final Full (Shuffled) fine-tune
+#         After each training phase → evaluate on that language's test set
+#         After Full phase → evaluate on full test set
+# ============================================================
+print("\n" + "="*60)
+print("PLAN B: Sequential Transfer Learning + Full Dataset Fine-tune")
+print("="*60)
+languages = ["english", "hindi", "hinglish"]
+# Pre-shuffle full training data once (same shuffle for all permutations)
+np.random.seed(42)
+shuffle_idx     = np.random.permutation(len(X_train_seq))
+X_full_shuffled = np.ascontiguousarray(X_train_seq[shuffle_idx], dtype=np.int32)
+y_full_shuffled = np.ascontiguousarray(y_train.values[shuffle_idx], dtype=np.float32)
+# Pre-build per-language test splits
+lang_test_idx = {
+    lang: (lang_test.values == lang)
+    for lang in languages
+}
+lang_test_X = {
+    lang: X_test_seq[lang_test_idx[lang]]
+    for lang in languages
+}
+lang_test_y = {
+    lang: y_test.values[lang_test_idx[lang]]
+    for lang in languages
+}
+cols = ["Strategy", "Phase", "Accuracy", "Balanced Acc",
+        "Precision", "Recall", "Specificity", "F1", "ROC-AUC"]
+for perm in permutations(languages):
+    perm_name     = " -> ".join(perm)
+    strategy_name = perm_name + " -> Full"
+    strategy_results = []
+    print(f"\n{'='*50}")
+    print(f"Strategy: {strategy_name}")
+    print(f"{'='*50}")
+    # Make a clean folder per strategy for figures
+    strat_tag = perm_name.replace(" -> ", "_to_")
+    strat_fig_path = base_path + f"/figures/{strat_tag}"
+    os.makedirs(strat_fig_path, exist_ok=True)
+    # Model built ONCE — weights carry forward across all phases
+    model = build_model()
+    # ── Language phases ──────────────────────────────────────
+    for lang in perm:
+        idx    = (lang_train == lang)
+        X_lang = X_train_seq[idx]
+        y_lang = y_train[idx]
+        print(f"  Training on: {lang} ({X_lang.shape[0]} samples)")
+        history = model.fit(
+            X_lang, y_lang,
+            validation_data=(X_val_seq, y_val),
+            epochs=8,
+            batch_size=32,
+            verbose=1
+        )
+        # Train/Val accuracy + loss curves
+        plot_training_curves(history, f"{strat_tag} [{lang}]", strat_fig_path)
+        # Evaluate on this language's test subset
+        preds = model.predict(lang_test_X[lang]).flatten()
+        acc, bal, prec, rec, spec, f1, auc = evaluate_metrics(lang_test_y[lang], preds)
+        strategy_results.append([strategy_name, lang, acc, bal, prec, rec, spec, f1, auc])
+        # Eval plots for this language
+        plot_eval_charts(lang_test_y[lang], preds, f"{strat_tag} [{lang}]", strat_fig_path)
+        print(f"    Acc={acc:.4f}  F1={f1:.4f}  AUC={auc:.4f}")
+    # ── Full phase ───────────────────────────────────────────
+    print(f"  Training on: Full Dataset ({X_full_shuffled.shape[0]} samples, shuffled)")
+    history_full = model.fit(
+        X_full_shuffled, y_full_shuffled,
+        validation_data=(X_val_seq, y_val),
+        epochs=8,
+        batch_size=64,
+        verbose=1
+    )
+    # Train/Val accuracy + loss curves for full phase
+    plot_training_curves(history_full, f"{strat_tag} [Full]", strat_fig_path)
+    # Evaluate on full test set
+    preds_full = model.predict(X_test_seq).flatten()
+    acc, bal, prec, rec, spec, f1, auc = evaluate_metrics(y_test.values, preds_full)
+    strategy_results.append([strategy_name, "Full", acc, bal, prec, rec, spec, f1, auc])
+    # Eval plots for full phase
+    plot_eval_charts(y_test.values, preds_full, f"{strat_tag} [Full]", strat_fig_path)
+    print(f"    Acc={acc:.4f}  F1={f1:.4f}  AUC={auc:.4f}")
+    # Save per-strategy results table (4 rows: 3 langs + Full)
+    strat_df = pd.DataFrame(strategy_results, columns=cols)
+    strat_df.to_csv(
+        base_path + f"/results_tables/{strat_tag}_results.csv",
+        index=False
+    )
+    print(f"\n  Results for strategy: {strategy_name}")
+    print(strat_df.to_string(index=False))
+    model.save(base_path + f"/trained_models/planB_{strat_tag}_Full.h5")
+    print(f"  Saved model: planB_{strat_tag}_Full.h5")
+# ============================================================
+# COMBINED RESULTS TABLE (all 6 strategies × 4 phases = 24 rows)
+# ============================================================
+all_csv = [
+    base_path + f"/results_tables/{('_to_'.join(perm))}_results.csv"
+    for perm in permutations(languages)
+]
+combined_df = pd.concat([pd.read_csv(f) for f in all_csv], ignore_index=True)
+combined_df.to_csv(base_path + "/results_tables/all_strategies_results.csv", index=False)
+print("\n" + "="*60)
+print("ALL STRATEGIES — COMBINED RESULTS")
+print("="*60)
+print(combined_df.to_string(index=False))

predict.py ADDED Viewed

	@@ -0,0 +1,185 @@

+#!/usr/bin/env python3
+"""
+predict.py — Interactive inference script for the SASC hate speech detection model.
+Usage:
+    python predict.py                          # fully interactive
+    python predict.py --model model.h5         # specify model path
+    python predict.py --input texts.csv        # specify input CSV
+    python predict.py --text "some text here"  # single text prediction
+"""
+import os
+import sys
+import argparse
+import json
+# ── Argument parsing ────────────────────────────────────────────────────────
+parser = argparse.ArgumentParser(description="SASC Hate Speech Detector")
+parser.add_argument("--model",     type=str, help="Path to .h5 model file")
+parser.add_argument("--tokenizer", type=str, help="Path to tokenizer.json")
+parser.add_argument("--input",     type=str, help="Path to input CSV file")
+parser.add_argument("--text",      type=str, help="Single text to classify")
+parser.add_argument("--output",    type=str, help="Path to save results CSV")
+parser.add_argument("--threshold", type=float, default=0.5, help="Decision threshold (default: 0.5)")
+parser.add_argument("--col",       type=str, default="text", help="Column name in CSV containing text (default: text)")
+args = parser.parse_args()
+# ── Interactive prompts if args not provided ─────────────────────────────────
+def ask(prompt, default=None):
+    suffix = f" [{default}]" if default else ""
+    val = input(f"{prompt}{suffix}: ").strip()
+    return val if val else default
+print("\n=== SASC Hate Speech Detector ===\n")
+# Model path
+model_path = args.model
+if not model_path:
+    model_path = ask("Model path (.h5)", "model.h5")
+if not os.path.exists(model_path):
+    print(f"Model not found: {model_path}")
+    sys.exit(1)
+# Tokenizer path
+tokenizer_path = args.tokenizer
+if not tokenizer_path:
+    # look next to model file first
+    candidate = os.path.join(os.path.dirname(model_path), "tokenizer.json")
+    tokenizer_path = ask("Tokenizer path", candidate if os.path.exists(candidate) else "tokenizer.json")
+if not os.path.exists(tokenizer_path):
+    print(f"Tokenizer not found: {tokenizer_path}")
+    sys.exit(1)
+# Threshold
+threshold = args.threshold
+if not args.threshold and not args.text and not args.input:
+    t = ask("Decision threshold (0.0-1.0)", "0.5")
+    try:
+        threshold = float(t)
+    except ValueError:
+        threshold = 0.5
+print(f"\nLoading model from {model_path}...")
+import tensorflow as tf
+model = tf.keras.models.load_model(model_path)
+print(f"Loading tokenizer from {tokenizer_path}...")
+from tensorflow.keras.preprocessing.text import tokenizer_from_json
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+with open(tokenizer_path) as f:
+    tokenizer = tokenizer_from_json(f.read())
+MAX_LEN = 100
+def predict(texts):
+    seqs   = tokenizer.texts_to_sequences(texts)
+    padded = pad_sequences(seqs, maxlen=MAX_LEN)
+    probs  = model.predict(padded, verbose=0).flatten()
+    labels = ["Hate Speech" if p > threshold else "Non-Hate" for p in probs]
+    return probs, labels
+# ── Single text mode ──────────────────────────────────────────────────────────
+if args.text:
+    probs, labels = predict([args.text])
+    print(f"\nText    : {args.text}")
+    print(f"Label   : {labels[0]}")
+    print(f"Confidence: {probs[0]:.4f}")
+    sys.exit(0)
+# ── CSV mode ──────────────────────────────────────────────────────────────────
+import pandas as pd
+input_path = args.input
+if not input_path:
+    mode = ask("Input mode — (1) CSV file  (2) Type text manually", "1")
+    if mode == "2":
+        # manual text entry loop
+        print("\nEnter texts one per line. Type 'done' when finished.\n")
+        texts = []
+        while True:
+            t = input("  Text: ").strip()
+            if t.lower() == "done":
+                break
+            if t:
+                texts.append(t)
+        if not texts:
+            print("No texts entered.")
+            sys.exit(0)
+        probs, labels = predict(texts)
+        import pandas as pd
+        results = pd.DataFrame({
+            "text":       texts,
+            "label":      labels,
+            "confidence": [round(float(p), 4) for p in probs]
+        })
+        print("\n" + "="*60)
+        print(results.to_string(index=False))
+        print("="*60)
+        out = args.output or ask("Save results to CSV? (leave blank to skip)", "")
+        if out:
+            results.to_csv(out, index=False)
+            print(f"Saved to {out}")
+        sys.exit(0)
+    else:
+        input_path = ask("CSV file path")
+if not os.path.exists(input_path):
+    print(f"File not found: {input_path}")
+    sys.exit(1)
+df = pd.read_csv(input_path)
+print(f"\nLoaded {len(df)} rows from {input_path}")
+print(f"Columns: {list(df.columns)}")
+text_col = args.col
+if text_col not in df.columns:
+    print(f"\nColumn '{text_col}' not found.")
+    text_col = ask(f"Which column contains the text?", df.columns[0])
+print(f"\nRunning inference on column '{text_col}' with threshold={threshold}...")
+texts = df[text_col].fillna("").astype(str).tolist()
+probs, labels = predict(texts)
+df["predicted_label"] = labels
+df["confidence"]      = [round(float(p), 4) for p in probs]
+# Summary
+hate_count    = labels.count("Hate Speech")
+nonhate_count = labels.count("Non-Hate")
+print(f"\n{'='*60}")
+print(f"Results Summary")
+print(f"{'='*60}")
+print(f"  Total samples : {len(texts)}")
+print(f"  Hate Speech   : {hate_count} ({hate_count/len(texts)*100:.1f}%)")
+print(f"  Non-Hate      : {nonhate_count} ({nonhate_count/len(texts)*100:.1f}%)")
+print(f"  Threshold     : {threshold}")
+print(f"{'='*60}")
+# Show sample
+print(f"\nSample predictions (first 10):")
+print(df[[text_col, "predicted_label", "confidence"]].head(10).to_string(index=False))
+# Save
+output_path = args.output
+if not output_path:
+    default_out = input_path.replace(".csv", "_predictions.csv")
+    output_path = ask(f"\nSave full results to CSV", default_out)
+if output_path:
+    df.to_csv(output_path, index=False)
+    print(f"\nSaved {len(df)} predictions to {output_path}")