tuklu
/

SASCv2

+# -*- coding: utf-8 -*-
+"""
+Strategy: Hinglish -> Hindi -> English -> Full
+- 50 epochs per phase (200 total)
+- Evaluate on each individual language + full after every phase
+- All figures: figsize=(8,6), dpi=300
+- Output dir: /root/output_v2  (old output_v1 untouched)
+"""
+import os
+import numpy as np
+import pandas as pd
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import (accuracy_score, balanced_accuracy_score,
+                             precision_score, recall_score, f1_score,
+                             roc_auc_score, confusion_matrix,
+                             roc_curve, precision_recall_curve)
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
+from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
+# ── Paths ────────────────────────────────────────────────────────────────────
+base_path = "/root/output_v2"
+data_path = "/root/dataset.csv"
+glove_path = "/root/glove.6B.300d.txt"
+for sub in ["dataset_splits", "figures", "results_tables", "trained_models"]:
+    os.makedirs(os.path.join(base_path, sub), exist_ok=True)
+# ── Load data ────────────────────────────────────────────────────────────────
+df = pd.read_csv(data_path)
+# Language distribution pie
+plt.figure(figsize=(8, 6))
+df['language'].value_counts().plot.pie(autopct='%1.1f%%')
+plt.title("Dataset Language Distribution")
+plt.ylabel("")
+plt.savefig(os.path.join(base_path, "figures", "language_distribution.png"), dpi=300, bbox_inches="tight")
+plt.close()
+X    = df["clean_text"]
+y    = df["hate_label"]
+lang = df["language"]
+# ── Splits ───────────────────────────────────────────────────────────────────
+X_temp, X_test, y_temp, y_test, lang_temp, lang_test = train_test_split(
+    X, y, lang, test_size=0.30, stratify=y, random_state=42)
+X_train, X_val, y_train, y_val, lang_train, lang_val = train_test_split(
+    X_temp, y_temp, lang_temp,
+    test_size=0.1428, stratify=y_temp, random_state=42)
+pd.DataFrame({"text": X_train, "label": y_train, "lang": lang_train}).to_csv(
+    os.path.join(base_path, "dataset_splits", "train.csv"), index=False)
+pd.DataFrame({"text": X_val,   "label": y_val,   "lang": lang_val}).to_csv(
+    os.path.join(base_path, "dataset_splits", "val.csv"),   index=False)
+pd.DataFrame({"text": X_test,  "label": y_test,  "lang": lang_test}).to_csv(
+    os.path.join(base_path, "dataset_splits", "test.csv"),  index=False)
+# ── Tokenise & pad ───────────────────────────────────────────────────────────
+MAX_LEN = 100
+VOCAB   = 50000
+tokenizer = Tokenizer(num_words=VOCAB)
+tokenizer.fit_on_texts(X_train)
+X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
+X_val_seq   = pad_sequences(tokenizer.texts_to_sequences(X_val),   maxlen=MAX_LEN)
+X_test_seq  = pad_sequences(tokenizer.texts_to_sequences(X_test),  maxlen=MAX_LEN)
+# ── GloVe embeddings ─────────────────────────────────────────────────────────
+EMBEDDING_DIM = 300
+print("Loading GloVe …")
+embeddings_index = {}
+with open(glove_path, encoding="utf8") as f:
+    for line in f:
+        values = line.split()
+        embeddings_index[values[0]] = np.asarray(values[1:], dtype="float32")
+print(f"Loaded {len(embeddings_index):,} word vectors.")
+word_index = tokenizer.word_index
+embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
+for word, i in word_index.items():
+    vec = embeddings_index.get(word)
+    if vec is not None:
+        embedding_matrix[i] = vec
+# ── Per-language test subsets ────────────────────────────────────────────────
+languages = ["english", "hindi", "hinglish"]
+lang_test_X = {la: X_test_seq[lang_test.values == la] for la in languages}
+lang_test_y = {la: y_test.values[lang_test.values == la] for la in languages}
+# ── Helpers ──────────────────────────────────────────────────────────────────
+def build_model():
+    m = Sequential([
+        Embedding(len(word_index) + 1, EMBEDDING_DIM,
+                  weights=[embedding_matrix], input_length=MAX_LEN, trainable=False),
+        Bidirectional(LSTM(128)),
+        Dropout(0.5),
+        Dense(64, activation="relu"),
+        Dense(1, activation="sigmoid"),
+    ])
+    m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
+    return m
+def evaluate_metrics(y_true, y_pred_prob):
+    y_pred = (y_pred_prob > 0.5).astype(int)
+    acc  = accuracy_score(y_true, y_pred)
+    bal  = balanced_accuracy_score(y_true, y_pred)
+    prec = precision_score(y_true, y_pred, zero_division=0)
+    rec  = recall_score(y_true, y_pred, zero_division=0)
+    f1   = f1_score(y_true, y_pred, zero_division=0)
+    auc  = roc_auc_score(y_true, y_pred_prob)
+    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
+    spec = tn / (tn + fp)
+    return acc, bal, prec, rec, spec, f1, auc
+def safe_tag(s):
+    return s.replace(" -> ", "_to_").replace(" ", "_")
+def plot_training_curves(history, tag, fig_dir):
+    fig, axes = plt.subplots(1, 2, figsize=(8, 6))
+    axes[0].plot(history.history['accuracy'],     label="Train Acc")
+    axes[0].plot(history.history['val_accuracy'], label="Val Acc")
+    axes[0].set_title(f"{tag} — Accuracy")
+    axes[0].set_xlabel("Epoch"); axes[0].set_ylabel("Accuracy")
+    axes[0].legend(); axes[0].grid(True)
+    axes[1].plot(history.history['loss'],     label="Train Loss")
+    axes[1].plot(history.history['val_loss'], label="Val Loss")
+    axes[1].set_title(f"{tag} — Loss")
+    axes[1].set_xlabel("Epoch"); axes[1].set_ylabel("Loss")
+    axes[1].legend(); axes[1].grid(True)
+    plt.tight_layout()
+    plt.savefig(os.path.join(fig_dir, f"{safe_tag(tag)}_curves.png"), dpi=300, bbox_inches="tight")
+    plt.close()
+def plot_eval_charts(y_true, preds, tag, fig_dir):
+    ftag = safe_tag(tag)
+    # Confusion matrix
+    cm = confusion_matrix(y_true, (preds > 0.5).astype(int))
+    plt.figure(figsize=(8, 6))
+    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
+                xticklabels=["Non-Hate", "Hate"],
+                yticklabels=["Non-Hate", "Hate"])
+    plt.title(f"{tag} — Confusion Matrix")
+    plt.xlabel("Predicted"); plt.ylabel("Actual")
+    plt.savefig(os.path.join(fig_dir, f"{ftag}_cm.png"), dpi=300, bbox_inches="tight")
+    plt.close()
+    # ROC
+    fpr, tpr, _ = roc_curve(y_true, preds)
+    auc_val = roc_auc_score(y_true, preds)
+    plt.figure(figsize=(8, 6))
+    plt.plot(fpr, tpr, label=f"AUC={auc_val:.4f}")
+    plt.plot([0, 1], [0, 1], '--')
+    plt.title(f"{tag} — ROC Curve")
+    plt.xlabel("FPR"); plt.ylabel("TPR")
+    plt.legend(); plt.grid(True)
+    plt.savefig(os.path.join(fig_dir, f"{ftag}_roc.png"), dpi=300, bbox_inches="tight")
+    plt.close()
+    # Precision-Recall
+    precision, recall, thresholds = precision_recall_curve(y_true, preds)
+    plt.figure(figsize=(8, 6))
+    plt.plot(recall, precision)
+    plt.title(f"{tag} — Precision-Recall Curve")
+    plt.xlabel("Recall"); plt.ylabel("Precision")
+    plt.grid(True)
+    plt.savefig(os.path.join(fig_dir, f"{ftag}_pr.png"), dpi=300, bbox_inches="tight")
+    plt.close()
+    # F1 vs Threshold
+    f1_scores = (2 * precision * recall) / (precision + recall + 1e-8)
+    plt.figure(figsize=(8, 6))
+    plt.plot(thresholds, f1_scores[:-1])
+    plt.title(f"{tag} — F1 Score vs Threshold")
+    plt.xlabel("Threshold"); plt.ylabel("F1 Score")
+    plt.grid(True)
+    plt.savefig(os.path.join(fig_dir, f"{ftag}_f1.png"), dpi=300, bbox_inches="tight")
+    plt.close()
+# ── Strategy ─────────────────────────────────────────────────────────────────
+STRATEGY   = ("hinglish", "hindi", "english")
+EPOCHS     = 50
+BATCH_LANG = 32
+BATCH_FULL = 64
+strategy_name = " -> ".join(STRATEGY) + " -> Full"
+print("\n" + "=" * 60)
+print(f"Strategy: {strategy_name}")
+print(f"Epochs per phase: {EPOCHS}  (Total: {EPOCHS * 4})")
+print("=" * 60)
+fig_dir = os.path.join(base_path, "figures", safe_tag(" -> ".join(STRATEGY)))
+os.makedirs(fig_dir, exist_ok=True)
+# Full training data (pre-shuffled, used in final phase)
+np.random.seed(42)
+shuffle_idx     = np.random.permutation(len(X_train_seq))
+X_full_shuffled = np.ascontiguousarray(X_train_seq[shuffle_idx], dtype=np.int32)
+y_full_shuffled = np.ascontiguousarray(y_train.values[shuffle_idx], dtype=np.float32)
+cols = ["Phase", "Eval_On", "Accuracy", "Balanced_Acc",
+        "Precision", "Recall", "Specificity", "F1", "ROC_AUC"]
+all_rows = []
+model = build_model()
+model.summary()
+# ── Language phases ──────────────────────────────────────────────────────────
+for phase_lang in STRATEGY:
+    idx    = (lang_train == phase_lang)
+    X_lang = X_train_seq[idx]
+    y_lang = y_train[idx]
+    print(f"\n{'─'*50}")
+    print(f"Phase: training on '{phase_lang}'  ({X_lang.shape[0]} samples, {EPOCHS} epochs)")
+    print(f"{'─'*50}")
+    history = model.fit(
+        X_lang, y_lang,
+        validation_data=(X_val_seq, y_val),
+        epochs=EPOCHS,
+        batch_size=BATCH_LANG,
+        verbose=1,
+    )
+    plot_training_curves(history, f"Phase_{phase_lang}", fig_dir)
+    # Evaluate on every individual language + full
+    for eval_lang in languages:
+        preds = model.predict(lang_test_X[eval_lang]).flatten()
+        metrics = evaluate_metrics(lang_test_y[eval_lang], preds)
+        all_rows.append([phase_lang, eval_lang] + list(metrics))
+        plot_eval_charts(lang_test_y[eval_lang], preds,
+                         f"Phase_{phase_lang}_eval_{eval_lang}", fig_dir)
+        print(f"  eval on {eval_lang:10s} | Acc={metrics[0]:.4f}  F1={metrics[5]:.4f}  AUC={metrics[6]:.4f}")
+    # Full test set
+    preds_full = model.predict(X_test_seq).flatten()
+    metrics_full = evaluate_metrics(y_test.values, preds_full)
+    all_rows.append([phase_lang, "full"] + list(metrics_full))
+    plot_eval_charts(y_test.values, preds_full,
+                     f"Phase_{phase_lang}_eval_full", fig_dir)
+    print(f"  eval on {'full':10s} | Acc={metrics_full[0]:.4f}  F1={metrics_full[5]:.4f}  AUC={metrics_full[6]:.4f}")
+# ── Full dataset phase ───────────────────────────────────────────────────────
+print(f"\n{'─'*50}")
+print(f"Phase: training on Full dataset  ({X_full_shuffled.shape[0]} samples, {EPOCHS} epochs)")
+print(f"{'─'*50}")
+history_full = model.fit(
+    X_full_shuffled, y_full_shuffled,
+    validation_data=(X_val_seq, y_val),
+    epochs=EPOCHS,
+    batch_size=BATCH_FULL,
+    verbose=1,
+)
+plot_training_curves(history_full, "Phase_Full", fig_dir)
+for eval_lang in languages:
+    preds = model.predict(lang_test_X[eval_lang]).flatten()
+    metrics = evaluate_metrics(lang_test_y[eval_lang], preds)
+    all_rows.append(["Full", eval_lang] + list(metrics))
+    plot_eval_charts(lang_test_y[eval_lang], preds,
+                     f"Phase_Full_eval_{eval_lang}", fig_dir)
+    print(f"  eval on {eval_lang:10s} | Acc={metrics[0]:.4f}  F1={metrics[5]:.4f}  AUC={metrics[6]:.4f}")
+preds_full = model.predict(X_test_seq).flatten()
+metrics_full = evaluate_metrics(y_test.values, preds_full)
+all_rows.append(["Full", "full"] + list(metrics_full))
+plot_eval_charts(y_test.values, preds_full, "Phase_Full_eval_full", fig_dir)
+print(f"  eval on {'full':10s} | Acc={metrics_full[0]:.4f}  F1={metrics_full[5]:.4f}  AUC={metrics_full[6]:.4f}")
+# ── Save results ─────────────────────────────────────────────────────────────
+results_df = pd.DataFrame(all_rows, columns=cols)
+results_df.to_csv(os.path.join(base_path, "results_tables", "hinglish_hindi_english_full_results.csv"), index=False)
+print("\n" + "=" * 60)
+print("FINAL RESULTS TABLE")
+print("=" * 60)
+print(results_df.to_string(index=False))
+model.save(os.path.join(base_path, "trained_models", "hinglish_hindi_english_full.h5"))
+print("\nModel saved.")
+print("Done.")

predict.py ADDED Viewed

	@@ -0,0 +1,206 @@

+#!/usr/bin/env python3
+"""
+predict.py — Interactive inference script for the SASC hate speech detection model.
+Usage:
+    python predict.py                          # fully interactive
+    python predict.py --model model.h5         # specify model path
+    python predict.py --input texts.csv        # specify input CSV
+    python predict.py --text "some text here"  # single text prediction
+"""
+import os
+import sys
+import argparse
+import json
+# suppress TF logs
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
+from prompt_toolkit import prompt
+from prompt_toolkit.completion import PathCompleter
+from prompt_toolkit.shortcuts import prompt as pt_prompt
+path_completer = PathCompleter(expanduser=True)
+# ── Argument parsing ────────────────────────────────────────────────────────
+parser = argparse.ArgumentParser(description="SASC Hate Speech Detector")
+parser.add_argument("--model",     type=str, help="Path to .h5 model file")
+parser.add_argument("--tokenizer", type=str, help="Path to tokenizer.json")
+parser.add_argument("--input",     type=str, help="Path to input CSV file")
+parser.add_argument("--text",      type=str, help="Single text to classify")
+parser.add_argument("--output",    type=str, help="Path to save results CSV")
+parser.add_argument("--threshold", type=float, default=0.5, help="Decision threshold (default: 0.5)")
+parser.add_argument("--col",       type=str, default="text", help="Column name in CSV containing text (default: text)")
+args = parser.parse_args()
+# ── Interactive prompts if args not provided ─────────────────────────────────
+def ask(message, default=None, is_path=False):
+    suffix = f" [{default}]" if default else ""
+    if is_path:
+        val = pt_prompt(f"{message}{suffix}: ", completer=path_completer).strip()
+    else:
+        val = input(f"{message}{suffix}: ").strip()
+    val = val if val else default
+    return os.path.expanduser(val) if val else val
+print("\n=== SASC Hate Speech Detector ===\n")
+# Model path
+model_path = args.model
+if not model_path:
+    model_path = ask("Model path (.h5)", "model.h5", is_path=True)
+if not os.path.exists(model_path):
+    print(f"Model not found: {model_path}")
+    sys.exit(1)
+# Tokenizer path
+tokenizer_path = args.tokenizer
+if not tokenizer_path:
+    # look next to model file first
+    candidate = os.path.join(os.path.dirname(model_path), "tokenizer.json")
+    tokenizer_path = ask("Tokenizer path", candidate if os.path.exists(candidate) else "tokenizer.json", is_path=True)
+if not os.path.exists(tokenizer_path):
+    print(f"Tokenizer not found: {tokenizer_path}")
+    sys.exit(1)
+# Threshold
+threshold = args.threshold
+if not args.threshold and not args.text and not args.input:
+    t = ask("Decision threshold (0.0-1.0)", "0.5")
+    try:
+        threshold = float(t)
+    except ValueError:
+        threshold = 0.5
+print(f"\nLoading model from   {model_path}")
+print(f"Loading tokenizer from {tokenizer_path}")
+import warnings
+warnings.filterwarnings("ignore")
+import tensorflow as tf
+import logging
+tf.get_logger().setLevel(logging.ERROR)
+model = tf.keras.models.load_model(model_path, compile=False)
+from tensorflow.keras.preprocessing.text import tokenizer_from_json
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+with open(tokenizer_path) as f:
+    tokenizer = tokenizer_from_json(f.read())
+print(f"Model loaded — vocab size: {len(tokenizer.word_index)}")
+MAX_LEN = 100
+def predict(texts):
+    seqs   = tokenizer.texts_to_sequences(texts)
+    padded = pad_sequences(seqs, maxlen=MAX_LEN)
+    probs  = model.predict(padded, verbose=0).flatten()
+    labels = ["Hate Speech" if p > threshold else "Non-Hate" for p in probs]
+    return probs, labels
+# ── Single text mode ──────────────────────────────────────────────────────────
+if args.text:
+    probs, labels = predict([args.text])
+    print(f"\nText    : {args.text}")
+    print(f"Label   : {labels[0]}")
+    print(f"Confidence: {probs[0]:.4f}")
+    sys.exit(0)
+# ── CSV mode ──────────────────────────────────────────────────────────────────
+import pandas as pd
+input_path = args.input
+if not input_path:
+    mode = ask("Input mode — (1) CSV file  (2) Type text manually", "1")
+    if mode == "2":
+        # manual text entry loop
+        print("\nEnter texts one per line. Type 'done' when finished.\n")
+        texts = []
+        while True:
+            t = input("  Text: ").strip()
+            if t.lower() == "done":
+                break
+            if t:
+                texts.append(t)
+        if not texts:
+            print("No texts entered.")
+            sys.exit(0)
+        probs, labels = predict(texts)
+        import pandas as pd
+        results = pd.DataFrame({
+            "text":       texts,
+            "label":      labels,
+            "confidence": [round(float(p), 4) for p in probs]
+        })
+        print("\n" + "="*60)
+        print(results.to_string(index=False))
+        print("="*60)
+        out = args.output or ask("Save results to CSV? (leave blank to skip)", "", is_path=True)
+        if out:
+            results.to_csv(out, index=False)
+            print(f"Saved to {out}")
+        sys.exit(0)
+    else:
+        input_path = ask("CSV file path", is_path=True)
+if not os.path.exists(input_path):
+    print(f"File not found: {input_path}")
+    sys.exit(1)
+df = pd.read_csv(input_path)
+print(f"\nLoaded {len(df)} rows from {input_path}")
+print(f"Columns: {list(df.columns)}")
+text_col = args.col
+if text_col not in df.columns:
+    print(f"\nColumn '{text_col}' not found.")
+    text_col = ask(f"Which column contains the text?", df.columns[0])
+print(f"\nRunning inference on column '{text_col}' with threshold={threshold}...")
+texts = df[text_col].fillna("").astype(str).tolist()
+probs, labels = predict(texts)
+df["predicted_label"] = labels
+df["confidence"]      = [round(float(p), 4) for p in probs]
+# Summary
+hate_count    = labels.count("Hate Speech")
+nonhate_count = labels.count("Non-Hate")
+print(f"\n{'='*60}")
+print(f"Results Summary")
+print(f"{'='*60}")
+print(f"  Total samples : {len(texts)}")
+print(f"  Hate Speech   : {hate_count} ({hate_count/len(texts)*100:.1f}%)")
+print(f"  Non-Hate      : {nonhate_count} ({nonhate_count/len(texts)*100:.1f}%)")
+print(f"  Threshold     : {threshold}")
+print(f"{'='*60}")
+# Show sample
+print(f"\nSample predictions (first 10):")
+print(df[[text_col, "predicted_label", "confidence"]].head(10).to_string(index=False))
+# Save
+output_path = args.output
+if not output_path:
+    default_out = input_path.replace(".csv", "_predictions.csv")
+    output_path = ask(f"\nSave full results to CSV", default_out, is_path=True)
+if output_path:
+    df.to_csv(output_path, index=False)
+    print(f"\nSaved {len(df)} predictions to {output_path}")

pyproject.toml ADDED Viewed

	@@ -0,0 +1,16 @@

+[project]
+name = "sasc"
+version = "0.1.0"
+description = "Multilingual Hate Speech Detection — GloVe + BiLSTM with Sequential Transfer Learning"
+readme = "README.md"
+requires-python = ">=3.11,<3.14"
+dependencies = [
+    "tensorflow>=2.13.0",
+    "numpy>=1.24.0",
+    "pandas>=2.0.0",
+    "scikit-learn>=1.3.0",
+    "matplotlib>=3.7.0",
+    "seaborn>=0.12.0",
+    "huggingface-hub>=0.20.0",
+    "prompt-toolkit>=3.0.52",
+]