NLP-Hub / app.py
Manas
Recommit files with PNGs tracked by LFS
6124cbc
"""
NLP Hub - Flask backend serving HuggingFace transformer models.
Models are lazy-loaded on first request to keep startup fast.
"""
from flask import Flask, render_template, request, jsonify
import os
import threading
app = Flask(__name__)
# ---------------------------------------------------------------------------
# Model registry - each entry describes a HuggingFace pipeline config
# ---------------------------------------------------------------------------
BESSTIE_MODELS = {
"en-IN": "vyshnav112233/BESSTIE-RoBERTa-en-IN-seed7",
"en-UK": "vyshnav112233/BESSTIE-RoBERTa-en-UK-seed7",
"en-AU": "vyshnav112233/BESSTIE-RoBERTa-en-AU-seed42",
}
SARCASM_LANG_MODELS = {
"en-IN": "vyshnav112233/roberta-sarcasm-en-IN-seed42",
"en-UK": "vyshnav112233/roberta-sarcasm-en-UK-seed42",
"en-AU": "vyshnav112233/roberta-sarcasm-en-AU-seed7",
}
TINYLLAMA_LORA_SARCASM_MODELS = {
"en-IN": "vyshnav112233/task2-3-lora-sarcasm-en-IN-seed-123",
"en-AU": "vyshnav112233/task2-3-lora-sarcasm-en-AU-seed-2024",
"en-UK": "vyshnav112233/task2-3-lora-sarcasm-en-UK-seed-42",
}
LANG_AWARE_MODELS = {
"language_roberta": BESSTIE_MODELS,
"language_roberta_sarcasm": SARCASM_LANG_MODELS,
"tinyllama_lora_sarcasm": TINYLLAMA_LORA_SARCASM_MODELS,
}
HOME_MODEL_KEYS = (
"roberta_sentiment",
"language_roberta",
"logreg_sarcasm",
"language_roberta_sarcasm",
"tinyllama_lora_sarcasm",
)
MODEL_REGISTRY = {
"roberta_sentiment": {
"name": "Task 1: RoBERTa",
"description": "Classifies text sentiment (positive / negative) using a fine-tuned RoBERTa model.",
"hf_task": "sentiment-analysis",
"hf_model": "vyshnav112233/roberta-base-sentiment",
"display_badge": "Simanta/roberta-base",
"icon": "fa-face-smile",
"task_group": "task1",
"task1_task": "sentiment",
"family": "ptlm",
},
"language_roberta": {
"name": "Task 2: RoBERTa-dialect",
"description": "Dialect-aware sentiment analysis fine-tuned on the BESSTIE dataset for Indian, British, and Australian English.",
"hf_task": "sentiment-analysis",
"hf_model": "vyshnav112233/BESSTIE-RoBERTa-en-IN-seed7; vyshnav112233/BESSTIE-RoBERTa-en-UK-seed7; vyshnav112233/BESSTIE-RoBERTa-en-AU-seed42",
"display_badge": "Ryan/RoBERTa-en-IN-seed7; Ryan/RoBERTa-en-UK-seed7; Ryan/RoBERTa-en-AU-seed42",
"icon": "fa-brain",
"task_group": "home",
},
"logreg_sarcasm": {
"name": "Task 1: Logistic Regression",
"description": "TF-IDF (1-2 grams) + Logistic Regression baseline for sarcasm detection. Best macro-F1 among baselines in our benchmark.",
"hf_task": "sarcasm-detection",
"hf_model": "sklearn/logreg_sarcasm.joblib",
"display_badge": "Simanta/sklearn/logreg_sarcasm.joblib",
"icon": "fa-face-grin-squint-tears",
"task_group": "home",
},
"language_roberta_sarcasm": {
"name": "Task 2: RoBERTa-dialect",
"description": "Dialect-aware sarcasm detection fine-tuned on Indian, British, and Australian English RoBERTa checkpoints.",
"hf_task": "sentiment-analysis",
"hf_model": "vyshnav112233/roberta-sarcasm-en-IN-seed42; vyshnav112233/roberta-sarcasm-en-UK-seed42; vyshnav112233/roberta-sarcasm-en-AU-seed7",
"display_badge": "Ryan/RoBERTa-en-IN-seed42; Ryan/RoBERTa-en-UK-seed42; Ryan/RoBERTa-en-AU-seed7",
"icon": "fa-brain",
"task_group": "home",
},
"tinyllama_lora_sarcasm": {
"name": "Task 3: TinyLlama-1.1B Adapter",
"description": "Dialect-aware sarcasm detection using TinyLlama-1.1B with LoRA adapters fine-tuned on Indian, British, and Australian English.",
"hf_task": "sentiment-analysis",
"hf_model": "; ".join(TINYLLAMA_LORA_SARCASM_MODELS.values()),
"display_badge": "Omkar/LORA-en-IN-seed123; Omkar/LORA-en-AU-seed2024; Omkar/LORA-en-UK-seed42",
"icon": "fa-microchip",
"task_group": "home",
},
"lr_sarcasm": {
"name": "Logistic Regression",
"description": "TF-IDF + Logistic Regression baseline trained on Simanta's sarcasm labels.",
"hf_task": "sarcasm-detection",
"hf_model": "sklearn/lr_sarcasm.joblib",
"icon": "fa-chart-line",
"task_group": "task1",
"task1_task": "sarcasm",
"family": "classical",
},
"svm_sarcasm": {
"name": "Linear SVM",
"description": "TF-IDF + calibrated Linear SVM baseline for sarcasm detection.",
"hf_task": "sarcasm-detection",
"hf_model": "sklearn/svm_sarcasm.joblib",
"icon": "fa-vector-square",
"task_group": "task1",
"task1_task": "sarcasm",
"family": "classical",
},
"rf_sarcasm": {
"name": "Random Forest",
"description": "TF-IDF + Random Forest baseline for sarcasm detection.",
"hf_task": "sarcasm-detection",
"hf_model": "sklearn/rf_sarcasm.joblib",
"icon": "fa-tree",
"task_group": "task1",
"task1_task": "sarcasm",
"family": "classical",
},
"albert_sarcasm": {
"name": "ALBERT",
"description": "Fine-tuned ALBERT model for sarcasm detection.",
"hf_task": "sentiment-analysis",
"hf_model": "vyshnav112233/albert-base-v2-sarcasm",
"icon": "fa-layer-group",
"task_group": "task1",
"task1_task": "sarcasm",
"family": "ptlm",
},
"roberta_sarcasm": {
"name": "RoBERTa",
"description": "Fine-tuned RoBERTa model for sarcasm detection.",
"hf_task": "sentiment-analysis",
"hf_model": "vyshnav112233/roberta-base-sarcasm",
"icon": "fa-brain",
"task_group": "task1",
"task1_task": "sarcasm",
"family": "ptlm",
},
"distilbert_sarcasm": {
"name": "DistilBERT",
"description": "Fine-tuned DistilBERT model for sarcasm detection.",
"hf_task": "sentiment-analysis",
"hf_model": "vyshnav112233/distilbert-base-sarcasm",
"icon": "fa-bolt",
"task_group": "task1",
"task1_task": "sarcasm",
"family": "ptlm",
},
"lr_sentiment": {
"name": "Logistic Regression",
"description": "TF-IDF + Logistic Regression baseline trained on Simanta's sentiment labels.",
"hf_task": "sentiment-analysis",
"hf_model": "sklearn/lr_sentiment.joblib",
"icon": "fa-chart-line",
"task_group": "task1",
"task1_task": "sentiment",
"family": "classical",
},
"svm_sentiment": {
"name": "Linear SVM",
"description": "TF-IDF + calibrated Linear SVM baseline for sentiment analysis.",
"hf_task": "sentiment-analysis",
"hf_model": "sklearn/svm_sentiment.joblib",
"icon": "fa-vector-square",
"task_group": "task1",
"task1_task": "sentiment",
"family": "classical",
},
"rf_sentiment": {
"name": "Random Forest",
"description": "TF-IDF + Random Forest baseline for sentiment analysis.",
"hf_task": "sentiment-analysis",
"hf_model": "sklearn/rf_sentiment.joblib",
"icon": "fa-tree",
"task_group": "task1",
"task1_task": "sentiment",
"family": "classical",
},
"albert_sentiment": {
"name": "ALBERT",
"description": "Fine-tuned ALBERT model for sentiment analysis.",
"hf_task": "sentiment-analysis",
"hf_model": "vyshnav112233/albert-base-v2-sentiment",
"icon": "fa-layer-group",
"task_group": "task1",
"task1_task": "sentiment",
"family": "ptlm",
},
"distilbert_sentiment": {
"name": "DistilBERT",
"description": "Fine-tuned DistilBERT model for sentiment analysis.",
"hf_task": "sentiment-analysis",
"hf_model": "vyshnav112233/distilbert-base-sentiment",
"icon": "fa-bolt",
"task_group": "task1",
"task1_task": "sentiment",
"family": "ptlm",
},
"t2_sentiment_au": {
"name": "RoBERTa · Trained on en-AU",
"description": "RoBERTa-base fine-tuned only on Australian English BESSTIE sentiment data.",
"hf_task": "sentiment-analysis",
"hf_model": "vyshnav112233/BESSTIE-RoBERTa-en-AU-seed42",
"icon": "fa-brain",
"task_group": "task2",
"task2_task": "sentiment",
"trained_on": "en-AU",
},
"t2_sentiment_in": {
"name": "RoBERTa · Trained on en-IN",
"description": "RoBERTa-base fine-tuned only on Indian English BESSTIE sentiment data.",
"hf_task": "sentiment-analysis",
"hf_model": "vyshnav112233/BESSTIE-RoBERTa-en-IN-seed7",
"icon": "fa-brain",
"task_group": "task2",
"task2_task": "sentiment",
"trained_on": "en-IN",
},
"t2_sentiment_uk": {
"name": "RoBERTa · Trained on en-UK",
"description": "RoBERTa-base fine-tuned only on British English BESSTIE sentiment data.",
"hf_task": "sentiment-analysis",
"hf_model": "vyshnav112233/BESSTIE-RoBERTa-en-UK-seed7",
"icon": "fa-brain",
"task_group": "task2",
"task2_task": "sentiment",
"trained_on": "en-UK",
},
"t2_sentiment_mixed": {
"name": "RoBERTa · Trained on Mixed (all 3)",
"description": "RoBERTa-base fine-tuned on en-AU, en-IN, and en-UK BESSTIE sentiment data combined.",
"hf_task": "sentiment-analysis",
"hf_model": "vyshnav112233/BESSTIE-RoBERTa-mixed-seed123",
"icon": "fa-brain",
"task_group": "task2",
"task2_task": "sentiment",
"trained_on": "mixed",
},
"t2_sarcasm_au": {
"name": "RoBERTa · Trained on en-AU",
"description": "RoBERTa-base fine-tuned only on Australian English BESSTIE sarcasm data.",
"hf_task": "sentiment-analysis",
"hf_model": "vyshnav112233/roberta-sarcasm-en-AU-seed7",
"icon": "fa-brain",
"task_group": "task2",
"task2_task": "sarcasm",
"trained_on": "en-AU",
},
"t2_sarcasm_in": {
"name": "RoBERTa · Trained on en-IN",
"description": "RoBERTa-base fine-tuned only on Indian English BESSTIE sarcasm data.",
"hf_task": "sentiment-analysis",
"hf_model": "vyshnav112233/roberta-sarcasm-en-IN-seed42",
"icon": "fa-brain",
"task_group": "task2",
"task2_task": "sarcasm",
"trained_on": "en-IN",
},
"t2_sarcasm_uk": {
"name": "RoBERTa · Trained on en-UK",
"description": "RoBERTa-base fine-tuned only on British English BESSTIE sarcasm data.",
"hf_task": "sentiment-analysis",
"hf_model": "vyshnav112233/roberta-sarcasm-en-UK-seed42",
"icon": "fa-brain",
"task_group": "task2",
"task2_task": "sarcasm",
"trained_on": "en-UK",
},
"t3_sarcasm_au": {
"name": "TinyLlama-LoRA · Trained on en-AU",
"description": "TinyLlama-1.1B-Chat with a LoRA adapter fine-tuned only on Australian English sarcasm data (seed 2024).",
"hf_task": "sentiment-analysis",
"hf_model": TINYLLAMA_LORA_SARCASM_MODELS["en-AU"],
"icon": "fa-microchip",
"task_group": "task3",
"task3_task": "sarcasm",
"trained_on": "en-AU",
"lora_adapter": True,
},
"t3_sarcasm_in": {
"name": "TinyLlama-LoRA · Trained on en-IN",
"description": "TinyLlama-1.1B-Chat with a LoRA adapter fine-tuned only on Indian English sarcasm data (seed 123).",
"hf_task": "sentiment-analysis",
"hf_model": TINYLLAMA_LORA_SARCASM_MODELS["en-IN"],
"icon": "fa-microchip",
"task_group": "task3",
"task3_task": "sarcasm",
"trained_on": "en-IN",
"lora_adapter": True,
},
"t3_sarcasm_uk": {
"name": "TinyLlama-LoRA · Trained on en-UK",
"description": "TinyLlama-1.1B-Chat with a LoRA adapter fine-tuned only on British English sarcasm data (seed 42).",
"hf_task": "sentiment-analysis",
"hf_model": TINYLLAMA_LORA_SARCASM_MODELS["en-UK"],
"icon": "fa-microchip",
"task_group": "task3",
"task3_task": "sarcasm",
"trained_on": "en-UK",
"lora_adapter": True,
},
}
SKLEARN_ARTIFACTS = {
"logreg_sarcasm": os.path.join("models", "logreg_sarcasm.joblib"),
"lr_sarcasm": os.path.join("models", "lr_sarcasm.joblib"),
"svm_sarcasm": os.path.join("models", "svm_sarcasm.joblib"),
"rf_sarcasm": os.path.join("models", "rf_sarcasm.joblib"),
"lr_sentiment": os.path.join("models", "lr_sentiment.joblib"),
"svm_sentiment": os.path.join("models", "svm_sentiment.joblib"),
"rf_sentiment": os.path.join("models", "rf_sentiment.joblib"),
}
TASK1_EVAL = {
"sentiment": [
{"model": "SVM", "family": "Classical", "accuracy": 0.838, "precision": 0.838, "recall": 0.838, "macro_f1": 0.838},
{"model": "Logistic Regression", "family": "Classical", "accuracy": 0.830, "precision": 0.830, "recall": 0.830, "macro_f1": 0.830},
{"model": "Random Forest", "family": "Classical", "accuracy": 0.795, "precision": 0.795, "recall": 0.795, "macro_f1": 0.795},
{"model": "RoBERTa", "family": "PTLM", "accuracy": 0.896, "precision": 0.896, "recall": 0.896, "macro_f1": 0.896},
{"model": "DistilBERT", "family": "PTLM", "accuracy": 0.868, "precision": 0.868, "recall": 0.868, "macro_f1": 0.868},
{"model": "ALBERT", "family": "PTLM", "accuracy": 0.865, "precision": 0.865, "recall": 0.865, "macro_f1": 0.865},
],
"sarcasm": [
{"model": "Random Forest", "family": "Classical", "accuracy": 0.502, "precision": 0.502, "recall": 0.502, "macro_f1": 0.502},
{"model": "SVM", "family": "Classical", "accuracy": 0.585, "precision": 0.585, "recall": 0.585, "macro_f1": 0.585},
{"model": "Logistic Regression", "family": "Classical", "accuracy": 0.677, "precision": 0.677, "recall": 0.677, "macro_f1": 0.677},
{"model": "RoBERTa", "family": "PTLM", "accuracy": 0.462, "precision": 0.462, "recall": 0.462, "macro_f1": 0.462},
{"model": "DistilBERT", "family": "PTLM", "accuracy": 0.611, "precision": 0.611, "recall": 0.611, "macro_f1": 0.611},
{"model": "ALBERT", "family": "PTLM", "accuracy": 0.462, "precision": 0.462, "recall": 0.462, "macro_f1": 0.462},
],
}
TASK2_EVAL = {
"sentiment": [
{"trained_on": "en-AU", "tested_on": "en-AU", "macro_f1": 0.8941, "macro_f1_std": 0.0053, "macro_p": 0.8965, "macro_r": 0.8935},
{"trained_on": "en-AU", "tested_on": "en-IN", "macro_f1": 0.8196, "macro_f1_std": 0.0203, "macro_p": 0.8250, "macro_r": 0.8228},
{"trained_on": "en-AU", "tested_on": "en-UK", "macro_f1": 0.9437, "macro_f1_std": 0.0007, "macro_p": 0.9443, "macro_r": 0.9436},
{"trained_on": "en-IN", "tested_on": "en-AU", "macro_f1": 0.8659, "macro_f1_std": 0.0047, "macro_p": 0.8737, "macro_r": 0.8654},
{"trained_on": "en-IN", "tested_on": "en-IN", "macro_f1": 0.8409, "macro_f1_std": 0.0056, "macro_p": 0.8444, "macro_r": 0.8417},
{"trained_on": "en-IN", "tested_on": "en-UK", "macro_f1": 0.9323, "macro_f1_std": 0.0040, "macro_p": 0.9328, "macro_r": 0.9325},
{"trained_on": "en-UK", "tested_on": "en-AU", "macro_f1": 0.8761, "macro_f1_std": 0.0168, "macro_p": 0.8855, "macro_r": 0.8748},
{"trained_on": "en-UK", "tested_on": "en-IN", "macro_f1": 0.8463, "macro_f1_std": 0.0008, "macro_p": 0.8467, "macro_r": 0.8463},
{"trained_on": "en-UK", "tested_on": "en-UK", "macro_f1": 0.9471, "macro_f1_std": 0.0000, "macro_p": 0.9471, "macro_r": 0.9472},
{"trained_on": "mixed", "tested_on": "en-AU", "macro_f1": 0.8910, "macro_f1_std": 0.0070, "macro_p": 0.8942, "macro_r": 0.8902},
{"trained_on": "mixed", "tested_on": "en-IN", "macro_f1": 0.8449, "macro_f1_std": 0.0082, "macro_p": 0.8449, "macro_r": 0.8453},
{"trained_on": "mixed", "tested_on": "en-UK", "macro_f1": 0.9523, "macro_f1_std": 0.0078, "macro_p": 0.9535, "macro_r": 0.9521},
],
"sarcasm": [
{"trained_on": "en-AU", "tested_on": "en-AU", "macro_f1": 0.7570, "macro_f1_std": 0.0078, "macro_p": 0.7542, "macro_r": 0.7892},
{"trained_on": "en-AU", "tested_on": "en-IN", "macro_f1": 0.4896, "macro_f1_std": 0.0170, "macro_p": 0.5531, "macro_r": 0.6991},
{"trained_on": "en-AU", "tested_on": "en-UK", "macro_f1": 0.5822, "macro_f1_std": 0.0227, "macro_p": 0.6008, "macro_r": 0.8212},
{"trained_on": "en-IN", "tested_on": "en-AU", "macro_f1": 0.4628, "macro_f1_std": 0.0617, "macro_p": 0.6261, "macro_r": 0.5228},
{"trained_on": "en-IN", "tested_on": "en-IN", "macro_f1": 0.6044, "macro_f1_std": 0.0447, "macro_p": 0.6131, "macro_r": 0.6320},
{"trained_on": "en-IN", "tested_on": "en-UK", "macro_f1": 0.5533, "macro_f1_std": 0.0902, "macro_p": 0.7034, "macro_r": 0.5617},
{"trained_on": "en-UK", "tested_on": "en-AU", "macro_f1": 0.6058, "macro_f1_std": 0.0484, "macro_p": 0.6722, "macro_r": 0.6062},
{"trained_on": "en-UK", "tested_on": "en-IN", "macro_f1": 0.5747, "macro_f1_std": 0.0166, "macro_p": 0.5772, "macro_r": 0.7117},
{"trained_on": "en-UK", "tested_on": "en-UK", "macro_f1": 0.7025, "macro_f1_std": 0.0215, "macro_p": 0.7029, "macro_r": 0.7291},
],
}
TASK3_EVAL = {
"sarcasm": [
{"trained_on": "en-AU", "tested_on": "en-AU", "macro_f1": 0.7603, "macro_f1_std": 0.0291, "macro_p": 0.7588, "macro_r": 0.7902},
{"trained_on": "en-AU", "tested_on": "en-IN", "macro_f1": 0.5005, "macro_f1_std": 0.0240, "macro_p": 0.5531, "macro_r": 0.6915},
{"trained_on": "en-AU", "tested_on": "en-UK", "macro_f1": 0.5805, "macro_f1_std": 0.0413, "macro_p": 0.6031, "macro_r": 0.8315},
{"trained_on": "en-IN", "tested_on": "en-AU", "macro_f1": 0.5394, "macro_f1_std": 0.1054, "macro_p": 0.7578, "macro_r": 0.5669},
{"trained_on": "en-IN", "tested_on": "en-IN", "macro_f1": 0.5964, "macro_f1_std": 0.0817, "macro_p": 0.7411, "macro_r": 0.6797},
{"trained_on": "en-IN", "tested_on": "en-UK", "macro_f1": 0.6661, "macro_f1_std": 0.1357, "macro_p": 0.7979, "macro_r": 0.7001},
{"trained_on": "en-UK", "tested_on": "en-AU", "macro_f1": 0.6003, "macro_f1_std": 0.0171, "macro_p": 0.6388, "macro_r": 0.5951},
{"trained_on": "en-UK", "tested_on": "en-IN", "macro_f1": 0.6331, "macro_f1_std": 0.0253, "macro_p": 0.6132, "macro_r": 0.7935},
{"trained_on": "en-UK", "tested_on": "en-UK", "macro_f1": 0.7724, "macro_f1_std": 0.0088, "macro_p": 0.7418, "macro_r": 0.8183},
],
}
# Lazy-loaded pipeline cache
_pipelines = {}
_lock = threading.Lock()
def get_pipeline(model_key: str, lang_code: str = None):
"""Return a cached transformers pipeline, loading on first call."""
cache_key = f"{model_key}:{lang_code}" if model_key in LANG_AWARE_MODELS else model_key
if cache_key in _pipelines:
return _pipelines[cache_key]
with _lock:
if cache_key in _pipelines:
return _pipelines[cache_key]
cfg = MODEL_REGISTRY[model_key]
if model_key in LANG_AWARE_MODELS:
dialect_map = LANG_AWARE_MODELS[model_key]
hf_model = dialect_map.get(lang_code, dialect_map["en-UK"])
if model_key == "tinyllama_lora_sarcasm":
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
print(f"[INFO] Loading PEFT causal-LM adapter: {hf_model} ...")
model = AutoPeftModelForCausalLM.from_pretrained(hf_model)
model.eval()
try:
tokenizer = AutoTokenizer.from_pretrained(hf_model)
except Exception:
base_id = model.peft_config["default"].base_model_name_or_path
tokenizer = AutoTokenizer.from_pretrained(base_id)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
pipe = {"_yes_no_lora": True, "model": model, "tokenizer": tokenizer}
else:
from transformers import pipeline
print(f"[INFO] Loading dialect-aware model: {hf_model} ...")
pipe = pipeline("sentiment-analysis", model=hf_model)
elif cfg.get("lora_adapter"):
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
hf_model = cfg["hf_model"]
print(f"[INFO] Loading PEFT causal-LM adapter: {hf_model} ...")
model = AutoPeftModelForCausalLM.from_pretrained(hf_model)
model.eval()
try:
tokenizer = AutoTokenizer.from_pretrained(hf_model)
except Exception:
base_id = model.peft_config["default"].base_model_name_or_path
tokenizer = AutoTokenizer.from_pretrained(base_id)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
pipe = {"_yes_no_lora": True, "model": model, "tokenizer": tokenizer}
elif model_key in SKLEARN_ARTIFACTS:
import joblib
artifact_path = SKLEARN_ARTIFACTS[model_key]
print(f"[INFO] Loading sklearn artifact: {artifact_path} ...")
pipe = joblib.load(artifact_path)
if hasattr(pipe, "named_steps"):
clf = pipe.named_steps.get("clf")
if hasattr(clf, "n_jobs"):
clf.n_jobs = 1
else:
from transformers import pipeline
print(f"[INFO] Loading model: {cfg['hf_model']} ...")
pipe = pipeline(cfg["hf_task"], model=cfg["hf_model"])
_pipelines[cache_key] = pipe
return pipe
def _run_inference(model_key: str, text: str, language: str = "UK English"):
lang_code = language if language in BESSTIE_MODELS else None
pipe = get_pipeline(model_key, lang_code)
cfg = MODEL_REGISTRY[model_key]
task = cfg["hf_task"]
if isinstance(pipe, dict) and pipe.get("_yes_no_lora"):
import torch
lora_model = pipe["model"]
tokenizer = pipe["tokenizer"]
messages = [
{"role": "user", "content": f'Is the following text sarcastic? Answer with only "Yes" or "No".\n\nText: "{text}"'}
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(lora_model.device)
with torch.no_grad():
logits = lora_model(**inputs).logits[0, -1]
yes_id = tokenizer.encode("Yes", add_special_tokens=False)[0]
no_id = tokenizer.encode("No", add_special_tokens=False)[0]
pair = torch.tensor([logits[yes_id].item(), logits[no_id].item()])
probs = torch.softmax(pair, dim=0)
prob_yes = float(probs[0])
is_sarcastic = prob_yes > 0.5
result = {
"label": "SARCASTIC" if is_sarcastic else "NOT_SARCASTIC",
"score": round(prob_yes if is_sarcastic else 1.0 - prob_yes, 4),
}
elif model_key in SKLEARN_ARTIFACTS:
pred = int(pipe.predict([text])[0])
proba = pipe.predict_proba([text])[0]
if task == "sarcasm-detection":
label = "SARCASTIC" if pred == 1 else "NOT_SARCASTIC"
elif cfg.get("task1_task") == "sentiment":
label = "POSITIVE" if pred == 1 else "NEGATIVE"
else:
label = str(pred)
result = {"label": label, "score": round(float(max(proba)), 4)}
elif task == "question-answering":
parts = text.split("[SEP]")
if len(parts) < 2:
raise ValueError("For QA, separate context and question with [SEP].")
context = parts[0].strip()
question = parts[1].strip()
raw = pipe(question=question, context=context)
result = {"answer": raw["answer"], "score": round(raw["score"], 4)}
elif task == "zero-shot-classification":
candidate_labels = ["politics", "technology", "sports", "health", "finance", "entertainment", "science"]
raw = pipe(text, candidate_labels=candidate_labels)
result = {"labels": raw["labels"][:5], "scores": [round(s, 4) for s in raw["scores"][:5]]}
elif task == "ner":
raw = pipe(text)
result = {
"entities": [
{"word": ent["word"], "entity": ent["entity"], "score": round(ent["score"], 4)}
for ent in raw
]
}
elif task == "summarization":
max_len = min(130, max(30, len(text.split()) // 2))
raw = pipe(text, max_length=max_len, min_length=15, do_sample=False)
result = {"summary": raw[0]["summary_text"]}
elif task == "text-generation":
raw = pipe(text, max_new_tokens=120, num_return_sequences=1, do_sample=True, temperature=0.8)
result = {"generated": raw[0]["generated_text"]}
elif task == "fill-mask":
result_raw = pipe(text)
result = {
"predictions": [
{"token": pred["token_str"], "score": round(pred["score"], 4), "sequence": pred["sequence"]}
for pred in result_raw[:5]
]
}
else:
raw = pipe(text)
if isinstance(raw, list) and isinstance(raw[0], list):
raw = raw[0]
top = raw[0]
result = {"label": top["label"], "score": round(top["score"], 4)}
result["model"] = cfg["name"]
result["language"] = language
result["family"] = cfg.get("family", "")
return result
# ---------------------------------------------------------------------------
# Routes - pages
# ---------------------------------------------------------------------------
@app.route("/")
def home():
home_models = {key: MODEL_REGISTRY[key] for key in HOME_MODEL_KEYS}
return render_template("home.html", models=home_models)
@app.route("/task1")
def task1():
task1_models = {
key: model
for key, model in MODEL_REGISTRY.items()
if model.get("task_group") == "task1"
}
return render_template("task1.html", models=task1_models, eval_tables=TASK1_EVAL)
@app.route("/task2")
def task2():
task2_models = {
key: model
for key, model in MODEL_REGISTRY.items()
if model.get("task_group") == "task2"
}
return render_template("task2.html", models=task2_models, eval_tables=TASK2_EVAL)
@app.route("/task3")
def task3():
task3_models = {
key: model
for key, model in MODEL_REGISTRY.items()
if model.get("task_group") == "task3"
}
return render_template("task3.html", models=task3_models, eval_tables=TASK3_EVAL)
# ---------------------------------------------------------------------------
# API - model inference
# ---------------------------------------------------------------------------
@app.route("/api/infer", methods=["POST"])
def infer():
"""Run inference on the selected model and return JSON results."""
data = request.get_json(force=True)
text = data.get("text", "").strip()
model_key = data.get("model", "")
language = data.get("language", "UK English")
if not text:
return jsonify({"error": "Please enter some text."}), 400
if model_key not in MODEL_REGISTRY:
return jsonify({"error": f"Unknown model: {model_key}"}), 400
lang_code = language if language in BESSTIE_MODELS else None
if model_key in LANG_AWARE_MODELS and not lang_code:
return jsonify({"error": "Please select a language dialect for this model."}), 400
try:
return jsonify(_run_inference(model_key, text, language))
except ValueError as exc:
return jsonify({"error": str(exc)}), 400
except Exception as exc:
return jsonify({"error": str(exc)}), 500
@app.route("/api/task1/infer", methods=["POST"])
def task1_infer():
"""Run one Task 1 model so the frontend can fan out parallel requests."""
data = request.get_json(force=True)
text = data.get("text", "").strip()
task = data.get("task", "")
model_key = data.get("model_key", "")
if not text:
return jsonify({"error": "Please enter some text."}), 400
if task not in {"sentiment", "sarcasm"}:
return jsonify({"error": "Task must be sentiment or sarcasm."}), 400
if model_key not in MODEL_REGISTRY:
return jsonify({"error": f"Unknown model: {model_key}"}), 400
if MODEL_REGISTRY[model_key].get("task_group") != "task1":
return jsonify({"error": f"{model_key} is not a Task 1 model."}), 400
if MODEL_REGISTRY[model_key].get("task1_task") != task:
return jsonify({"error": f"{model_key} does not belong to the {task} task."}), 400
try:
return jsonify(_run_inference(model_key, text))
except Exception as exc:
return jsonify({"error": str(exc), "model": MODEL_REGISTRY[model_key]["name"]}), 500
@app.route("/api/task2/infer", methods=["POST"])
def task2_infer():
"""Run one Task 2 model so the frontend can fan out parallel requests."""
data = request.get_json(force=True)
text = data.get("text", "").strip()
task = data.get("task", "")
model_key = data.get("model_key", "")
if not text:
return jsonify({"error": "Please enter some text."}), 400
if task not in {"sentiment", "sarcasm"}:
return jsonify({"error": "Task must be sentiment or sarcasm."}), 400
if model_key not in MODEL_REGISTRY:
return jsonify({"error": f"Unknown model: {model_key}"}), 400
if MODEL_REGISTRY[model_key].get("task_group") != "task2":
return jsonify({"error": f"{model_key} is not a Task 2 model."}), 400
if MODEL_REGISTRY[model_key].get("task2_task") != task:
return jsonify({"error": f"{model_key} does not belong to the {task} task."}), 400
try:
result = _run_inference(model_key, text)
result["trained_on"] = MODEL_REGISTRY[model_key].get("trained_on", "")
return jsonify(result)
except Exception as exc:
return jsonify({"error": str(exc), "model": MODEL_REGISTRY[model_key]["name"]}), 500
@app.route("/api/task3/infer", methods=["POST"])
def task3_infer():
"""Run one Task 3 model so the frontend can fan out parallel requests."""
data = request.get_json(force=True)
text = data.get("text", "").strip()
task = data.get("task", "")
model_key = data.get("model_key", "")
if not text:
return jsonify({"error": "Please enter some text."}), 400
if task not in {"sarcasm"}:
return jsonify({"error": "Task must be sarcasm."}), 400
if model_key not in MODEL_REGISTRY:
return jsonify({"error": f"Unknown model: {model_key}"}), 400
if MODEL_REGISTRY[model_key].get("task_group") != "task3":
return jsonify({"error": f"{model_key} is not a Task 3 model."}), 400
if MODEL_REGISTRY[model_key].get("task3_task") != task:
return jsonify({"error": f"{model_key} does not belong to the {task} task."}), 400
try:
result = _run_inference(model_key, text)
result["trained_on"] = MODEL_REGISTRY[model_key].get("trained_on", "")
return jsonify(result)
except Exception as exc:
return jsonify({"error": str(exc), "model": MODEL_REGISTRY[model_key]["name"]}), 500
# ---------------------------------------------------------------------------
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
app.run(host="0.0.0.0", debug=True, port=port)