Spaces:
Sleeping
Sleeping
| """ | |
| NLP Hub - Flask backend serving HuggingFace transformer models. | |
| Models are lazy-loaded on first request to keep startup fast. | |
| """ | |
| from flask import Flask, render_template, request, jsonify | |
| import os | |
| import threading | |
| app = Flask(__name__) | |
| # --------------------------------------------------------------------------- | |
| # Model registry - each entry describes a HuggingFace pipeline config | |
| # --------------------------------------------------------------------------- | |
| BESSTIE_MODELS = { | |
| "en-IN": "vyshnav112233/BESSTIE-RoBERTa-en-IN-seed7", | |
| "en-UK": "vyshnav112233/BESSTIE-RoBERTa-en-UK-seed7", | |
| "en-AU": "vyshnav112233/BESSTIE-RoBERTa-en-AU-seed42", | |
| } | |
| SARCASM_LANG_MODELS = { | |
| "en-IN": "vyshnav112233/roberta-sarcasm-en-IN-seed42", | |
| "en-UK": "vyshnav112233/roberta-sarcasm-en-UK-seed42", | |
| "en-AU": "vyshnav112233/roberta-sarcasm-en-AU-seed7", | |
| } | |
| TINYLLAMA_LORA_SARCASM_MODELS = { | |
| "en-IN": "vyshnav112233/task2-3-lora-sarcasm-en-IN-seed-123", | |
| "en-AU": "vyshnav112233/task2-3-lora-sarcasm-en-AU-seed-2024", | |
| "en-UK": "vyshnav112233/task2-3-lora-sarcasm-en-UK-seed-42", | |
| } | |
| LANG_AWARE_MODELS = { | |
| "language_roberta": BESSTIE_MODELS, | |
| "language_roberta_sarcasm": SARCASM_LANG_MODELS, | |
| "tinyllama_lora_sarcasm": TINYLLAMA_LORA_SARCASM_MODELS, | |
| } | |
| HOME_MODEL_KEYS = ( | |
| "roberta_sentiment", | |
| "language_roberta", | |
| "logreg_sarcasm", | |
| "language_roberta_sarcasm", | |
| "tinyllama_lora_sarcasm", | |
| ) | |
| MODEL_REGISTRY = { | |
| "roberta_sentiment": { | |
| "name": "Task 1: RoBERTa", | |
| "description": "Classifies text sentiment (positive / negative) using a fine-tuned RoBERTa model.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "vyshnav112233/roberta-base-sentiment", | |
| "display_badge": "Simanta/roberta-base", | |
| "icon": "fa-face-smile", | |
| "task_group": "task1", | |
| "task1_task": "sentiment", | |
| "family": "ptlm", | |
| }, | |
| "language_roberta": { | |
| "name": "Task 2: RoBERTa-dialect", | |
| "description": "Dialect-aware sentiment analysis fine-tuned on the BESSTIE dataset for Indian, British, and Australian English.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "vyshnav112233/BESSTIE-RoBERTa-en-IN-seed7; vyshnav112233/BESSTIE-RoBERTa-en-UK-seed7; vyshnav112233/BESSTIE-RoBERTa-en-AU-seed42", | |
| "display_badge": "Ryan/RoBERTa-en-IN-seed7; Ryan/RoBERTa-en-UK-seed7; Ryan/RoBERTa-en-AU-seed42", | |
| "icon": "fa-brain", | |
| "task_group": "home", | |
| }, | |
| "logreg_sarcasm": { | |
| "name": "Task 1: Logistic Regression", | |
| "description": "TF-IDF (1-2 grams) + Logistic Regression baseline for sarcasm detection. Best macro-F1 among baselines in our benchmark.", | |
| "hf_task": "sarcasm-detection", | |
| "hf_model": "sklearn/logreg_sarcasm.joblib", | |
| "display_badge": "Simanta/sklearn/logreg_sarcasm.joblib", | |
| "icon": "fa-face-grin-squint-tears", | |
| "task_group": "home", | |
| }, | |
| "language_roberta_sarcasm": { | |
| "name": "Task 2: RoBERTa-dialect", | |
| "description": "Dialect-aware sarcasm detection fine-tuned on Indian, British, and Australian English RoBERTa checkpoints.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "vyshnav112233/roberta-sarcasm-en-IN-seed42; vyshnav112233/roberta-sarcasm-en-UK-seed42; vyshnav112233/roberta-sarcasm-en-AU-seed7", | |
| "display_badge": "Ryan/RoBERTa-en-IN-seed42; Ryan/RoBERTa-en-UK-seed42; Ryan/RoBERTa-en-AU-seed7", | |
| "icon": "fa-brain", | |
| "task_group": "home", | |
| }, | |
| "tinyllama_lora_sarcasm": { | |
| "name": "Task 3: TinyLlama-1.1B Adapter", | |
| "description": "Dialect-aware sarcasm detection using TinyLlama-1.1B with LoRA adapters fine-tuned on Indian, British, and Australian English.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "; ".join(TINYLLAMA_LORA_SARCASM_MODELS.values()), | |
| "display_badge": "Omkar/LORA-en-IN-seed123; Omkar/LORA-en-AU-seed2024; Omkar/LORA-en-UK-seed42", | |
| "icon": "fa-microchip", | |
| "task_group": "home", | |
| }, | |
| "lr_sarcasm": { | |
| "name": "Logistic Regression", | |
| "description": "TF-IDF + Logistic Regression baseline trained on Simanta's sarcasm labels.", | |
| "hf_task": "sarcasm-detection", | |
| "hf_model": "sklearn/lr_sarcasm.joblib", | |
| "icon": "fa-chart-line", | |
| "task_group": "task1", | |
| "task1_task": "sarcasm", | |
| "family": "classical", | |
| }, | |
| "svm_sarcasm": { | |
| "name": "Linear SVM", | |
| "description": "TF-IDF + calibrated Linear SVM baseline for sarcasm detection.", | |
| "hf_task": "sarcasm-detection", | |
| "hf_model": "sklearn/svm_sarcasm.joblib", | |
| "icon": "fa-vector-square", | |
| "task_group": "task1", | |
| "task1_task": "sarcasm", | |
| "family": "classical", | |
| }, | |
| "rf_sarcasm": { | |
| "name": "Random Forest", | |
| "description": "TF-IDF + Random Forest baseline for sarcasm detection.", | |
| "hf_task": "sarcasm-detection", | |
| "hf_model": "sklearn/rf_sarcasm.joblib", | |
| "icon": "fa-tree", | |
| "task_group": "task1", | |
| "task1_task": "sarcasm", | |
| "family": "classical", | |
| }, | |
| "albert_sarcasm": { | |
| "name": "ALBERT", | |
| "description": "Fine-tuned ALBERT model for sarcasm detection.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "vyshnav112233/albert-base-v2-sarcasm", | |
| "icon": "fa-layer-group", | |
| "task_group": "task1", | |
| "task1_task": "sarcasm", | |
| "family": "ptlm", | |
| }, | |
| "roberta_sarcasm": { | |
| "name": "RoBERTa", | |
| "description": "Fine-tuned RoBERTa model for sarcasm detection.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "vyshnav112233/roberta-base-sarcasm", | |
| "icon": "fa-brain", | |
| "task_group": "task1", | |
| "task1_task": "sarcasm", | |
| "family": "ptlm", | |
| }, | |
| "distilbert_sarcasm": { | |
| "name": "DistilBERT", | |
| "description": "Fine-tuned DistilBERT model for sarcasm detection.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "vyshnav112233/distilbert-base-sarcasm", | |
| "icon": "fa-bolt", | |
| "task_group": "task1", | |
| "task1_task": "sarcasm", | |
| "family": "ptlm", | |
| }, | |
| "lr_sentiment": { | |
| "name": "Logistic Regression", | |
| "description": "TF-IDF + Logistic Regression baseline trained on Simanta's sentiment labels.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "sklearn/lr_sentiment.joblib", | |
| "icon": "fa-chart-line", | |
| "task_group": "task1", | |
| "task1_task": "sentiment", | |
| "family": "classical", | |
| }, | |
| "svm_sentiment": { | |
| "name": "Linear SVM", | |
| "description": "TF-IDF + calibrated Linear SVM baseline for sentiment analysis.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "sklearn/svm_sentiment.joblib", | |
| "icon": "fa-vector-square", | |
| "task_group": "task1", | |
| "task1_task": "sentiment", | |
| "family": "classical", | |
| }, | |
| "rf_sentiment": { | |
| "name": "Random Forest", | |
| "description": "TF-IDF + Random Forest baseline for sentiment analysis.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "sklearn/rf_sentiment.joblib", | |
| "icon": "fa-tree", | |
| "task_group": "task1", | |
| "task1_task": "sentiment", | |
| "family": "classical", | |
| }, | |
| "albert_sentiment": { | |
| "name": "ALBERT", | |
| "description": "Fine-tuned ALBERT model for sentiment analysis.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "vyshnav112233/albert-base-v2-sentiment", | |
| "icon": "fa-layer-group", | |
| "task_group": "task1", | |
| "task1_task": "sentiment", | |
| "family": "ptlm", | |
| }, | |
| "distilbert_sentiment": { | |
| "name": "DistilBERT", | |
| "description": "Fine-tuned DistilBERT model for sentiment analysis.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "vyshnav112233/distilbert-base-sentiment", | |
| "icon": "fa-bolt", | |
| "task_group": "task1", | |
| "task1_task": "sentiment", | |
| "family": "ptlm", | |
| }, | |
| "t2_sentiment_au": { | |
| "name": "RoBERTa · Trained on en-AU", | |
| "description": "RoBERTa-base fine-tuned only on Australian English BESSTIE sentiment data.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "vyshnav112233/BESSTIE-RoBERTa-en-AU-seed42", | |
| "icon": "fa-brain", | |
| "task_group": "task2", | |
| "task2_task": "sentiment", | |
| "trained_on": "en-AU", | |
| }, | |
| "t2_sentiment_in": { | |
| "name": "RoBERTa · Trained on en-IN", | |
| "description": "RoBERTa-base fine-tuned only on Indian English BESSTIE sentiment data.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "vyshnav112233/BESSTIE-RoBERTa-en-IN-seed7", | |
| "icon": "fa-brain", | |
| "task_group": "task2", | |
| "task2_task": "sentiment", | |
| "trained_on": "en-IN", | |
| }, | |
| "t2_sentiment_uk": { | |
| "name": "RoBERTa · Trained on en-UK", | |
| "description": "RoBERTa-base fine-tuned only on British English BESSTIE sentiment data.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "vyshnav112233/BESSTIE-RoBERTa-en-UK-seed7", | |
| "icon": "fa-brain", | |
| "task_group": "task2", | |
| "task2_task": "sentiment", | |
| "trained_on": "en-UK", | |
| }, | |
| "t2_sentiment_mixed": { | |
| "name": "RoBERTa · Trained on Mixed (all 3)", | |
| "description": "RoBERTa-base fine-tuned on en-AU, en-IN, and en-UK BESSTIE sentiment data combined.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "vyshnav112233/BESSTIE-RoBERTa-mixed-seed123", | |
| "icon": "fa-brain", | |
| "task_group": "task2", | |
| "task2_task": "sentiment", | |
| "trained_on": "mixed", | |
| }, | |
| "t2_sarcasm_au": { | |
| "name": "RoBERTa · Trained on en-AU", | |
| "description": "RoBERTa-base fine-tuned only on Australian English BESSTIE sarcasm data.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "vyshnav112233/roberta-sarcasm-en-AU-seed7", | |
| "icon": "fa-brain", | |
| "task_group": "task2", | |
| "task2_task": "sarcasm", | |
| "trained_on": "en-AU", | |
| }, | |
| "t2_sarcasm_in": { | |
| "name": "RoBERTa · Trained on en-IN", | |
| "description": "RoBERTa-base fine-tuned only on Indian English BESSTIE sarcasm data.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "vyshnav112233/roberta-sarcasm-en-IN-seed42", | |
| "icon": "fa-brain", | |
| "task_group": "task2", | |
| "task2_task": "sarcasm", | |
| "trained_on": "en-IN", | |
| }, | |
| "t2_sarcasm_uk": { | |
| "name": "RoBERTa · Trained on en-UK", | |
| "description": "RoBERTa-base fine-tuned only on British English BESSTIE sarcasm data.", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": "vyshnav112233/roberta-sarcasm-en-UK-seed42", | |
| "icon": "fa-brain", | |
| "task_group": "task2", | |
| "task2_task": "sarcasm", | |
| "trained_on": "en-UK", | |
| }, | |
| "t3_sarcasm_au": { | |
| "name": "TinyLlama-LoRA · Trained on en-AU", | |
| "description": "TinyLlama-1.1B-Chat with a LoRA adapter fine-tuned only on Australian English sarcasm data (seed 2024).", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": TINYLLAMA_LORA_SARCASM_MODELS["en-AU"], | |
| "icon": "fa-microchip", | |
| "task_group": "task3", | |
| "task3_task": "sarcasm", | |
| "trained_on": "en-AU", | |
| "lora_adapter": True, | |
| }, | |
| "t3_sarcasm_in": { | |
| "name": "TinyLlama-LoRA · Trained on en-IN", | |
| "description": "TinyLlama-1.1B-Chat with a LoRA adapter fine-tuned only on Indian English sarcasm data (seed 123).", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": TINYLLAMA_LORA_SARCASM_MODELS["en-IN"], | |
| "icon": "fa-microchip", | |
| "task_group": "task3", | |
| "task3_task": "sarcasm", | |
| "trained_on": "en-IN", | |
| "lora_adapter": True, | |
| }, | |
| "t3_sarcasm_uk": { | |
| "name": "TinyLlama-LoRA · Trained on en-UK", | |
| "description": "TinyLlama-1.1B-Chat with a LoRA adapter fine-tuned only on British English sarcasm data (seed 42).", | |
| "hf_task": "sentiment-analysis", | |
| "hf_model": TINYLLAMA_LORA_SARCASM_MODELS["en-UK"], | |
| "icon": "fa-microchip", | |
| "task_group": "task3", | |
| "task3_task": "sarcasm", | |
| "trained_on": "en-UK", | |
| "lora_adapter": True, | |
| }, | |
| } | |
| SKLEARN_ARTIFACTS = { | |
| "logreg_sarcasm": os.path.join("models", "logreg_sarcasm.joblib"), | |
| "lr_sarcasm": os.path.join("models", "lr_sarcasm.joblib"), | |
| "svm_sarcasm": os.path.join("models", "svm_sarcasm.joblib"), | |
| "rf_sarcasm": os.path.join("models", "rf_sarcasm.joblib"), | |
| "lr_sentiment": os.path.join("models", "lr_sentiment.joblib"), | |
| "svm_sentiment": os.path.join("models", "svm_sentiment.joblib"), | |
| "rf_sentiment": os.path.join("models", "rf_sentiment.joblib"), | |
| } | |
| TASK1_EVAL = { | |
| "sentiment": [ | |
| {"model": "SVM", "family": "Classical", "accuracy": 0.838, "precision": 0.838, "recall": 0.838, "macro_f1": 0.838}, | |
| {"model": "Logistic Regression", "family": "Classical", "accuracy": 0.830, "precision": 0.830, "recall": 0.830, "macro_f1": 0.830}, | |
| {"model": "Random Forest", "family": "Classical", "accuracy": 0.795, "precision": 0.795, "recall": 0.795, "macro_f1": 0.795}, | |
| {"model": "RoBERTa", "family": "PTLM", "accuracy": 0.896, "precision": 0.896, "recall": 0.896, "macro_f1": 0.896}, | |
| {"model": "DistilBERT", "family": "PTLM", "accuracy": 0.868, "precision": 0.868, "recall": 0.868, "macro_f1": 0.868}, | |
| {"model": "ALBERT", "family": "PTLM", "accuracy": 0.865, "precision": 0.865, "recall": 0.865, "macro_f1": 0.865}, | |
| ], | |
| "sarcasm": [ | |
| {"model": "Random Forest", "family": "Classical", "accuracy": 0.502, "precision": 0.502, "recall": 0.502, "macro_f1": 0.502}, | |
| {"model": "SVM", "family": "Classical", "accuracy": 0.585, "precision": 0.585, "recall": 0.585, "macro_f1": 0.585}, | |
| {"model": "Logistic Regression", "family": "Classical", "accuracy": 0.677, "precision": 0.677, "recall": 0.677, "macro_f1": 0.677}, | |
| {"model": "RoBERTa", "family": "PTLM", "accuracy": 0.462, "precision": 0.462, "recall": 0.462, "macro_f1": 0.462}, | |
| {"model": "DistilBERT", "family": "PTLM", "accuracy": 0.611, "precision": 0.611, "recall": 0.611, "macro_f1": 0.611}, | |
| {"model": "ALBERT", "family": "PTLM", "accuracy": 0.462, "precision": 0.462, "recall": 0.462, "macro_f1": 0.462}, | |
| ], | |
| } | |
| TASK2_EVAL = { | |
| "sentiment": [ | |
| {"trained_on": "en-AU", "tested_on": "en-AU", "macro_f1": 0.8941, "macro_f1_std": 0.0053, "macro_p": 0.8965, "macro_r": 0.8935}, | |
| {"trained_on": "en-AU", "tested_on": "en-IN", "macro_f1": 0.8196, "macro_f1_std": 0.0203, "macro_p": 0.8250, "macro_r": 0.8228}, | |
| {"trained_on": "en-AU", "tested_on": "en-UK", "macro_f1": 0.9437, "macro_f1_std": 0.0007, "macro_p": 0.9443, "macro_r": 0.9436}, | |
| {"trained_on": "en-IN", "tested_on": "en-AU", "macro_f1": 0.8659, "macro_f1_std": 0.0047, "macro_p": 0.8737, "macro_r": 0.8654}, | |
| {"trained_on": "en-IN", "tested_on": "en-IN", "macro_f1": 0.8409, "macro_f1_std": 0.0056, "macro_p": 0.8444, "macro_r": 0.8417}, | |
| {"trained_on": "en-IN", "tested_on": "en-UK", "macro_f1": 0.9323, "macro_f1_std": 0.0040, "macro_p": 0.9328, "macro_r": 0.9325}, | |
| {"trained_on": "en-UK", "tested_on": "en-AU", "macro_f1": 0.8761, "macro_f1_std": 0.0168, "macro_p": 0.8855, "macro_r": 0.8748}, | |
| {"trained_on": "en-UK", "tested_on": "en-IN", "macro_f1": 0.8463, "macro_f1_std": 0.0008, "macro_p": 0.8467, "macro_r": 0.8463}, | |
| {"trained_on": "en-UK", "tested_on": "en-UK", "macro_f1": 0.9471, "macro_f1_std": 0.0000, "macro_p": 0.9471, "macro_r": 0.9472}, | |
| {"trained_on": "mixed", "tested_on": "en-AU", "macro_f1": 0.8910, "macro_f1_std": 0.0070, "macro_p": 0.8942, "macro_r": 0.8902}, | |
| {"trained_on": "mixed", "tested_on": "en-IN", "macro_f1": 0.8449, "macro_f1_std": 0.0082, "macro_p": 0.8449, "macro_r": 0.8453}, | |
| {"trained_on": "mixed", "tested_on": "en-UK", "macro_f1": 0.9523, "macro_f1_std": 0.0078, "macro_p": 0.9535, "macro_r": 0.9521}, | |
| ], | |
| "sarcasm": [ | |
| {"trained_on": "en-AU", "tested_on": "en-AU", "macro_f1": 0.7570, "macro_f1_std": 0.0078, "macro_p": 0.7542, "macro_r": 0.7892}, | |
| {"trained_on": "en-AU", "tested_on": "en-IN", "macro_f1": 0.4896, "macro_f1_std": 0.0170, "macro_p": 0.5531, "macro_r": 0.6991}, | |
| {"trained_on": "en-AU", "tested_on": "en-UK", "macro_f1": 0.5822, "macro_f1_std": 0.0227, "macro_p": 0.6008, "macro_r": 0.8212}, | |
| {"trained_on": "en-IN", "tested_on": "en-AU", "macro_f1": 0.4628, "macro_f1_std": 0.0617, "macro_p": 0.6261, "macro_r": 0.5228}, | |
| {"trained_on": "en-IN", "tested_on": "en-IN", "macro_f1": 0.6044, "macro_f1_std": 0.0447, "macro_p": 0.6131, "macro_r": 0.6320}, | |
| {"trained_on": "en-IN", "tested_on": "en-UK", "macro_f1": 0.5533, "macro_f1_std": 0.0902, "macro_p": 0.7034, "macro_r": 0.5617}, | |
| {"trained_on": "en-UK", "tested_on": "en-AU", "macro_f1": 0.6058, "macro_f1_std": 0.0484, "macro_p": 0.6722, "macro_r": 0.6062}, | |
| {"trained_on": "en-UK", "tested_on": "en-IN", "macro_f1": 0.5747, "macro_f1_std": 0.0166, "macro_p": 0.5772, "macro_r": 0.7117}, | |
| {"trained_on": "en-UK", "tested_on": "en-UK", "macro_f1": 0.7025, "macro_f1_std": 0.0215, "macro_p": 0.7029, "macro_r": 0.7291}, | |
| ], | |
| } | |
| TASK3_EVAL = { | |
| "sarcasm": [ | |
| {"trained_on": "en-AU", "tested_on": "en-AU", "macro_f1": 0.7603, "macro_f1_std": 0.0291, "macro_p": 0.7588, "macro_r": 0.7902}, | |
| {"trained_on": "en-AU", "tested_on": "en-IN", "macro_f1": 0.5005, "macro_f1_std": 0.0240, "macro_p": 0.5531, "macro_r": 0.6915}, | |
| {"trained_on": "en-AU", "tested_on": "en-UK", "macro_f1": 0.5805, "macro_f1_std": 0.0413, "macro_p": 0.6031, "macro_r": 0.8315}, | |
| {"trained_on": "en-IN", "tested_on": "en-AU", "macro_f1": 0.5394, "macro_f1_std": 0.1054, "macro_p": 0.7578, "macro_r": 0.5669}, | |
| {"trained_on": "en-IN", "tested_on": "en-IN", "macro_f1": 0.5964, "macro_f1_std": 0.0817, "macro_p": 0.7411, "macro_r": 0.6797}, | |
| {"trained_on": "en-IN", "tested_on": "en-UK", "macro_f1": 0.6661, "macro_f1_std": 0.1357, "macro_p": 0.7979, "macro_r": 0.7001}, | |
| {"trained_on": "en-UK", "tested_on": "en-AU", "macro_f1": 0.6003, "macro_f1_std": 0.0171, "macro_p": 0.6388, "macro_r": 0.5951}, | |
| {"trained_on": "en-UK", "tested_on": "en-IN", "macro_f1": 0.6331, "macro_f1_std": 0.0253, "macro_p": 0.6132, "macro_r": 0.7935}, | |
| {"trained_on": "en-UK", "tested_on": "en-UK", "macro_f1": 0.7724, "macro_f1_std": 0.0088, "macro_p": 0.7418, "macro_r": 0.8183}, | |
| ], | |
| } | |
| # Lazy-loaded pipeline cache | |
| _pipelines = {} | |
| _lock = threading.Lock() | |
| def get_pipeline(model_key: str, lang_code: str = None): | |
| """Return a cached transformers pipeline, loading on first call.""" | |
| cache_key = f"{model_key}:{lang_code}" if model_key in LANG_AWARE_MODELS else model_key | |
| if cache_key in _pipelines: | |
| return _pipelines[cache_key] | |
| with _lock: | |
| if cache_key in _pipelines: | |
| return _pipelines[cache_key] | |
| cfg = MODEL_REGISTRY[model_key] | |
| if model_key in LANG_AWARE_MODELS: | |
| dialect_map = LANG_AWARE_MODELS[model_key] | |
| hf_model = dialect_map.get(lang_code, dialect_map["en-UK"]) | |
| if model_key == "tinyllama_lora_sarcasm": | |
| from peft import AutoPeftModelForCausalLM | |
| from transformers import AutoTokenizer | |
| print(f"[INFO] Loading PEFT causal-LM adapter: {hf_model} ...") | |
| model = AutoPeftModelForCausalLM.from_pretrained(hf_model) | |
| model.eval() | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained(hf_model) | |
| except Exception: | |
| base_id = model.peft_config["default"].base_model_name_or_path | |
| tokenizer = AutoTokenizer.from_pretrained(base_id) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| pipe = {"_yes_no_lora": True, "model": model, "tokenizer": tokenizer} | |
| else: | |
| from transformers import pipeline | |
| print(f"[INFO] Loading dialect-aware model: {hf_model} ...") | |
| pipe = pipeline("sentiment-analysis", model=hf_model) | |
| elif cfg.get("lora_adapter"): | |
| from peft import AutoPeftModelForCausalLM | |
| from transformers import AutoTokenizer | |
| hf_model = cfg["hf_model"] | |
| print(f"[INFO] Loading PEFT causal-LM adapter: {hf_model} ...") | |
| model = AutoPeftModelForCausalLM.from_pretrained(hf_model) | |
| model.eval() | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained(hf_model) | |
| except Exception: | |
| base_id = model.peft_config["default"].base_model_name_or_path | |
| tokenizer = AutoTokenizer.from_pretrained(base_id) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| pipe = {"_yes_no_lora": True, "model": model, "tokenizer": tokenizer} | |
| elif model_key in SKLEARN_ARTIFACTS: | |
| import joblib | |
| artifact_path = SKLEARN_ARTIFACTS[model_key] | |
| print(f"[INFO] Loading sklearn artifact: {artifact_path} ...") | |
| pipe = joblib.load(artifact_path) | |
| if hasattr(pipe, "named_steps"): | |
| clf = pipe.named_steps.get("clf") | |
| if hasattr(clf, "n_jobs"): | |
| clf.n_jobs = 1 | |
| else: | |
| from transformers import pipeline | |
| print(f"[INFO] Loading model: {cfg['hf_model']} ...") | |
| pipe = pipeline(cfg["hf_task"], model=cfg["hf_model"]) | |
| _pipelines[cache_key] = pipe | |
| return pipe | |
| def _run_inference(model_key: str, text: str, language: str = "UK English"): | |
| lang_code = language if language in BESSTIE_MODELS else None | |
| pipe = get_pipeline(model_key, lang_code) | |
| cfg = MODEL_REGISTRY[model_key] | |
| task = cfg["hf_task"] | |
| if isinstance(pipe, dict) and pipe.get("_yes_no_lora"): | |
| import torch | |
| lora_model = pipe["model"] | |
| tokenizer = pipe["tokenizer"] | |
| messages = [ | |
| {"role": "user", "content": f'Is the following text sarcastic? Answer with only "Yes" or "No".\n\nText: "{text}"'} | |
| ] | |
| prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| inputs = tokenizer(prompt, return_tensors="pt").to(lora_model.device) | |
| with torch.no_grad(): | |
| logits = lora_model(**inputs).logits[0, -1] | |
| yes_id = tokenizer.encode("Yes", add_special_tokens=False)[0] | |
| no_id = tokenizer.encode("No", add_special_tokens=False)[0] | |
| pair = torch.tensor([logits[yes_id].item(), logits[no_id].item()]) | |
| probs = torch.softmax(pair, dim=0) | |
| prob_yes = float(probs[0]) | |
| is_sarcastic = prob_yes > 0.5 | |
| result = { | |
| "label": "SARCASTIC" if is_sarcastic else "NOT_SARCASTIC", | |
| "score": round(prob_yes if is_sarcastic else 1.0 - prob_yes, 4), | |
| } | |
| elif model_key in SKLEARN_ARTIFACTS: | |
| pred = int(pipe.predict([text])[0]) | |
| proba = pipe.predict_proba([text])[0] | |
| if task == "sarcasm-detection": | |
| label = "SARCASTIC" if pred == 1 else "NOT_SARCASTIC" | |
| elif cfg.get("task1_task") == "sentiment": | |
| label = "POSITIVE" if pred == 1 else "NEGATIVE" | |
| else: | |
| label = str(pred) | |
| result = {"label": label, "score": round(float(max(proba)), 4)} | |
| elif task == "question-answering": | |
| parts = text.split("[SEP]") | |
| if len(parts) < 2: | |
| raise ValueError("For QA, separate context and question with [SEP].") | |
| context = parts[0].strip() | |
| question = parts[1].strip() | |
| raw = pipe(question=question, context=context) | |
| result = {"answer": raw["answer"], "score": round(raw["score"], 4)} | |
| elif task == "zero-shot-classification": | |
| candidate_labels = ["politics", "technology", "sports", "health", "finance", "entertainment", "science"] | |
| raw = pipe(text, candidate_labels=candidate_labels) | |
| result = {"labels": raw["labels"][:5], "scores": [round(s, 4) for s in raw["scores"][:5]]} | |
| elif task == "ner": | |
| raw = pipe(text) | |
| result = { | |
| "entities": [ | |
| {"word": ent["word"], "entity": ent["entity"], "score": round(ent["score"], 4)} | |
| for ent in raw | |
| ] | |
| } | |
| elif task == "summarization": | |
| max_len = min(130, max(30, len(text.split()) // 2)) | |
| raw = pipe(text, max_length=max_len, min_length=15, do_sample=False) | |
| result = {"summary": raw[0]["summary_text"]} | |
| elif task == "text-generation": | |
| raw = pipe(text, max_new_tokens=120, num_return_sequences=1, do_sample=True, temperature=0.8) | |
| result = {"generated": raw[0]["generated_text"]} | |
| elif task == "fill-mask": | |
| result_raw = pipe(text) | |
| result = { | |
| "predictions": [ | |
| {"token": pred["token_str"], "score": round(pred["score"], 4), "sequence": pred["sequence"]} | |
| for pred in result_raw[:5] | |
| ] | |
| } | |
| else: | |
| raw = pipe(text) | |
| if isinstance(raw, list) and isinstance(raw[0], list): | |
| raw = raw[0] | |
| top = raw[0] | |
| result = {"label": top["label"], "score": round(top["score"], 4)} | |
| result["model"] = cfg["name"] | |
| result["language"] = language | |
| result["family"] = cfg.get("family", "") | |
| return result | |
| # --------------------------------------------------------------------------- | |
| # Routes - pages | |
| # --------------------------------------------------------------------------- | |
| def home(): | |
| home_models = {key: MODEL_REGISTRY[key] for key in HOME_MODEL_KEYS} | |
| return render_template("home.html", models=home_models) | |
| def task1(): | |
| task1_models = { | |
| key: model | |
| for key, model in MODEL_REGISTRY.items() | |
| if model.get("task_group") == "task1" | |
| } | |
| return render_template("task1.html", models=task1_models, eval_tables=TASK1_EVAL) | |
| def task2(): | |
| task2_models = { | |
| key: model | |
| for key, model in MODEL_REGISTRY.items() | |
| if model.get("task_group") == "task2" | |
| } | |
| return render_template("task2.html", models=task2_models, eval_tables=TASK2_EVAL) | |
| def task3(): | |
| task3_models = { | |
| key: model | |
| for key, model in MODEL_REGISTRY.items() | |
| if model.get("task_group") == "task3" | |
| } | |
| return render_template("task3.html", models=task3_models, eval_tables=TASK3_EVAL) | |
| # --------------------------------------------------------------------------- | |
| # API - model inference | |
| # --------------------------------------------------------------------------- | |
| def infer(): | |
| """Run inference on the selected model and return JSON results.""" | |
| data = request.get_json(force=True) | |
| text = data.get("text", "").strip() | |
| model_key = data.get("model", "") | |
| language = data.get("language", "UK English") | |
| if not text: | |
| return jsonify({"error": "Please enter some text."}), 400 | |
| if model_key not in MODEL_REGISTRY: | |
| return jsonify({"error": f"Unknown model: {model_key}"}), 400 | |
| lang_code = language if language in BESSTIE_MODELS else None | |
| if model_key in LANG_AWARE_MODELS and not lang_code: | |
| return jsonify({"error": "Please select a language dialect for this model."}), 400 | |
| try: | |
| return jsonify(_run_inference(model_key, text, language)) | |
| except ValueError as exc: | |
| return jsonify({"error": str(exc)}), 400 | |
| except Exception as exc: | |
| return jsonify({"error": str(exc)}), 500 | |
| def task1_infer(): | |
| """Run one Task 1 model so the frontend can fan out parallel requests.""" | |
| data = request.get_json(force=True) | |
| text = data.get("text", "").strip() | |
| task = data.get("task", "") | |
| model_key = data.get("model_key", "") | |
| if not text: | |
| return jsonify({"error": "Please enter some text."}), 400 | |
| if task not in {"sentiment", "sarcasm"}: | |
| return jsonify({"error": "Task must be sentiment or sarcasm."}), 400 | |
| if model_key not in MODEL_REGISTRY: | |
| return jsonify({"error": f"Unknown model: {model_key}"}), 400 | |
| if MODEL_REGISTRY[model_key].get("task_group") != "task1": | |
| return jsonify({"error": f"{model_key} is not a Task 1 model."}), 400 | |
| if MODEL_REGISTRY[model_key].get("task1_task") != task: | |
| return jsonify({"error": f"{model_key} does not belong to the {task} task."}), 400 | |
| try: | |
| return jsonify(_run_inference(model_key, text)) | |
| except Exception as exc: | |
| return jsonify({"error": str(exc), "model": MODEL_REGISTRY[model_key]["name"]}), 500 | |
| def task2_infer(): | |
| """Run one Task 2 model so the frontend can fan out parallel requests.""" | |
| data = request.get_json(force=True) | |
| text = data.get("text", "").strip() | |
| task = data.get("task", "") | |
| model_key = data.get("model_key", "") | |
| if not text: | |
| return jsonify({"error": "Please enter some text."}), 400 | |
| if task not in {"sentiment", "sarcasm"}: | |
| return jsonify({"error": "Task must be sentiment or sarcasm."}), 400 | |
| if model_key not in MODEL_REGISTRY: | |
| return jsonify({"error": f"Unknown model: {model_key}"}), 400 | |
| if MODEL_REGISTRY[model_key].get("task_group") != "task2": | |
| return jsonify({"error": f"{model_key} is not a Task 2 model."}), 400 | |
| if MODEL_REGISTRY[model_key].get("task2_task") != task: | |
| return jsonify({"error": f"{model_key} does not belong to the {task} task."}), 400 | |
| try: | |
| result = _run_inference(model_key, text) | |
| result["trained_on"] = MODEL_REGISTRY[model_key].get("trained_on", "") | |
| return jsonify(result) | |
| except Exception as exc: | |
| return jsonify({"error": str(exc), "model": MODEL_REGISTRY[model_key]["name"]}), 500 | |
| def task3_infer(): | |
| """Run one Task 3 model so the frontend can fan out parallel requests.""" | |
| data = request.get_json(force=True) | |
| text = data.get("text", "").strip() | |
| task = data.get("task", "") | |
| model_key = data.get("model_key", "") | |
| if not text: | |
| return jsonify({"error": "Please enter some text."}), 400 | |
| if task not in {"sarcasm"}: | |
| return jsonify({"error": "Task must be sarcasm."}), 400 | |
| if model_key not in MODEL_REGISTRY: | |
| return jsonify({"error": f"Unknown model: {model_key}"}), 400 | |
| if MODEL_REGISTRY[model_key].get("task_group") != "task3": | |
| return jsonify({"error": f"{model_key} is not a Task 3 model."}), 400 | |
| if MODEL_REGISTRY[model_key].get("task3_task") != task: | |
| return jsonify({"error": f"{model_key} does not belong to the {task} task."}), 400 | |
| try: | |
| result = _run_inference(model_key, text) | |
| result["trained_on"] = MODEL_REGISTRY[model_key].get("trained_on", "") | |
| return jsonify(result) | |
| except Exception as exc: | |
| return jsonify({"error": str(exc), "model": MODEL_REGISTRY[model_key]["name"]}), 500 | |
| # --------------------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| port = int(os.environ.get("PORT", 7860)) | |
| app.run(host="0.0.0.0", debug=True, port=port) | |