Spaces:
Sleeping
Sleeping
| from flask import Flask, request, render_template | |
| from transformers import pipeline | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.svm import LinearSVC | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.tree import DecisionTreeClassifier | |
| import polars as pl | |
| import joblib | |
| from pathlib import Path | |
| import logging | |
| import os | |
| from time import perf_counter | |
| from typing import Optional, Tuple | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| app = Flask(__name__) | |
| CLASS_ID_TO_SENTIMENT = { | |
| "0": "negative", | |
| "1": "neutral", | |
| "2": "positive" | |
| } | |
| def categorize_probability(probability: Optional[float]) -> Tuple[str, str, str]: | |
| """ | |
| Map a probability (0-1) to a qualitative label and associated CSS modifier. | |
| Returns (label, css_class, display_value). | |
| """ | |
| if probability is None: | |
| return ("Unknown", "probability-unknown", "N/A") | |
| percent = max(0.0, min(probability * 100.0, 100.0)) | |
| if percent >= 80: | |
| return ("Definitely", "probability-definitely", f"{percent:.0f}%") | |
| if percent >= 60: | |
| return ("Probably", "probability-probably", f"{percent:.0f}%") | |
| return ("Maybe", "probability-maybe", f"{percent:.0f}%") | |
| PRESET_TEXTS = [ | |
| "flower isn't beautiful", | |
| "there is no more love. only pain.", | |
| "one isn't a beauty, but two is a wondrous wonder", | |
| "hvl is a fake university #uibforever" | |
| ] | |
| # Use HF Spaces persistent storage if available, otherwise local cache | |
| CACHE_DIR = Path(os.getenv("HF_HOME", ".")) / ".model_cache" | |
| CACHE_DIR.mkdir(exist_ok=True) | |
| logger.info("Loading BERTweet from HuggingFace Hub...") | |
| bertweet_pipeline = pipeline("sentiment-analysis", model="kluvin/bertweet-tweet-sentiment") | |
| logger.info("BERTweet loaded successfully") | |
| # Define model configurations | |
| model_configs = { | |
| "Decision Tree": Pipeline([ | |
| ("tfidf", TfidfVectorizer(max_features=2000, stop_words="english")), | |
| ("clf", DecisionTreeClassifier(max_depth=10, random_state=42)) | |
| ]), | |
| "Random Forest": Pipeline([ | |
| ("tfidf", TfidfVectorizer(max_features=500, stop_words="english")), | |
| ("clf", RandomForestClassifier(n_estimators=100, random_state=42)) | |
| ]), | |
| "Logistic Regression": Pipeline([ | |
| ("tfidf", TfidfVectorizer(max_features=2000, stop_words="english")), | |
| ("clf", LogisticRegression(max_iter=1000, random_state=42)) | |
| ]), | |
| "Linear SVM": Pipeline([ | |
| ("tfidf", TfidfVectorizer(max_features=2000, stop_words="english")), | |
| ("clf", LinearSVC(random_state=42)) | |
| ]) | |
| } | |
| sklearn_pipelines = {} | |
| cache_file = CACHE_DIR / "ml_models.joblib" | |
| if cache_file.exists(): | |
| logger.info("Loading cached ML models...") | |
| try: | |
| sklearn_pipelines = joblib.load(cache_file) | |
| logger.info("✓ Cached models loaded successfully!") | |
| except Exception as e: | |
| logger.error(f"Failed to load cache: {e}") | |
| logger.info("Will retrain models...") | |
| if not sklearn_pipelines: | |
| logger.info("Loading training data and training ML models...") | |
| splits = {'train': 'train.jsonl'} | |
| df = pl.read_ndjson('hf://datasets/SetFit/tweet_sentiment_extraction/' + splits['train']) | |
| X_train = df['text'].to_list() | |
| y_train = df['label'].to_list() | |
| logger.info("Training models...") | |
| for model_name, sklearn_pipeline in model_configs.items(): | |
| logger.info(f" Training {model_name}...") | |
| sklearn_pipeline.fit(X_train, y_train) | |
| sklearn_pipelines[model_name] = sklearn_pipeline | |
| logger.info("Saving models to cache...") | |
| joblib.dump(sklearn_pipelines, cache_file) | |
| logger.info(f"✓ Models cached at {cache_file}") | |
| logger.info("All models loaded and ready!") | |
| def render_model_result(model_name: str, sentiment_name: str, probability: float | None) -> str: | |
| probability_label, probability_css, probability_value = categorize_probability(probability) | |
| return f''' | |
| <div class="model-result {sentiment_name}"> | |
| <h3>{model_name}</h3> | |
| <p class="sentiment">{sentiment_name.capitalize()}</p> | |
| <p class="confidence"> | |
| <span class="probability-badge {probability_css}"> | |
| <span class="probability-label">{probability_label}</span> | |
| <span class="probability-value">{probability_value}</span> | |
| </span> | |
| </p> | |
| </div> | |
| ''' | |
| def build_results_markup(text_input: str) -> str: | |
| inference_start = perf_counter() | |
| results_html = "" | |
| pipeline_output = bertweet_pipeline(text_input)[0] | |
| predicted_class_id = pipeline_output['label'] | |
| probability = pipeline_output['score'] | |
| sentiment_name = CLASS_ID_TO_SENTIMENT[predicted_class_id] | |
| logger.info(f"BERTweet prediction: {text_input} -> {sentiment_name} ({probability:.4f})") | |
| results_html += render_model_result("BERTweet (Transformer)", sentiment_name, probability) | |
| for model_name, sklearn_pipeline in sklearn_pipelines.items(): | |
| inputs = [text_input] | |
| predicted_class = sklearn_pipeline.predict(inputs)[0] | |
| classifier = sklearn_pipeline.named_steps['clf'] | |
| if hasattr(classifier, 'predict_proba'): | |
| class_probabilities = sklearn_pipeline.predict_proba(inputs)[0] | |
| probability = class_probabilities.max() | |
| elif hasattr(classifier, 'decision_function'): | |
| decision_scores = sklearn_pipeline.decision_function(inputs)[0] | |
| probability = 1.0 / (1.0 + abs(decision_scores.min())) | |
| else: | |
| probability = None | |
| sentiment_name = CLASS_ID_TO_SENTIMENT[str(predicted_class)] | |
| results_html += render_model_result(model_name, sentiment_name, probability) | |
| elapsed_ms = (perf_counter() - inference_start) * 1000 | |
| return ( | |
| f'<aside class="inference-meta">Inference time: {elapsed_ms:.0f} ms</aside>' | |
| f'<div class="results-grid">{results_html}</div>' | |
| ) | |
| def home(): | |
| default_text = PRESET_TEXTS[0] | |
| initial_results_html = "" | |
| try: | |
| logger.info("Precomputing initial classification for default preset...") | |
| initial_results_html = build_results_markup(default_text) | |
| except Exception as e: | |
| logger.error(f"Failed to precompute initial results: {e}", exc_info=True) | |
| return render_template( | |
| 'index.html', | |
| presets=PRESET_TEXTS, | |
| default_preset=default_text, | |
| initial_results=initial_results_html | |
| ) | |
| def classify(): | |
| try: | |
| text_input = request.form['text'] | |
| cleaned_text = text_input.strip() | |
| if not cleaned_text: | |
| return ''' | |
| <div class="result error"> | |
| <h2>Error: Please enter some text</h2> | |
| </div> | |
| ''' | |
| logger.info(f"Classifying: {cleaned_text[:50]}...") | |
| return build_results_markup(cleaned_text) | |
| except Exception as e: | |
| logger.error(f"Classification error: {e}", exc_info=True) | |
| return f''' | |
| <div class="result error"> | |
| <h2>Error: {e}</h2> | |
| </div> | |
| ''' | |
| if __name__ == "__main__": | |
| if app.debug: | |
| logger.setLevel(logging.DEBUG) | |
| app.run(debug=True) | |