import streamlit as st from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline from underthesea import word_tokenize # from src.text_preprocessor import preprocess_text MODEL_NAME = "ndyah2020/phobert-base-v2-vsmec-finetuned" # MODEL_NAME = "wonrax/phobert-base-vietnamese-sentiment" # @st.cache_resource def load_sentiment_pipeline(): try: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False) model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME) sentiment_pipeline = pipeline( "sentiment-analysis", model=model, tokenizer=tokenizer, truncation=True, max_length=256, ) st.success("✅ Mô hình (local) đã sẵn sàng!") return sentiment_pipeline except Exception as e: st.error(f"❌ Lỗi khi tải mô hình local: {e}") return None def predict_sentiment(text: str, sentiment_pipeline): if not text or sentiment_pipeline is None: return "Lỗi", 0.0 try: segmented_text = " ".join(word_tokenize(text)) result = sentiment_pipeline(segmented_text)[0] label_map = { "NEG": "NEGATIVE", "POS": "POSITIVE", "NEU": "NEUTRAL" } raw_label = result["label"].upper() confidence = float(result["score"]) CONFIDENCE_THRESHOLD = 0.5 if confidence < CONFIDENCE_THRESHOLD: # Nếu độ tin cậy quá thấp, ép về Trung tính label = label_map["NEU"] label += " (không rõ)" else: # Nếu độ tin cậy đủ cao, dùng nhãn dự đoán label = label_map.get(raw_label, "Không xác định") if confidence >= 0.85: label += " (rất rõ)" elif confidence >= 0.7: label += " (khá rõ)" else: label += " (hơi nhẹ)" return label, confidence except Exception as e: print(f"Lỗi khi dự đoán: {e}") return "Lỗi", 0.0