Spaces:
Sleeping
Sleeping
File size: 2,232 Bytes
e6e7585 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from underthesea import word_tokenize
# from src.text_preprocessor import preprocess_text
MODEL_NAME = "ndyah2020/phobert-base-v2-vsmec-finetuned"
# MODEL_NAME = "wonrax/phobert-base-vietnamese-sentiment" #
@st.cache_resource
def load_sentiment_pipeline():
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
sentiment_pipeline = pipeline(
"sentiment-analysis",
model=model,
tokenizer=tokenizer,
truncation=True,
max_length=256,
)
st.success("✅ Mô hình (local) đã sẵn sàng!")
return sentiment_pipeline
except Exception as e:
st.error(f"❌ Lỗi khi tải mô hình local: {e}")
return None
def predict_sentiment(text: str, sentiment_pipeline):
if not text or sentiment_pipeline is None:
return "Lỗi", 0.0
try:
segmented_text = " ".join(word_tokenize(text))
result = sentiment_pipeline(segmented_text)[0]
label_map = {
"NEG": "NEGATIVE",
"POS": "POSITIVE",
"NEU": "NEUTRAL"
}
raw_label = result["label"].upper()
confidence = float(result["score"])
CONFIDENCE_THRESHOLD = 0.5
if confidence < CONFIDENCE_THRESHOLD:
# Nếu độ tin cậy quá thấp, ép về Trung tính
label = label_map["NEU"]
label += " (không rõ)"
else:
# Nếu độ tin cậy đủ cao, dùng nhãn dự đoán
label = label_map.get(raw_label, "Không xác định")
if confidence >= 0.85:
label += " (rất rõ)"
elif confidence >= 0.7:
label += " (khá rõ)"
else:
label += " (hơi nhẹ)"
return label, confidence
except Exception as e:
print(f"Lỗi khi dự đoán: {e}")
return "Lỗi", 0.0
|