final_sentiment / app.py
tunglee7it's picture
Update app.py
11dde42 verified
import torch
import joblib
import re
import unicodedata
from fastapi import FastAPI, Body
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# =========================
# Text preprocessing (giống lúc train)
# =========================
def preprocess_text(text):
if not text:
return ""
# Normalize unicode
text = unicodedata.normalize("NFC", text)
# lowercase
text = text.lower()
# remove links
text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
# remove mention + hashtag
text = re.sub(r"@\w+|#\w+", "", text)
# remove special characters (giữ punctuation cơ bản)
text = re.sub(r"[^\w\s,.!?]", "", text)
# remove extra whitespace
text = re.sub(r"\s+", " ", text).strip()
return text
# =========================
# Load model
# =========================
MODEL_PATH = "./phobert_sentiment_model_final"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
model.eval()
label_encoder = joblib.load("./label_encoder.pkl")
# =========================
# FastAPI
# =========================
app = FastAPI(
title="PhoBERT Sentiment API",
version="1.0"
)
# =========================
# Health
# =========================
@app.get("/")
def root():
return {"message": "PhoBERT Sentiment API running"}
@app.get("/health")
def health():
return {"status": "ok"}
# =========================
# Predict comments
# =========================
@app.post("/predict_comments")
def predict_comments(data = Body(...)):
# hỗ trợ cả list và dict (n8n hay gửi list)
if isinstance(data, list):
data = data[0]
comments = data.get("info_comment", [])
if not comments:
return data
# lấy comment gốc
original_texts = [c.get("comment", "") for c in comments]
# preprocessing giống lúc train
clean_texts = [preprocess_text(t) for t in original_texts]
# tokenize
inputs = tokenizer(
clean_texts,
padding=True,
truncation=True,
max_length=128,
return_tensors="pt"
)
# inference
with torch.no_grad():
outputs = model(**inputs)
probs = torch.softmax(outputs.logits, dim=1)
pred_ids = torch.argmax(probs, dim=1).tolist()
labels = label_encoder.inverse_transform(pred_ids)
# gắn label + confidence vào JSON
for i, comment in enumerate(comments):
comment["label"] = labels[i]
comment["confidence"] = round(probs[i][pred_ids[i]].item(), 4)
return data