import torch import joblib import re import unicodedata from fastapi import FastAPI, Body from transformers import AutoTokenizer, AutoModelForSequenceClassification # ========================= # Text preprocessing (giống lúc train) # ========================= def preprocess_text(text): if not text: return "" # Normalize unicode text = unicodedata.normalize("NFC", text) # lowercase text = text.lower() # remove links text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE) # remove mention + hashtag text = re.sub(r"@\w+|#\w+", "", text) # remove special characters (giữ punctuation cơ bản) text = re.sub(r"[^\w\s,.!?]", "", text) # remove extra whitespace text = re.sub(r"\s+", " ", text).strip() return text # ========================= # Load model # ========================= MODEL_PATH = "./phobert_sentiment_model_final" tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH) model.eval() label_encoder = joblib.load("./label_encoder.pkl") # ========================= # FastAPI # ========================= app = FastAPI( title="PhoBERT Sentiment API", version="1.0" ) # ========================= # Health # ========================= @app.get("/") def root(): return {"message": "PhoBERT Sentiment API running"} @app.get("/health") def health(): return {"status": "ok"} # ========================= # Predict comments # ========================= @app.post("/predict_comments") def predict_comments(data = Body(...)): # hỗ trợ cả list và dict (n8n hay gửi list) if isinstance(data, list): data = data[0] comments = data.get("info_comment", []) if not comments: return data # lấy comment gốc original_texts = [c.get("comment", "") for c in comments] # preprocessing giống lúc train clean_texts = [preprocess_text(t) for t in original_texts] # tokenize inputs = tokenizer( clean_texts, padding=True, truncation=True, max_length=128, return_tensors="pt" ) # inference with torch.no_grad(): outputs = model(**inputs) probs = torch.softmax(outputs.logits, dim=1) pred_ids = torch.argmax(probs, dim=1).tolist() labels = label_encoder.inverse_transform(pred_ids) # gắn label + confidence vào JSON for i, comment in enumerate(comments): comment["label"] = labels[i] comment["confidence"] = round(probs[i][pred_ids[i]].item(), 4) return data