Spaces:
Sleeping
Sleeping
| import torch | |
| import joblib | |
| import re | |
| import unicodedata | |
| from fastapi import FastAPI, Body | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| # ========================= | |
| # Text preprocessing (giống lúc train) | |
| # ========================= | |
| def preprocess_text(text): | |
| if not text: | |
| return "" | |
| # Normalize unicode | |
| text = unicodedata.normalize("NFC", text) | |
| # lowercase | |
| text = text.lower() | |
| # remove links | |
| text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE) | |
| # remove mention + hashtag | |
| text = re.sub(r"@\w+|#\w+", "", text) | |
| # remove special characters (giữ punctuation cơ bản) | |
| text = re.sub(r"[^\w\s,.!?]", "", text) | |
| # remove extra whitespace | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| # ========================= | |
| # Load model | |
| # ========================= | |
| MODEL_PATH = "./phobert_sentiment_model_final" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) | |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH) | |
| model.eval() | |
| label_encoder = joblib.load("./label_encoder.pkl") | |
| # ========================= | |
| # FastAPI | |
| # ========================= | |
| app = FastAPI( | |
| title="PhoBERT Sentiment API", | |
| version="1.0" | |
| ) | |
| # ========================= | |
| # Health | |
| # ========================= | |
| def root(): | |
| return {"message": "PhoBERT Sentiment API running"} | |
| def health(): | |
| return {"status": "ok"} | |
| # ========================= | |
| # Predict comments | |
| # ========================= | |
| def predict_comments(data = Body(...)): | |
| # hỗ trợ cả list và dict (n8n hay gửi list) | |
| if isinstance(data, list): | |
| data = data[0] | |
| comments = data.get("info_comment", []) | |
| if not comments: | |
| return data | |
| # lấy comment gốc | |
| original_texts = [c.get("comment", "") for c in comments] | |
| # preprocessing giống lúc train | |
| clean_texts = [preprocess_text(t) for t in original_texts] | |
| # tokenize | |
| inputs = tokenizer( | |
| clean_texts, | |
| padding=True, | |
| truncation=True, | |
| max_length=128, | |
| return_tensors="pt" | |
| ) | |
| # inference | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| probs = torch.softmax(outputs.logits, dim=1) | |
| pred_ids = torch.argmax(probs, dim=1).tolist() | |
| labels = label_encoder.inverse_transform(pred_ids) | |
| # gắn label + confidence vào JSON | |
| for i, comment in enumerate(comments): | |
| comment["label"] = labels[i] | |
| comment["confidence"] = round(probs[i][pred_ids[i]].item(), 4) | |
| return data |