import torch
import joblib
import re
import unicodedata

from fastapi import FastAPI, Body
from transformers import AutoTokenizer, AutoModelForSequenceClassification


# =========================
# Text preprocessing (giống lúc train)
# =========================

def preprocess_text(text):

    if not text:
        return ""

    # Normalize unicode
    text = unicodedata.normalize("NFC", text)

    # lowercase
    text = text.lower()

    # remove links
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)

    # remove mention + hashtag
    text = re.sub(r"@\w+|#\w+", "", text)

    # remove special characters (giữ punctuation cơ bản)
    text = re.sub(r"[^\w\s,.!?]", "", text)

    # remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text


# =========================
# Load model
# =========================

MODEL_PATH = "./phobert_sentiment_model_final"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
model.eval()

label_encoder = joblib.load("./label_encoder.pkl")


# =========================
# FastAPI
# =========================

app = FastAPI(
    title="PhoBERT Sentiment API",
    version="1.0"
)


# =========================
# Health
# =========================

@app.get("/")
def root():
    return {"message": "PhoBERT Sentiment API running"}

@app.get("/health")
def health():
    return {"status": "ok"}


# =========================
# Predict comments
# =========================

@app.post("/predict_comments")
def predict_comments(data = Body(...)):

    # hỗ trợ cả list và dict (n8n hay gửi list)
    if isinstance(data, list):
        data = data[0]

    comments = data.get("info_comment", [])

    if not comments:
        return data

    # lấy comment gốc
    original_texts = [c.get("comment", "") for c in comments]

    # preprocessing giống lúc train
    clean_texts = [preprocess_text(t) for t in original_texts]

    # tokenize
    inputs = tokenizer(
        clean_texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    # inference
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        pred_ids = torch.argmax(probs, dim=1).tolist()

    labels = label_encoder.inverse_transform(pred_ids)

    # gắn label + confidence vào JSON
    for i, comment in enumerate(comments):
        comment["label"] = labels[i]
        comment["confidence"] = round(probs[i][pred_ids[i]].item(), 4)

    return data