Spaces:

tunglee7it
/

final_sentiment

Sleeping

App Files Files Community

final_sentiment / app.py

tunglee7it

Update app.py

11dde42 verified 23 days ago

raw

history blame contribute delete

2.63 kB

	import torch
	import joblib
	import re
	import unicodedata

	from fastapi import FastAPI, Body
	from transformers import AutoTokenizer, AutoModelForSequenceClassification


	# =========================
	# Text preprocessing (giống lúc train)
	# =========================

	def preprocess_text(text):

	if not text:
	return ""

	# Normalize unicode
	text = unicodedata.normalize("NFC", text)

	# lowercase
	text = text.lower()

	# remove links
	text = re.sub(r"http\S+\|www\S+\|https\S+", "", text, flags=re.MULTILINE)

	# remove mention + hashtag
	text = re.sub(r"@\w+\|#\w+", "", text)

	# remove special characters (giữ punctuation cơ bản)
	text = re.sub(r"[^\w\s,.!?]", "", text)

	# remove extra whitespace
	text = re.sub(r"\s+", " ", text).strip()

	return text


	# =========================
	# Load model
	# =========================

	MODEL_PATH = "./phobert_sentiment_model_final"

	tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
	model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
	model.eval()

	label_encoder = joblib.load("./label_encoder.pkl")


	# =========================
	# FastAPI
	# =========================

	app = FastAPI(
	title="PhoBERT Sentiment API",
	version="1.0"
	)


	# =========================
	# Health
	# =========================

	@app.get("/")
	def root():
	return {"message": "PhoBERT Sentiment API running"}

	@app.get("/health")
	def health():
	return {"status": "ok"}


	# =========================
	# Predict comments
	# =========================

	@app.post("/predict_comments")
	def predict_comments(data = Body(...)):

	# hỗ trợ cả list và dict (n8n hay gửi list)
	if isinstance(data, list):
	data = data[0]

	comments = data.get("info_comment", [])

	if not comments:
	return data

	# lấy comment gốc
	original_texts = [c.get("comment", "") for c in comments]

	# preprocessing giống lúc train
	clean_texts = [preprocess_text(t) for t in original_texts]

	# tokenize
	inputs = tokenizer(
	clean_texts,
	padding=True,
	truncation=True,
	max_length=128,
	return_tensors="pt"
	)

	# inference
	with torch.no_grad():
	outputs = model(**inputs)
	probs = torch.softmax(outputs.logits, dim=1)
	pred_ids = torch.argmax(probs, dim=1).tolist()

	labels = label_encoder.inverse_transform(pred_ids)

	# gắn label + confidence vào JSON
	for i, comment in enumerate(comments):
	comment["label"] = labels[i]
	comment["confidence"] = round(probs[i][pred_ids[i]].item(), 4)

	return data