Spaces:

ndyah2020
/

VietSent_Pipeline

Sleeping

App Files Files Community

VietSent_Pipeline / src /train_phobert_sentiment.py

ndyah2020

Upload 8 files

e6e7585 verified 5 months ago

raw

history blame contribute delete

3.76 kB

	import os
	from multiprocessing import freeze_support # <-- THÊM DÒNG NÀY
	from datasets import load_dataset, DatasetDict, concatenate_datasets
	from transformers import (
	AutoTokenizer,
	AutoModelForSequenceClassification,
	Trainer,
	TrainingArguments,
	)
	from underthesea import word_tokenize
	import evaluate

	# === HẰNG SỐ (Để global) ===
	MODEL_NAME = "vinai/phobert-base-v2"
	SAVE_DIR = "./phobert-base-v2-finetuned-sentiment"
	os.makedirs(SAVE_DIR, exist_ok=True)
	NUM_LABELS = 3
	LABEL2ID = {"NEG": 0, "NEU": 1, "POS": 2}
	ID2LABEL = {v: k for k, v in LABEL2ID.items()}

	# === CÁC HÀM (Để global) ===
	def normalize_label(example):
	emotion = example["Emotion"]

	if emotion == "Enjoyment":
	example["label"] = LABEL2ID["POS"]
	elif emotion in ["Anger", "Disgust", "Sadness", "Fear"]:
	example["label"] = LABEL2ID["NEG"]
	else:
	# Bao gồm "Surprise" và "Other"
	example["label"] = LABEL2ID["NEU"]

	return example


	def preprocess_function(examples):
	text = [" ".join(word_tokenize(t)) for t in examples["Sentence"]]
	encoding = tokenizer(text, truncation=True, padding="max_length", max_length=128)
	encoding["labels"] = examples["label"]
	return encoding

	def compute_metrics(eval_pred):
	logits, labels = eval_pred
	preds = logits.argmax(axis=-1)
	return metric.compute(predictions=preds, references=labels)

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
	metric = evaluate.load("accuracy")


	if __name__ == '__main__':
	freeze_support()
	print("🔹 Tải dữ liệu...")
	vie_train = load_dataset("tridm/UIT-VSMEC", split="train")
	vie_test = load_dataset("tridm/UIT-VSMEC", split="test")

	dataset = DatasetDict({
	"train": vie_train,
	"test": vie_test
	})

	print("🔹 Bắt đầu tiền xử lý...")
	print(" > Ánh xạ nhãn...")
	# Áp dụng tối ưu CPU (từ i7-13620H)
	dataset = dataset.map(normalize_label, num_proc=8)

	print(" > Tokenize văn bản...")
	# Áp dụng tối ưu CPU (từ i7-13620H)
	dataset = dataset.map(preprocess_function, batched=True, num_proc=8)

	print(" > Xóa cột không cần thiết...")
	dataset = dataset.remove_columns(["Sentence", "Emotion"])

	print("🔹 Tải mô hình...")
	model = AutoModelForSequenceClassification.from_pretrained(
	MODEL_NAME,
	num_labels=NUM_LABELS,
	id2label=ID2LABEL,
	label2id=LABEL2ID
	)

	training_args = TrainingArguments(
	output_dir=SAVE_DIR,
	eval_strategy="epoch",
	save_strategy="epoch",
	learning_rate=2e-5,
	per_device_train_batch_size=32,
	per_device_eval_batch_size=32,
	num_train_epochs=4,
	weight_decay=0.01,
	load_best_model_at_end=True,
	logging_dir="./logs",
	logging_strategy="steps",
	logging_steps=100,
	save_total_limit=2,
	metric_for_best_model="accuracy",
	dataloader_num_workers=4 # Tối ưu cho i7
	)

	# 7 Huấn luyện
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=dataset["train"],
	eval_dataset=dataset["test"],
	tokenizer=tokenizer,
	compute_metrics=compute_metrics,
	)

	print("🚀 Bắt đầu huấn luyện...")
	trainer.train()

	print("🏁 Huấn luyện hoàn tất. Đang đánh giá...")
	trainer.evaluate()

	# 8 Lưu mô hình đã fine-tune
	trainer.save_model(SAVE_DIR)
	tokenizer.save_pretrained(SAVE_DIR)

	print(f"✅ Mô hình đã lưu tại: {SAVE_DIR}")