Sentimen-Analysis / train_indobert.py
noranisa's picture
Update train_indobert.py
496de3b verified
"""
train_indobert.py β€” Fine-tuning IndoBERT untuk klasifikasi sentimen.
Jalankan secara lokal dengan GPU, BUKAN saat startup app.
"""
if __name__ == "__main__":
import pandas as pd
from datasets import Dataset
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
Trainer,
TrainingArguments,
EarlyStoppingCallback,
)
MODEL_NAME = "indobenchmark/indobert-base-p1"
OUTPUT_DIR = "model/final_model"
DATA_PATH = "data/train.csv" # kolom: text, label (0/1/2)
# ── LOAD DATA ──
df = pd.read_csv(DATA_PATH)
dataset = Dataset.from_pandas(df)
# ── TOKENIZER ──
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def tokenize(batch):
return tokenizer(batch["text"], truncation=True, padding=True, max_length=128)
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.rename_column("label", "labels")
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
# ── SPLIT ──
split = dataset.train_test_split(test_size=0.1, seed=42)
train_d = split["train"]
eval_d = split["test"]
# ── MODEL ──
num_labels = df["label"].nunique()
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME, num_labels=num_labels
)
# ── TRAINING ARGS ──
args = TrainingArguments(
output_dir=OUTPUT_DIR,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=5,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
logging_steps=20,
fp16=True, # matikan jika CPU-only
report_to="none",
)
trainer = Trainer(
model=model,
args=args,
train_dataset=train_d,
eval_dataset=eval_d,
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)
trainer.train()
# ── SAVE ──
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"βœ… Model saved: {OUTPUT_DIR}")