""" train_indobert.py — Fine-tuning IndoBERT untuk klasifikasi sentimen. Jalankan secara lokal dengan GPU, BUKAN saat startup app. """ if __name__ == "__main__": import pandas as pd from datasets import Dataset from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, ) MODEL_NAME = "indobenchmark/indobert-base-p1" OUTPUT_DIR = "model/final_model" DATA_PATH = "data/train.csv" # kolom: text, label (0/1/2) # ── LOAD DATA ── df = pd.read_csv(DATA_PATH) dataset = Dataset.from_pandas(df) # ── TOKENIZER ── tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) def tokenize(batch): return tokenizer(batch["text"], truncation=True, padding=True, max_length=128) dataset = dataset.map(tokenize, batched=True) dataset = dataset.rename_column("label", "labels") dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"]) # ── SPLIT ── split = dataset.train_test_split(test_size=0.1, seed=42) train_d = split["train"] eval_d = split["test"] # ── MODEL ── num_labels = df["label"].nunique() model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, num_labels=num_labels ) # ── TRAINING ARGS ── args = TrainingArguments( output_dir=OUTPUT_DIR, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=5, evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, metric_for_best_model="eval_loss", logging_steps=20, fp16=True, # matikan jika CPU-only report_to="none", ) trainer = Trainer( model=model, args=args, train_dataset=train_d, eval_dataset=eval_d, callbacks=[EarlyStoppingCallback(early_stopping_patience=2)], ) trainer.train() # ── SAVE ── model.save_pretrained(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR) print(f"✅ Model saved: {OUTPUT_DIR}")