Spaces:
Sleeping
Sleeping
| """ | |
| train_indobert.py β Fine-tuning IndoBERT untuk klasifikasi sentimen. | |
| Jalankan secara lokal dengan GPU, BUKAN saat startup app. | |
| """ | |
| if __name__ == "__main__": | |
| import pandas as pd | |
| from datasets import Dataset | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| Trainer, | |
| TrainingArguments, | |
| EarlyStoppingCallback, | |
| ) | |
| MODEL_NAME = "indobenchmark/indobert-base-p1" | |
| OUTPUT_DIR = "model/final_model" | |
| DATA_PATH = "data/train.csv" # kolom: text, label (0/1/2) | |
| # ββ LOAD DATA ββ | |
| df = pd.read_csv(DATA_PATH) | |
| dataset = Dataset.from_pandas(df) | |
| # ββ TOKENIZER ββ | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| def tokenize(batch): | |
| return tokenizer(batch["text"], truncation=True, padding=True, max_length=128) | |
| dataset = dataset.map(tokenize, batched=True) | |
| dataset = dataset.rename_column("label", "labels") | |
| dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"]) | |
| # ββ SPLIT ββ | |
| split = dataset.train_test_split(test_size=0.1, seed=42) | |
| train_d = split["train"] | |
| eval_d = split["test"] | |
| # ββ MODEL ββ | |
| num_labels = df["label"].nunique() | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| MODEL_NAME, num_labels=num_labels | |
| ) | |
| # ββ TRAINING ARGS ββ | |
| args = TrainingArguments( | |
| output_dir=OUTPUT_DIR, | |
| per_device_train_batch_size=16, | |
| per_device_eval_batch_size=16, | |
| num_train_epochs=5, | |
| evaluation_strategy="epoch", | |
| save_strategy="epoch", | |
| load_best_model_at_end=True, | |
| metric_for_best_model="eval_loss", | |
| logging_steps=20, | |
| fp16=True, # matikan jika CPU-only | |
| report_to="none", | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=args, | |
| train_dataset=train_d, | |
| eval_dataset=eval_d, | |
| callbacks=[EarlyStoppingCallback(early_stopping_patience=2)], | |
| ) | |
| trainer.train() | |
| # ββ SAVE ββ | |
| model.save_pretrained(OUTPUT_DIR) | |
| tokenizer.save_pretrained(OUTPUT_DIR) | |
| print(f"β Model saved: {OUTPUT_DIR}") |