unesco-training-scripts / train_unesco_tagger.py
unesco-data-ai's picture
Upload train_unesco_tagger.py with huggingface_hub
8797ed2 verified
# /// script
# dependencies = [
# "trl>=0.12.0",
# "transformers>=4.36.0",
# "accelerate>=0.24.0",
# "trackio",
# ]
# ///
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
print("Loading dataset...")
dataset = load_dataset("unesco-data-ai/unesco-thesaurus-sft")
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
config = SFTConfig(
output_dir="lfm2.5-1.2b-unesco-tagger",
push_to_hub=True,
hub_model_id="unesco-data-ai/lfm2.5-1.2b-unesco-tagger-v1",
hub_strategy="every_save",
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-5,
max_length=1024,
logging_steps=10,
save_strategy="steps",
save_steps=200,
save_total_limit=2,
eval_strategy="steps",
eval_steps=200,
warmup_ratio=0.1,
lr_scheduler_type="cosine",
bf16=True,
report_to="trackio",
project="unesco-keyword-extraction",
run_name="lfm2.5-1.2b-sft-v1",
)
print("Initializing trainer...")
trainer = SFTTrainer(
model="LiquidAI/LFM2.5-1.2B-Instruct",
train_dataset=train_dataset,
eval_dataset=eval_dataset,
args=config,
)
print("Starting training...")
trainer.train()
print("Pushing to Hub...")
trainer.push_to_hub()
print("Complete! Model at: https://huggingface.co/unesco-data-ai/lfm2.5-1.2b-unesco-tagger-v1")