# /// script # requires-python = ">=3.10" # dependencies = [ # "transformers>=4.45.0", # "datasets>=2.14.0", # "trl>=0.12.0", # "peft>=0.13.0", # "accelerate>=0.34.0", # "bitsandbytes>=0.44.0", # "trackio>=0.1.0", # "huggingface_hub>=0.25.0", # ] # /// import os import trackio from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer from trl import SFTConfig, SFTTrainer # Initialize tracking trackio.init(project="obsidian-bases-slm-compact") # Config MODEL_ID = "HuggingFaceTB/SmolLM2-135M-Instruct" DATASET_ID = "ssdavid/obsidian-bases-query-v2-compact" OUTPUT_REPO = "ssdavid/obsidian-bases-slm-compact" # Load dataset print(f"Loading dataset: {DATASET_ID}") dataset = load_dataset(DATASET_ID, split="train") print(f"Dataset size: {len(dataset)}") # Load model and tokenizer print(f"Loading model: {MODEL_ID}") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained(MODEL_ID) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Training config training_args = SFTConfig( output_dir="./output", num_train_epochs=3, per_device_train_batch_size=8, gradient_accumulation_steps=2, learning_rate=2e-5, warmup_ratio=0.1, logging_steps=10, save_strategy="epoch", push_to_hub=True, hub_model_id=OUTPUT_REPO, hub_token=os.environ.get("HF_TOKEN"), report_to=["trackio"], ) # Trainer trainer = SFTTrainer( model=model, args=training_args, train_dataset=dataset, processing_class=tokenizer, ) # Train print("Starting training...") trainer.train() # Push final model print("Pushing to Hub...") trainer.push_to_hub() print(f"✓ Model pushed to {OUTPUT_REPO}")