| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import os |
| | import trackio |
| | from datasets import load_dataset |
| | from transformers import AutoModelForCausalLM, AutoTokenizer |
| | from trl import SFTConfig, SFTTrainer |
| |
|
| | |
| | trackio.init(project="obsidian-bases-slm-compact") |
| |
|
| | |
| | MODEL_ID = "HuggingFaceTB/SmolLM2-135M-Instruct" |
| | DATASET_ID = "ssdavid/obsidian-bases-query-v2-compact" |
| | OUTPUT_REPO = "ssdavid/obsidian-bases-slm-compact" |
| |
|
| | |
| | print(f"Loading dataset: {DATASET_ID}") |
| | dataset = load_dataset(DATASET_ID, split="train") |
| | print(f"Dataset size: {len(dataset)}") |
| |
|
| | |
| | print(f"Loading model: {MODEL_ID}") |
| | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
| | model = AutoModelForCausalLM.from_pretrained(MODEL_ID) |
| |
|
| | if tokenizer.pad_token is None: |
| | tokenizer.pad_token = tokenizer.eos_token |
| |
|
| | |
| | training_args = SFTConfig( |
| | output_dir="./output", |
| | num_train_epochs=3, |
| | per_device_train_batch_size=8, |
| | gradient_accumulation_steps=2, |
| | learning_rate=2e-5, |
| | warmup_ratio=0.1, |
| | logging_steps=10, |
| | save_strategy="epoch", |
| | push_to_hub=True, |
| | hub_model_id=OUTPUT_REPO, |
| | hub_token=os.environ.get("HF_TOKEN"), |
| | report_to=["trackio"], |
| | ) |
| |
|
| | |
| | trainer = SFTTrainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=dataset, |
| | processing_class=tokenizer, |
| | ) |
| |
|
| | |
| | print("Starting training...") |
| | trainer.train() |
| |
|
| | |
| | print("Pushing to Hub...") |
| | trainer.push_to_hub() |
| | print(f"✓ Model pushed to {OUTPUT_REPO}") |
| |
|