| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | """ |
| | LoRA Fine-tuning Script: Add Tool Calling to Synthia-S1-27b |
| | |
| | This script fine-tunes Tesslate/Synthia-S1-27b with LoRA using the |
| | nvidia/Nemotron-Agentic-v1 tool_calling dataset. |
| | |
| | Usage: |
| | # With uv (recommended) |
| | uv run train_tool_calling.py |
| | |
| | # Or with pip |
| | pip install torch transformers datasets trl peft accelerate bitsandbytes trackio |
| | python train_tool_calling.py |
| | |
| | Hardware Requirements: |
| | - Minimum: 1x A100 80GB or 2x A10G 24GB |
| | - Recommended: 1x A100 80GB for fastest training |
| | """ |
| |
|
| | import os |
| | import json |
| | from datasets import load_dataset, Dataset |
| | from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorForLanguageModeling |
| | from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training |
| | from trl import SFTTrainer, SFTConfig |
| | import torch |
| | import trackio |
| | from huggingface_hub import hf_hub_download, HfApi, create_repo |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | BASE_MODEL = "Tesslate/Synthia-S1-27b" |
| | OUTPUT_MODEL = "Synthia-S1-27b-tool-calling" |
| |
|
| | |
| | DATASET_NAME = "nvidia/Nemotron-Agentic-v1" |
| | DATASET_SPLIT = "tool_calling" |
| | MAX_SAMPLES = None |
| |
|
| | |
| | NUM_EPOCHS = 1 |
| | MAX_SEQ_LENGTH = 4096 |
| | BATCH_SIZE = 1 |
| | GRADIENT_ACCUMULATION = 16 |
| | LEARNING_RATE = 2e-4 |
| | WARMUP_RATIO = 0.03 |
| |
|
| | |
| | LORA_R = 64 |
| | LORA_ALPHA = 128 |
| | LORA_DROPOUT = 0.05 |
| |
|
| | |
| | USE_4BIT = False |
| |
|
| | |
| | TOKENIZED_DATASET_REPO = "Codyfederer/synthia-tool-calling-tokenized" |
| | SAVE_TOKENIZED = True |
| | TOKENIZED_DATASET_PRIVATE = True |
| | LOAD_TOKENIZED_IF_EXISTS = True |
| |
|
| | |
| | PUSH_TO_HUB = True |
| | HUB_PRIVATE = False |
| |
|
| | |
| | |
| | |
| |
|
| | def tokenize_conversation(example, tokenizer, max_length): |
| | """ |
| | Tokenize a conversation using the model's chat template. |
| | Returns input_ids, attention_mask, and labels for causal LM training. |
| | """ |
| | messages = example["messages"] |
| |
|
| | |
| | text = tokenizer.apply_chat_template( |
| | messages, |
| | tokenize=False, |
| | add_generation_prompt=False |
| | ) |
| |
|
| | |
| | tokenized = tokenizer( |
| | text, |
| | truncation=True, |
| | max_length=max_length, |
| | padding=False, |
| | return_tensors=None, |
| | ) |
| |
|
| | |
| | tokenized["labels"] = tokenized["input_ids"].copy() |
| |
|
| | return tokenized |
| |
|
| |
|
| | def main(): |
| | print("=" * 60) |
| | print("Tool Calling Fine-tuning for Synthia-S1-27b") |
| | print("=" * 60) |
| |
|
| | |
| | trackio.init(project="synthia-tool-calling") |
| |
|
| | |
| | from huggingface_hub import whoami |
| | try: |
| | username = whoami()["name"] |
| | hub_model_id = f"{username}/{OUTPUT_MODEL}" |
| | print(f"Will push to: {hub_model_id}") |
| | except Exception as e: |
| | print(f"Warning: Not logged in to HF Hub ({e})") |
| | print("Model will be saved locally only. Run 'huggingface-cli login' to enable Hub push.") |
| | hub_model_id = OUTPUT_MODEL |
| | global PUSH_TO_HUB |
| | PUSH_TO_HUB = False |
| |
|
| | |
| | |
| | |
| | print(f"\nLoading tokenizer from {BASE_MODEL}...") |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained( |
| | BASE_MODEL, |
| | trust_remote_code=True, |
| | padding_side="right", |
| | ) |
| |
|
| | |
| | if tokenizer.pad_token is None: |
| | tokenizer.pad_token = tokenizer.eos_token |
| | tokenizer.pad_token_id = tokenizer.eos_token_id |
| |
|
| | print(f"Vocab size: {len(tokenizer):,}") |
| |
|
| | |
| | |
| | |
| | train_dataset = None |
| | eval_dataset = None |
| |
|
| | if LOAD_TOKENIZED_IF_EXISTS: |
| | print(f"\nChecking for pre-tokenized dataset: {TOKENIZED_DATASET_REPO}") |
| | try: |
| | from datasets import load_dataset as hf_load_dataset |
| |
|
| | |
| | tokenized_ds = hf_load_dataset(TOKENIZED_DATASET_REPO) |
| |
|
| | |
| | if "train" in tokenized_ds and "input_ids" in tokenized_ds["train"].column_names: |
| | print(" Found pre-tokenized dataset with input_ids!") |
| | train_dataset = tokenized_ds["train"] |
| | eval_dataset = tokenized_ds.get("test", tokenized_ds.get("validation")) |
| | print(f" Train samples: {len(train_dataset):,}") |
| | if eval_dataset: |
| | print(f" Eval samples: {len(eval_dataset):,}") |
| | else: |
| | print(" Dataset exists but is not tokenized (no input_ids column)") |
| | print(" Will re-tokenize and save...") |
| | except Exception as e: |
| | print(f" Could not load pre-tokenized dataset: {e}") |
| | print(" Will tokenize from scratch...") |
| |
|
| | |
| | |
| | |
| | if train_dataset is None: |
| | print(f"\nLoading dataset: {DATASET_NAME} ({DATASET_SPLIT} split)...") |
| |
|
| | |
| | jsonl_file = f"data/{DATASET_SPLIT}.jsonl" |
| | print(f"Downloading {jsonl_file}...") |
| |
|
| | local_path = hf_hub_download( |
| | repo_id=DATASET_NAME, |
| | filename=jsonl_file, |
| | repo_type="dataset" |
| | ) |
| | print(f"Downloaded to: {local_path}") |
| |
|
| | |
| | print("Loading and processing JSONL file...") |
| | processed_examples = [] |
| | skipped = 0 |
| |
|
| | with open(local_path, 'r', encoding='utf-8') as f: |
| | for line_num, line in enumerate(f): |
| | if line_num % 50000 == 0: |
| | print(f" Processed {line_num:,} lines...") |
| | try: |
| | example = json.loads(line.strip()) |
| | messages = example.get("messages", []) |
| |
|
| | |
| | formatted_messages = [] |
| | for msg in messages: |
| | role = msg.get("role", "user") |
| | content = msg.get("content", "") |
| |
|
| | |
| | if isinstance(content, list): |
| | |
| | parts = [] |
| | for item in content: |
| | if isinstance(item, dict): |
| | if "text" in item: |
| | parts.append(item["text"]) |
| | else: |
| | parts.append(json.dumps(item)) |
| | else: |
| | parts.append(str(item)) |
| | content = "\n".join(parts) if parts else "" |
| | elif isinstance(content, dict): |
| | content = json.dumps(content) |
| | elif content is None: |
| | content = "" |
| | else: |
| | content = str(content) |
| |
|
| | formatted_messages.append({ |
| | "role": role, |
| | "content": content |
| | }) |
| |
|
| | |
| | |
| | if formatted_messages: |
| | merged_messages = [] |
| | for msg in formatted_messages: |
| | role = msg["role"] |
| | content = msg["content"] |
| |
|
| | |
| | if role == "tool": |
| | role = "user" |
| | content = f"[Tool Result]\n{content}" |
| |
|
| | |
| | if merged_messages and merged_messages[-1]["role"] == role: |
| | merged_messages[-1]["content"] += f"\n\n{content}" |
| | else: |
| | merged_messages.append({"role": role, "content": content}) |
| |
|
| | |
| | if merged_messages and merged_messages[0]["role"] != "user": |
| | |
| | merged_messages.insert(0, {"role": "user", "content": "[Start]"}) |
| |
|
| | processed_examples.append({"messages": merged_messages}) |
| |
|
| | except Exception as e: |
| | skipped += 1 |
| | if skipped < 5: |
| | print(f" Warning: Skipped line {line_num}: {e}") |
| |
|
| | print(f"Loaded {len(processed_examples):,} examples (skipped {skipped})") |
| |
|
| | |
| | dataset = Dataset.from_list(processed_examples) |
| | print(f"Dataset size: {len(dataset):,} examples") |
| |
|
| | if MAX_SAMPLES and len(dataset) > MAX_SAMPLES: |
| | dataset = dataset.shuffle(seed=42).select(range(MAX_SAMPLES)) |
| | print(f"Limited to {MAX_SAMPLES:,} samples for training") |
| |
|
| | |
| | split_dataset = dataset.train_test_split(test_size=0.02, seed=42) |
| | train_dataset = split_dataset["train"] |
| | eval_dataset = split_dataset["test"] |
| |
|
| | print(f"Train samples: {len(train_dataset):,}") |
| | print(f"Eval samples: {len(eval_dataset):,}") |
| |
|
| | |
| | |
| | |
| | print(f"\nTokenizing dataset with max_length={MAX_SEQ_LENGTH}...") |
| | print("This may take a while for large datasets...") |
| |
|
| | |
| | train_dataset = train_dataset.map( |
| | lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH), |
| | remove_columns=["messages"], |
| | num_proc=4, |
| | desc="Tokenizing train", |
| | ) |
| |
|
| | |
| | eval_dataset = eval_dataset.map( |
| | lambda x: tokenize_conversation(x, tokenizer, MAX_SEQ_LENGTH), |
| | remove_columns=["messages"], |
| | num_proc=4, |
| | desc="Tokenizing eval", |
| | ) |
| |
|
| | print(f"Tokenization complete!") |
| | print(f"Train dataset columns: {train_dataset.column_names}") |
| | print(f"Sample input_ids length: {len(train_dataset[0]['input_ids'])}") |
| |
|
| | |
| | if SAVE_TOKENIZED: |
| | print(f"\nSaving TOKENIZED dataset to Hub: {TOKENIZED_DATASET_REPO}") |
| | try: |
| | |
| | api = HfApi() |
| | try: |
| | create_repo( |
| | TOKENIZED_DATASET_REPO, |
| | repo_type="dataset", |
| | private=TOKENIZED_DATASET_PRIVATE, |
| | exist_ok=True |
| | ) |
| | print(f" Created/verified repo (private={TOKENIZED_DATASET_PRIVATE})") |
| |
|
| | |
| | if TOKENIZED_DATASET_PRIVATE: |
| | try: |
| | api.update_repo_visibility( |
| | TOKENIZED_DATASET_REPO, |
| | repo_type="dataset", |
| | private=True |
| | ) |
| | print(f" Ensured repo is private") |
| | except Exception: |
| | pass |
| | except Exception as e: |
| | print(f" Repo creation note: {e}") |
| |
|
| | |
| | train_dataset.reset_format() |
| | eval_dataset.reset_format() |
| |
|
| | |
| | print(f" Verifying tokenized data...") |
| | print(f" Train columns: {train_dataset.column_names}") |
| | print(f" Sample input_ids type: {type(train_dataset[0]['input_ids'])}") |
| | print(f" Sample input_ids length: {len(train_dataset[0]['input_ids'])}") |
| | print(f" First 10 tokens: {train_dataset[0]['input_ids'][:10]}") |
| |
|
| | |
| | print(f" Pushing train split ({len(train_dataset):,} examples)...") |
| | train_dataset.push_to_hub( |
| | TOKENIZED_DATASET_REPO, |
| | split="train", |
| | ) |
| | print(f" Pushing test split ({len(eval_dataset):,} examples)...") |
| | eval_dataset.push_to_hub( |
| | TOKENIZED_DATASET_REPO, |
| | split="test", |
| | ) |
| | print(f" SUCCESS! Saved TOKENIZED data to: https://huggingface.co/datasets/{TOKENIZED_DATASET_REPO}") |
| | print(f" Columns saved: {train_dataset.column_names}") |
| | print(f" Dataset is private: {TOKENIZED_DATASET_PRIVATE}") |
| |
|
| | |
| | print(f" Verifying upload...") |
| | try: |
| | from datasets import load_dataset as verify_load |
| | verify_ds = verify_load(TOKENIZED_DATASET_REPO, split="train", streaming=True) |
| | sample = next(iter(verify_ds)) |
| | if "input_ids" in sample: |
| | print(f" VERIFIED: Dataset contains input_ids with {len(sample['input_ids'])} tokens") |
| | else: |
| | print(f" WARNING: Dataset uploaded but input_ids not found in columns: {list(sample.keys())}") |
| | except Exception as ve: |
| | print(f" Could not verify upload: {ve}") |
| |
|
| | except Exception as e: |
| | print(f" ERROR saving to Hub: {e}") |
| | import traceback |
| | traceback.print_exc() |
| | print(" Continuing with training anyway...") |
| |
|
| | |
| | |
| | |
| | print(f"\nLoading model: {BASE_MODEL}...") |
| |
|
| | if USE_4BIT: |
| | print("Using 4-bit quantization (QLoRA)") |
| | bnb_config = BitsAndBytesConfig( |
| | load_in_4bit=True, |
| | bnb_4bit_quant_type="nf4", |
| | bnb_4bit_compute_dtype=torch.bfloat16, |
| | bnb_4bit_use_double_quant=True, |
| | ) |
| | else: |
| | bnb_config = None |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | BASE_MODEL, |
| | quantization_config=bnb_config, |
| | device_map="auto", |
| | trust_remote_code=True, |
| | torch_dtype=torch.bfloat16, |
| | attn_implementation="sdpa", |
| | ) |
| |
|
| | if USE_4BIT: |
| | model = prepare_model_for_kbit_training(model) |
| |
|
| | print(f"Model loaded. Parameters: {model.num_parameters():,}") |
| |
|
| | |
| | |
| | |
| | print(f"\nConfiguring LoRA (r={LORA_R}, alpha={LORA_ALPHA})...") |
| |
|
| | |
| | target_modules = [ |
| | "q_proj", "k_proj", "v_proj", "o_proj", |
| | "gate_proj", "up_proj", "down_proj", |
| | ] |
| |
|
| | lora_config = LoraConfig( |
| | r=LORA_R, |
| | lora_alpha=LORA_ALPHA, |
| | lora_dropout=LORA_DROPOUT, |
| | target_modules=target_modules, |
| | bias="none", |
| | task_type="CAUSAL_LM", |
| | ) |
| |
|
| | model = get_peft_model(model, lora_config) |
| | model.print_trainable_parameters() |
| |
|
| | |
| | |
| | |
| | print("\nConfiguring training...") |
| |
|
| | training_args = SFTConfig( |
| | output_dir=f"./{OUTPUT_MODEL}", |
| |
|
| | |
| | num_train_epochs=NUM_EPOCHS, |
| | per_device_train_batch_size=BATCH_SIZE, |
| | per_device_eval_batch_size=BATCH_SIZE, |
| | gradient_accumulation_steps=GRADIENT_ACCUMULATION, |
| |
|
| | |
| | learning_rate=LEARNING_RATE, |
| | lr_scheduler_type="cosine", |
| | warmup_ratio=WARMUP_RATIO, |
| | weight_decay=0.01, |
| | optim="adamw_torch", |
| |
|
| | |
| | gradient_checkpointing=True, |
| | gradient_checkpointing_kwargs={"use_reentrant": False}, |
| | max_grad_norm=1.0, |
| |
|
| | |
| | max_length=MAX_SEQ_LENGTH, |
| | packing=False, |
| |
|
| | |
| | eval_strategy="steps", |
| | eval_steps=500, |
| |
|
| | |
| | save_strategy="steps", |
| | save_steps=500, |
| | save_total_limit=3, |
| |
|
| | |
| | push_to_hub=PUSH_TO_HUB, |
| | hub_model_id=hub_model_id if PUSH_TO_HUB else None, |
| | hub_strategy="checkpoint", |
| | hub_private_repo=HUB_PRIVATE, |
| |
|
| | |
| | logging_steps=10, |
| | report_to="trackio", |
| | run_name=f"lora-r{LORA_R}-lr{LEARNING_RATE}", |
| |
|
| | |
| | bf16=True, |
| | dataloader_num_workers=4, |
| | dataloader_pin_memory=True, |
| |
|
| | |
| | seed=42, |
| | ) |
| |
|
| | |
| | |
| | |
| | print("\nInitializing trainer...") |
| |
|
| | |
| | data_collator = DataCollatorForLanguageModeling( |
| | tokenizer=tokenizer, |
| | mlm=False, |
| | ) |
| |
|
| | |
| | is_pretokenized = "input_ids" in train_dataset.column_names |
| | print(f"Dataset is pre-tokenized: {is_pretokenized}") |
| | print(f"Dataset columns: {train_dataset.column_names}") |
| |
|
| | trainer = SFTTrainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=train_dataset, |
| | eval_dataset=eval_dataset, |
| | processing_class=tokenizer, |
| | data_collator=data_collator, |
| | ) |
| |
|
| | |
| | |
| | |
| | print("\n" + "=" * 60) |
| | print("Starting training...") |
| | print("=" * 60 + "\n") |
| |
|
| | trainer.train() |
| |
|
| | |
| | |
| | |
| | print("\nSaving final model...") |
| | trainer.save_model() |
| |
|
| | if PUSH_TO_HUB: |
| | print(f"Pushing to Hub: {hub_model_id}") |
| | trainer.push_to_hub() |
| | print(f"\n✅ Model available at: https://huggingface.co/{hub_model_id}") |
| | else: |
| | print(f"Model saved locally to: ./{OUTPUT_MODEL}") |
| |
|
| | print("\n" + "=" * 60) |
| | print("Training complete!") |
| | print("=" * 60) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|