RayMelius Claude Sonnet 4.6 commited on
Commit
ebf88a6
·
1 Parent(s): e1870c9

Fix SFTConfig: move max_seq_length to tokenizer.model_max_length

Browse files

Newer TRL removed max_seq_length from SFTConfig.__init__.
Set tokenizer.model_max_length = MAX_SEQ_LEN instead.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. notebooks/ch_trader_finetune.ipynb +2 -48
notebooks/ch_trader_finetune.ipynb CHANGED
@@ -285,13 +285,7 @@
285
  "id": "load-tokenizer",
286
  "metadata": {},
287
  "outputs": [],
288
- "source": [
289
- "print(f\"Loading tokenizer: {BASE_MODEL}\")\n",
290
- "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n",
291
- "tokenizer.pad_token = tokenizer.eos_token\n",
292
- "tokenizer.padding_side = \"right\"\n",
293
- "print(\"Tokenizer loaded\")"
294
- ]
295
  },
296
  {
297
  "cell_type": "code",
@@ -404,47 +398,7 @@
404
  "id": "train",
405
  "metadata": {},
406
  "outputs": [],
407
- "source": [
408
- "sft_config = SFTConfig(\n",
409
- " output_dir=OUTPUT_DIR,\n",
410
- " num_train_epochs=NUM_EPOCHS,\n",
411
- " per_device_train_batch_size=BATCH_SIZE,\n",
412
- " per_device_eval_batch_size=BATCH_SIZE,\n",
413
- " gradient_accumulation_steps=GRAD_ACCUM,\n",
414
- " gradient_checkpointing=True,\n",
415
- " optim=\"paged_adamw_32bit\",\n",
416
- " learning_rate=LR,\n",
417
- " lr_scheduler_type=\"cosine\",\n",
418
- " warmup_ratio=0.05,\n",
419
- " max_seq_length=MAX_SEQ_LEN,\n",
420
- " fp16=not torch.cuda.is_bf16_supported(),\n",
421
- " bf16=torch.cuda.is_bf16_supported(),\n",
422
- " logging_steps=25,\n",
423
- " eval_strategy=\"steps\",\n",
424
- " eval_steps=100,\n",
425
- " save_strategy=\"steps\",\n",
426
- " save_steps=100,\n",
427
- " load_best_model_at_end=True,\n",
428
- " metric_for_best_model=\"eval_loss\",\n",
429
- " greater_is_better=False,\n",
430
- " report_to=\"none\",\n",
431
- " dataset_text_field=\"text\",\n",
432
- " packing=False,\n",
433
- ")\n",
434
- "\n",
435
- "trainer = SFTTrainer(\n",
436
- " model=model,\n",
437
- " args=sft_config,\n",
438
- " train_dataset=train_dataset,\n",
439
- " eval_dataset=val_dataset,\n",
440
- " peft_config=lora_config,\n",
441
- " processing_class=tokenizer,\n",
442
- ")\n",
443
- "\n",
444
- "print(\"Starting training...\")\n",
445
- "trainer.train()\n",
446
- "print(\"Training complete.\")"
447
- ]
448
  },
449
  {
450
  "cell_type": "markdown",
 
285
  "id": "load-tokenizer",
286
  "metadata": {},
287
  "outputs": [],
288
+ "source": "print(f\"Loading tokenizer: {BASE_MODEL}\")\ntokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\ntokenizer.pad_token = tokenizer.eos_token\ntokenizer.padding_side = \"right\"\ntokenizer.model_max_length = MAX_SEQ_LEN # replaces max_seq_length in SFTConfig\nprint(\"Tokenizer loaded\")"
 
 
 
 
 
 
289
  },
290
  {
291
  "cell_type": "code",
 
398
  "id": "train",
399
  "metadata": {},
400
  "outputs": [],
401
+ "source": "sft_config = SFTConfig(\n output_dir=OUTPUT_DIR,\n num_train_epochs=NUM_EPOCHS,\n per_device_train_batch_size=BATCH_SIZE,\n per_device_eval_batch_size=BATCH_SIZE,\n gradient_accumulation_steps=GRAD_ACCUM,\n gradient_checkpointing=True,\n optim=\"paged_adamw_32bit\",\n learning_rate=LR,\n lr_scheduler_type=\"cosine\",\n warmup_ratio=0.05,\n fp16=not torch.cuda.is_bf16_supported(),\n bf16=torch.cuda.is_bf16_supported(),\n logging_steps=25,\n eval_strategy=\"steps\",\n eval_steps=100,\n save_strategy=\"steps\",\n save_steps=100,\n load_best_model_at_end=True,\n metric_for_best_model=\"eval_loss\",\n greater_is_better=False,\n report_to=\"none\",\n dataset_text_field=\"text\",\n packing=False,\n)\n\ntrainer = SFTTrainer(\n model=model,\n args=sft_config,\n train_dataset=train_dataset,\n eval_dataset=val_dataset,\n peft_config=lora_config,\n processing_class=tokenizer,\n)\n\nprint(\"Starting training...\")\ntrainer.train()\nprint(\"Training complete.\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  },
403
  {
404
  "cell_type": "markdown",