ajkndfjsdfasdf commited on
Commit
07d2e4c
·
verified ·
1 Parent(s): c1a453b

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +24 -14
train.py CHANGED
@@ -1,38 +1,48 @@
1
- from transformers import T5ForConditionalGeneration, ByT5Tokenizer, Trainer, TrainingArguments
2
  from datasets import load_dataset
3
  import os
4
  import wandb
5
  import torch
6
 
7
  # 🔧 Название модели и путь
8
- model_name = "google/byt5-small"
9
- run_id = "byt5-autobatch"
10
  output_dir = f"./{run_id}"
11
- start_batch_size = 300
12
- step_batch_size = 5
13
 
14
  # 📦 Загружаем модель и токенизатор
15
  model = T5ForConditionalGeneration.from_pretrained(model_name)
16
- tokenizer = ByT5Tokenizer.from_pretrained(model_name)
17
 
18
  # 📂 Загружаем датасет
19
  data_files = {
20
- "train": "mt5_training_data-1.jsonl",
21
- "validation": "mt5_validation_data-1.jsonl"
22
  }
23
  dataset = load_dataset("json", data_files=data_files)
24
 
25
  # 🔠 Токенизация
26
  def tokenize_function(examples):
27
- model_inputs = tokenizer(examples["text"], max_length=256, truncation=True, padding="max_length")
28
- labels = tokenizer(examples["target"], max_length=64, truncation=True, padding="max_length")
 
 
 
 
 
 
 
 
 
 
29
  model_inputs["labels"] = labels["input_ids"]
30
  return model_inputs
31
 
32
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
33
 
34
  # 🔑 Авторизация W&B
35
- wandb.login(key="5f028bc0142fb7fa45bdacdde3c00dbbaf8bf98e")
36
 
37
  # 🚀 Функция автоподбора batch size
38
  def try_training_with_batch_size(batch_size_start):
@@ -44,11 +54,11 @@ def try_training_with_batch_size(batch_size_start):
44
  output_dir=output_dir,
45
  evaluation_strategy="steps",
46
  eval_steps=100,
47
- learning_rate=5e-5,
48
  per_device_train_batch_size=batch_size,
49
  per_device_eval_batch_size=batch_size,
50
- #fp16=True,
51
- num_train_epochs=3,
52
  logging_steps=100,
53
  warmup_ratio=0.06,
54
  logging_first_step=True,
 
1
+ from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
2
  from datasets import load_dataset
3
  import os
4
  import wandb
5
  import torch
6
 
7
  # 🔧 Название модели и путь
8
+ model_name = "google/flan-t5-large"
9
+ run_id = "flan-t5-large-ru-autobatch"
10
  output_dir = f"./{run_id}"
11
+ start_batch_size = 10 # ⚠️ Начинаем с небольшого batch, чтобы избежать OOM
12
+ step_batch_size = 1
13
 
14
  # 📦 Загружаем модель и токенизатор
15
  model = T5ForConditionalGeneration.from_pretrained(model_name)
16
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
17
 
18
  # 📂 Загружаем датасет
19
  data_files = {
20
+ "train": "mt5_ru_gen_async.jsonl",
21
+ "validation": "mt5_ru_gen_eval.jsonl"
22
  }
23
  dataset = load_dataset("json", data_files=data_files)
24
 
25
  # 🔠 Токенизация
26
  def tokenize_function(examples):
27
+ model_inputs = tokenizer(
28
+ examples["text"], max_length=256, truncation=True, padding="max_length"
29
+ )
30
+ with tokenizer.as_target_tokenizer():
31
+ labels = tokenizer(
32
+ examples["target"], max_length=256, truncation=True, padding="max_length"
33
+ )
34
+ # Заменяем PAD-токены на -100, чтобы не учитывать их в подсчёте loss
35
+ labels["input_ids"] = [
36
+ [(token if token != tokenizer.pad_token_id else -100) for token in label]
37
+ for label in labels["input_ids"]
38
+ ]
39
  model_inputs["labels"] = labels["input_ids"]
40
  return model_inputs
41
 
42
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
43
 
44
  # 🔑 Авторизация W&B
45
+ wandb.login(key="ВАШ_WANDB_КЛЮЧ_ЗДЕСЬ")
46
 
47
  # 🚀 Функция автоподбора batch size
48
  def try_training_with_batch_size(batch_size_start):
 
54
  output_dir=output_dir,
55
  evaluation_strategy="steps",
56
  eval_steps=100,
57
+ learning_rate=3e-5,
58
  per_device_train_batch_size=batch_size,
59
  per_device_eval_batch_size=batch_size,
60
+ #fp16=True, # Включайте при наличии подходящего GPU (A100 / V100 / T4)
61
+ num_train_epochs=10,
62
  logging_steps=100,
63
  warmup_ratio=0.06,
64
  logging_first_step=True,