Spaces:
Runtime error
Runtime error
Fix bug when TrainingArguments and Trainer
Browse files- spanish_medica_llm.py +6 -7
spanish_medica_llm.py
CHANGED
|
@@ -16,10 +16,10 @@ from datasets import load_dataset, concatenate_datasets
|
|
| 16 |
from transformers import (
|
| 17 |
AutoModelForCausalLM,
|
| 18 |
AutoTokenizer,
|
| 19 |
-
BitsAndBytesConfig,
|
|
|
|
| 20 |
TrainingArguments,
|
| 21 |
-
Trainer
|
| 22 |
-
DataCollatorForLanguageModeling
|
| 23 |
)
|
| 24 |
|
| 25 |
from accelerate import FullyShardedDataParallelPlugin, Accelerator
|
|
@@ -399,7 +399,7 @@ def getTokenizedDataset(dataset, tokenizer):
|
|
| 399 |
return dataset
|
| 400 |
|
| 401 |
return dataset.map(
|
| 402 |
-
lambda element : tokenize(element, tokenizer)
|
| 403 |
batched = True,
|
| 404 |
remove_columns = dataset["train"].column_names
|
| 405 |
)
|
|
@@ -497,8 +497,7 @@ def configAndRunTraining(basemodel, dataset, eval_dataset, tokenizer):
|
|
| 497 |
else:
|
| 498 |
tokenizer.pad_token = tokenizer.eos_token
|
| 499 |
data_collator_pretrain = DataCollatorForLanguageModeling(tokenizer, mlm = False)
|
| 500 |
-
|
| 501 |
-
training_args = transformers.TrainingArguments(
|
| 502 |
output_dir=output_dir,
|
| 503 |
push_to_hub = True,
|
| 504 |
hub_private_repo = False,
|
|
@@ -524,7 +523,7 @@ def configAndRunTraining(basemodel, dataset, eval_dataset, tokenizer):
|
|
| 524 |
bf16=False
|
| 525 |
)
|
| 526 |
|
| 527 |
-
trainer =
|
| 528 |
model= basemodel,
|
| 529 |
train_dataset = dataset['train'],
|
| 530 |
eval_dataset = eval_dataset,
|
|
|
|
| 16 |
from transformers import (
|
| 17 |
AutoModelForCausalLM,
|
| 18 |
AutoTokenizer,
|
| 19 |
+
BitsAndBytesConfig,
|
| 20 |
+
DataCollatorForLanguageModeling,
|
| 21 |
TrainingArguments,
|
| 22 |
+
Trainer
|
|
|
|
| 23 |
)
|
| 24 |
|
| 25 |
from accelerate import FullyShardedDataParallelPlugin, Accelerator
|
|
|
|
| 399 |
return dataset
|
| 400 |
|
| 401 |
return dataset.map(
|
| 402 |
+
lambda element : tokenize(element, tokenizer),
|
| 403 |
batched = True,
|
| 404 |
remove_columns = dataset["train"].column_names
|
| 405 |
)
|
|
|
|
| 497 |
else:
|
| 498 |
tokenizer.pad_token = tokenizer.eos_token
|
| 499 |
data_collator_pretrain = DataCollatorForLanguageModeling(tokenizer, mlm = False)
|
| 500 |
+
training_args = TrainingArguments(
|
|
|
|
| 501 |
output_dir=output_dir,
|
| 502 |
push_to_hub = True,
|
| 503 |
hub_private_repo = False,
|
|
|
|
| 523 |
bf16=False
|
| 524 |
)
|
| 525 |
|
| 526 |
+
trainer = Trainer(
|
| 527 |
model= basemodel,
|
| 528 |
train_dataset = dataset['train'],
|
| 529 |
eval_dataset = eval_dataset,
|