flopml
/

mingru

Model card Files Files and versions

flpelerin commited on Nov 1, 2024

Commit

29a6938

·

1 Parent(s): 073d8c3

fix

Files changed (1) hide show

train.py +6 -1

train.py CHANGED Viewed

@@ -40,7 +40,12 @@ print(f"Tokenzier has {vocab_size} unique tokens")
 dataset = load_dataset(dataset_path)
 def process_function(examples):
-    return tokenizer(examples['text'], padding='longest', truncation=True)
 tokenized_datasets = dataset.map(process_function, batched=True)
 print(f"Dataset has {tokenized_datasets['train'].num_rows} rows of {batch_size} times {seq_length} tokens")

 dataset = load_dataset(dataset_path)
 def process_function(examples):
+    return tokenizer(
+        examples['text'],
+        padding='max_length',  # Fixed padding
+        truncation=True,
+        max_length=seq_length  # Fixed max length
+    )
 tokenized_datasets = dataset.map(process_function, batched=True)
 print(f"Dataset has {tokenized_datasets['train'].num_rows} rows of {batch_size} times {seq_length} tokens")