flpelerin commited on
Commit
29a6938
·
1 Parent(s): 073d8c3
Files changed (1) hide show
  1. train.py +6 -1
train.py CHANGED
@@ -40,7 +40,12 @@ print(f"Tokenzier has {vocab_size} unique tokens")
40
  dataset = load_dataset(dataset_path)
41
 
42
  def process_function(examples):
43
- return tokenizer(examples['text'], padding='longest', truncation=True)
 
 
 
 
 
44
 
45
  tokenized_datasets = dataset.map(process_function, batched=True)
46
  print(f"Dataset has {tokenized_datasets['train'].num_rows} rows of {batch_size} times {seq_length} tokens")
 
40
  dataset = load_dataset(dataset_path)
41
 
42
  def process_function(examples):
43
+ return tokenizer(
44
+ examples['text'],
45
+ padding='max_length', # Fixed padding
46
+ truncation=True,
47
+ max_length=seq_length # Fixed max length
48
+ )
49
 
50
  tokenized_datasets = dataset.map(process_function, batched=True)
51
  print(f"Dataset has {tokenized_datasets['train'].num_rows} rows of {batch_size} times {seq_length} tokens")