flpelerin commited on
Commit ·
29a6938
1
Parent(s): 073d8c3
fix
Browse files
train.py
CHANGED
|
@@ -40,7 +40,12 @@ print(f"Tokenzier has {vocab_size} unique tokens")
|
|
| 40 |
dataset = load_dataset(dataset_path)
|
| 41 |
|
| 42 |
def process_function(examples):
|
| 43 |
-
return tokenizer(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
tokenized_datasets = dataset.map(process_function, batched=True)
|
| 46 |
print(f"Dataset has {tokenized_datasets['train'].num_rows} rows of {batch_size} times {seq_length} tokens")
|
|
|
|
| 40 |
dataset = load_dataset(dataset_path)
|
| 41 |
|
| 42 |
def process_function(examples):
|
| 43 |
+
return tokenizer(
|
| 44 |
+
examples['text'],
|
| 45 |
+
padding='max_length', # Fixed padding
|
| 46 |
+
truncation=True,
|
| 47 |
+
max_length=seq_length # Fixed max length
|
| 48 |
+
)
|
| 49 |
|
| 50 |
tokenized_datasets = dataset.map(process_function, batched=True)
|
| 51 |
print(f"Dataset has {tokenized_datasets['train'].num_rows} rows of {batch_size} times {seq_length} tokens")
|