Fix some bugs
Browse files- src/run.sh +3 -3
- src/run_clm_flax.py +2 -0
src/run.sh
CHANGED
|
@@ -9,9 +9,9 @@ export OUTPUT_DIR=/home/m3hrdadfi/code/gpt2-medium-persian
|
|
| 9 |
# export CONFIG_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
|
| 10 |
# export TOKENIZER_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
|
| 11 |
|
| 12 |
-
export TRAIN_FILE=/home/m3hrdadfi/data/train.csv
|
| 13 |
-
export VALIDATION_FILE=/home/m3hrdadfi/data/test.csv
|
| 14 |
-
export TEST_FILE=/home/m3hrdadfi/code/data/test.csv
|
| 15 |
# export DATASET_NAME=oscar
|
| 16 |
# export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
|
| 17 |
export MAX_SEQUENCE_LENGTH=512
|
|
|
|
| 9 |
# export CONFIG_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
|
| 10 |
# export TOKENIZER_NAME=/home/m3hrdadfi/code/gpt2-medium-persian
|
| 11 |
|
| 12 |
+
export TRAIN_FILE=/home/m3hrdadfi/data/train-fixed.csv
|
| 13 |
+
export VALIDATION_FILE=/home/m3hrdadfi/data/test-fixed.csv
|
| 14 |
+
export TEST_FILE=/home/m3hrdadfi/code/data/test-fixed.csv
|
| 15 |
# export DATASET_NAME=oscar
|
| 16 |
# export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
|
| 17 |
export MAX_SEQUENCE_LENGTH=512
|
src/run_clm_flax.py
CHANGED
|
@@ -368,6 +368,7 @@ def main():
|
|
| 368 |
# dataset = dataset.map(normalizer)
|
| 369 |
# logger.info(f"Preprocessed dataset kept {len(dataset)} out of {len(raw_dataset)}")
|
| 370 |
dataset = raw_dataset
|
|
|
|
| 371 |
|
| 372 |
# Load pretrained model and tokenizer
|
| 373 |
|
|
@@ -421,6 +422,7 @@ def main():
|
|
| 421 |
else:
|
| 422 |
column_names = dataset["validation"].column_names
|
| 423 |
text_column_name = "text" if "text" in column_names else column_names[0]
|
|
|
|
| 424 |
|
| 425 |
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
|
| 426 |
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
|
|
|
|
| 368 |
# dataset = dataset.map(normalizer)
|
| 369 |
# logger.info(f"Preprocessed dataset kept {len(dataset)} out of {len(raw_dataset)}")
|
| 370 |
dataset = raw_dataset
|
| 371 |
+
logger.info(f"dataset: {dataset}")
|
| 372 |
|
| 373 |
# Load pretrained model and tokenizer
|
| 374 |
|
|
|
|
| 422 |
else:
|
| 423 |
column_names = dataset["validation"].column_names
|
| 424 |
text_column_name = "text" if "text" in column_names else column_names[0]
|
| 425 |
+
logger.info(f"text_column_name: {text_column_name}")
|
| 426 |
|
| 427 |
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
|
| 428 |
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
|