| import functools |
| import seqio |
| import tensorflow as tf |
| import t5.data |
| from datasets import load_dataset |
| from t5.data import postprocessors |
| from t5.data import preprocessors |
| from t5.evaluation import metrics |
| from seqio import FunctionDataSource, utils |
|
|
| from ul2_objective import ul2_objective |
| |
| R_DENOISER_SPAN_LENGTHS = [3.0, 8.0] |
| X_DENOISER_SPAN_LENGTHS = [3.0, 8.0, 64.0, 64.0] |
| R_DENOISER_CORRUPT_RATES = [0.15, 0.15] |
| X_DENOISER_CORRUPT_RATES = [0.5, 0.5, 0.15, 0.5] |
|
|
| R_DENOISER_TOKEN_PREFIX = '[NLU]' |
| X_DENOISER_TOKEN_PREFIX = '[NLG]' |
| S_DENOISER_TOKEN_PREFIX = '[S2S]' |
|
|
|
|
| TaskRegistry = seqio.TaskRegistry |
| scand_vocabulary=seqio.SentencePieceVocabulary('gs://nb-t5/t5/vocabs/wikipedia/no-da-en-sv-nn-is_32000_unigram.sp.model', extra_ids=100) |
| eng_vocabulary=seqio.SentencePieceVocabulary('gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model', extra_ids=0) |
| mt5_vocabulary=seqio.SentencePieceVocabulary('gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model', extra_ids=0) |
|
|
| SCAND_OUTPUT_FEATURES = { |
| "inputs": seqio.Feature( |
| vocabulary=scand_vocabulary, add_eos=True, |
| required=False), |
| "targets": seqio.Feature( |
| vocabulary=scand_vocabulary, add_eos=True) |
| } |
|
|
| ENG_OUTPUT_FEATURES = { |
| "inputs": seqio.Feature( |
| vocabulary=eng_vocabulary, add_eos=True, |
| required=False), |
| "targets": seqio.Feature( |
| vocabulary=eng_vocabulary, add_eos=True) |
| } |
| MT5_OUTPUT_FEATURES = { |
| "inputs": seqio.Feature( |
| vocabulary=mt5_vocabulary, add_eos=True, |
| required=False), |
| "targets": seqio.Feature( |
| vocabulary=mt5_vocabulary, add_eos=True) |
| } |
|
|
|
|
| def gen_dataset(split, shuffle=False, seed=None, column="text", dataset_params=None): |
| dataset = load_dataset(**dataset_params) |
| if shuffle: |
| if seed: |
| dataset = dataset.shuffle(seed=seed) |
| else: |
| dataset = dataset.shuffle() |
| while True: |
| for item in dataset[str(split)]: |
| yield item[column] |
|
|
|
|
| def dataset_fn(split, shuffle_files, seed=None, dataset_params=None): |
| return tf.data.Dataset.from_generator( |
| functools.partial(gen_dataset, split, shuffle_files, seed, dataset_params=dataset_params), |
| output_signature=tf.TensorSpec(shape=(), dtype=tf.string, name=dataset_name) |
| ) |
|
|
|
|
|
|
| @utils.map_over_dataset |
| def target_to_key(x, key_map, target_key): |
| """Assign the value from the dataset to target_key in key_map""" |
| return {**key_map, target_key: x} |
|
|
|
|
|
|
| |
| dataset_name = 'NbAiLab/scandinavian' |
| dataset_params = {"path": dataset_name, "use_auth_token": True, "streaming": True} |
| dataset_shapes = None |
| TaskRegistry.add( |
| "scandinavian_span_engvoc", |
| source=seqio.FunctionDataSource( |
| dataset_fn=functools.partial(dataset_fn, dataset_params=dataset_params), |
| splits=("train", "validation"), |
| caching_permitted=False, |
| num_input_examples=dataset_shapes, |
| ), |
| preprocessors=[ |
| functools.partial( |
| target_to_key, key_map={ |
| "inputs": None, |
| "targets": None, |
| }, target_key="targets"), |
| seqio.preprocessors.tokenize, |
| |
| preprocessors.span_corruption, |
| seqio.preprocessors.append_eos_after_trim, |
| ], |
| output_features={"targets": ENG_OUTPUT_FEATURES["targets"]}, |
| metric_fns=[] |
| ) |
|
|
|
|
| |
| dataset_name = 'NbAiLab/scandinavian' |
| dataset_params = {"path": dataset_name, "use_auth_token": True, "streaming": True} |
| dataset_shapes = None |
| TaskRegistry.add( |
| "scandinavian_span_scandvoc", |
| source=seqio.FunctionDataSource( |
| dataset_fn=functools.partial(dataset_fn, dataset_params=dataset_params), |
| splits=("train", "validation"), |
| caching_permitted=False, |
| num_input_examples=dataset_shapes, |
| ), |
| preprocessors=[ |
| functools.partial( |
| target_to_key, key_map={ |
| "inputs": None, |
| "targets": None, |
| }, target_key="targets"), |
| seqio.preprocessors.tokenize, |
| |
| preprocessors.span_corruption, |
| seqio.preprocessors.append_eos_after_trim, |
| ], |
| output_features={"targets": SCAND_OUTPUT_FEATURES["targets"]}, |
| metric_fns=[] |
| ) |
|
|
|
|
| |
| dataset_name = 'NbAiLab/scandinavian' |
| dataset_params = {"path": dataset_name, "use_auth_token": True, "streaming": True} |
| dataset_shapes = None |
| TaskRegistry.add( |
| "scandinavian_ul2_engvoc", |
| source=seqio.FunctionDataSource( |
| dataset_fn=functools.partial(dataset_fn, dataset_params=dataset_params), |
| splits=("train", "validation"), |
| caching_permitted=False, |
| num_input_examples=dataset_shapes, |
| ), |
| preprocessors=[ |
| functools.partial( |
| target_to_key, key_map={ |
| "inputs": None, |
| "targets": None, |
| }, target_key="targets"), |
| seqio.preprocessors.tokenize, |
| functools.partial( |
| ul2_objective, |
| shard_ds=False, |
| use_prefix_lm_task=True, |
| rates=[0.4 / len(R_DENOISER_SPAN_LENGTHS)]*len(R_DENOISER_SPAN_LENGTHS) + [ |
| 0.4 / len(X_DENOISER_SPAN_LENGTHS)]*len(X_DENOISER_SPAN_LENGTHS) + [0.2], |
| mean_noise_span_lengths=R_DENOISER_SPAN_LENGTHS + X_DENOISER_SPAN_LENGTHS, |
| noise_densities=R_DENOISER_CORRUPT_RATES + X_DENOISER_CORRUPT_RATES, |
| optional_task_prefixes=[R_DENOISER_TOKEN_PREFIX]*len(R_DENOISER_SPAN_LENGTHS) + [ |
| X_DENOISER_TOKEN_PREFIX]*len(X_DENOISER_SPAN_LENGTHS) + [S_DENOISER_TOKEN_PREFIX], |
| reserved_for_packing=5, |
| ), |
| seqio.preprocessors.append_eos_after_trim, |
| ], |
| output_features={"targets": ENG_OUTPUT_FEATURES["targets"]}, |
| metric_fns=[] |
| ) |
|
|
|
|
| |
| dataset_name = 'NbAiLab/scandinavian' |
| dataset_params = {"path": dataset_name, "use_auth_token": True, "streaming": True} |
| dataset_shapes = None |
| TaskRegistry.add( |
| "scandinavian_ul2_scandvoc", |
| source=seqio.FunctionDataSource( |
| dataset_fn=functools.partial(dataset_fn, dataset_params=dataset_params), |
| splits=("train", "validation"), |
| caching_permitted=False, |
| num_input_examples=dataset_shapes, |
| ), |
| preprocessors=[ |
| functools.partial( |
| target_to_key, key_map={ |
| "inputs": None, |
| "targets": None, |
| }, target_key="targets"), |
| seqio.preprocessors.tokenize, |
| functools.partial( |
| ul2_objective, |
| shard_ds=False, |
| use_prefix_lm_task=True, |
| rates=[0.4 / len(R_DENOISER_SPAN_LENGTHS)]*len(R_DENOISER_SPAN_LENGTHS) + [ |
| 0.4 / len(X_DENOISER_SPAN_LENGTHS)]*len(X_DENOISER_SPAN_LENGTHS) + [0.2], |
| mean_noise_span_lengths=R_DENOISER_SPAN_LENGTHS + X_DENOISER_SPAN_LENGTHS, |
| noise_densities=R_DENOISER_CORRUPT_RATES + X_DENOISER_CORRUPT_RATES, |
| optional_task_prefixes=[R_DENOISER_TOKEN_PREFIX]*len(R_DENOISER_SPAN_LENGTHS) + [ |
| X_DENOISER_TOKEN_PREFIX]*len(X_DENOISER_SPAN_LENGTHS) + [S_DENOISER_TOKEN_PREFIX], |
| reserved_for_packing=5, |
| ), |
| seqio.preprocessors.append_eos_after_trim, |
| ], |
| output_features={"targets": SCAND_OUTPUT_FEATURES["targets"]}, |
| metric_fns=[] |
| ) |
|
|
|
|
| |
| dataset_name = 'NbAiLab/scandinavian' |
| dataset_params = {"path": dataset_name, "use_auth_token": True, "streaming": True} |
| dataset_shapes = None |
| TaskRegistry.add( |
| "scandinavian_ul2_mt5voc", |
| source=seqio.FunctionDataSource( |
| dataset_fn=functools.partial(dataset_fn, dataset_params=dataset_params), |
| splits=("train", "validation"), |
| caching_permitted=False, |
| num_input_examples=dataset_shapes, |
| ), |
| preprocessors=[ |
| functools.partial( |
| target_to_key, key_map={ |
| "inputs": None, |
| "targets": None, |
| }, target_key="targets"), |
| seqio.preprocessors.tokenize, |
| functools.partial( |
| ul2_objective, |
| shard_ds=False, |
| use_prefix_lm_task=True, |
| rates=[0.4 / len(R_DENOISER_SPAN_LENGTHS)]*len(R_DENOISER_SPAN_LENGTHS) + [ |
| 0.4 / len(X_DENOISER_SPAN_LENGTHS)]*len(X_DENOISER_SPAN_LENGTHS) + [0.2], |
| mean_noise_span_lengths=R_DENOISER_SPAN_LENGTHS + X_DENOISER_SPAN_LENGTHS, |
| noise_densities=R_DENOISER_CORRUPT_RATES + X_DENOISER_CORRUPT_RATES, |
| optional_task_prefixes=[R_DENOISER_TOKEN_PREFIX]*len(R_DENOISER_SPAN_LENGTHS) + [ |
| X_DENOISER_TOKEN_PREFIX]*len(X_DENOISER_SPAN_LENGTHS) + [S_DENOISER_TOKEN_PREFIX], |
| reserved_for_packing=5, |
| ), |
| seqio.preprocessors.append_eos_after_trim, |
| ], |
| output_features={"targets": MT5_OUTPUT_FEATURES["targets"]}, |
| metric_fns=[] |
| ) |
|
|
|
|
|
|
| |
| dataset_name = 'NbAiLab/scandinavian' |
| dataset_params = {"path": dataset_name, "use_auth_token": True, "streaming": True} |
| dataset_shapes = None |
| TaskRegistry.add( |
| "scandinavian_span_mt5voc", |
| source=seqio.FunctionDataSource( |
| dataset_fn=functools.partial(dataset_fn, dataset_params=dataset_params), |
| splits=("train", "validation"), |
| caching_permitted=False, |
| num_input_examples=dataset_shapes, |
| ), |
| preprocessors=[ |
| functools.partial( |
| target_to_key, key_map={ |
| "inputs": None, |
| "targets": None, |
| }, target_key="targets"), |
| seqio.preprocessors.tokenize, |
| |
| preprocessors.span_corruption, |
| seqio.preprocessors.append_eos_after_trim, |
| ], |
| output_features={"targets": MT5_OUTPUT_FEATURES["targets"]}, |
| metric_fns=[] |
| ) |
|
|
|
|
|
|
|
|
|
|