Commit
·
2fa335f
1
Parent(s):
7674270
Update ul2_tasks.py
Browse files- ul2_tasks.py +24 -12
ul2_tasks.py
CHANGED
|
@@ -69,34 +69,46 @@ dataset_shapes = {"train": dataset["train"].num_rows,
|
|
| 69 |
TaskRegistry.add(
|
| 70 |
"pretrain_biological_ul2",
|
| 71 |
source=seqio.FunctionDataSource(
|
| 72 |
-
dataset_fn=functools.partial(
|
|
|
|
|
|
|
| 73 |
splits=("train", "validation"),
|
| 74 |
caching_permitted=False,
|
| 75 |
-
num_input_examples=dataset_shapes,
|
| 76 |
),
|
| 77 |
preprocessors=[
|
| 78 |
functools.partial(
|
| 79 |
-
target_to_key,
|
| 80 |
-
|
| 81 |
-
"
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
| 83 |
seqio.preprocessors.tokenize,
|
| 84 |
functools.partial(
|
| 85 |
ul2_objective,
|
| 86 |
shard_ds=False,
|
| 87 |
use_prefix_lm_task=True, # use S-denoising
|
| 88 |
-
rates=[0.4 / len(R_DENOISER_SPAN_LENGTHS)]*len(R_DENOISER_SPAN_LENGTHS)
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
| 90 |
mean_noise_span_lengths=R_DENOISER_SPAN_LENGTHS + X_DENOISER_SPAN_LENGTHS,
|
| 91 |
noise_densities=R_DENOISER_CORRUPT_RATES + X_DENOISER_CORRUPT_RATES,
|
| 92 |
-
optional_task_prefixes=[R_DENOISER_TOKEN_PREFIX]
|
| 93 |
-
|
|
|
|
|
|
|
| 94 |
reserved_for_packing=1, # make room for task prefix token
|
| 95 |
),
|
| 96 |
seqio.preprocessors.append_eos_after_trim,
|
| 97 |
],
|
| 98 |
-
output_features={
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
| 100 |
)
|
| 101 |
|
| 102 |
|
|
|
|
| 69 |
TaskRegistry.add(
|
| 70 |
"pretrain_biological_ul2",
|
| 71 |
source=seqio.FunctionDataSource(
|
| 72 |
+
dataset_fn=functools.partial(
|
| 73 |
+
dataset_fn, path="Siddharth63/biological_dataset",
|
| 74 |
+
),
|
| 75 |
splits=("train", "validation"),
|
| 76 |
caching_permitted=False,
|
|
|
|
| 77 |
),
|
| 78 |
preprocessors=[
|
| 79 |
functools.partial(
|
| 80 |
+
target_to_key,
|
| 81 |
+
key_map={
|
| 82 |
+
"inputs": "text",
|
| 83 |
+
"targets": "text",
|
| 84 |
+
},
|
| 85 |
+
target_key="targets",
|
| 86 |
+
),
|
| 87 |
seqio.preprocessors.tokenize,
|
| 88 |
functools.partial(
|
| 89 |
ul2_objective,
|
| 90 |
shard_ds=False,
|
| 91 |
use_prefix_lm_task=True, # use S-denoising
|
| 92 |
+
rates=[0.4 / len(R_DENOISER_SPAN_LENGTHS)] * len(R_DENOISER_SPAN_LENGTHS)
|
| 93 |
+
+ [0.4 / len(X_DENOISER_SPAN_LENGTHS)] * len(X_DENOISER_SPAN_LENGTHS)
|
| 94 |
+
+ [
|
| 95 |
+
0.2
|
| 96 |
+
], # equal total 40% rate for both R- and X-denoisers + 20% for S-denoising (suggested at the paper chapter 4.5)
|
| 97 |
mean_noise_span_lengths=R_DENOISER_SPAN_LENGTHS + X_DENOISER_SPAN_LENGTHS,
|
| 98 |
noise_densities=R_DENOISER_CORRUPT_RATES + X_DENOISER_CORRUPT_RATES,
|
| 99 |
+
optional_task_prefixes=[R_DENOISER_TOKEN_PREFIX]
|
| 100 |
+
* len(R_DENOISER_SPAN_LENGTHS)
|
| 101 |
+
+ [X_DENOISER_TOKEN_PREFIX] * len(X_DENOISER_SPAN_LENGTHS)
|
| 102 |
+
+ [S_DENOISER_TOKEN_PREFIX],
|
| 103 |
reserved_for_packing=1, # make room for task prefix token
|
| 104 |
),
|
| 105 |
seqio.preprocessors.append_eos_after_trim,
|
| 106 |
],
|
| 107 |
+
output_features={
|
| 108 |
+
"targets": DEFAULT_OUTPUT_FEATURES["targets"],
|
| 109 |
+
"inputs": seqio.Feature(vocabulary=vocabulary, add_eos=True),
|
| 110 |
+
},
|
| 111 |
+
metric_fns=[metrics.accuracy],
|
| 112 |
)
|
| 113 |
|
| 114 |
|