Commit ·
3114e37
1
Parent(s): 608ff23
Update ul2_tasks.py
Browse files- ul2_tasks.py +0 -47
ul2_tasks.py
CHANGED
|
@@ -62,9 +62,6 @@ def target_to_key(x, key_map, target_key):
|
|
| 62 |
return {**key_map, target_key: x}
|
| 63 |
|
| 64 |
|
| 65 |
-
dataset_shapes = {"train": dataset["train"].num_rows,
|
| 66 |
-
"validation": dataset["validation"].num_rows}
|
| 67 |
-
|
| 68 |
TaskRegistry.add(
|
| 69 |
"pretrain_medical_ul2",
|
| 70 |
source=seqio.FunctionDataSource(
|
|
@@ -109,47 +106,3 @@ TaskRegistry.add(
|
|
| 109 |
},
|
| 110 |
metric_fns=[metrics.accuracy],
|
| 111 |
)
|
| 112 |
-
|
| 113 |
-
# dataset_name = "gs://medical-siddharth/medical_data"
|
| 114 |
-
# dataset_params = {"from_disk_path": dataset_name}
|
| 115 |
-
|
| 116 |
-
# if "from_disk_path" in dataset_params:
|
| 117 |
-
# dataset = load_from_disk(dataset_params.get("from_disk_path"))
|
| 118 |
-
# else:
|
| 119 |
-
# dataset = load_dataset(**dataset_params)
|
| 120 |
-
|
| 121 |
-
# dataset_shapes = {"train": dataset["train"].num_rows,
|
| 122 |
-
# "validation": dataset["validation"].num_rows}
|
| 123 |
-
|
| 124 |
-
# TaskRegistry.add(
|
| 125 |
-
# "pretrain_medical_ul2",
|
| 126 |
-
# source=seqio.FunctionDataSource(
|
| 127 |
-
# dataset_fn=functools.partial(dataset_fn, dataset=dataset),
|
| 128 |
-
# splits=("train", "validation"),
|
| 129 |
-
# caching_permitted=False,
|
| 130 |
-
# num_input_examples=dataset_shapes,
|
| 131 |
-
# ),
|
| 132 |
-
# preprocessors=[
|
| 133 |
-
# functools.partial(
|
| 134 |
-
# target_to_key, key_map={
|
| 135 |
-
# "inputs": None,
|
| 136 |
-
# "targets": None,
|
| 137 |
-
# }, target_key="targets"),
|
| 138 |
-
# seqio.preprocessors.tokenize,
|
| 139 |
-
# functools.partial(
|
| 140 |
-
# ul2_objective,
|
| 141 |
-
# shard_ds=False,
|
| 142 |
-
# use_prefix_lm_task=True, # use S-denoising
|
| 143 |
-
# rates=[0.4 / len(R_DENOISER_SPAN_LENGTHS)]*len(R_DENOISER_SPAN_LENGTHS) + [
|
| 144 |
-
# 0.4 / len(X_DENOISER_SPAN_LENGTHS)]*len(X_DENOISER_SPAN_LENGTHS) + [0.2], # equal total 40% rate for both R- and X-denoisers + 20% for S-denoising (suggested at the paper chapter 4.5)
|
| 145 |
-
# mean_noise_span_lengths=R_DENOISER_SPAN_LENGTHS + X_DENOISER_SPAN_LENGTHS,
|
| 146 |
-
# noise_densities=R_DENOISER_CORRUPT_RATES + X_DENOISER_CORRUPT_RATES,
|
| 147 |
-
# optional_task_prefixes=[R_DENOISER_TOKEN_PREFIX]*len(R_DENOISER_SPAN_LENGTHS) + [
|
| 148 |
-
# X_DENOISER_TOKEN_PREFIX]*len(X_DENOISER_SPAN_LENGTHS) + [S_DENOISER_TOKEN_PREFIX],
|
| 149 |
-
# reserved_for_packing=1, # make room for task prefix token
|
| 150 |
-
# ),
|
| 151 |
-
# seqio.preprocessors.append_eos_after_trim,
|
| 152 |
-
# ],
|
| 153 |
-
# output_features={"targets": DEFAULT_OUTPUT_FEATURES["targets"]},
|
| 154 |
-
# metric_fns=[metrics.accuracy]
|
| 155 |
-
# )
|
|
|
|
| 62 |
return {**key_map, target_key: x}
|
| 63 |
|
| 64 |
|
|
|
|
|
|
|
|
|
|
| 65 |
TaskRegistry.add(
|
| 66 |
"pretrain_medical_ul2",
|
| 67 |
source=seqio.FunctionDataSource(
|
|
|
|
| 106 |
},
|
| 107 |
metric_fns=[metrics.accuracy],
|
| 108 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|