rearange files

Browse files

Files changed (4) hide show

ag_news_clm.sh → _ag_news_clm.sh +0 -0
ag_news_load.sh → _ag_news_load.sh +0 -0
ag_news.sh +2 -2
datasets/dataset.py +81 -0

ag_news_clm.sh → _ag_news_clm.sh RENAMED Viewed

File without changes

ag_news_load.sh → _ag_news_load.sh RENAMED Viewed

File without changes

ag_news.sh CHANGED Viewed

@@ -1,4 +1,4 @@
-export RUN_NAME=test
 ./venv/bin/python train.py \
 --t5_model_name_or_path="t5-base" \
@@ -16,5 +16,5 @@ export RUN_NAME=test
 --learning_rate="5e-3" --warmup_steps="1000" \
 --adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
 --overwrite_output_dir \
---num_train_epochs="20" \
 --push_to_hub \

+export RUN_NAME=ag_news
 ./venv/bin/python train.py \
 --t5_model_name_or_path="t5-base" \
 --learning_rate="5e-3" --warmup_steps="1000" \
 --adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
 --overwrite_output_dir \
+--num_train_epochs="3" \
 --push_to_hub \

datasets/dataset.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# unused
+"""Wikipedia Sentences"""
+from __future__ import absolute_import, division, print_function
+import os
+import json
+import datasets
+_DESCRIPTION = """\
+Dataset of sentences from Wikipedia (from the [Optimus paper](https://arxiv.org/abs/2004.04092)).
+Each is of mex 64 words & <=256 GPT2 tokens.
+Each row is a tokenised sentence.
+{'token_ids': '{gpt2 token ids}'}
+This is to test the semantics of a Transformer-VAEs latent space by interpolating on sentences.
+"""
+NUM_SEGMENTS = 5
+_TRAIN_DOWNLOAD_URL = r"https://storage.googleapis.com/t-vae/wikipedia_json_64_filtered_segment_{0}.zip"
+class WikiSentencesConfig(datasets.BuilderConfig):
+    """BuilderConfig for WikiSentences."""
+    def __init__(self, segment=None, max_num_samples=None, **kwargs):
+        """BuilderConfig for WikiSentences.
+        Args:
+          segment_num: keyword argument to specify the segment of the dataset to load
+          **kwargs: keyword arguments forwarded to super.
+        """
+        self.segment = segment
+        self.max_num_samples = max_num_samples
+        super(WikiSentencesConfig, self).__init__(**kwargs)
+class WikiSentences(datasets.GeneratorBasedBuilder):
+    """Sentences from Wikipedia."""
+    BUILDER_CONFIGS = [
+        WikiSentencesConfig(
+            name=f"segment_{i}",
+            description=f"Segment {i+1}/{NUM_SEGMENTS} of WikiSentences Dataset for interpolating on natural language.",
+            segment=i
+        ) for i in range(NUM_SEGMENTS)
+    ] + [
+        WikiSentencesConfig(
+            name=f"1M_segment_{i}",
+            description=f"Segment {i+1}/{NUM_SEGMENTS} of WikiSentences Dataset for interpolating on natural language.",
+            segment=i, max_num_samples=1_000_000
+        ) for i in range(NUM_SEGMENTS)
+    ]
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    'token_ids': [datasets.Value("int32")],
+                }
+            ),
+            homepage="https://github.com/Fraser-Greenlee/transformer-vae",
+        )
+    def _split_generators(self, dl_manager):
+        assert(self.config.segment < NUM_SEGMENTS), f'Segment does not exist, requested segment {self.config.segment}, but max segment num is {NUM_SEGMENTS - 1}'
+        folder_path = dl_manager.download_and_extract(_TRAIN_DOWNLOAD_URL.format(self.config.segment))
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN, gen_kwargs={"filepath": os.path.join(folder_path, 'segment_output.jsonl')}
+            ),
+        ]
+    def _generate_examples(self, filepath):
+        """Generate examples."""
+        with open(filepath, encoding="utf-8") as json_lines_file:
+            for id_, line in enumerate(json_lines_file):
+                yield id_, json.loads(line)
+                if id_ >= self.config.max_num_samples:
+                    break