| import json |
| import logging |
| from datasets import load_dataset |
| from tqdm import tqdm |
| |
| logging.basicConfig( |
| format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", |
| level="INFO", |
| datefmt="[%X]", |
| ) |
|
|
| |
| logger = logging.getLogger(__name__) |
| !wget http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin |
| mc4 = load_dataset( |
| "./mc4", |
| "es", |
| split="train", |
| sampling_method="steps", |
| perplexity_model="./es.arpa.bin", |
| sampling_factor=1.5e5, |
| boundaries=[536394.99320948,662247.50212365,919250.87225178], |
| streaming=True).shuffle(buffer_size=10000, seed=2021) |
| total = 0 |
| with open("mc4-es-train-50M-steps.jsonl", "w") as f: |
| for sample in tqdm(mc4, total=50_000_000): |
| f.write(json.dumps(sample) + "\n") |
| total += 1 |
| if total >= 50_000_000: |
| break |
|
|
| mc4val = load_dataset( |
| "./mc4", |
| "es", |
| split="validation", |
| sampling_method="steps", |
| perplexity_model="./es.arpa.bin", |
| sampling_factor=5e5, |
| boundaries=[536394.99320948,662247.50212365,919250.87225178], |
| streaming=True).shuffle(buffer_size=10000, seed=2021) |
| total = 0 |
| with open("mc4-es-validation-5M-steps.jsonl", "w") as f: |
| for sample in tqdm(mc4val, total=5_000_000): |
| f.write(json.dumps(sample) + "\n") |
| total += 1 |
| if total >= 5_000_000: |
| break |
|
|
|
|
| |
|
|
| import json |
| import logging |
| from datasets import load_dataset |
| from tqdm import tqdm |
| |
| logging.basicConfig( |
| format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", |
| level="INFO", |
| datefmt="[%X]", |
| ) |
|
|
| |
| logger = logging.getLogger(__name__) |
|
|
|
|
| mc4 = load_dataset( |
| "./mc4", |
| "es", |
| split="train", |
| sampling_method="gaussian", |
| perplexity_model="../es.arpa.bin", |
| sampling_factor=0.78, |
| boundaries=[536394.99320948,662247.50212365,919250.87225178], |
| streaming=True).shuffle(buffer_size=10000, seed=2021) |
| total = 0 |
| with open("mc4-es-train-50M-gaussian.jsonl", "w") as f: |
| for sample in tqdm(mc4, total=50_000_000): |
| f.write(json.dumps(sample) + "\n") |
| total += 1 |
| if total >= 50_000_000: |
| break |
| mc4val = load_dataset( |
| "./mc4", |
| "es", |
| split="validation", |
| sampling_method="gaussian", |
| perplexity_model="../es.arpa.bin", |
| sampling_factor=1, |
| boundaries=[536394.99320948,662247.50212365,919250.87225178], |
| streaming=True).shuffle(buffer_size=10000, seed=2021) |
| total = 0 |
| with open("mc4-es-validation-5M-gaussian.jsonl", "w") as f: |
| for sample in tqdm(mc4val, total=5_000_000): |
| f.write(json.dumps(sample) + "\n") |
| total += 1 |
| if total >= 5_000_000: |
| break |
|
|
|
|
| |
|
|
| import json |
| import logging |
| from datasets import load_dataset |
| from tqdm import tqdm |
| |
| logging.basicConfig( |
| format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", |
| level="INFO", |
| datefmt="[%X]", |
| ) |
|
|
| |
| logger = logging.getLogger(__name__) |
|
|
|
|
| mc4 = load_dataset( |
| "./mc4", |
| "es", |
| split="train", |
| sampling_method="random", |
| perplexity_model="../es.arpa.bin", |
| sampling_factor=0.5, |
| boundaries=[536394.99320948,662247.50212365,919250.87225178], |
| streaming=True).shuffle(buffer_size=10000, seed=2021) |
| total = 0 |
| with open("mc4-es-train-50M-random.jsonl", "w") as f: |
| for sample in tqdm(mc4, total=50_000_000): |
| f.write(json.dumps(sample) + "\n") |
| total += 1 |
| if total >= 50_000_000: |
| break |
| mc4val = load_dataset( |
| "./mc4", |
| "es", |
| split="validation", |
| sampling_method="random", |
| perplexity_model="../es.arpa.bin", |
| sampling_factor=0.5, |
| boundaries=[536394.99320948,662247.50212365,919250.87225178], |
| streaming=True).shuffle(buffer_size=10000, seed=2021) |
| total = 0 |
| with open("mc4-es-validation-5M-random.jsonl", "w") as f: |
| for sample in tqdm(mc4val, total=5_000_000): |
| f.write(json.dumps(sample) + "\n") |
| total += 1 |
| if total >= 5_000_000: |
| break |
|
|
|
|
|
|
| ------------ |
|
|
|
|