| import os | |
| import time | |
| from multiprocessing import Pool | |
| from tqdm import tqdm | |
| from huggingface_hub import Repository | |
| def save_shard(shard_tuple): | |
| """Save shard""" | |
| filename, shard = shard_tuple | |
| # use to_json instead to save as json file | |
| shard.to_parquet(filename) | |
| # def save_manual_shards(ds, user="loubnabnl", remote_dataset_repo="bigcode-pii-pjj"): | |
| # """Save sharded data | |
| # Args: | |
| # ds (Dataset): dataset to be saved | |
| # user (str): user name | |
| # remote_dataset_repo (str): remote dataset repository | |
| # out_path (str): path to save the shards""" | |
| # # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO | |
| # # you can save the shards inside it and do git add/commit/push to push data to the hub | |
| # out_path = remote_dataset_repo | |
| # # if out path doesn't already exist | |
| # if not os.path.exists(out_path): | |
| # repo = Repository( | |
| # local_dir=out_path, | |
| # clone_from=user + "/" + remote_dataset_repo, | |
| # repo_type="dataset", | |
| # private=True, | |
| # use_auth_token=True, | |
| # git_user=user | |
| # ) | |
| # # files will be numerous we save them in a folder called data inside out_path | |
| # os.mkdir(out_path + "/data") | |
| # SHARD_SIZE = 1000 << 20 | |
| # if ds._indices is not None: | |
| # dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data) | |
| # else: | |
| # dataset_nbytes = ds.data.nbytes | |
| # num_shards = int(dataset_nbytes / SHARD_SIZE) + 1 | |
| # print(f"Number of shards: {num_shards}") | |
| # print("sharding the dataset") | |
| # t_start = time.time() | |
| # shards = (ds.shard(num_shards=num_shards, index=i, contiguous=True) for i in range(num_shards)) | |
| # # use f"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json" instead for json files | |
| # filenames = (f"{out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet" for index in range(num_shards)) | |
| # with Pool(16) as p: | |
| # list(tqdm(p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4), total=num_shards)) | |
| # print(f"Time to save dataset: {time.time()-t_start:.2f}") | |
| # # to push dataset to hub do: git add/commit/push inside OUT_PATH | |
| def save_manual_shards(ds, user="loubnabnl", remote_dataset_repo="bigcode-pii-pjj"): | |
| """Save sharded data | |
| Args: | |
| ds (Dataset): dataset to be saved | |
| user (str): user name | |
| remote_dataset_repo (str): remote dataset repository | |
| out_path (str): path to save the shards""" | |
| # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO | |
| # you can save the shards inside it and do git add/commit/push to push data to the hub | |
| out_path = remote_dataset_repo | |
| # if out path doesn't already exist | |
| if not os.path.exists(out_path): | |
| repo_url = f'https://huggingface.co/{user}/{remote_dataset_repo}' | |
| repo = Repository(local_dir=out_path, clone_from=repo_url, repo_type="dataset") | |
| repo.create_repo(private=True, use_auth_token=True, git_user=user) | |
| # files will be numerous we save them in a folder called data inside out_path | |
| os.mkdir(out_path + "/data") | |
| SHARD_SIZE = 1000 << 20 | |
| if ds._indices is not None: | |
| dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data) | |
| else: | |
| dataset_nbytes = ds.data.nbytes | |
| num_shards = int(dataset_nbytes / SHARD_SIZE) + 1 | |
| print(f"Number of shards: {num_shards}") | |
| print("sharding the dataset") | |
| t_start = time.time() | |
| shards = (ds.shard(num_shards=num_shards, index=i, contiguous=True) for i in range(num_shards)) | |
| # use f"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json" instead for json files | |
| filenames = (f"{out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet" for index in range(num_shards)) | |
| with Pool(16) as p: | |
| list(tqdm(p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4), total=num_shards)) | |
| print(f"Time to save dataset: {time.time()-t_start:.2f}") | |
| # to push dataset to hub do: git add/commit/push inside OUT_PATH |