lhmd commited on
Commit
dfdf337
·
verified ·
1 Parent(s): 640bd70

Upload download.py

Browse files
Files changed (1) hide show
  1. download.py +285 -0
download.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ This script is used to download the DL3DV-10 dataset for all resolution levels from the huggingface repo.
2
+ As the whole dataset is too large for most users, we provide this script so that you can download the dataset efficiently based on your needs.
3
+ We provide several options to download the dataset (image frames with poses):
4
+ - [X] Resolution level: 4K, 2K, 960P, 480P
5
+ - [X] Subset of the 10K, e.g. 1K(0~1K), 2K(1K~2K), 3K(2K~3K), etc
6
+ - [X] specific hash
7
+ - [X] file_type: raw video | images+poses | colmap cache
8
+
9
+ Notes:
10
+ - file_type + resolution will decide which dataset repo to download the files
11
+ - subset will decide which subdir will be used
12
+ - if hash is set, only the specific hash will be downloaded
13
+
14
+ """
15
+
16
+ import os
17
+ from os.path import join
18
+ import pandas as pd
19
+ from tqdm import tqdm
20
+ from huggingface_hub import HfApi
21
+ import argparse
22
+ import traceback
23
+ import shutil
24
+ import urllib.request
25
+ import zipfile
26
+ from huggingface_hub import HfFileSystem
27
+ from multiprocessing import Process
28
+
29
+ api = HfApi()
30
+ resolution2repo = {
31
+ '480P': 'DL3DV/DL3DV-ALL-480P',
32
+ '960P': 'DL3DV/DL3DV-ALL-960P',
33
+ '2K': 'DL3DV/DL3DV-ALL-2K',
34
+ '4K': 'DL3DV/DL3DV-ALL-4K'
35
+ }
36
+
37
+ def verify_access(repo: str):
38
+ """ This function can be used to verify if the user has access to the repo.
39
+
40
+ :param repo: the repo name
41
+ :return: True if the user has access, False otherwise
42
+ """
43
+ fs = HfFileSystem()
44
+ try:
45
+ fs.ls(f'datasets/{repo}')
46
+ return True
47
+ except BaseException as e:
48
+ return False
49
+
50
+
51
+ def hf_download_path(repo: str, rel_path: str, odir: str, max_try: int = 5):
52
+ """ hf api is not reliable, retry when failed with max tries
53
+
54
+ :param repo: The huggingface dataset repo
55
+ :param rel_path: The relative path in the repo
56
+ :param odir: output path
57
+ :param max_try: As the downloading is not a reliable process, we will retry for max_try times
58
+ """
59
+ counter = 0
60
+ while True:
61
+ if counter >= max_try:
62
+ print(f"ERROR: Download {repo}/{rel_path} failed.")
63
+ return False
64
+ try:
65
+ api.hf_hub_download(repo_id=repo,
66
+ filename=rel_path,
67
+ repo_type='dataset',
68
+ local_dir=odir,
69
+ cache_dir=join(odir, '.cache'))
70
+ return True
71
+
72
+ except KeyboardInterrupt:
73
+ print('Keyboard Interrupt. Exit.')
74
+ exit()
75
+ except BaseException as e:
76
+ traceback.print_exc()
77
+ counter += 1
78
+ # print(f'Downloading summary {counter}')
79
+
80
+
81
+ def download_from_url(url: str, ofile: str):
82
+ """ Download a file from the url to ofile
83
+
84
+ :param url: The url link
85
+ :param ofile: The output path
86
+ :return: True if download success, False otherwise
87
+ """
88
+ try:
89
+ # Use urllib.request.urlretrieve to download the file from `url` and save it locally at `local_file_path`
90
+ urllib.request.urlretrieve(url, ofile)
91
+ return True
92
+ except Exception as e:
93
+ print(f"An error occurred while downloading the file: {e}")
94
+ return False
95
+
96
+
97
+ def clean_huggingface_cache(output_dir: str, repo: str):
98
+ """ Huggingface cache may take too much space, we clean the cache to save space if necessary
99
+
100
+ Current huggingface hub does not provide good practice to clean the space.
101
+ We mannually clean the cache directory if necessary.
102
+
103
+ :param output_dir: the current output directory
104
+ :param output_dir: the huggingface repo
105
+ """
106
+ repo_cache_dir = repo.replace('/', '--')
107
+ # cur_cache_dir = join(output_dir, '.cache', f'datasets--{repo_cache_dir}')
108
+ cur_cache_dir = join(output_dir, '.cache')
109
+
110
+ if os.path.exists(cur_cache_dir):
111
+ shutil.rmtree(cur_cache_dir)
112
+
113
+
114
+ def get_download_list(subset_opt: str, hash_name: str, reso_opt: str, file_type: str, output_dir: str):
115
+ """ Get the download list based on the subset and hash name
116
+
117
+ 1. Get the meta file
118
+ 2. Select the subset. Based on reso_opt, get the downloading list prepared.
119
+ 3. Return the download list.
120
+
121
+ :param subset_opt: Subset of the 10K, e.g. 1K(0~1K), 2K(1K~2K), 3K(2K~3K), etc
122
+ :param hash_name: If provided a non-empty string, ignore the subset_opt and only download the specific hash
123
+ :param reso_opt: The resolution to download.
124
+ :param file_type: The file type to download: video | images+poses | colmap_cache
125
+ :param output_dir: The output directory.
126
+ """
127
+ def to_download_item(hash_name, reso, batch, file_type):
128
+ if file_type == 'images+poses':
129
+ repo = resolution2repo[reso]
130
+ rel_path = f'{batch}/{hash_name}.zip'
131
+ elif file_type == 'video':
132
+ repo = 'DL3DV/DL3DV-ALL-video'
133
+ rel_path = f'{batch}/{hash_name}/video.mp4'
134
+ elif file_type == 'colmap_cache':
135
+ repo = 'DL3DV/DL3DV-ALL-ColmapCache'
136
+ rel_path = f'{batch}/{hash_name}.zip'
137
+
138
+ # return f'{repo}/{batch}/{hash_name}'
139
+ return { 'repo': repo, 'rel_path': rel_path }
140
+
141
+ ret = []
142
+
143
+ meta_link = 'https://raw.githubusercontent.com/DL3DV-10K/Dataset/main/cache/DL3DV-valid.csv'
144
+ cache_folder = join(output_dir, '.cache')
145
+ meta_file = join(cache_folder, 'DL3DV-valid.csv')
146
+ os.makedirs(cache_folder, exist_ok=True)
147
+ if not os.path.exists(meta_file):
148
+ assert download_from_url(meta_link, meta_file), 'Download meta file failed.'
149
+
150
+ df = pd.read_csv(meta_file)
151
+
152
+ # if hash is set, ignore the subset_opt
153
+ if hash_name != '':
154
+ assert hash_name in df['hash'].values, f'Hash {hash_name} not found in the meta file.'
155
+
156
+ batch = df[df['hash'] == hash_name]['batch'].values[0]
157
+ link = to_download_item(hash_name, reso_opt, batch, file_type)
158
+ ret = [link]
159
+ return ret
160
+
161
+ # if hash not set, we download the whole subset
162
+ subdf = df[df['batch'] == subset_opt]
163
+ for i, r in subdf.iterrows():
164
+ hash_name = r['hash']
165
+ ret.append(to_download_item(hash_name, reso_opt, subset_opt, file_type))
166
+
167
+ return ret
168
+
169
+
170
+ def download(download_list: list, output_dir: str, is_clean_cache: bool):
171
+ """ Download the dataset based on the download_list and user options.
172
+
173
+ :param download_list: the list of files to download, [{'repo', 'rel_path'}]
174
+ :param output_dir: the output directory
175
+ :param reso_opt: the resolution option
176
+ :param is_clean_cache: if set, will clean the huggingface cache to save space
177
+ """
178
+ succ_count = 0
179
+
180
+ for item in tqdm(download_list, desc='Downloading'):
181
+ repo = item['repo']
182
+ rel_path = item['rel_path']
183
+
184
+ output_path = os.path.join(output_dir, rel_path)
185
+ output_path = output_path.replace('.zip', '')
186
+ # skip if already exists locally
187
+ if os.path.exists(output_path):
188
+ succ_count += 1
189
+ continue
190
+ succ = hf_download_path(repo, rel_path, output_dir)
191
+
192
+
193
+ if succ:
194
+ succ_count += 1
195
+ if is_clean_cache:
196
+ clean_huggingface_cache(output_dir, repo)
197
+
198
+ # unzip the file
199
+ if rel_path.endswith('.zip'):
200
+ zip_file = join(output_dir, rel_path)
201
+ with zipfile.ZipFile(zip_file, 'r') as zip_ref:
202
+ ofile = join(output_dir, os.path.dirname(rel_path))
203
+ zip_ref.extractall(ofile)
204
+ os.remove(zip_file)
205
+ else:
206
+ print(f'Download {rel_path} failed')
207
+
208
+ print(f'Summary: {succ_count}/{len(download_list)} files downloaded successfully')
209
+ return succ_count == len(download_list)
210
+
211
+
212
+ def download_dataset(args):
213
+ """ Download the dataset based on the user inputs.
214
+
215
+ :param args: argparse args. Used to decide the subset.
216
+ :return: download success or not
217
+ """
218
+ output_dir = args.odir
219
+ subset_opt = args.subset
220
+ reso_opt = args.resolution
221
+ hash_name = args.hash
222
+ file_type = args.file_type
223
+ is_clean_cache = args.clean_cache
224
+
225
+ os.makedirs(output_dir, exist_ok=True)
226
+
227
+ download_list = get_download_list(subset_opt, hash_name, reso_opt, file_type, output_dir)
228
+ return download(download_list, output_dir, is_clean_cache)
229
+
230
+
231
+ if __name__ == '__main__':
232
+ parser = argparse.ArgumentParser()
233
+ parser.add_argument('--odir', type=str, help='output directory', required=True)
234
+ parser.add_argument('--subset', choices=['1K', '2K', '3K', '4K', '5K', '6K', '7K', '8K', '9K', '10K', '11K', 'all'], help='The subset of the benchmark to download', required=True)
235
+ parser.add_argument('--resolution', choices=['4K', '2K', '960P', '480P'], help='The resolution to donwnload', required=True)
236
+ parser.add_argument('--file_type', choices=['images+poses', 'video', 'colmap_cache'], help='The file type to download', required=True, default='images+poses')
237
+ parser.add_argument('--hash', type=str, help='If set subset=hash, this is the hash code of the scene to download', default='')
238
+ parser.add_argument('--clean_cache', action='store_true', help='If set, will clean the huggingface cache to save space')
239
+ params = parser.parse_args()
240
+
241
+ assert params.file_type in ['images+poses', 'video', 'colmap_cache'], 'Check the file_type input.'
242
+
243
+ if params.file_type == 'images+poses':
244
+ repo = resolution2repo[params.resolution]
245
+ elif params.file_type == 'video':
246
+ repo = 'DL3DV/DL3DV-ALL-video'
247
+ elif params.file_type == 'colmap_cache':
248
+ repo = 'DL3DV/DL3DV-ALL-ColmapCache'
249
+
250
+ if not verify_access(repo):
251
+ print(f'You have not grant the access yet. Go to relevant huggingface repo (https://huggingface.co/datasets/{repo}) and apply for the access.')
252
+ exit(1)
253
+
254
+ # Handle 'all' subset option: download all 11 subsets in parallel
255
+ if params.subset == 'all':
256
+ subsets = ['1K', '2K', '3K', '4K', '5K', '6K', '7K', '8K', '9K', '10K', '11K']
257
+ processes = []
258
+
259
+ print(f'Downloading all 11 subsets in parallel...')
260
+ for subset in subsets:
261
+ # Create a copy of params for each subprocess
262
+ subset_params = argparse.Namespace(
263
+ odir=params.odir,
264
+ subset=subset,
265
+ resolution=params.resolution,
266
+ file_type=params.file_type,
267
+ hash=params.hash,
268
+ clean_cache=params.clean_cache
269
+ )
270
+ p = Process(target=download_dataset, args=(subset_params,))
271
+ p.start()
272
+ processes.append(p)
273
+ print(f'Started process for subset {subset}')
274
+
275
+ # Wait for all processes to complete
276
+ for p in processes:
277
+ p.join()
278
+
279
+ print('All downloads completed. Refer to', params.odir)
280
+ else:
281
+ # Single subset download
282
+ if download_dataset(params):
283
+ print('Download Done. Refer to', params.odir)
284
+ else:
285
+ print(f'Download to {params.odir} failed. See error messsage.')