File size: 11,829 Bytes
dfdf337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0299d25
dfdf337
 
0299d25
 
 
 
 
dfdf337
 
0299d25
 
 
 
 
 
 
 
 
 
 
 
dfdf337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
""" This script is used to download the DL3DV-10 dataset for all resolution levels from the huggingface repo.
    As the whole dataset is too large for most users, we provide this script so that you can download the dataset efficiently based on your needs.
    We provide several options to download the dataset (image frames with poses):
        - [X] Resolution level: 4K, 2K, 960P, 480P  
        - [X] Subset of the 10K, e.g. 1K(0~1K), 2K(1K~2K), 3K(2K~3K), etc
        - [X] specific hash 
        - [X] file_type: raw video | images+poses | colmap cache 

    Notes:
        - file_type + resolution will decide which dataset repo to download the files 
        - subset will decide which subdir will be used 
        - if hash is set, only the specific hash will be downloaded

"""

import os
from os.path import join
import pandas as pd
from tqdm import tqdm
from huggingface_hub import HfApi
import argparse
import traceback
import shutil
import urllib.request
import zipfile
from huggingface_hub import HfFileSystem
from multiprocessing import Process

api = HfApi()
resolution2repo = {
    '480P': 'DL3DV/DL3DV-ALL-480P',
    '960P': 'DL3DV/DL3DV-ALL-960P',
    '2K': 'DL3DV/DL3DV-ALL-2K',
    '4K': 'DL3DV/DL3DV-ALL-4K'
}

def verify_access(repo: str):
    """ This function can be used to verify if the user has access to the repo. 

    :param repo: the repo name  
    :return: True if the user has access, False otherwise
    """    
    fs = HfFileSystem()
    try:
        fs.ls(f'datasets/{repo}')
        return True
    except BaseException as e:
        return False


def hf_download_path(repo: str, rel_path: str, odir: str, max_try: int = 5):
    """ hf api is not reliable, retry when failed with max tries

    :param repo: The huggingface dataset repo 
    :param rel_path: The relative path in the repo
    :param odir: output path 
    :param max_try: As the downloading is not a reliable process, we will retry for max_try times
    """	
    counter = 0
    while True:
        if counter >= max_try:
            print(f"ERROR: Download {repo}/{rel_path} failed.")
            return False
        try:
            api.hf_hub_download(repo_id=repo, 
                                filename=rel_path, 
                                repo_type='dataset', 
                                local_dir=odir, 
                                cache_dir=join(odir, '.cache'))
            return True

        except KeyboardInterrupt:
            print('Keyboard Interrupt. Exit.')
            exit()
        except BaseException as e:
            traceback.print_exc()
            counter += 1
            # print(f'Downloading summary {counter}')
    

def download_from_url(url: str, ofile: str):
    """ Download a file from the url to ofile 

    :param url: The url link 
    :param ofile: The output path 
    :return: True if download success, False otherwise
    """    
    try:
        # Use urllib.request.urlretrieve to download the file from `url` and save it locally at `local_file_path`
        urllib.request.urlretrieve(url, ofile)
        return True
    except Exception as e:
        print(f"An error occurred while downloading the file: {e}") 
        return False


def clean_huggingface_cache(output_dir: str, repo: str):
    """ Huggingface cache may take too much space, we clean the cache to save space if necessary

        Current huggingface hub does not provide good practice to clean the space.  
        We mannually clean the cache directory if necessary. 

    :param output_dir: the current output directory 
    :param output_dir: the huggingface repo 
    """    
    repo_cache_dir = repo.replace('/', '--')
    # cur_cache_dir = join(output_dir, '.cache', f'datasets--{repo_cache_dir}')
    cur_cache_dir = join(output_dir, '.cache')

    if os.path.exists(cur_cache_dir):
        shutil.rmtree(cur_cache_dir)
    

def get_download_list(subset_opt: str, hash_name: str, reso_opt: str, file_type: str, output_dir: str):
    """ Get the download list based on the subset and hash name

        1. Get the meta file   
        2. Select the subset. Based on reso_opt, get the downloading list prepared. 
        3. Return the download list.

    :param subset_opt: Subset of the 10K, e.g. 1K(0~1K), 2K(1K~2K), 3K(2K~3K), etc
    :param hash_name: If provided a non-empty string, ignore the subset_opt and only download the specific hash 
    :param reso_opt: The resolution to download. 
    :param file_type: The file type to download: video | images+poses | colmap_cache  
    :param output_dir: The output directory. 
    """    
    def to_download_item(hash_name, reso, batch, file_type):
        if file_type == 'images+poses':
            repo = resolution2repo[reso]
            rel_path = f'{batch}/{hash_name}.zip'
        elif file_type == 'video':
            repo = 'DL3DV/DL3DV-ALL-video'
            rel_path = f'{batch}/{hash_name}/video.mp4'
        elif file_type == 'colmap_cache':
            repo = 'DL3DV/DL3DV-ALL-ColmapCache'
            rel_path = f'{batch}/{hash_name}.zip'

        # return f'{repo}/{batch}/{hash_name}'
        return { 'repo': repo, 'rel_path': rel_path }

    ret = []

    meta_link = 'https://raw.githubusercontent.com/DL3DV-10K/Dataset/main/cache/DL3DV-valid.csv'
    cache_folder = join(output_dir, '.cache') 
    meta_file = join(cache_folder, 'DL3DV-valid.csv')
    os.makedirs(cache_folder, exist_ok=True)
    if not os.path.exists(meta_file):
        assert download_from_url(meta_link, meta_file), 'Download meta file failed.'

    df = pd.read_csv(meta_file)

    # if hash is set, ignore the subset_opt
    if hash_name != '':
        assert hash_name in df['hash'].values, f'Hash {hash_name} not found in the meta file.'

        batch = df[df['hash'] == hash_name]['batch'].values[0]
        link = to_download_item(hash_name, reso_opt, batch, file_type)
        ret = [link]
        return ret

    # if hash not set, we download the whole subset
    subdf = df[df['batch'] == subset_opt]
    for i, r in subdf.iterrows():
        hash_name = r['hash']
        ret.append(to_download_item(hash_name, reso_opt, subset_opt, file_type))

    return ret


def download(download_list: list, output_dir: str, is_clean_cache: bool):
    """ Download the dataset based on the download_list and user options.

    :param download_list: the list of files to download, [{'repo', 'rel_path'}]
    :param output_dir: the output directory 
    :param reso_opt: the resolution option 
    :param is_clean_cache: if set, will clean the huggingface cache to save space 
    """	
    succ_count = 0
    
    for item in tqdm(download_list, desc='Downloading'):
        repo = item['repo']
        rel_path = item['rel_path']
        
        output_path = os.path.join(output_dir, rel_path)
        output_path = output_path.replace('.zip', '')
        # skip if already exists locally
        if os.path.exists(output_path):
            succ_count += 1
            continue
        succ = hf_download_path(repo, rel_path, output_dir)


        if succ:
            succ_count += 1
            if is_clean_cache:
                clean_huggingface_cache(output_dir, repo)
            
            # unzip the file
            if rel_path.endswith('.zip'):
                zip_file = join(output_dir, rel_path)
                hash_name = os.path.basename(rel_path).replace('.zip', '')
                # Create target directory: output_dir/batch/hash_name
                ofile = join(output_dir, os.path.dirname(rel_path), hash_name)
                os.makedirs(ofile, exist_ok=True)

                with zipfile.ZipFile(zip_file, 'r') as zip_ref:
                    zip_ref.extractall(ofile)

                # Check if there's a nested hash/hash/ structure
                inner_hash_dir = join(ofile, hash_name)
                if os.path.exists(inner_hash_dir) and os.path.isdir(inner_hash_dir):
                    # Move all contents from inner hash dir to outer hash dir
                    for item in os.listdir(inner_hash_dir):
                        src = join(inner_hash_dir, item)
                        dst = join(ofile, item)
                        shutil.move(src, dst)
                    # Remove the now-empty inner hash directory
                    os.rmdir(inner_hash_dir)

                os.remove(zip_file)
        else:
            print(f'Download {rel_path} failed')

    print(f'Summary: {succ_count}/{len(download_list)} files downloaded successfully')
    return succ_count == len(download_list)


def download_dataset(args):
    """ Download the dataset based on the user inputs.

    :param args: argparse args. Used to decide the subset.
    :return: download success or not
    """	
    output_dir = args.odir
    subset_opt = args.subset
    reso_opt   = args.resolution
    hash_name  = args.hash
    file_type  = args.file_type
    is_clean_cache = args.clean_cache

    os.makedirs(output_dir, exist_ok=True)

    download_list = get_download_list(subset_opt, hash_name, reso_opt, file_type, output_dir)
    return download(download_list, output_dir, is_clean_cache)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--odir', type=str, help='output directory', required=True)
    parser.add_argument('--subset', choices=['1K', '2K', '3K', '4K', '5K', '6K', '7K', '8K', '9K', '10K', '11K', 'all'], help='The subset of the benchmark to download', required=True)
    parser.add_argument('--resolution', choices=['4K', '2K', '960P', '480P'], help='The resolution to donwnload', required=True)
    parser.add_argument('--file_type', choices=['images+poses', 'video', 'colmap_cache'], help='The file type to download', required=True, default='images+poses')
    parser.add_argument('--hash', type=str, help='If set subset=hash, this is the hash code of the scene to download', default='')
    parser.add_argument('--clean_cache', action='store_true', help='If set, will clean the huggingface cache to save space')
    params = parser.parse_args()

    assert params.file_type in ['images+poses', 'video', 'colmap_cache'], 'Check the file_type input.'

    if params.file_type == 'images+poses':
        repo = resolution2repo[params.resolution]
    elif params.file_type == 'video':
        repo = 'DL3DV/DL3DV-ALL-video'
    elif params.file_type == 'colmap_cache':
        repo = 'DL3DV/DL3DV-ALL-ColmapCache'

    if not verify_access(repo):
        print(f'You have not grant the access yet. Go to relevant huggingface repo (https://huggingface.co/datasets/{repo}) and apply for the access.')
        exit(1)

    # Handle 'all' subset option: download all 11 subsets in parallel
    if params.subset == 'all':
        subsets = ['1K', '2K', '3K', '4K', '5K', '6K', '7K', '8K', '9K', '10K', '11K']
        processes = []

        print(f'Downloading all 11 subsets in parallel...')
        for subset in subsets:
            # Create a copy of params for each subprocess
            subset_params = argparse.Namespace(
                odir=params.odir,
                subset=subset,
                resolution=params.resolution,
                file_type=params.file_type,
                hash=params.hash,
                clean_cache=params.clean_cache
            )
            p = Process(target=download_dataset, args=(subset_params,))
            p.start()
            processes.append(p)
            print(f'Started process for subset {subset}')

        # Wait for all processes to complete
        for p in processes:
            p.join()

        print('All downloads completed. Refer to', params.odir)
    else:
        # Single subset download
        if download_dataset(params):
            print('Download Done. Refer to', params.odir)
        else:
            print(f'Download to {params.odir} failed. See error messsage.')