English
File size: 8,866 Bytes
26225c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import os
import h5py
import torch
import socket
import numpy as np
from time import time
from datetime import datetime
from src.utils.tensor import tensor_idx, cast_numpyfy
from src.utils.sparse import dense_to_csr, csr_to_dense


__all__ = [
    'date_time_string', 'dated_dir', 'host_data_root', 'save_tensor',
    'load_tensor', 'save_tensor_dict', 'load_tensor_dict', 'save_dense_to_csr',
    'load_csr_to_dense']


def date_time_string():
    """Returns a string holding the current date and time. Useful for
    creating an output file or directory.
    """
    date = '-'.join([
        f'{getattr(datetime.now(), x)}'
        for x in ['year', 'month', 'day']])
    time = '-'.join([
        f'{getattr(datetime.now(), x)}'
        for x in ['hour', 'minute', 'second']])
    return f'{date}_{time}'


def dated_dir(root, create=False):
    """Returns a directory path in root, named based on the current date
    and time.
    """
    dir_name = date_time_string()
    path = os.path.join(root, dir_name)
    if create and not os.path.exists(path):
        os.makedirs(path, exist_ok=True)
    return path


#TODO: remove this for deployment !
def host_data_root():
    """Read the host machine's name and return the known $DATA_ROOT
    directory
    """
    HOST = socket.gethostname()
    if HOST == 'DEL2001W017':
        DATA_ROOT = '/media/drobert-admin/DATA2/datasets'
    elif HOST == 'HP-2010S002':
        DATA_ROOT = '/var/data/drobert/datasets'
    elif HOST == '9c81b1a54ad8':
        DATA_ROOT = '/raid/dataset/pointcloud/data'
    elif HOST.endswith('sis.cnes.fr'):
        DATA_ROOT = '/home/qt/robertda/scratch/datasets'
    else:
        raise NotImplementedError(
            f"Unknown host '{HOST}', cannot set DATA_ROOT")
    return DATA_ROOT


def save_tensor(x, f, key, fp_dtype=torch.float):
    """Save torch.Tensor to HDF5 file.

    :param x: 2D torch.Tensor
    :param f: h5 file path of h5py.File or h5py.Group
    :param key: str
        h5py.Dataset key under which to save the tensor
    :param fp_dtype: torch dtype
        Data type to which floating point tensors should be cast before
        saving
    :return:
    """
    if not isinstance(f, (h5py.File, h5py.Group)):
        with h5py.File(f, 'w') as file:
            save_tensor(x, file, key, fp_dtype=fp_dtype)
        return

    assert isinstance(x, torch.Tensor)

    d = cast_numpyfy(x, fp_dtype=fp_dtype)
    f.create_dataset(key, data=d, dtype=d.dtype)


def load_tensor(f, key=None, idx=None):
    """Load torch.Tensor from an HDF5 file. See `save_tensor` for
    writing such file. Options allow reading only part of the rows.

    :param f: h5 file path of h5py.File or h5py.Group or h5py.Dataset
    :param key: str
        h5py.Dataset key under which to the tensor was saved. Must be
        provided if f is not already a h5py.Dataset object
    :param idx: int, list, numpy.ndarray, torch.Tensor
        Used to select and read only some rows of the dense tensor.
        Supports fancy indexing
    :return:
    """
    if not isinstance(f, (h5py.File, h5py.Group, h5py.Dataset)):
        with h5py.File(f, 'r') as file:
            out = load_tensor(file, key=key, idx=idx)
        return out

    if not isinstance(f, h5py.Dataset):
        f = f[key]

    idx = tensor_idx(idx)

    if idx is None or idx.shape[0] == 0:
        x = torch.from_numpy(f[:])
    else:
        x = torch.from_numpy(f[:])[idx]

    # By default, convert int16 and int32 to int64, might cause issues
    # for tensor indexing otherwise
    if x is not None and not x.is_floating_point():
        x = x.long()

    return x


def save_tensor_dict(d, f, key, fp_dtype=torch.float):
    """Save torch.Tensor to HDF5 file.

    :param d: dictionary of 2D torch.Tensors
    :param f: h5 file path of h5py.File or h5py.Group
    :param key: str
        h5py.Dataset key under which to save the tensor dictionary
    :param fp_dtype: torch dtype
        Data type to which floating point tensors should be cast before
        saving
    :return:
    """
    if not isinstance(f, (h5py.File, h5py.Group)):
        with h5py.File(f, 'w') as file:
            save_tensor_dict(d, file, key, fp_dtype=fp_dtype)
        return

    g = f.create_group(key)
    for k, v in d.items():
        if not isinstance(v, torch.Tensor):
            continue
        save_tensor(v, g, k, fp_dtype=fp_dtype)


def load_tensor_dict(f, idx=None):
    """Load a dictionary of torch.Tensor from an HDF5 file.

    :param f: h5 file path of h5py.File or h5py.Group or h5py.Dataset
    :param idx: int, list, numpy.ndarray, torch.Tensor
        Used to select and read only some rows of the dense tensor.
        Supports fancy indexing
    :return:
    """
    if not isinstance(f, (h5py.File, h5py.Group)):
        with h5py.File(f, 'w') as file:
            load_tensor_dict(file)
        return

    return {k: load_tensor(f[k], key=None, idx=idx) for k in f.keys()}


def save_dense_to_csr(x, f, fp_dtype=torch.float):
    """Compress a 2D tensor with CSR format and save it in an
    already-open HDF5.

    :param x: 2D torch.Tensor
    :param f: h5 file path of h5py.File or h5py.Group
    :param fp_dtype: torch dtype
        Data type to which floating point tensors should be cast before
        saving
    :return:
    """
    if not isinstance(f, (h5py.File, h5py.Group)):
        with h5py.File(f, 'w') as file:
            save_dense_to_csr(x, file, fp_dtype=fp_dtype)
        return

    assert isinstance(x, torch.Tensor) and x.dim() == 2

    pointers, columns, values = dense_to_csr(x)
    save_tensor(pointers, f, 'pointers', fp_dtype=fp_dtype)
    save_tensor(columns, f, 'columns', fp_dtype=fp_dtype)
    save_tensor(values, f, 'values', fp_dtype=fp_dtype)
    f.create_dataset('shape', data=np.array(x.shape))


def load_csr_to_dense(f, idx=None, verbose=False):
    """Read an HDF5 file of group produced using `dense_to_csr_hdf5` and
    return the dense tensor. An optional idx can be passed to only read
    corresponding rows from the dense tensor.

    :param f: h5 file path of h5py.File or h5py.Group
    :param idx: int, list, numpy.ndarray, torch.Tensor
        Used to select and read only some rows of the dense tensor.
        Supports fancy indexing
    :param verbose: bool
    :return:
    """
    KEYS = ['pointers', 'columns', 'values', 'shape']

    if not isinstance(f, (h5py.File, h5py.Group)):
        with h5py.File(f, 'r') as file:
            out = load_csr_to_dense(file, idx=idx, verbose=verbose)
        return out

    assert all(k in f.keys() for k in KEYS)

    idx = tensor_idx(idx)

    if idx is None or idx.shape[0] == 0:
        start = time()
        pointers = load_tensor(f['pointers'])
        columns = load_tensor(f['columns'])
        values = load_tensor(f['values'])
        shape = load_tensor(f['shape'])
        if verbose:
            print(f'load_csr_to_dense read all      : {time() - start:0.5f}s')
        start = time()
        out = csr_to_dense(pointers, columns, values, shape=shape)
        if verbose:
            print(f'load_csr_to_dense csr_to_dense  : {time() - start:0.5f}s')
        return out

    # Read only pointers start and end indices based on idx
    start = time()
    ptr_start = load_tensor(f['pointers'], idx=idx)
    ptr_end = load_tensor(f['pointers'], idx=idx + 1)
    if verbose:
        print(f'load_csr_to_dense read ptr      : {time() - start:0.5f}s')

    # Create the new pointers
    start = time()
    pointers = torch.cat([
        torch.zeros(1, dtype=ptr_start.dtype),
        torch.cumsum(ptr_end - ptr_start, 0)])
    if verbose:
        print(f'load_csr_to_dense pointers      : {time() - start:0.5f}s')

    # Create the indexing tensor to select and order values.
    # Simply, we could have used a list of slices but we want to
    # avoid for loops and list concatenations to benefit from torch
    # capabilities
    start = time()
    sizes = pointers[1:] - pointers[:-1]
    val_idx = torch.arange(pointers[-1])
    val_idx -= torch.arange(pointers[-1] + 1)[
        pointers[:-1]].repeat_interleave(sizes)
    val_idx += ptr_start.repeat_interleave(sizes)
    if verbose:
        print(f'load_csr_to_dense val_idx       : {time() - start:0.5f}s')

    # Read the columns and values, now we have computed the val_idx.
    # Make sure to update the output shape too, since the rows have been
    # indexed
    start = time()
    columns = load_tensor(f['columns'], idx=val_idx)
    values = load_tensor(f['values'], idx=val_idx)
    shape = load_tensor(f['shape'])
    shape[0] = idx.shape[0]
    if verbose:
        print(f'load_csr_to_dense read values   : {time() - start:0.5f}s')

    start = time()
    out = csr_to_dense(pointers, columns, values, shape=shape)
    if verbose:
        print(f'load_csr_to_dense csr_to_dense  : {time() - start:0.5f}s')

    return out