File size: 8,419 Bytes
e9f9fd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
"Utility functions for memory management"

from ..imports.torch import *
from ..core import *
from ..script import *
import functools, threading, time
from .pynvml_gate import *
from collections import namedtuple

#is_osx = platform.system() == "Darwin"
use_gpu = torch.cuda.is_available()

GPUMemory = namedtuple('GPUMemory', ['total', 'free', 'used'])

if use_gpu:
    pynvml = load_pynvml_env()

def preload_pytorch():
    torch.ones((1, 1)).cuda()

def b2mb(num):
    """ convert Bs to MBs and round down """
    return int(num/2**20)

def gpu_mem_get(id=None):
    "get total, used and free memory (in MBs) for gpu `id`. if `id` is not passed, currently selected torch device is used"
    if not use_gpu: return GPUMemory(0, 0, 0)
    if id is None: id = torch.cuda.current_device()
    try:
        handle = pynvml.nvmlDeviceGetHandleByIndex(id)
        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        return GPUMemory(*(map(b2mb, [info.total, info.free, info.used])))
    except:
        return GPUMemory(0, 0, 0)

def gpu_mem_get_all():
    "get total, used and free memory (in MBs) for each available gpu"
    if not use_gpu: return []
    return list(map(gpu_mem_get, range(pynvml.nvmlDeviceGetCount())))

def gpu_mem_get_free():
    "get free memory (in MBs) for the currently selected gpu id, w/o emptying the cache"
    return gpu_mem_get().free

def gpu_mem_get_free_no_cache():
    "get free memory (in MBs) for the currently selected gpu id, after emptying the cache"
    torch.cuda.empty_cache()
    return gpu_mem_get().free

def gpu_mem_get_used():
    "get used memory (in MBs) for the currently selected gpu id, w/o emptying the cache"
    return gpu_mem_get().used

def gpu_mem_get_used_fast(gpu_handle):
    "get used memory (in MBs) for the currently selected gpu id, w/o emptying the cache, and needing the `gpu_handle` arg"
    info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
    return b2mb(info.used)

def gpu_mem_get_used_no_cache():
    "get used memory (in MBs) for the currently selected gpu id, after emptying the cache"
    torch.cuda.empty_cache()
    return gpu_mem_get().used

def gpu_with_max_free_mem():
    "get [gpu_id, its_free_ram] for the first gpu with highest available RAM"
    mem_all = gpu_mem_get_all()
    if not len(mem_all): return None, 0
    free_all = np.array([x.free for x in mem_all])
    id = np.argmax(free_all)
    return id, free_all[id]

class GPUMemTrace():
    "Trace allocated and peaked GPU memory usage (deltas)."
    def __init__(self, silent=False, ctx=None, on_exit_report=True):
        assert torch.cuda.is_available(), "pytorch CUDA is required"
        self.silent = silent # shortcut to turn off all reports from constructor
        self.ctx    = ctx    # default context note in report
        self.on_exit_report = on_exit_report # auto-report on ctx manager exit (default: True)
        self.start()

    def reset(self):
        self.used_start = gpu_mem_get_used_no_cache()
        self.used_peak  = self.used_start

    def data_set(self):
        # delta_used is the difference between current used mem and used mem at the start
        self.delta_used = gpu_mem_get_used_no_cache() - self.used_start

        # delta_peaked is the overhead if any. It is calculated as follows:
        #
        # 1. The difference between the peak memory and the used memory at the
        # start is measured:
        # 2a. If it's negative, then delta_peaked is 0
        # 2b. Otherwise, if used_delta is positive it gets subtracted from delta_peaked
        # XXX: 2a shouldn't be needed once we have a reliable peak counter
        self.delta_peaked = self.used_peak - self.used_start
        if self.delta_peaked < 0: self.delta_peaked = 0
        elif self.delta_used > 0: self.delta_peaked -= self.delta_used

    def data(self):
        if self.is_running: self.data_set()
        return self.delta_used, self.delta_peaked

    def start(self):
        self.is_running = True
        self.reset()
        self.peak_monitor_start()

    def stop(self):
        self.peak_monitor_stop()
        self.data_set()
        self.is_running = False

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *exc):
        self.stop()
        if self.on_exit_report: self.report('exit')

    def __del__(self):
        self.stop()

    def __repr__(self):
        delta_used, delta_peaked = self.data()
        return f"△Used Peaked MB: {delta_used:6,.0f} {delta_peaked:6,.0f}"

    def _get_ctx(self, subctx=None):
        "Return ' (ctx: subctx)' or ' (ctx)' or ' (subctx)' or '' depending on this and constructor arguments"
        l = []
        if self.ctx is not None:      l.append(self.ctx)
        if subctx is not None:        l.append(subctx)
        return '' if len(l) == 0 else f" ({': '.join(l)})"

    def silent(self, silent=True):
        self.silent = silent

    def report(self, subctx=None):
        "Print delta used+peaked, and an optional context note, which can also be preset in constructor"
        if self.silent: return
        print(f"{ self.__repr__() }{ self._get_ctx(subctx) }")

    def report_n_reset(self, subctx=None):
        "Print delta used+peaked, and an optional context note. Then reset counters"
        self.report(subctx)
        self.reset()

    def peak_monitor_start(self):
        self.peak_monitoring = True

        # continually sample GPU RAM usage
        peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
        peak_monitor_thread.daemon = True
        peak_monitor_thread.start()

    def peak_monitor_stop(self):
        self.peak_monitoring = False

    # XXX: this is an unreliable function, since there is no thread priority
    # control and it may not run enough or not run at all
    def peak_monitor_func(self):
        gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(torch.cuda.current_device())
        while True:
            self.used_peak = max(gpu_mem_get_used_fast(gpu_handle), self.used_peak)
            if not self.peak_monitoring: break
            time.sleep(0.001) # 1msec

def gpu_mem_trace(func):
    "A decorator that runs `GPUMemTrace` w/ report on func"
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        with GPUMemTrace(ctx=func.__qualname__, on_exit_report=True):
            return func(*args, **kwargs)
    return wrapper

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    #Removed from debugging
    columns = df.columns
    #.drop('index')

    for col in columns:
        col_type = df[col].dtype
        if str(col_type) != 'category' and col_type != 'datetime64[ns]' and col_type != bool:
            if col_type != object:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        #df[col] = df[col].astype(np.float16)
                    #Sometimes causes and error and had to remove
                    if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        print('Error '+col+' Value would be a float64. Disregarding.')
            else:
                df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df