File size: 9,337 Bytes
e9f9fd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"Data loading pipeline for structured data support. Loads from pandas DataFrame"
from ..torch_core import *
from .transform import *
from ..basic_data import *
from ..data_block import *
from ..basic_train import *
from .models import *
from pandas.api.types import is_numeric_dtype, is_categorical_dtype

__all__ = ['TabularDataBunch', 'TabularLine', 'TabularList', 'TabularProcessor', 'tabular_learner']

OptTabTfms = Optional[Collection[TabularProc]]

#def emb_sz_rule(n_cat:int)->int: return min(50, (n_cat//2)+1)
def emb_sz_rule(n_cat:int)->int: return min(600, round(1.6 * n_cat**0.56))

def def_emb_sz(classes, n, sz_dict=None):
    "Pick an embedding size for `n` depending on `classes` if not given in `sz_dict`."
    sz_dict = ifnone(sz_dict, {})
    n_cat = len(classes[n])
    sz = sz_dict.get(n, int(emb_sz_rule(n_cat)))  # rule of thumb
    return n_cat,sz

class TabularLine(ItemBase):
    "Basic item for tabular data."
    def __init__(self, cats, conts, classes, names):
        self.cats,self.conts,self.classes,self.names = cats,conts,classes,names
        self.data = [tensor(cats), tensor(conts)]

    def __str__(self):
        res = ''
        for c, n in zip(self.cats, self.names[:len(self.cats)]):
            res += f"{n} {(self.classes[n][c])}; "
        for c,n in zip(self.conts, self.names[len(self.cats):]):
            res += f'{n} {c:.4f}; '
        return res

class TabularProcessor(PreProcessor):
    "Regroup the `procs` in one `PreProcessor`."
    def __init__(self, ds:ItemBase=None, procs=None):
        procs = ifnone(procs, ds.procs if ds is not None else None)
        self.procs = listify(procs)

    def process_one(self, item):
        df = pd.DataFrame([item,item])
        for proc in self.procs: proc(df, test=True)
        if len(self.cat_names) != 0:
            codes = np.stack([c.cat.codes.values for n,c in df[self.cat_names].items()], 1).astype(np.int64) + 1
        else: codes = [[]]
        if len(self.cont_names) != 0:
            conts = np.stack([c.astype('float32').values for n,c in df[self.cont_names].items()], 1)
        else: conts = [[]]
        classes = None
        col_names = list(df[self.cat_names].columns.values) + list(df[self.cont_names].columns.values)
        return TabularLine(codes[0], conts[0], classes, col_names)

    def process(self, ds):
        if ds.inner_df is None:
            ds.classes,ds.cat_names,ds.cont_names = self.classes,self.cat_names,self.cont_names
            ds.col_names = self.cat_names + self.cont_names
            ds.preprocessed = True
            return
        for i,proc in enumerate(self.procs):
            if isinstance(proc, TabularProc): proc(ds.inner_df, test=True)
            else:
                #cat and cont names may have been changed by transform (like Fill_NA)
                proc = proc(ds.cat_names, ds.cont_names)
                proc(ds.inner_df)
                ds.cat_names,ds.cont_names = proc.cat_names,proc.cont_names
                self.procs[i] = proc
        self.cat_names,self.cont_names = ds.cat_names,ds.cont_names
        if len(ds.cat_names) != 0:
            ds.codes = np.stack([c.cat.codes.values for n,c in ds.inner_df[ds.cat_names].items()], 1).astype(np.int64) + 1
            self.classes = ds.classes = OrderedDict({n:np.concatenate([['#na#'],c.cat.categories.values])
                                      for n,c in ds.inner_df[ds.cat_names].items()})
            cat_cols = list(ds.inner_df[ds.cat_names].columns.values)
        else: ds.codes,ds.classes,self.classes,cat_cols = None,None,None,[]
        if len(ds.cont_names) != 0:
            ds.conts = np.stack([c.astype('float32').values for n,c in ds.inner_df[ds.cont_names].items()], 1)
            cont_cols = list(ds.inner_df[ds.cont_names].columns.values)
        else: ds.conts,cont_cols = None,[]
        ds.col_names = cat_cols + cont_cols
        ds.preprocessed = True

class TabularDataBunch(DataBunch):
    "Create a `DataBunch` suitable for tabular data."
    @classmethod
    def from_df(cls, path, df:DataFrame, dep_var:str, valid_idx:Collection[int], procs:OptTabTfms=None,
                cat_names:OptStrList=None, cont_names:OptStrList=None, classes:Collection=None, 
                test_df=None, bs:int=64, val_bs:int=None, num_workers:int=defaults.cpus, dl_tfms:Optional[Collection[Callable]]=None, 
                device:torch.device=None, collate_fn:Callable=data_collate, no_check:bool=False)->DataBunch:
        "Create a `DataBunch` from `df` and `valid_idx` with `dep_var`. `kwargs` are passed to `DataBunch.create`."
        cat_names = ifnone(cat_names, []).copy()
        cont_names = ifnone(cont_names, list(set(df)-set(cat_names)-{dep_var}))
        procs = listify(procs)
        src = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_by_idx(valid_idx))
        src = src.label_from_df(cols=dep_var) if classes is None else src.label_from_df(cols=dep_var, classes=classes)
        if test_df is not None: src.add_test(TabularList.from_df(test_df, cat_names=cat_names, cont_names=cont_names,
                                                                 processor = src.train.x.processor))
        return src.databunch(path=path, bs=bs, val_bs=val_bs, num_workers=num_workers, device=device, 
                             collate_fn=collate_fn, no_check=no_check)

class TabularList(ItemList):
    "Basic `ItemList` for tabular data."
    _item_cls=TabularLine
    _processor=TabularProcessor
    _bunch=TabularDataBunch
    def __init__(self, items:Iterator, cat_names:OptStrList=None, cont_names:OptStrList=None,
                 procs=None, **kwargs)->'TabularList':
        super().__init__(range_of(items), **kwargs)
        #dataframe is in inner_df, items is just a range of index
        if cat_names is None:  cat_names = []
        if cont_names is None: cont_names = []
        self.cat_names,self.cont_names,self.procs = cat_names,cont_names,procs
        self.copy_new += ['cat_names', 'cont_names', 'procs']
        self.preprocessed = False

    @classmethod
    def from_df(cls, df:DataFrame, cat_names:OptStrList=None, cont_names:OptStrList=None, procs=None, **kwargs)->'ItemList':
        "Get the list of inputs in the `col` of `path/csv_name`."
        return cls(items=range(len(df)), cat_names=cat_names, cont_names=cont_names, procs=procs, inner_df=df.copy(), **kwargs)

    def get(self, o):
        if not self.preprocessed: return self.inner_df.iloc[o] if hasattr(self, 'inner_df') else self.items[o]
        codes = [] if self.codes is None else self.codes[o]
        conts = [] if self.conts is None else self.conts[o]
        return self._item_cls(codes, conts, self.classes, self.col_names)

    def get_emb_szs(self, sz_dict=None):
        "Return the default embedding sizes suitable for this data or takes the ones in `sz_dict`."
        return [def_emb_sz(self.classes, n, sz_dict) for n in self.cat_names]

    def reconstruct(self, t:Tensor):
        return self._item_cls(t[0], t[1], self.classes, self.col_names)

    def show_xys(self, xs, ys)->None:
        "Show the `xs` (inputs) and `ys` (targets)."
        from IPython.display import display, HTML
        items,names = [], xs[0].names + ['target']
        for i, (x,y) in enumerate(zip(xs,ys)):
            res = []
            cats = x.cats if len(x.cats.size()) > 0 else []
            conts = x.conts if len(x.conts.size()) > 0 else []
            for c, n in zip(cats, x.names[:len(cats)]):
                res.append(x.classes[n][c])
            res += [f'{c:.4f}' for c in conts] + [y]
            items.append(res)
        items = np.array(items)
        df = pd.DataFrame({n:items[:,i] for i,n in enumerate(names)}, columns=names)
        with pd.option_context('display.max_colwidth', -1):
            display(HTML(df.to_html(index=False)))

    def show_xyzs(self, xs, ys, zs):
        "Show `xs` (inputs), `ys` (targets) and `zs` (predictions)."
        from IPython.display import display, HTML
        items,names = [], xs[0].names + ['target', 'prediction']
        for i, (x,y,z) in enumerate(zip(xs,ys,zs)):
            res = []
            cats = x.cats if len(x.cats.size()) > 0 else []
            conts = x.conts if len(x.conts.size()) > 0 else []
            for c, n in zip(cats, x.names[:len(cats)]):
                res.append(str(x.classes[n][c]))
            res += [f'{c:.4f}' for c in conts] + [y, z]
            items.append(res)
        items = np.array(items)
        df = pd.DataFrame({n:items[:,i] for i,n in enumerate(names)}, columns=names)
        with pd.option_context('display.max_colwidth', -1):
            display(HTML(df.to_html(index=False)))

def tabular_learner(data:DataBunch, layers:Collection[int], emb_szs:Dict[str,int]=None, metrics=None,
        ps:Collection[float]=None, emb_drop:float=0., y_range:OptRange=None, use_bn:bool=True, **learn_kwargs):
    "Get a `Learner` using `data`, with `metrics`, including a `TabularModel` created using the remaining params."
    emb_szs = data.get_emb_szs(ifnone(emb_szs, {}))
    model = TabularModel(emb_szs, len(data.cont_names), out_sz=data.c, layers=layers, ps=ps, emb_drop=emb_drop,
                         y_range=y_range, use_bn=use_bn)
    return Learner(data, model, metrics=metrics, **learn_kwargs)