File size: 2,652 Bytes
c4ac745
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
from tqdm import tqdm


class DataTransformer:
    def __init__(self):
        self.columns = {}
        self.col_order = []

    def fit(self, data: pd.DataFrame, id_cols: list = [], ref_cols: dict = {}, name: str = None):
        self.col_order = data.columns.tolist()
        for c in tqdm(data.columns, desc=f"Table {name if name is not None else ''} columns:"):
            col = data[c]
            if c in ref_cols:
                self.columns[c] = ref_cols[c]
            elif c in id_cols:
                self.columns[c] = {
                    "type": "id",
                    "is_int": pd.api.types.is_integer_dtype(col.dtype) or (pd.api.types.is_numeric_dtype(col.dtype)
                                                                           and (col - col.round()).abs().mean() < 1e-6)}
            else:
                col_descr = {}
                if (col.nunique() > 5 and pd.to_numeric(col.dropna(), errors="coerce").isna().all()
                        and not pd.to_datetime(col.dropna(), errors="coerce").isna().any()):
                    col = pd.to_datetime(col, errors="coerce")
                    col_descr["type"] = "datetime"
                    min_col = col.min()
                    col_descr["min"] = str(min_col)
                    col = (col - min_col).dt.total_seconds()
                if col.isna().any():
                    if pd.api.types.is_numeric_dtype(col.dtype):
                        col_descr["fillna"] = col.min() - 1
                    else:
                        col_descr["fillna"] = "<special-for-null>"
                self.columns[c] = col_descr

    def transform(self, data: pd.DataFrame) -> pd.DataFrame:
        data = data.copy()
        for c in self.col_order:
            col_descr = self.columns[c]
            col_type = col_descr.get("type", "normal")
            col = data[c]
            if col_type == "id":
                if col_descr["is_int"]:
                    col = col.astype("Int64")
            elif col_type == "datetime":
                col = pd.to_datetime(col, errors="coerce")
                col = (col - pd.to_datetime(col_descr["min"])).dt.total_seconds()
            if "fillna" in col_descr:
                col = col.fillna(col_descr["fillna"])
            data[c] = col
        return data[self.col_order]

    def to_dict(self):
        return {
            "columns": self.columns,
            "order": self.col_order
        }

    @classmethod
    def from_dict(cls, data):
        transformer = cls()
        transformer.columns = data["columns"]
        transformer.col_order = data["order"]
        return transformer