Spaces:
Running
Running
| import pandas as pd | |
| from tqdm import tqdm | |
| class DataTransformer: | |
| def __init__(self): | |
| self.columns = {} | |
| self.col_order = [] | |
| def fit(self, data: pd.DataFrame, id_cols: list = [], ref_cols: dict = {}, name: str = None): | |
| self.col_order = data.columns.tolist() | |
| for c in tqdm(data.columns, desc=f"Table {name if name is not None else ''} columns:"): | |
| col = data[c] | |
| if c in ref_cols: | |
| self.columns[c] = ref_cols[c] | |
| elif c in id_cols: | |
| self.columns[c] = { | |
| "type": "id", | |
| "is_int": pd.api.types.is_integer_dtype(col.dtype) or (pd.api.types.is_numeric_dtype(col.dtype) | |
| and (col - col.round()).abs().mean() < 1e-6)} | |
| else: | |
| col_descr = {} | |
| if (col.nunique() > 5 and pd.to_numeric(col.dropna(), errors="coerce").isna().all() | |
| and not pd.to_datetime(col.dropna(), errors="coerce").isna().any()): | |
| col = pd.to_datetime(col, errors="coerce") | |
| col_descr["type"] = "datetime" | |
| min_col = col.min() | |
| col_descr["min"] = str(min_col) | |
| col = (col - min_col).dt.total_seconds() | |
| if col.isna().any(): | |
| if pd.api.types.is_numeric_dtype(col.dtype): | |
| col_descr["fillna"] = col.min() - 1 | |
| else: | |
| col_descr["fillna"] = "<special-for-null>" | |
| self.columns[c] = col_descr | |
| def transform(self, data: pd.DataFrame) -> pd.DataFrame: | |
| data = data.copy() | |
| for c in self.col_order: | |
| col_descr = self.columns[c] | |
| col_type = col_descr.get("type", "normal") | |
| col = data[c] | |
| if col_type == "id": | |
| if col_descr["is_int"]: | |
| col = col.astype("Int64") | |
| elif col_type == "datetime": | |
| col = pd.to_datetime(col, errors="coerce") | |
| col = (col - pd.to_datetime(col_descr["min"])).dt.total_seconds() | |
| if "fillna" in col_descr: | |
| col = col.fillna(col_descr["fillna"]) | |
| data[c] = col | |
| return data[self.col_order] | |
| def to_dict(self): | |
| return { | |
| "columns": self.columns, | |
| "order": self.col_order | |
| } | |
| def from_dict(cls, data): | |
| transformer = cls() | |
| transformer.columns = data["columns"] | |
| transformer.col_order = data["order"] | |
| return transformer | |