| | from dataclasses import dataclass |
| | from typing import Optional |
| |
|
| | import pandas as pd |
| | from datasets import Dataset |
| | from sklearn.model_selection import train_test_split |
| |
|
| |
|
| | RESERVED_COLUMNS = ["autotrain_id", "autotrain_label"] |
| |
|
| |
|
| | @dataclass |
| | class TabularBinaryClassificationPreprocessor: |
| | train_data: pd.DataFrame |
| | label_column: str |
| | username: str |
| | project_name: str |
| | id_column: Optional[str] = None |
| | valid_data: Optional[pd.DataFrame] = None |
| | test_size: Optional[float] = 0.2 |
| | seed: Optional[int] = 42 |
| |
|
| | def __post_init__(self): |
| | |
| | if self.id_column is not None: |
| | if self.id_column not in self.train_data.columns: |
| | raise ValueError(f"{self.id_column} not in train data") |
| |
|
| | if self.label_column not in self.train_data.columns: |
| | raise ValueError(f"{self.label_column} not in train data") |
| |
|
| | |
| | if self.valid_data is not None: |
| | if self.id_column is not None: |
| | if self.id_column not in self.valid_data.columns: |
| | raise ValueError(f"{self.id_column} not in valid data") |
| | if self.label_column not in self.valid_data.columns: |
| | raise ValueError(f"{self.label_column} not in valid data") |
| |
|
| | |
| | for column in RESERVED_COLUMNS: |
| | if column in self.train_data.columns: |
| | raise ValueError(f"{column} is a reserved column name") |
| | if self.valid_data is not None: |
| | if column in self.valid_data.columns: |
| | raise ValueError(f"{column} is a reserved column name") |
| |
|
| | def split(self): |
| | if self.valid_data is not None: |
| | return self.train_data, self.valid_data |
| | else: |
| | train_df, valid_df = train_test_split( |
| | self.train_data, |
| | test_size=self.test_size, |
| | random_state=self.seed, |
| | stratify=self.train_data[self.label_column], |
| | ) |
| | train_df = train_df.reset_index(drop=True) |
| | valid_df = valid_df.reset_index(drop=True) |
| | return train_df, valid_df |
| |
|
| | def prepare_columns(self, train_df, valid_df): |
| | train_df.loc[:, "autotrain_id"] = train_df[self.id_column] |
| | train_df.loc[:, "autotrain_label"] = train_df[self.label_column] |
| | valid_df.loc[:, "autotrain_id"] = valid_df[self.id_column] |
| | valid_df.loc[:, "autotrain_label"] = valid_df[self.label_column] |
| |
|
| | |
| | train_df = train_df.drop(columns=[self.id_column, self.label_column]) |
| | valid_df = valid_df.drop(columns=[self.id_column, self.label_column]) |
| | return train_df, valid_df |
| |
|
| | def prepare(self): |
| | train_df, valid_df = self.split() |
| | train_df, valid_df = self.prepare_columns(train_df, valid_df) |
| | train_df = Dataset.from_pandas(train_df) |
| | valid_df = Dataset.from_pandas(valid_df) |
| | train_df.push_to_hub(f"{self.username}/autotrain-data-{self.project_name}", split="train", private=True) |
| | valid_df.push_to_hub(f"{self.username}/autotrain-data-{self.project_name}", split="validation", private=True) |
| | return train_df, valid_df |
| |
|
| |
|
| | class TabularMultiClassClassificationPreprocessor(TabularBinaryClassificationPreprocessor): |
| | pass |
| |
|
| |
|
| | class TabularSingleColumnRegressionPreprocessor(TabularBinaryClassificationPreprocessor): |
| | def split(self): |
| | if self.valid_data is not None: |
| | return self.train_data, self.valid_data |
| | else: |
| | train_df, valid_df = train_test_split( |
| | self.train_data, |
| | test_size=self.test_size, |
| | random_state=self.seed, |
| | ) |
| | train_df = train_df.reset_index(drop=True) |
| | valid_df = valid_df.reset_index(drop=True) |
| | return train_df, valid_df |
| |
|