File size: 9,231 Bytes
c0551d3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | from dataclasses import dataclass
from typing import Optional
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
RESERVED_COLUMNS = ["autotrain_text", "autotrain_label"]
LLM_RESERVED_COLUMNS = ["autotrain_prompt", "autotrain_context", "autotrain_response", "autotrain_prompt_start"]
@dataclass
class TextBinaryClassificationPreprocessor:
train_data: pd.DataFrame
text_column: str
label_column: str
username: str
project_name: str
token: str
valid_data: Optional[pd.DataFrame] = None
test_size: Optional[float] = 0.2
seed: Optional[int] = 42
def __post_init__(self):
# check if text_column and label_column are in train_data
if self.text_column not in self.train_data.columns:
raise ValueError(f"{self.text_column} not in train data")
if self.label_column not in self.train_data.columns:
raise ValueError(f"{self.label_column} not in train data")
# check if text_column and label_column are in valid_data
if self.valid_data is not None:
if self.text_column not in self.valid_data.columns:
raise ValueError(f"{self.text_column} not in valid data")
if self.label_column not in self.valid_data.columns:
raise ValueError(f"{self.label_column} not in valid data")
# make sure no reserved columns are in train_data or valid_data
for column in RESERVED_COLUMNS:
if column in self.train_data.columns:
raise ValueError(f"{column} is a reserved column name")
if self.valid_data is not None:
if column in self.valid_data.columns:
raise ValueError(f"{column} is a reserved column name")
def split(self):
if self.valid_data is not None:
return self.train_data, self.valid_data
else:
train_df, valid_df = train_test_split(
self.train_data,
test_size=self.test_size,
random_state=self.seed,
stratify=self.train_data[self.label_column],
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
return train_df, valid_df
def prepare_columns(self, train_df, valid_df):
train_df.loc[:, "autotrain_text"] = train_df[self.text_column]
train_df.loc[:, "autotrain_label"] = train_df[self.label_column]
valid_df.loc[:, "autotrain_text"] = valid_df[self.text_column]
valid_df.loc[:, "autotrain_label"] = valid_df[self.label_column]
# drop text_column and label_column
train_df = train_df.drop(columns=[self.text_column, self.label_column])
valid_df = valid_df.drop(columns=[self.text_column, self.label_column])
return train_df, valid_df
def prepare(self):
train_df, valid_df = self.split()
train_df, valid_df = self.prepare_columns(train_df, valid_df)
train_df = Dataset.from_pandas(train_df)
valid_df = Dataset.from_pandas(valid_df)
train_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="train",
private=True,
token=self.token,
)
valid_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="validation",
private=True,
token=self.token,
)
return train_df, valid_df
class TextMultiClassClassificationPreprocessor(TextBinaryClassificationPreprocessor):
pass
class TextSingleColumnRegressionPreprocessor(TextBinaryClassificationPreprocessor):
def split(self):
if self.valid_data is not None:
return self.train_data, self.valid_data
else:
train_df, valid_df = train_test_split(
self.train_data,
test_size=self.test_size,
random_state=self.seed,
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
return train_df, valid_df
@dataclass
class LLMPreprocessor:
train_data: pd.DataFrame
username: str
project_name: str
token: str
valid_data: Optional[pd.DataFrame] = None
test_size: Optional[float] = 0.2
seed: Optional[int] = 42
context_column: Optional[str] = None
prompt_start_column: Optional[str] = None
text_column: Optional[str] = None
prompt_column: Optional[str] = None
response_column: Optional[str] = None
def __post_init__(self):
# user can either provide text_column or prompt_column and response_column
if self.text_column is not None and (self.prompt_column is not None or self.response_column is not None):
raise ValueError("Please provide either text_column or prompt_column and response_column")
if self.text_column is not None:
# if text_column is provided, use it for prompt_column and response_column
self.prompt_column = self.text_column
self.response_column = self.text_column
# check if text_column and response_column are in train_data
if self.prompt_column not in self.train_data.columns:
raise ValueError(f"{self.prompt_column} not in train data")
if self.response_column not in self.train_data.columns:
raise ValueError(f"{self.response_column} not in train data")
# check if text_column and response_column are in valid_data
if self.valid_data is not None:
if self.prompt_column not in self.valid_data.columns:
raise ValueError(f"{self.prompt_column} not in valid data")
if self.response_column not in self.valid_data.columns:
raise ValueError(f"{self.response_column} not in valid data")
# make sure no reserved columns are in train_data or valid_data
for column in RESERVED_COLUMNS + LLM_RESERVED_COLUMNS:
if column in self.train_data.columns:
raise ValueError(f"{column} is a reserved column name")
if self.valid_data is not None:
if column in self.valid_data.columns:
raise ValueError(f"{column} is a reserved column name")
def split(self):
if self.valid_data is not None:
return self.train_data, self.valid_data
else:
train_df, valid_df = train_test_split(
self.train_data,
test_size=self.test_size,
random_state=self.seed,
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
return train_df, valid_df
def prepare_columns(self, train_df, valid_df):
if self.text_column is not None:
train_df.loc[:, "autotrain_text"] = train_df[self.text_column]
valid_df.loc[:, "autotrain_text"] = valid_df[self.text_column]
# drop text_column and label_column
train_df = train_df.drop(columns=[self.text_column])
valid_df = valid_df.drop(columns=[self.text_column])
return train_df, valid_df
else:
train_df.loc[:, "autotrain_prompt"] = train_df[self.prompt_column]
valid_df.loc[:, "autotrain_prompt"] = valid_df[self.prompt_column]
train_df.loc[:, "autotrain_response"] = train_df[self.response_column]
valid_df.loc[:, "autotrain_response"] = valid_df[self.response_column]
train_df = train_df.drop(columns=[self.prompt_column, self.response_column])
valid_df = valid_df.drop(columns=[self.prompt_column, self.response_column])
if self.context_column is not None:
train_df.loc[:, "autotrain_context"] = train_df[self.context_column]
valid_df.loc[:, "autotrain_context"] = valid_df[self.context_column]
train_df = train_df.drop(columns=[self.context_column])
valid_df = valid_df.drop(columns=[self.context_column])
if self.prompt_start_column is not None:
train_df.loc[:, "autotrain_prompt_start"] = train_df[self.prompt_start_column]
valid_df.loc[:, "autotrain_prompt_start"] = valid_df[self.prompt_start_column]
train_df = train_df.drop(columns=[self.prompt_start_column])
valid_df = valid_df.drop(columns=[self.prompt_start_column])
return train_df, valid_df
def prepare(self):
train_df, valid_df = self.split()
train_df, valid_df = self.prepare_columns(train_df, valid_df)
train_df = Dataset.from_pandas(train_df)
valid_df = Dataset.from_pandas(valid_df)
train_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="train",
private=True,
token=self.token,
)
valid_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="validation",
private=True,
token=self.token,
)
return train_df, valid_df
|