Hemanth-thunder's picture
End of training
c0551d3
from dataclasses import dataclass
from typing import Optional
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
RESERVED_COLUMNS = ["autotrain_text", "autotrain_label"]
LLM_RESERVED_COLUMNS = ["autotrain_prompt", "autotrain_context", "autotrain_response", "autotrain_prompt_start"]
@dataclass
class TextBinaryClassificationPreprocessor:
train_data: pd.DataFrame
text_column: str
label_column: str
username: str
project_name: str
token: str
valid_data: Optional[pd.DataFrame] = None
test_size: Optional[float] = 0.2
seed: Optional[int] = 42
def __post_init__(self):
# check if text_column and label_column are in train_data
if self.text_column not in self.train_data.columns:
raise ValueError(f"{self.text_column} not in train data")
if self.label_column not in self.train_data.columns:
raise ValueError(f"{self.label_column} not in train data")
# check if text_column and label_column are in valid_data
if self.valid_data is not None:
if self.text_column not in self.valid_data.columns:
raise ValueError(f"{self.text_column} not in valid data")
if self.label_column not in self.valid_data.columns:
raise ValueError(f"{self.label_column} not in valid data")
# make sure no reserved columns are in train_data or valid_data
for column in RESERVED_COLUMNS:
if column in self.train_data.columns:
raise ValueError(f"{column} is a reserved column name")
if self.valid_data is not None:
if column in self.valid_data.columns:
raise ValueError(f"{column} is a reserved column name")
def split(self):
if self.valid_data is not None:
return self.train_data, self.valid_data
else:
train_df, valid_df = train_test_split(
self.train_data,
test_size=self.test_size,
random_state=self.seed,
stratify=self.train_data[self.label_column],
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
return train_df, valid_df
def prepare_columns(self, train_df, valid_df):
train_df.loc[:, "autotrain_text"] = train_df[self.text_column]
train_df.loc[:, "autotrain_label"] = train_df[self.label_column]
valid_df.loc[:, "autotrain_text"] = valid_df[self.text_column]
valid_df.loc[:, "autotrain_label"] = valid_df[self.label_column]
# drop text_column and label_column
train_df = train_df.drop(columns=[self.text_column, self.label_column])
valid_df = valid_df.drop(columns=[self.text_column, self.label_column])
return train_df, valid_df
def prepare(self):
train_df, valid_df = self.split()
train_df, valid_df = self.prepare_columns(train_df, valid_df)
train_df = Dataset.from_pandas(train_df)
valid_df = Dataset.from_pandas(valid_df)
train_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="train",
private=True,
token=self.token,
)
valid_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="validation",
private=True,
token=self.token,
)
return train_df, valid_df
class TextMultiClassClassificationPreprocessor(TextBinaryClassificationPreprocessor):
pass
class TextSingleColumnRegressionPreprocessor(TextBinaryClassificationPreprocessor):
def split(self):
if self.valid_data is not None:
return self.train_data, self.valid_data
else:
train_df, valid_df = train_test_split(
self.train_data,
test_size=self.test_size,
random_state=self.seed,
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
return train_df, valid_df
@dataclass
class LLMPreprocessor:
train_data: pd.DataFrame
username: str
project_name: str
token: str
valid_data: Optional[pd.DataFrame] = None
test_size: Optional[float] = 0.2
seed: Optional[int] = 42
context_column: Optional[str] = None
prompt_start_column: Optional[str] = None
text_column: Optional[str] = None
prompt_column: Optional[str] = None
response_column: Optional[str] = None
def __post_init__(self):
# user can either provide text_column or prompt_column and response_column
if self.text_column is not None and (self.prompt_column is not None or self.response_column is not None):
raise ValueError("Please provide either text_column or prompt_column and response_column")
if self.text_column is not None:
# if text_column is provided, use it for prompt_column and response_column
self.prompt_column = self.text_column
self.response_column = self.text_column
# check if text_column and response_column are in train_data
if self.prompt_column not in self.train_data.columns:
raise ValueError(f"{self.prompt_column} not in train data")
if self.response_column not in self.train_data.columns:
raise ValueError(f"{self.response_column} not in train data")
# check if text_column and response_column are in valid_data
if self.valid_data is not None:
if self.prompt_column not in self.valid_data.columns:
raise ValueError(f"{self.prompt_column} not in valid data")
if self.response_column not in self.valid_data.columns:
raise ValueError(f"{self.response_column} not in valid data")
# make sure no reserved columns are in train_data or valid_data
for column in RESERVED_COLUMNS + LLM_RESERVED_COLUMNS:
if column in self.train_data.columns:
raise ValueError(f"{column} is a reserved column name")
if self.valid_data is not None:
if column in self.valid_data.columns:
raise ValueError(f"{column} is a reserved column name")
def split(self):
if self.valid_data is not None:
return self.train_data, self.valid_data
else:
train_df, valid_df = train_test_split(
self.train_data,
test_size=self.test_size,
random_state=self.seed,
)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
return train_df, valid_df
def prepare_columns(self, train_df, valid_df):
if self.text_column is not None:
train_df.loc[:, "autotrain_text"] = train_df[self.text_column]
valid_df.loc[:, "autotrain_text"] = valid_df[self.text_column]
# drop text_column and label_column
train_df = train_df.drop(columns=[self.text_column])
valid_df = valid_df.drop(columns=[self.text_column])
return train_df, valid_df
else:
train_df.loc[:, "autotrain_prompt"] = train_df[self.prompt_column]
valid_df.loc[:, "autotrain_prompt"] = valid_df[self.prompt_column]
train_df.loc[:, "autotrain_response"] = train_df[self.response_column]
valid_df.loc[:, "autotrain_response"] = valid_df[self.response_column]
train_df = train_df.drop(columns=[self.prompt_column, self.response_column])
valid_df = valid_df.drop(columns=[self.prompt_column, self.response_column])
if self.context_column is not None:
train_df.loc[:, "autotrain_context"] = train_df[self.context_column]
valid_df.loc[:, "autotrain_context"] = valid_df[self.context_column]
train_df = train_df.drop(columns=[self.context_column])
valid_df = valid_df.drop(columns=[self.context_column])
if self.prompt_start_column is not None:
train_df.loc[:, "autotrain_prompt_start"] = train_df[self.prompt_start_column]
valid_df.loc[:, "autotrain_prompt_start"] = valid_df[self.prompt_start_column]
train_df = train_df.drop(columns=[self.prompt_start_column])
valid_df = valid_df.drop(columns=[self.prompt_start_column])
return train_df, valid_df
def prepare(self):
train_df, valid_df = self.split()
train_df, valid_df = self.prepare_columns(train_df, valid_df)
train_df = Dataset.from_pandas(train_df)
valid_df = Dataset.from_pandas(valid_df)
train_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="train",
private=True,
token=self.token,
)
valid_df.push_to_hub(
f"{self.username}/autotrain-data-{self.project_name}",
split="validation",
private=True,
token=self.token,
)
return train_df, valid_df