TabGAN / tabgan /sampler.py
InsafQ's picture
Add BayesianGenerator (Gaussian Copula)
4925509 verified
# -*- coding: utf-8 -*-
import logging
import numpy as np
import warnings
from typing import Tuple
import pandas as pd
import torch
try:
from be_great import GReaT
except ImportError:
GReaT = None
try:
from _ForestDiffusion import ForestDiffusionModel
except ImportError:
ForestDiffusionModel = None
from _ctgan.synthesizer import _CTGANSynthesizer as CTGAN
from tabgan.abc_sampler import Sampler, SampleData
from tabgan.adversarial_model import AdversarialModel
from tabgan.utils import setup_logging, _drop_col_if_exist, \
get_columns_if_exists, _sampler, get_year_mnth_dt_from_date, collect_dates
from tabgan.llm_config import LLMAPIConfig
from tabgan.llm_api_client import LLMAPIClient
warnings.filterwarnings("ignore")
__author__ = "Insaf Ashrapov"
__copyright__ = "Insaf Ashrapov"
__license__ = "Apache 2.0"
__all__ = ["OriginalGenerator", "GANGenerator", "ForestDiffusionGenerator", "LLMGenerator"]
class _BaseGenerator(SampleData):
"""Base factory that stores constructor arguments for the concrete sampler."""
_sampler_class = None
def __init__(self, *args, **kwargs):
self.args = args
self.kwargs = kwargs
def get_object_generator(self) -> Sampler:
return self._sampler_class(*self.args, **self.kwargs)
class OriginalGenerator(_BaseGenerator):
_sampler_class = None # set after SamplerOriginal is defined
class GANGenerator(_BaseGenerator):
_sampler_class = None
class ForestDiffusionGenerator(_BaseGenerator):
_sampler_class = None
class LLMGenerator(_BaseGenerator):
_sampler_class = None
class BayesianGenerator(_BaseGenerator):
_sampler_class = None
class SamplerOriginal(Sampler):
def __init__(
self,
gen_x_times: float = 1.1,
cat_cols: list = None,
bot_filter_quantile: float = 0.001,
top_filter_quantile: float = 0.999,
is_post_process: bool = True,
adversarial_model_params: dict = None,
pregeneration_frac: float = 2,
only_generated_data: bool = False,
gen_params: dict = None,
text_generating_columns: list = None,
conditional_columns: list = None,
llm_api_config: LLMAPIConfig = None,
):
"""
Initialize an original sampler configuration.
Args:
gen_x_times (float): Factor controlling how many synthetic samples
to generate relative to the training size. The final amount
can be smaller after post-processing and adversarial filtering.
cat_cols (list | None): Names of categorical columns in the
training data.
bot_filter_quantile (float): Lower quantile used for numeric
post-processing filters.
top_filter_quantile (float): Upper quantile used for numeric
post-processing filters.
is_post_process (bool): Whether to apply post-processing filters
based on the distribution of `test_df`. If False, the
quantile-based filters are skipped.
adversarial_model_params (dict): Parameters for the adversarial
filtering model used to keep generated samples close to the
test distribution.
pregeneration_frac (float): Oversampling factor applied before
post-processing. The final number of rows is derived from
`gen_x_times`.
only_generated_data (bool): If True, return only synthetic rows.
If False, append generated rows to the original training data.
gen_params (dict): Model-specific generation parameters shared by
subclasses (GAN, ForestDiffusion, LLM).
text_generating_columns (list | None): Column names for which new
text values should be generated (used by `SamplerLLM`).
conditional_columns (list | None): Column names that condition
text generation for `text_generating_columns`.
llm_api_config (LLMAPIConfig | None): Configuration for external LLM
API-based text generation. When provided, text generation will use
the API instead of the local model. Useful for LM Studio, Ollama,
OpenAI, etc.
"""
if adversarial_model_params is None:
adversarial_model_params = {
"metrics": "AUC",
"max_depth": 2,
"max_bin": 100,
"n_estimators": 150,
"learning_rate": 0.02,
"random_state": 42,
}
if gen_params is None:
gen_params = {"batch_size": 45, "patience": 25, "epochs": 50, "llm": "distilgpt2"}
super().__init__(
gen_x_times=gen_x_times,
cat_cols=cat_cols,
bot_filter_quantile=bot_filter_quantile,
top_filter_quantile=top_filter_quantile,
is_post_process=is_post_process,
adversarial_model_params=adversarial_model_params,
pregeneration_frac=pregeneration_frac,
only_generated_data=only_generated_data,
gen_params=gen_params,
)
self.text_generating_columns = text_generating_columns
self.conditional_columns = conditional_columns
self.llm_api_config = llm_api_config
if not hasattr(self, "TEMP_TARGET"):
self.TEMP_TARGET = "TEMP_TARGET"
@staticmethod
def preprocess_data_df(df) -> pd.DataFrame:
logging.info(f"Input shape: {df.shape}")
if not isinstance(df, pd.DataFrame):
raise ValueError(
f"Input dataframe is not a pandas DataFrame: got {type(df)}"
)
return df
def preprocess_data(
self, train, target, test_df
) -> Tuple[pd.DataFrame, pd.DataFrame]:
train = self.preprocess_data_df(train)
target = self.preprocess_data_df(target)
test_df = self.preprocess_data_df(test_df)
self.TEMP_TARGET = target.columns[0]
if self.TEMP_TARGET in train.columns:
raise ValueError(
f"Input train dataframe already has '{self.TEMP_TARGET}' column, consider removing it"
)
if "test_similarity" in train.columns:
raise ValueError(
"Input train dataframe already have test_similarity, consider removing it"
)
return train, target, test_df
def generate_data(
self, train_df, target, test_df, only_generated_data
) -> Tuple[pd.DataFrame, pd.DataFrame]:
if only_generated_data:
warnings.warn(
"For SamplerOriginal setting only_generated_data doesn't change anything, "
"because generated data sampled from the train!"
)
self._validate_data(train_df, target, test_df)
train_df[self.TEMP_TARGET] = target
generated_df = train_df.sample(
frac=(1 + self.pregeneration_frac), replace=True, random_state=42
)
generated_df = generated_df.reset_index(drop=True)
logging.info(
f"Generated shape: {generated_df.drop(self.TEMP_TARGET, axis=1).shape} "
f"and {generated_df[self.TEMP_TARGET].shape}"
)
return (
generated_df.drop(self.TEMP_TARGET, axis=1),
generated_df[self.TEMP_TARGET],
)
def postprocess_data(self, train_df, target, test_df):
if not self.is_post_process or test_df is None:
logging.info("Skipping postprocessing")
return train_df, target
self._validate_data(train_df, target, test_df)
train_df[self.TEMP_TARGET] = target
# Filter numerical columns
for col in test_df.columns:
if self.cat_cols is None or col not in self.cat_cols:
min_val = test_df[col].quantile(self.bot_filter_quantile)
max_val = test_df[col].quantile(self.top_filter_quantile)
train_df = train_df[(train_df[col].isna()) | ((train_df[col] >= min_val) & (train_df[col] <= max_val))]
if train_df.shape[0] < 10:
raise ValueError(f"Too few samples (<10) after filtering column {col}. "
f"Test data may be skewed. Filter range: [{min_val}, {max_val}]")
# Filter categorical columns
if self.cat_cols:
for col in self.cat_cols:
train_df = train_df[train_df[col].isin(test_df[col].unique())]
if train_df.shape[0] < 10:
raise ValueError(f"Too few samples (<10) after filtering categorical column {col}")
logging.info(
f"Generated shapes after postprocessing: {train_df.drop(self.TEMP_TARGET, axis=1).shape} plus target")
result_df = train_df.reset_index(drop=True)
return (
result_df.drop(self.TEMP_TARGET, axis=1),
result_df[self.TEMP_TARGET]
)
def adversarial_filtering(self, train_df, target, test_df):
if test_df is None:
logging.info("Skipping adversarial filtering, because test_df is None.")
return train_df, target
ad_model = AdversarialModel(
cat_cols=self.cat_cols, model_params=self.adversarial_model_params
)
self._validate_data(train_df, target, test_df)
train_df[self.TEMP_TARGET] = target
ad_model.adversarial_test(test_df, train_df.drop(self.TEMP_TARGET, axis=1))
train_df["test_similarity"] = ad_model.trained_model.predict(
train_df.drop(self.TEMP_TARGET, axis=1)
)
train_df.sort_values("test_similarity", ascending=False, inplace=True)
train_df = train_df.head(self.get_generated_shape(train_df) * train_df.shape[0])
del ad_model
return (
train_df.drop(["test_similarity", self.TEMP_TARGET], axis=1).reset_index(
drop=True
),
train_df[self.TEMP_TARGET].reset_index(drop=True),
)
@staticmethod
def _validate_data(train_df, target, test_df):
if test_df is not None:
if train_df.shape[0] < 10 or test_df.shape[0] < 10:
raise ValueError(
f"Shape of train is {train_df.shape[0]} and test is {test_df.shape[0]}. "
f"Both should be at least 10! Consider disabling adversarial filtering"
)
if target is not None:
if train_df.shape[0] != target.shape[0]:
raise ValueError(
f"Shape mismatch: train_df has {train_df.shape[0]} rows "
f"but target has {target.shape[0]} rows"
)
def handle_generated_data(self, train_df, generated_df, only_generated_data):
"""
Align and optionally merge generated rows with the original training data.
The generated data is cast to the dtypes and column order of `train_df`
so that downstream models receive data with a consistent schema.
Args:
train_df (pd.DataFrame): Original training data used to infer the
schema and target column.
generated_df (pd.DataFrame or array-like): Newly generated
samples to be aligned with `train_df`.
only_generated_data (bool): If True, return only synthetic rows;
otherwise, append them to `train_df` before returning.
Returns:
Tuple[pd.DataFrame, pd.Series | pd.DataFrame]: Features and
corresponding target values.
"""
generated_df = pd.DataFrame(generated_df)
generated_df.columns = train_df.columns
for column_index in range(len(generated_df.columns)):
target_column = generated_df.columns[column_index]
generated_df[target_column] = generated_df[target_column].astype(
train_df.dtypes.values[column_index]
)
if not only_generated_data:
train_df = pd.concat([train_df, generated_df]).reset_index(drop=True)
logging.info(
f"Generated shapes: {_drop_col_if_exist(train_df, self.TEMP_TARGET).shape} plus target"
)
return (
_drop_col_if_exist(train_df, self.TEMP_TARGET),
get_columns_if_exists(train_df, self.TEMP_TARGET),
)
else:
logging.info(
f"Generated shapes: {_drop_col_if_exist(generated_df, self.TEMP_TARGET).shape} plus target"
)
return (
_drop_col_if_exist(generated_df, self.TEMP_TARGET),
get_columns_if_exists(generated_df, self.TEMP_TARGET),
)
class SamplerGAN(SamplerOriginal):
def check_params(self):
if self.gen_params["batch_size"] % 10 != 0:
logging.warning(
f"Batch size should be divisible by 10, but got {self.gen_params['batch_size']}. Fixing it")
self.gen_params["batch_size"] += 10 - (self.gen_params["batch_size"] % 10)
if "patience" not in self.gen_params:
logging.warning("patience param is not set for GAN params, setting default to 25")
self.gen_params["patience"] = 25
if "epochs" not in self.gen_params:
logging.warning("epochs param is not set for GAN params, setting default to 50")
self.gen_params["epochs"] = 50
def generate_data(
self, train_df, target, test_df, only_generated_data: bool
) -> Tuple[pd.DataFrame, pd.DataFrame]:
self.check_params()
self._validate_data(train_df, target, test_df)
if target is not None:
train_df[self.TEMP_TARGET] = target
ctgan = CTGAN(batch_size=self.gen_params["batch_size"], patience=self.gen_params["patience"])
logging.info("training GAN")
if self.cat_cols is None:
ctgan.fit(train_df, [], epochs=self.gen_params["epochs"])
else:
ctgan.fit(train_df, self.cat_cols, epochs=self.gen_params["epochs"])
logging.info("Finished training GAN")
generated_df = ctgan.sample(
self.pregeneration_frac * self.get_generated_shape(train_df)
)
return self.handle_generated_data(train_df, generated_df, only_generated_data)
class SamplerDiffusion(SamplerOriginal):
def generate_data(
self, train_df, target, test_df, only_generated_data: bool
) -> Tuple[pd.DataFrame, pd.DataFrame]:
self._validate_data(train_df, target, test_df)
if target is not None:
train_df[self.TEMP_TARGET] = target
logging.info("Fitting ForestDiffusion model")
if ForestDiffusionModel is None:
raise ImportError(
"ForestDiffusion is not installed. "
"Please install it: pip install ForestDiffusion"
)
if self.cat_cols is None:
forest_model = ForestDiffusionModel(train_df.to_numpy(), label_y=None, n_t=50,
duplicate_K=100,
diffusion_type='flow', n_jobs=-1)
else:
forest_model = ForestDiffusionModel(train_df.to_numpy(), label_y=None, n_t=50,
duplicate_K=100,
# todo fix bug with cat cols
# cat_indexes=self.get_column_indexes(train_df, self.cat_cols),
diffusion_type='flow', n_jobs=-1)
logging.info("Finished training ForestDiffusionModel")
generated_df = forest_model.generate(batch_size=int(self.gen_x_times * train_df.to_numpy().shape[0]))
return self.handle_generated_data(train_df, generated_df, only_generated_data)
@staticmethod
def get_column_indexes(df, column_names):
return [df.columns.get_loc(col) for col in column_names]
class SamplerLLM(SamplerOriginal):
def check_params(self):
if "llm" not in self.gen_params:
logging.warning("llm param is not set for LLM params, setting default to 'distilgpt2'")
self.gen_params["llm"] = "distilgpt2"
if "max_length" not in self.gen_params:
logging.warning("max_length param is not set for LLM params, setting default to 500")
self.gen_params["max_length"] = 500
if self.gen_params["epochs"] < 3:
logging.warning(
f"Current epoch={self.gen_params['epochs']} for LLM training is too low, setting to 3")
self.gen_params["epochs"] = 3
def _build_training_frame(self, train_df: pd.DataFrame, target: pd.DataFrame | None) -> pd.DataFrame:
"""
Return a copy of the training frame with TEMP_TARGET attached when a target is provided.
"""
current_train_df = train_df.copy()
if target is not None:
current_train_df[self.TEMP_TARGET] = target
return current_train_df
def _fit_great_model(self, current_train_df: pd.DataFrame):
"""
Fit a GReaT model on the provided training frame and return the instance and inference device.
"""
logging.info("Fitting LLM model")
is_fp16 = torch.cuda.is_available()
try:
from be_great import GReaT
except ImportError:
raise ImportError("be_great library is not installed. Please install it to use LLMGenerator.")
great_model_instance = GReaT(
llm=self.gen_params["llm"],
batch_size=self.gen_params["batch_size"],
epochs=self.gen_params["epochs"],
fp16=is_fp16,
)
great_model_instance.fit(current_train_df)
logging.info("Finished training LLM model")
device = "cuda" if torch.cuda.is_available() else "cpu"
return great_model_instance, device
def _conditional_text_generation(
self,
great_model_instance,
current_train_df: pd.DataFrame,
train_df: pd.DataFrame,
target: pd.DataFrame | None,
device: str,
) -> pd.DataFrame:
"""
Generate rows when text and conditional columns are specified.
"""
logging.info("Starting conditional generation of text columns.")
num_samples_to_generate = int(self.gen_x_times * train_df.shape[0])
original_unique_text_values: dict[str, set] = {}
for col in self.text_generating_columns:
if col not in current_train_df.columns:
raise ValueError(f"Text generating column '{col}' not found in training data.")
original_unique_text_values[col] = set(current_train_df[col].unique())
attribute_distributions: dict[str, pd.Series] = {}
for col in self.conditional_columns:
if col not in current_train_df.columns:
raise ValueError(f"Conditional column '{col}' not found in training data.")
attribute_distributions[col] = current_train_df[col].value_counts(normalize=True)
generated_rows: list[dict] = []
all_train_columns = current_train_df.columns.tolist()
for _ in range(num_samples_to_generate):
current_row_data: dict = {}
for attr_col in self.conditional_columns:
dist = attribute_distributions[attr_col]
current_row_data[attr_col] = np.random.choice(dist.index, p=dist.values)
row_template_for_impute = pd.DataFrame(columns=all_train_columns, index=[0])
for col in all_train_columns:
if col in current_row_data:
row_template_for_impute.loc[0, col] = current_row_data[col]
elif col not in self.text_generating_columns:
row_template_for_impute.loc[0, col] = np.nan
imputed_full_row_df = great_model_instance.impute(
row_template_for_impute.copy(),
max_length=self.gen_params.get("max_length", 500),
)
for col in all_train_columns:
if col not in self.text_generating_columns and col not in current_row_data:
current_row_data[col] = imputed_full_row_df.loc[0, col]
for text_col in self.text_generating_columns:
prompt_parts: list[str] = []
for cond_col in self.conditional_columns:
prompt_parts.append(f"{cond_col}: {current_row_data[cond_col]}")
for other_col in all_train_columns:
if (
other_col not in self.text_generating_columns
and other_col not in self.conditional_columns
and other_col in current_row_data
):
val_str = str(current_row_data[other_col])
if len(val_str) > 30:
val_str = val_str[:27] + "..."
prompt_parts.append(f"{other_col}: {val_str}")
prompt = ", ".join(prompt_parts) + f", Generate {text_col}: "
generated_text_candidate = None
max_retries = 10
for _retry_attempt in range(max_retries):
generated_text_candidate = self._generate_via_prompt(
prompt,
great_model_instance,
device=device,
)
if generated_text_candidate not in original_unique_text_values[text_col]:
break
else:
logging.warning(
f"Max retries reached for generating novel text for {text_col}. Using last candidate."
)
current_row_data[text_col] = generated_text_candidate
ordered_row = {col: current_row_data.get(col) for col in train_df.columns}
if target is not None and self.TEMP_TARGET in current_row_data:
ordered_row[self.TEMP_TARGET] = current_row_data[self.TEMP_TARGET]
generated_rows.append(ordered_row)
generated_df = pd.DataFrame(generated_rows)
return generated_df.reindex(columns=current_train_df.columns)
def _standard_llm_sampling(
self,
great_model_instance,
current_train_df: pd.DataFrame,
device: str,
) -> pd.DataFrame:
"""
Fallback sampling when no explicit text/conditional columns are provided.
"""
logging.info("Starting standard LLM sampling.")
return great_model_instance.sample(
int(self.gen_x_times * current_train_df.shape[0]),
device=device,
max_length=self.gen_params["max_length"],
)
def generate_data(
self, train_df, target, test_df, only_generated_data: bool
) -> Tuple[pd.DataFrame, pd.DataFrame]:
self._validate_data(train_df, target, test_df)
self.check_params()
current_train_df = self._build_training_frame(train_df, target)
great_model_instance, device = self._fit_great_model(current_train_df)
if self.text_generating_columns and self.conditional_columns:
generated_df = self._conditional_text_generation(
great_model_instance,
current_train_df=current_train_df,
train_df=train_df,
target=target,
device=device,
)
else:
generated_df = self._standard_llm_sampling(
great_model_instance,
current_train_df=current_train_df,
device=device,
)
# When a target is provided, ``current_train_df`` already includes the
# TEMP_TARGET column and represents the true training frame used for
# generation. Passing it to ``handle_generated_data`` keeps feature and
# target alignment consistent for both conditional and standard LLM
# sampling paths.
base_train_for_handling = current_train_df if target is not None else train_df
return self.handle_generated_data(base_train_for_handling, generated_df, only_generated_data)
def _generate_via_prompt(self, prompt: str, great_model_instance, device: str, max_tokens_to_generate=50) -> str:
"""
Generate a short text completion from the underlying GReaT LLM.
Args:
prompt (str): Serialized row description used as generation context.
great_model_instance: Fitted GReaT instance providing `model` and
`tokenizer` attributes.
device (str): Target device for inference (for example, ``"cpu"``
or ``"cuda"``).
max_tokens_to_generate (int): Maximum number of new tokens to
sample from the model.
Returns:
str: Post-processed generated text. Returns an empty string if
generation fails.
"""
llm_model = great_model_instance.model
tokenizer = great_model_instance.tokenizer
if llm_model is None or tokenizer is None:
logging.error("LLM model or tokenizer not available in GReaT instance.")
return "" # Or raise an error
llm_model.to(device)
inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
max_length=tokenizer.model_max_length - max_tokens_to_generate)
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)
try:
outputs = llm_model.generate(
input_ids,
attention_mask=attention_mask,
max_new_tokens=max_tokens_to_generate,
pad_token_id=tokenizer.eos_token_id,
do_sample=True, # Enable sampling for more diverse outputs
temperature=0.7, # Default temperature, can be tuned
top_k=50, # Default top_k, can be tuned
top_p=0.95 # Default top_p, can be tuned
)
generated_text = tokenizer.decode(outputs[0, input_ids.shape[1]:], skip_special_tokens=True)
generated_text = generated_text.split('\n')[0].split('|')[0].strip()
return generated_text
except Exception as e:
logging.error(f"Error during text generation via prompt: {e}")
return "" # Fallback or re-raise
class SamplerBayesian(SamplerOriginal):
"""Bayesian synthetic data generator using Gaussian Copula.
Fits marginal distributions for each numerical column and captures
correlations via a Gaussian copula. Categorical columns are sampled
from their empirical conditional distributions.
"""
def generate_data(
self, train_df, target, test_df, only_generated_data: bool
):
from scipy.stats import norm, rankdata
self._validate_data(train_df, target, test_df)
if target is not None:
train_df = train_df.copy()
train_df[self.TEMP_TARGET] = target
n_samples = int(self.gen_x_times * len(train_df))
num_cols = [c for c in train_df.columns
if pd.api.types.is_numeric_dtype(train_df[c])]
cat_cols_here = [c for c in train_df.columns if c not in num_cols]
generated_parts = {}
# --- Numerical columns: Gaussian copula ---
if num_cols:
num_data = train_df[num_cols].copy()
# Store marginals (empirical CDF via ranks)
uniform = pd.DataFrame(index=num_data.index, columns=num_cols,
dtype=float)
for col in num_cols:
vals = num_data[col].values.astype(float)
ranks = rankdata(vals, method="average")
# Map to (0, 1) open interval
uniform[col] = ranks / (len(ranks) + 1)
# Transform to standard normal
normal_data = uniform.apply(norm.ppf)
# Replace any inf/-inf caused by extreme ranks
normal_data.replace([np.inf, -np.inf], np.nan, inplace=True)
normal_data.fillna(0.0, inplace=True)
# Fit covariance
mean = normal_data.mean().values.copy()
cov = normal_data.cov().values.copy()
# Regularize covariance to ensure positive-definiteness
cov += np.eye(len(num_cols)) * 1e-6
# Sample from multivariate normal
z_samples = np.random.multivariate_normal(mean, cov, size=n_samples)
# Transform back through inverse CDF (quantile mapping)
for i, col in enumerate(num_cols):
u = norm.cdf(z_samples[:, i])
sorted_vals = np.sort(num_data[col].dropna().values)
n_orig = len(sorted_vals)
# Map uniform samples to original quantiles
indices = np.clip(
(u * n_orig).astype(int), 0, n_orig - 1
)
# Add small noise to avoid exact duplicates
base = sorted_vals[indices]
if n_orig > 1:
scale = np.std(sorted_vals) * 0.01
base = base + np.random.normal(0, scale, size=len(base))
generated_parts[col] = base
# --- Categorical columns: empirical frequency sampling ---
for col in cat_cols_here:
freq = train_df[col].value_counts(normalize=True)
generated_parts[col] = np.random.choice(
freq.index, size=n_samples, p=freq.values
)
generated_df = pd.DataFrame(generated_parts, columns=train_df.columns)
return self.handle_generated_data(train_df, generated_df.values,
only_generated_data)
# Wire up factory classes to their concrete sampler implementations
OriginalGenerator._sampler_class = SamplerOriginal
GANGenerator._sampler_class = SamplerGAN
ForestDiffusionGenerator._sampler_class = SamplerDiffusion
LLMGenerator._sampler_class = SamplerLLM
BayesianGenerator._sampler_class = SamplerBayesian
if __name__ == "__main__":
setup_logging(logging.DEBUG)
train_size = 75
train = pd.DataFrame(np.random.randint(-10, 150, size=(train_size, 4)), columns=list("ABCD"))
target = pd.DataFrame(np.random.randint(0, 2, size=(train_size, 1)), columns=list("Y"))
test = pd.DataFrame(np.random.randint(0, 100, size=(train_size, 4)), columns=list("ABCD"))
logging.info(train)
generators = [
OriginalGenerator(gen_x_times=15),
GANGenerator(gen_x_times=10, only_generated_data=False,
gen_params={"batch_size": 500, "patience": 25, "epochs": 500}),
LLMGenerator(gen_params={"batch_size": 32, "epochs": 4, "llm": "distilgpt2", "max_length": 500}),
OriginalGenerator(gen_x_times=15),
GANGenerator(cat_cols=["A"], gen_x_times=20, only_generated_data=True),
ForestDiffusionGenerator(cat_cols=["A"], gen_x_times=10, only_generated_data=True),
ForestDiffusionGenerator(gen_x_times=15, only_generated_data=False,
gen_params={"batch_size": 500, "patience": 25, "epochs": 500})
]
for gen in generators:
_sampler(gen, train, target if 'LLMGenerator' not in str(type(gen)) else None, test)
min_date, max_date = pd.to_datetime('2019-01-01'), pd.to_datetime('2021-12-31')
train['Date'] = min_date + pd.to_timedelta(np.random.randint((max_date - min_date).days + 1, size=train_size),
unit='d')
train = get_year_mnth_dt_from_date(train, 'Date')
new_train, new_target = GANGenerator(
gen_x_times=1.1, cat_cols=['year'], bot_filter_quantile=0.001, top_filter_quantile=0.999,
is_post_process=True, pregeneration_frac=2, only_generated_data=False
).generate_data_pipe(train.drop('Date', axis=1), None, train.drop('Date', axis=1))
new_train = collect_dates(new_train)