Spaces:

InsafQ
/

TabGAN

Running

App Files Files Community

TabGAN / tabgan /sampler.py

InsafQ

Add BayesianGenerator (Gaussian Copula)

4925509 verified 16 days ago

raw

history blame contribute delete

32.2 kB

	# -- coding: utf-8 --

	import logging
	import numpy as np
	import warnings
	from typing import Tuple

	import pandas as pd
	import torch

	try:
	from be_great import GReaT
	except ImportError:
	GReaT = None

	try:
	from _ForestDiffusion import ForestDiffusionModel
	except ImportError:
	ForestDiffusionModel = None

	from _ctgan.synthesizer import _CTGANSynthesizer as CTGAN
	from tabgan.abc_sampler import Sampler, SampleData
	from tabgan.adversarial_model import AdversarialModel
	from tabgan.utils import setup_logging, _drop_col_if_exist, \
	get_columns_if_exists, _sampler, get_year_mnth_dt_from_date, collect_dates
	from tabgan.llm_config import LLMAPIConfig
	from tabgan.llm_api_client import LLMAPIClient

	warnings.filterwarnings("ignore")

	__author__ = "Insaf Ashrapov"
	__copyright__ = "Insaf Ashrapov"
	__license__ = "Apache 2.0"

	__all__ = ["OriginalGenerator", "GANGenerator", "ForestDiffusionGenerator", "LLMGenerator"]


	class _BaseGenerator(SampleData):
	"""Base factory that stores constructor arguments for the concrete sampler."""
	_sampler_class = None

	def __init__(self, args, *kwargs):
	self.args = args
	self.kwargs = kwargs

	def get_object_generator(self) -> Sampler:
	return self._sampler_class(self.args, *self.kwargs)


	class OriginalGenerator(_BaseGenerator):
	_sampler_class = None # set after SamplerOriginal is defined


	class GANGenerator(_BaseGenerator):
	_sampler_class = None


	class ForestDiffusionGenerator(_BaseGenerator):
	_sampler_class = None


	class LLMGenerator(_BaseGenerator):
	_sampler_class = None


	class BayesianGenerator(_BaseGenerator):
	_sampler_class = None


	class SamplerOriginal(Sampler):
	def __init__(
	self,
	gen_x_times: float = 1.1,
	cat_cols: list = None,
	bot_filter_quantile: float = 0.001,
	top_filter_quantile: float = 0.999,
	is_post_process: bool = True,
	adversarial_model_params: dict = None,
	pregeneration_frac: float = 2,
	only_generated_data: bool = False,
	gen_params: dict = None,
	text_generating_columns: list = None,
	conditional_columns: list = None,
	llm_api_config: LLMAPIConfig = None,
	):
	"""
	Initialize an original sampler configuration.

	Args:
	gen_x_times (float): Factor controlling how many synthetic samples
	to generate relative to the training size. The final amount
	can be smaller after post-processing and adversarial filtering.
	cat_cols (list \| None): Names of categorical columns in the
	training data.
	bot_filter_quantile (float): Lower quantile used for numeric
	post-processing filters.
	top_filter_quantile (float): Upper quantile used for numeric
	post-processing filters.
	is_post_process (bool): Whether to apply post-processing filters
	based on the distribution of `test_df`. If False, the
	quantile-based filters are skipped.
	adversarial_model_params (dict): Parameters for the adversarial
	filtering model used to keep generated samples close to the
	test distribution.
	pregeneration_frac (float): Oversampling factor applied before
	post-processing. The final number of rows is derived from
	`gen_x_times`.
	only_generated_data (bool): If True, return only synthetic rows.
	If False, append generated rows to the original training data.
	gen_params (dict): Model-specific generation parameters shared by
	subclasses (GAN, ForestDiffusion, LLM).
	text_generating_columns (list \| None): Column names for which new
	text values should be generated (used by `SamplerLLM`).
	conditional_columns (list \| None): Column names that condition
	text generation for `text_generating_columns`.
	llm_api_config (LLMAPIConfig \| None): Configuration for external LLM
	API-based text generation. When provided, text generation will use
	the API instead of the local model. Useful for LM Studio, Ollama,
	OpenAI, etc.
	"""
	if adversarial_model_params is None:
	adversarial_model_params = {
	"metrics": "AUC",
	"max_depth": 2,
	"max_bin": 100,
	"n_estimators": 150,
	"learning_rate": 0.02,
	"random_state": 42,
	}
	if gen_params is None:
	gen_params = {"batch_size": 45, "patience": 25, "epochs": 50, "llm": "distilgpt2"}
	super().__init__(
	gen_x_times=gen_x_times,
	cat_cols=cat_cols,
	bot_filter_quantile=bot_filter_quantile,
	top_filter_quantile=top_filter_quantile,
	is_post_process=is_post_process,
	adversarial_model_params=adversarial_model_params,
	pregeneration_frac=pregeneration_frac,
	only_generated_data=only_generated_data,
	gen_params=gen_params,
	)
	self.text_generating_columns = text_generating_columns
	self.conditional_columns = conditional_columns
	self.llm_api_config = llm_api_config
	if not hasattr(self, "TEMP_TARGET"):
	self.TEMP_TARGET = "TEMP_TARGET"

	@staticmethod
	def preprocess_data_df(df) -> pd.DataFrame:
	logging.info(f"Input shape: {df.shape}")
	if not isinstance(df, pd.DataFrame):
	raise ValueError(
	f"Input dataframe is not a pandas DataFrame: got {type(df)}"
	)
	return df

	def preprocess_data(
	self, train, target, test_df
	) -> Tuple[pd.DataFrame, pd.DataFrame]:
	train = self.preprocess_data_df(train)
	target = self.preprocess_data_df(target)
	test_df = self.preprocess_data_df(test_df)
	self.TEMP_TARGET = target.columns[0]
	if self.TEMP_TARGET in train.columns:
	raise ValueError(
	f"Input train dataframe already has '{self.TEMP_TARGET}' column, consider removing it"
	)
	if "test_similarity" in train.columns:
	raise ValueError(
	"Input train dataframe already have test_similarity, consider removing it"
	)

	return train, target, test_df

	def generate_data(
	self, train_df, target, test_df, only_generated_data
	) -> Tuple[pd.DataFrame, pd.DataFrame]:
	if only_generated_data:
	warnings.warn(
	"For SamplerOriginal setting only_generated_data doesn't change anything, "
	"because generated data sampled from the train!"
	)
	self._validate_data(train_df, target, test_df)
	train_df[self.TEMP_TARGET] = target
	generated_df = train_df.sample(
	frac=(1 + self.pregeneration_frac), replace=True, random_state=42
	)
	generated_df = generated_df.reset_index(drop=True)

	logging.info(
	f"Generated shape: {generated_df.drop(self.TEMP_TARGET, axis=1).shape} "
	f"and {generated_df[self.TEMP_TARGET].shape}"
	)
	return (
	generated_df.drop(self.TEMP_TARGET, axis=1),
	generated_df[self.TEMP_TARGET],
	)

	def postprocess_data(self, train_df, target, test_df):
	if not self.is_post_process or test_df is None:
	logging.info("Skipping postprocessing")
	return train_df, target

	self._validate_data(train_df, target, test_df)
	train_df[self.TEMP_TARGET] = target

	# Filter numerical columns
	for col in test_df.columns:
	if self.cat_cols is None or col not in self.cat_cols:
	min_val = test_df[col].quantile(self.bot_filter_quantile)
	max_val = test_df[col].quantile(self.top_filter_quantile)
	train_df = train_df[(train_df[col].isna()) \| ((train_df[col] >= min_val) & (train_df[col] <= max_val))]

	if train_df.shape[0] < 10:
	raise ValueError(f"Too few samples (<10) after filtering column {col}. "
	f"Test data may be skewed. Filter range: [{min_val}, {max_val}]")

	# Filter categorical columns
	if self.cat_cols:
	for col in self.cat_cols:
	train_df = train_df[train_df[col].isin(test_df[col].unique())]
	if train_df.shape[0] < 10:
	raise ValueError(f"Too few samples (<10) after filtering categorical column {col}")

	logging.info(
	f"Generated shapes after postprocessing: {train_df.drop(self.TEMP_TARGET, axis=1).shape} plus target")

	result_df = train_df.reset_index(drop=True)
	return (
	result_df.drop(self.TEMP_TARGET, axis=1),
	result_df[self.TEMP_TARGET]
	)

	def adversarial_filtering(self, train_df, target, test_df):
	if test_df is None:
	logging.info("Skipping adversarial filtering, because test_df is None.")
	return train_df, target
	ad_model = AdversarialModel(
	cat_cols=self.cat_cols, model_params=self.adversarial_model_params
	)
	self._validate_data(train_df, target, test_df)
	train_df[self.TEMP_TARGET] = target
	ad_model.adversarial_test(test_df, train_df.drop(self.TEMP_TARGET, axis=1))

	train_df["test_similarity"] = ad_model.trained_model.predict(
	train_df.drop(self.TEMP_TARGET, axis=1)
	)
	train_df.sort_values("test_similarity", ascending=False, inplace=True)
	train_df = train_df.head(self.get_generated_shape(train_df) * train_df.shape[0])
	del ad_model

	return (
	train_df.drop(["test_similarity", self.TEMP_TARGET], axis=1).reset_index(
	drop=True
	),
	train_df[self.TEMP_TARGET].reset_index(drop=True),
	)

	@staticmethod
	def _validate_data(train_df, target, test_df):
	if test_df is not None:
	if train_df.shape[0] < 10 or test_df.shape[0] < 10:
	raise ValueError(
	f"Shape of train is {train_df.shape[0]} and test is {test_df.shape[0]}. "
	f"Both should be at least 10! Consider disabling adversarial filtering"
	)
	if target is not None:
	if train_df.shape[0] != target.shape[0]:
	raise ValueError(
	f"Shape mismatch: train_df has {train_df.shape[0]} rows "
	f"but target has {target.shape[0]} rows"
	)

	def handle_generated_data(self, train_df, generated_df, only_generated_data):
	"""
	Align and optionally merge generated rows with the original training data.

	The generated data is cast to the dtypes and column order of `train_df`
	so that downstream models receive data with a consistent schema.

	Args:
	train_df (pd.DataFrame): Original training data used to infer the
	schema and target column.
	generated_df (pd.DataFrame or array-like): Newly generated
	samples to be aligned with `train_df`.
	only_generated_data (bool): If True, return only synthetic rows;
	otherwise, append them to `train_df` before returning.

	Returns:
	Tuple[pd.DataFrame, pd.Series \| pd.DataFrame]: Features and
	corresponding target values.
	"""
	generated_df = pd.DataFrame(generated_df)
	generated_df.columns = train_df.columns

	for column_index in range(len(generated_df.columns)):
	target_column = generated_df.columns[column_index]
	generated_df[target_column] = generated_df[target_column].astype(
	train_df.dtypes.values[column_index]
	)

	if not only_generated_data:
	train_df = pd.concat([train_df, generated_df]).reset_index(drop=True)
	logging.info(
	f"Generated shapes: {_drop_col_if_exist(train_df, self.TEMP_TARGET).shape} plus target"
	)
	return (
	_drop_col_if_exist(train_df, self.TEMP_TARGET),
	get_columns_if_exists(train_df, self.TEMP_TARGET),
	)
	else:
	logging.info(
	f"Generated shapes: {_drop_col_if_exist(generated_df, self.TEMP_TARGET).shape} plus target"
	)
	return (
	_drop_col_if_exist(generated_df, self.TEMP_TARGET),
	get_columns_if_exists(generated_df, self.TEMP_TARGET),
	)


	class SamplerGAN(SamplerOriginal):
	def check_params(self):
	if self.gen_params["batch_size"] % 10 != 0:
	logging.warning(
	f"Batch size should be divisible by 10, but got {self.gen_params['batch_size']}. Fixing it")
	self.gen_params["batch_size"] += 10 - (self.gen_params["batch_size"] % 10)

	if "patience" not in self.gen_params:
	logging.warning("patience param is not set for GAN params, setting default to 25")
	self.gen_params["patience"] = 25

	if "epochs" not in self.gen_params:
	logging.warning("epochs param is not set for GAN params, setting default to 50")
	self.gen_params["epochs"] = 50

	def generate_data(
	self, train_df, target, test_df, only_generated_data: bool
	) -> Tuple[pd.DataFrame, pd.DataFrame]:
	self.check_params()
	self._validate_data(train_df, target, test_df)
	if target is not None:
	train_df[self.TEMP_TARGET] = target
	ctgan = CTGAN(batch_size=self.gen_params["batch_size"], patience=self.gen_params["patience"])
	logging.info("training GAN")
	if self.cat_cols is None:
	ctgan.fit(train_df, [], epochs=self.gen_params["epochs"])
	else:
	ctgan.fit(train_df, self.cat_cols, epochs=self.gen_params["epochs"])
	logging.info("Finished training GAN")
	generated_df = ctgan.sample(
	self.pregeneration_frac * self.get_generated_shape(train_df)
	)
	return self.handle_generated_data(train_df, generated_df, only_generated_data)


	class SamplerDiffusion(SamplerOriginal):
	def generate_data(
	self, train_df, target, test_df, only_generated_data: bool
	) -> Tuple[pd.DataFrame, pd.DataFrame]:
	self._validate_data(train_df, target, test_df)
	if target is not None:
	train_df[self.TEMP_TARGET] = target
	logging.info("Fitting ForestDiffusion model")
	if ForestDiffusionModel is None:
	raise ImportError(
	"ForestDiffusion is not installed. "
	"Please install it: pip install ForestDiffusion"
	)
	if self.cat_cols is None:
	forest_model = ForestDiffusionModel(train_df.to_numpy(), label_y=None, n_t=50,
	duplicate_K=100,
	diffusion_type='flow', n_jobs=-1)
	else:
	forest_model = ForestDiffusionModel(train_df.to_numpy(), label_y=None, n_t=50,
	duplicate_K=100,
	# todo fix bug with cat cols
	# cat_indexes=self.get_column_indexes(train_df, self.cat_cols),
	diffusion_type='flow', n_jobs=-1)
	logging.info("Finished training ForestDiffusionModel")
	generated_df = forest_model.generate(batch_size=int(self.gen_x_times * train_df.to_numpy().shape[0]))

	return self.handle_generated_data(train_df, generated_df, only_generated_data)

	@staticmethod
	def get_column_indexes(df, column_names):
	return [df.columns.get_loc(col) for col in column_names]


	class SamplerLLM(SamplerOriginal):
	def check_params(self):
	if "llm" not in self.gen_params:
	logging.warning("llm param is not set for LLM params, setting default to 'distilgpt2'")
	self.gen_params["llm"] = "distilgpt2"
	if "max_length" not in self.gen_params:
	logging.warning("max_length param is not set for LLM params, setting default to 500")
	self.gen_params["max_length"] = 500

	if self.gen_params["epochs"] < 3:
	logging.warning(
	f"Current epoch={self.gen_params['epochs']} for LLM training is too low, setting to 3")
	self.gen_params["epochs"] = 3

	def _build_training_frame(self, train_df: pd.DataFrame, target: pd.DataFrame \| None) -> pd.DataFrame:
	"""
	Return a copy of the training frame with TEMP_TARGET attached when a target is provided.
	"""
	current_train_df = train_df.copy()
	if target is not None:
	current_train_df[self.TEMP_TARGET] = target
	return current_train_df

	def _fit_great_model(self, current_train_df: pd.DataFrame):
	"""
	Fit a GReaT model on the provided training frame and return the instance and inference device.
	"""
	logging.info("Fitting LLM model")
	is_fp16 = torch.cuda.is_available()
	try:
	from be_great import GReaT
	except ImportError:
	raise ImportError("be_great library is not installed. Please install it to use LLMGenerator.")

	great_model_instance = GReaT(
	llm=self.gen_params["llm"],
	batch_size=self.gen_params["batch_size"],
	epochs=self.gen_params["epochs"],
	fp16=is_fp16,
	)
	great_model_instance.fit(current_train_df)
	logging.info("Finished training LLM model")

	device = "cuda" if torch.cuda.is_available() else "cpu"
	return great_model_instance, device

	def _conditional_text_generation(
	self,
	great_model_instance,
	current_train_df: pd.DataFrame,
	train_df: pd.DataFrame,
	target: pd.DataFrame \| None,
	device: str,
	) -> pd.DataFrame:
	"""
	Generate rows when text and conditional columns are specified.
	"""
	logging.info("Starting conditional generation of text columns.")
	num_samples_to_generate = int(self.gen_x_times * train_df.shape[0])

	original_unique_text_values: dict[str, set] = {}
	for col in self.text_generating_columns:
	if col not in current_train_df.columns:
	raise ValueError(f"Text generating column '{col}' not found in training data.")
	original_unique_text_values[col] = set(current_train_df[col].unique())

	attribute_distributions: dict[str, pd.Series] = {}
	for col in self.conditional_columns:
	if col not in current_train_df.columns:
	raise ValueError(f"Conditional column '{col}' not found in training data.")
	attribute_distributions[col] = current_train_df[col].value_counts(normalize=True)

	generated_rows: list[dict] = []
	all_train_columns = current_train_df.columns.tolist()

	for _ in range(num_samples_to_generate):
	current_row_data: dict = {}

	for attr_col in self.conditional_columns:
	dist = attribute_distributions[attr_col]
	current_row_data[attr_col] = np.random.choice(dist.index, p=dist.values)

	row_template_for_impute = pd.DataFrame(columns=all_train_columns, index=[0])
	for col in all_train_columns:
	if col in current_row_data:
	row_template_for_impute.loc[0, col] = current_row_data[col]
	elif col not in self.text_generating_columns:
	row_template_for_impute.loc[0, col] = np.nan

	imputed_full_row_df = great_model_instance.impute(
	row_template_for_impute.copy(),
	max_length=self.gen_params.get("max_length", 500),
	)

	for col in all_train_columns:
	if col not in self.text_generating_columns and col not in current_row_data:
	current_row_data[col] = imputed_full_row_df.loc[0, col]

	for text_col in self.text_generating_columns:
	prompt_parts: list[str] = []
	for cond_col in self.conditional_columns:
	prompt_parts.append(f"{cond_col}: {current_row_data[cond_col]}")
	for other_col in all_train_columns:
	if (
	other_col not in self.text_generating_columns
	and other_col not in self.conditional_columns
	and other_col in current_row_data
	):
	val_str = str(current_row_data[other_col])
	if len(val_str) > 30:
	val_str = val_str[:27] + "..."
	prompt_parts.append(f"{other_col}: {val_str}")

	prompt = ", ".join(prompt_parts) + f", Generate {text_col}: "

	generated_text_candidate = None
	max_retries = 10
	for _retry_attempt in range(max_retries):
	generated_text_candidate = self._generate_via_prompt(
	prompt,
	great_model_instance,
	device=device,
	)
	if generated_text_candidate not in original_unique_text_values[text_col]:
	break
	else:
	logging.warning(
	f"Max retries reached for generating novel text for {text_col}. Using last candidate."
	)
	current_row_data[text_col] = generated_text_candidate

	ordered_row = {col: current_row_data.get(col) for col in train_df.columns}
	if target is not None and self.TEMP_TARGET in current_row_data:
	ordered_row[self.TEMP_TARGET] = current_row_data[self.TEMP_TARGET]

	generated_rows.append(ordered_row)

	generated_df = pd.DataFrame(generated_rows)
	return generated_df.reindex(columns=current_train_df.columns)

	def _standard_llm_sampling(
	self,
	great_model_instance,
	current_train_df: pd.DataFrame,
	device: str,
	) -> pd.DataFrame:
	"""
	Fallback sampling when no explicit text/conditional columns are provided.
	"""
	logging.info("Starting standard LLM sampling.")
	return great_model_instance.sample(
	int(self.gen_x_times * current_train_df.shape[0]),
	device=device,
	max_length=self.gen_params["max_length"],
	)

	def generate_data(
	self, train_df, target, test_df, only_generated_data: bool
	) -> Tuple[pd.DataFrame, pd.DataFrame]:
	self._validate_data(train_df, target, test_df)
	self.check_params()

	current_train_df = self._build_training_frame(train_df, target)
	great_model_instance, device = self._fit_great_model(current_train_df)

	if self.text_generating_columns and self.conditional_columns:
	generated_df = self._conditional_text_generation(
	great_model_instance,
	current_train_df=current_train_df,
	train_df=train_df,
	target=target,
	device=device,
	)
	else:
	generated_df = self._standard_llm_sampling(
	great_model_instance,
	current_train_df=current_train_df,
	device=device,
	)

	# When a target is provided, ``current_train_df`` already includes the
	# TEMP_TARGET column and represents the true training frame used for
	# generation. Passing it to ``handle_generated_data`` keeps feature and
	# target alignment consistent for both conditional and standard LLM
	# sampling paths.
	base_train_for_handling = current_train_df if target is not None else train_df
	return self.handle_generated_data(base_train_for_handling, generated_df, only_generated_data)

	def _generate_via_prompt(self, prompt: str, great_model_instance, device: str, max_tokens_to_generate=50) -> str:
	"""
	Generate a short text completion from the underlying GReaT LLM.

	Args:
	prompt (str): Serialized row description used as generation context.
	great_model_instance: Fitted GReaT instance providing `model` and
	`tokenizer` attributes.
	device (str): Target device for inference (for example, ``"cpu"``
	or ``"cuda"``).
	max_tokens_to_generate (int): Maximum number of new tokens to
	sample from the model.

	Returns:
	str: Post-processed generated text. Returns an empty string if
	generation fails.
	"""
	llm_model = great_model_instance.model
	tokenizer = great_model_instance.tokenizer

	if llm_model is None or tokenizer is None:
	logging.error("LLM model or tokenizer not available in GReaT instance.")
	return "" # Or raise an error

	llm_model.to(device)

	inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
	max_length=tokenizer.model_max_length - max_tokens_to_generate)
	input_ids = inputs.input_ids.to(device)
	attention_mask = inputs.attention_mask.to(device)

	try:
	outputs = llm_model.generate(
	input_ids,
	attention_mask=attention_mask,
	max_new_tokens=max_tokens_to_generate,
	pad_token_id=tokenizer.eos_token_id,
	do_sample=True, # Enable sampling for more diverse outputs
	temperature=0.7, # Default temperature, can be tuned
	top_k=50, # Default top_k, can be tuned
	top_p=0.95 # Default top_p, can be tuned
	)
	generated_text = tokenizer.decode(outputs[0, input_ids.shape[1]:], skip_special_tokens=True)

	generated_text = generated_text.split('\n')[0].split('\|')[0].strip()

	return generated_text

	except Exception as e:
	logging.error(f"Error during text generation via prompt: {e}")
	return "" # Fallback or re-raise


	class SamplerBayesian(SamplerOriginal):
	"""Bayesian synthetic data generator using Gaussian Copula.

	Fits marginal distributions for each numerical column and captures
	correlations via a Gaussian copula. Categorical columns are sampled
	from their empirical conditional distributions.
	"""

	def generate_data(
	self, train_df, target, test_df, only_generated_data: bool
	):
	from scipy.stats import norm, rankdata

	self._validate_data(train_df, target, test_df)

	if target is not None:
	train_df = train_df.copy()
	train_df[self.TEMP_TARGET] = target

	n_samples = int(self.gen_x_times * len(train_df))
	num_cols = [c for c in train_df.columns
	if pd.api.types.is_numeric_dtype(train_df[c])]
	cat_cols_here = [c for c in train_df.columns if c not in num_cols]

	generated_parts = {}

	# --- Numerical columns: Gaussian copula ---
	if num_cols:
	num_data = train_df[num_cols].copy()
	# Store marginals (empirical CDF via ranks)
	uniform = pd.DataFrame(index=num_data.index, columns=num_cols,
	dtype=float)
	for col in num_cols:
	vals = num_data[col].values.astype(float)
	ranks = rankdata(vals, method="average")
	# Map to (0, 1) open interval
	uniform[col] = ranks / (len(ranks) + 1)

	# Transform to standard normal
	normal_data = uniform.apply(norm.ppf)
	# Replace any inf/-inf caused by extreme ranks
	normal_data.replace([np.inf, -np.inf], np.nan, inplace=True)
	normal_data.fillna(0.0, inplace=True)

	# Fit covariance
	mean = normal_data.mean().values.copy()
	cov = normal_data.cov().values.copy()
	# Regularize covariance to ensure positive-definiteness
	cov += np.eye(len(num_cols)) * 1e-6

	# Sample from multivariate normal
	z_samples = np.random.multivariate_normal(mean, cov, size=n_samples)

	# Transform back through inverse CDF (quantile mapping)
	for i, col in enumerate(num_cols):
	u = norm.cdf(z_samples[:, i])
	sorted_vals = np.sort(num_data[col].dropna().values)
	n_orig = len(sorted_vals)
	# Map uniform samples to original quantiles
	indices = np.clip(
	(u * n_orig).astype(int), 0, n_orig - 1
	)
	# Add small noise to avoid exact duplicates
	base = sorted_vals[indices]
	if n_orig > 1:
	scale = np.std(sorted_vals) * 0.01
	base = base + np.random.normal(0, scale, size=len(base))
	generated_parts[col] = base

	# --- Categorical columns: empirical frequency sampling ---
	for col in cat_cols_here:
	freq = train_df[col].value_counts(normalize=True)
	generated_parts[col] = np.random.choice(
	freq.index, size=n_samples, p=freq.values
	)

	generated_df = pd.DataFrame(generated_parts, columns=train_df.columns)
	return self.handle_generated_data(train_df, generated_df.values,
	only_generated_data)


	# Wire up factory classes to their concrete sampler implementations
	OriginalGenerator._sampler_class = SamplerOriginal
	GANGenerator._sampler_class = SamplerGAN
	ForestDiffusionGenerator._sampler_class = SamplerDiffusion
	LLMGenerator._sampler_class = SamplerLLM
	BayesianGenerator._sampler_class = SamplerBayesian


	if __name__ == "__main__":
	setup_logging(logging.DEBUG)
	train_size = 75
	train = pd.DataFrame(np.random.randint(-10, 150, size=(train_size, 4)), columns=list("ABCD"))
	target = pd.DataFrame(np.random.randint(0, 2, size=(train_size, 1)), columns=list("Y"))
	test = pd.DataFrame(np.random.randint(0, 100, size=(train_size, 4)), columns=list("ABCD"))
	logging.info(train)

	generators = [
	OriginalGenerator(gen_x_times=15),
	GANGenerator(gen_x_times=10, only_generated_data=False,
	gen_params={"batch_size": 500, "patience": 25, "epochs": 500}),
	LLMGenerator(gen_params={"batch_size": 32, "epochs": 4, "llm": "distilgpt2", "max_length": 500}),
	OriginalGenerator(gen_x_times=15),
	GANGenerator(cat_cols=["A"], gen_x_times=20, only_generated_data=True),
	ForestDiffusionGenerator(cat_cols=["A"], gen_x_times=10, only_generated_data=True),
	ForestDiffusionGenerator(gen_x_times=15, only_generated_data=False,
	gen_params={"batch_size": 500, "patience": 25, "epochs": 500})
	]

	for gen in generators:
	_sampler(gen, train, target if 'LLMGenerator' not in str(type(gen)) else None, test)

	min_date, max_date = pd.to_datetime('2019-01-01'), pd.to_datetime('2021-12-31')
	train['Date'] = min_date + pd.to_timedelta(np.random.randint((max_date - min_date).days + 1, size=train_size),
	unit='d')
	train = get_year_mnth_dt_from_date(train, 'Date')

	new_train, new_target = GANGenerator(
	gen_x_times=1.1, cat_cols=['year'], bot_filter_quantile=0.001, top_filter_quantile=0.999,
	is_post_process=True, pregeneration_frac=2, only_generated_data=False
	).generate_data_pipe(train.drop('Date', axis=1), None, train.drop('Date', axis=1))
	new_train = collect_dates(new_train)