import pandas as pd import numpy as np from datetime import datetime, timedelta import random class DataGenerator: def __init__(self): self.full_dataset = None def generate(self, variables, sample_size): """Generate sample data based on variables""" np.random.seed(42) # For reproducibility data = {} # Generate data for each variable for variable in variables: if 'age' in variable.lower(): data[variable] = np.random.normal(35, 12, sample_size).astype(int) data[variable] = np.clip(data[variable], 18, 80) elif 'amount' in variable.lower() or 'price' in variable.lower(): data[variable] = np.random.lognormal(4, 1, sample_size) data[variable] = np.round(data[variable], 2) elif 'category' in variable.lower(): categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books'] data[variable] = np.random.choice(categories, sample_size) elif 'channel' in variable.lower(): channels = ['Email', 'Social Media', 'TV', 'Print', 'Online', 'Direct'] data[variable] = np.random.choice(channels, sample_size) elif 'location' in variable.lower(): locations = ['Urban', 'Suburban', 'Rural'] data[variable] = np.random.choice(locations, sample_size) elif 'frequency' in variable.lower(): data[variable] = np.random.poisson(3, sample_size) + 1 elif 'satisfaction' in variable.lower() or 'score' in variable.lower(): data[variable] = np.random.choice([1, 2, 3, 4, 5], sample_size, p=[0.05, 0.1, 0.2, 0.4, 0.25]) elif 'time' in variable.lower(): data[variable] = np.random.exponential(7, sample_size).astype(int) + 1 else: # Default to numeric data data[variable] = np.random.normal(50, 15, sample_size) # Add ID column data['ID'] = range(1, sample_size + 1) # Create DataFrame df = pd.DataFrame(data) # Store full dataset self.full_dataset = df return df def get_full_dataset(self): """Return the full generated dataset""" return self.full_dataset def add_missing_values(self, df, missing_rate=0.05): """Add missing values to simulate real data""" df_with_missing = df.copy() for column in df.columns: if column != 'ID': missing_indices = np.random.choice( df.index, size=int(len(df) * missing_rate), replace=False ) df_with_missing.loc[missing_indices, column] = np.nan return df_with_missing