Spaces:
Configuration error
Configuration error
| import pandas as pd | |
| import numpy as np | |
| from datetime import datetime, timedelta | |
| import random | |
| class DataGenerator: | |
| def __init__(self): | |
| self.full_dataset = None | |
| def generate(self, variables, sample_size): | |
| """Generate sample data based on variables""" | |
| np.random.seed(42) # For reproducibility | |
| data = {} | |
| # Generate data for each variable | |
| for variable in variables: | |
| if 'age' in variable.lower(): | |
| data[variable] = np.random.normal(35, 12, sample_size).astype(int) | |
| data[variable] = np.clip(data[variable], 18, 80) | |
| elif 'amount' in variable.lower() or 'price' in variable.lower(): | |
| data[variable] = np.random.lognormal(4, 1, sample_size) | |
| data[variable] = np.round(data[variable], 2) | |
| elif 'category' in variable.lower(): | |
| categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books'] | |
| data[variable] = np.random.choice(categories, sample_size) | |
| elif 'channel' in variable.lower(): | |
| channels = ['Email', 'Social Media', 'TV', 'Print', 'Online', 'Direct'] | |
| data[variable] = np.random.choice(channels, sample_size) | |
| elif 'location' in variable.lower(): | |
| locations = ['Urban', 'Suburban', 'Rural'] | |
| data[variable] = np.random.choice(locations, sample_size) | |
| elif 'frequency' in variable.lower(): | |
| data[variable] = np.random.poisson(3, sample_size) + 1 | |
| elif 'satisfaction' in variable.lower() or 'score' in variable.lower(): | |
| data[variable] = np.random.choice([1, 2, 3, 4, 5], sample_size, | |
| p=[0.05, 0.1, 0.2, 0.4, 0.25]) | |
| elif 'time' in variable.lower(): | |
| data[variable] = np.random.exponential(7, sample_size).astype(int) + 1 | |
| else: | |
| # Default to numeric data | |
| data[variable] = np.random.normal(50, 15, sample_size) | |
| # Add ID column | |
| data['ID'] = range(1, sample_size + 1) | |
| # Create DataFrame | |
| df = pd.DataFrame(data) | |
| # Store full dataset | |
| self.full_dataset = df | |
| return df | |
| def get_full_dataset(self): | |
| """Return the full generated dataset""" | |
| return self.full_dataset | |
| def add_missing_values(self, df, missing_rate=0.05): | |
| """Add missing values to simulate real data""" | |
| df_with_missing = df.copy() | |
| for column in df.columns: | |
| if column != 'ID': | |
| missing_indices = np.random.choice( | |
| df.index, | |
| size=int(len(df) * missing_rate), | |
| replace=False | |
| ) | |
| df_with_missing.loc[missing_indices, column] = np.nan | |
| return df_with_missing |