BI_ANALYTICS / data_generator.py
ratulsur's picture
Upload 13 files
98bc1c2 verified
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
class DataGenerator:
def __init__(self):
self.full_dataset = None
def generate(self, variables, sample_size):
"""Generate sample data based on variables"""
np.random.seed(42) # For reproducibility
data = {}
# Generate data for each variable
for variable in variables:
if 'age' in variable.lower():
data[variable] = np.random.normal(35, 12, sample_size).astype(int)
data[variable] = np.clip(data[variable], 18, 80)
elif 'amount' in variable.lower() or 'price' in variable.lower():
data[variable] = np.random.lognormal(4, 1, sample_size)
data[variable] = np.round(data[variable], 2)
elif 'category' in variable.lower():
categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books']
data[variable] = np.random.choice(categories, sample_size)
elif 'channel' in variable.lower():
channels = ['Email', 'Social Media', 'TV', 'Print', 'Online', 'Direct']
data[variable] = np.random.choice(channels, sample_size)
elif 'location' in variable.lower():
locations = ['Urban', 'Suburban', 'Rural']
data[variable] = np.random.choice(locations, sample_size)
elif 'frequency' in variable.lower():
data[variable] = np.random.poisson(3, sample_size) + 1
elif 'satisfaction' in variable.lower() or 'score' in variable.lower():
data[variable] = np.random.choice([1, 2, 3, 4, 5], sample_size,
p=[0.05, 0.1, 0.2, 0.4, 0.25])
elif 'time' in variable.lower():
data[variable] = np.random.exponential(7, sample_size).astype(int) + 1
else:
# Default to numeric data
data[variable] = np.random.normal(50, 15, sample_size)
# Add ID column
data['ID'] = range(1, sample_size + 1)
# Create DataFrame
df = pd.DataFrame(data)
# Store full dataset
self.full_dataset = df
return df
def get_full_dataset(self):
"""Return the full generated dataset"""
return self.full_dataset
def add_missing_values(self, df, missing_rate=0.05):
"""Add missing values to simulate real data"""
df_with_missing = df.copy()
for column in df.columns:
if column != 'ID':
missing_indices = np.random.choice(
df.index,
size=int(len(df) * missing_rate),
replace=False
)
df_with_missing.loc[missing_indices, column] = np.nan
return df_with_missing