Spaces:

ratulsur
/

BI_ANALYTICS

Configuration error

App Files Files Community

BI_ANALYTICS / data_generator.py

ratulsur

Upload 13 files

98bc1c2 verified 6 months ago

raw

history blame contribute delete

3.07 kB

	import pandas as pd
	import numpy as np
	from datetime import datetime, timedelta
	import random

	class DataGenerator:
	def __init__(self):
	self.full_dataset = None

	def generate(self, variables, sample_size):
	"""Generate sample data based on variables"""
	np.random.seed(42) # For reproducibility

	data = {}

	# Generate data for each variable
	for variable in variables:
	if 'age' in variable.lower():
	data[variable] = np.random.normal(35, 12, sample_size).astype(int)
	data[variable] = np.clip(data[variable], 18, 80)

	elif 'amount' in variable.lower() or 'price' in variable.lower():
	data[variable] = np.random.lognormal(4, 1, sample_size)
	data[variable] = np.round(data[variable], 2)

	elif 'category' in variable.lower():
	categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books']
	data[variable] = np.random.choice(categories, sample_size)

	elif 'channel' in variable.lower():
	channels = ['Email', 'Social Media', 'TV', 'Print', 'Online', 'Direct']
	data[variable] = np.random.choice(channels, sample_size)

	elif 'location' in variable.lower():
	locations = ['Urban', 'Suburban', 'Rural']
	data[variable] = np.random.choice(locations, sample_size)

	elif 'frequency' in variable.lower():
	data[variable] = np.random.poisson(3, sample_size) + 1

	elif 'satisfaction' in variable.lower() or 'score' in variable.lower():
	data[variable] = np.random.choice([1, 2, 3, 4, 5], sample_size,
	p=[0.05, 0.1, 0.2, 0.4, 0.25])

	elif 'time' in variable.lower():
	data[variable] = np.random.exponential(7, sample_size).astype(int) + 1

	else:
	# Default to numeric data
	data[variable] = np.random.normal(50, 15, sample_size)

	# Add ID column
	data['ID'] = range(1, sample_size + 1)

	# Create DataFrame
	df = pd.DataFrame(data)

	# Store full dataset
	self.full_dataset = df

	return df

	def get_full_dataset(self):
	"""Return the full generated dataset"""
	return self.full_dataset

	def add_missing_values(self, df, missing_rate=0.05):
	"""Add missing values to simulate real data"""
	df_with_missing = df.copy()

	for column in df.columns:
	if column != 'ID':
	missing_indices = np.random.choice(
	df.index,
	size=int(len(df) * missing_rate),
	replace=False
	)
	df_with_missing.loc[missing_indices, column] = np.nan

	return df_with_missing