insightgenai / sample_data_generator.py
mohsinbhatti's picture
Initial commit - InsightGenAI files
e478478
"""
Sample Data Generator - InsightGenAI
====================================
Generate sample datasets for testing the application.
Author: InsightGenAI Team
Version: 1.0.0
"""
import pandas as pd
import numpy as np
from typing import Optional
import argparse
def generate_classification_dataset(n_samples: int = 1000,
n_features: int = 10,
n_classes: int = 2,
noise: float = 0.1,
output_path: Optional[str] = None) -> pd.DataFrame:
"""
Generate a synthetic classification dataset.
Args:
n_samples: Number of samples
n_features: Number of features
n_classes: Number of classes
noise: Noise level
output_path: Path to save CSV (optional)
Returns:
pd.DataFrame: Generated dataset
"""
np.random.seed(42)
# Generate features
data = {}
for i in range(n_features):
if i < n_features // 2:
data[f'feature_{i}'] = np.random.randn(n_samples)
else:
data[f'feature_{i}'] = np.random.randint(0, 100, n_samples)
# Generate target with some relationship to features
X = np.column_stack([data[f'feature_{i}'] for i in range(min(3, n_features))])
weights = np.random.randn(X.shape[1])
logits = X @ weights + np.random.randn(n_samples) * noise
if n_classes == 2:
target = (logits > np.median(logits)).astype(int)
else:
target = pd.cut(logits, bins=n_classes, labels=range(n_classes)).astype(int)
data['target'] = target
# Add some categorical features
data['category'] = np.random.choice(['A', 'B', 'C'], n_samples)
data['region'] = np.random.choice(['North', 'South', 'East', 'West'], n_samples)
# Add a text column
text_templates = [
"Customer reported satisfaction with the product.",
"User experienced issues with the interface.",
"Positive feedback received from client.",
"Technical difficulties encountered during usage.",
"Excellent performance and reliability noted."
]
data['feedback'] = np.random.choice(text_templates, n_samples)
# Add missing values (5% random)
df = pd.DataFrame(data)
mask = np.random.random(df.shape) < 0.05
df = df.mask(mask)
if output_path:
df.to_csv(output_path, index=False)
print(f"Classification dataset saved to {output_path}")
return df
def generate_regression_dataset(n_samples: int = 1000,
n_features: int = 8,
noise: float = 0.5,
output_path: Optional[str] = None) -> pd.DataFrame:
"""
Generate a synthetic regression dataset.
Args:
n_samples: Number of samples
n_features: Number of features
noise: Noise level
output_path: Path to save CSV (optional)
Returns:
pd.DataFrame: Generated dataset
"""
np.random.seed(42)
# Generate features
data = {}
for i in range(n_features):
if i < n_features // 2:
data[f'feature_{i}'] = np.random.randn(n_samples)
else:
data[f'feature_{i}'] = np.random.randint(1, 100, n_samples)
# Generate target with linear relationship
X = np.column_stack([data[f'feature_{i}'] for i in range(n_features)])
weights = np.random.randn(n_features)
target = X @ weights + np.random.randn(n_samples) * noise
data['target'] = target
# Add categorical features
data['category'] = np.random.choice(['Type1', 'Type2', 'Type3'], n_samples)
# Add datetime column
data['date'] = pd.date_range(start='2020-01-01', periods=n_samples, freq='D')
df = pd.DataFrame(data)
if output_path:
df.to_csv(output_path, index=False)
print(f"Regression dataset saved to {output_path}")
return df
def generate_customer_churn_dataset(n_samples: int = 2000,
output_path: Optional[str] = None) -> pd.DataFrame:
"""
Generate a customer churn prediction dataset.
Args:
n_samples: Number of samples
output_path: Path to save CSV (optional)
Returns:
pd.DataFrame: Generated dataset
"""
np.random.seed(42)
data = {
'customer_id': range(1, n_samples + 1),
'age': np.random.randint(18, 80, n_samples),
'gender': np.random.choice(['Male', 'Female'], n_samples),
'tenure': np.random.randint(0, 72, n_samples),
'monthly_charges': np.random.uniform(20, 120, n_samples),
'total_charges': np.random.uniform(100, 8000, n_samples),
'contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples),
'payment_method': np.random.choice(['Electronic check', 'Mailed check',
'Bank transfer', 'Credit card'], n_samples),
'internet_service': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples),
'tech_support': np.random.choice(['Yes', 'No'], n_samples),
'online_security': np.random.choice(['Yes', 'No'], n_samples),
'paperless_billing': np.random.choice(['Yes', 'No'], n_samples),
}
# Generate churn based on features
churn_prob = (
0.3 +
(data['tenure'] < 12) * 0.2 +
(data['contract'] == 'Month-to-month') * 0.15 +
(data['payment_method'] == 'Electronic check') * 0.1 +
(data['tech_support'] == 'No') * 0.1 +
np.random.randn(n_samples) * 0.1
)
churn_prob = np.clip(churn_prob, 0, 1)
data['churn'] = (np.random.random(n_samples) < churn_prob).astype(int)
df = pd.DataFrame(data)
if output_path:
df.to_csv(output_path, index=False)
print(f"Customer churn dataset saved to {output_path}")
return df
def generate_housing_dataset(n_samples: int = 1500,
output_path: Optional[str] = None) -> pd.DataFrame:
"""
Generate a housing price prediction dataset.
Args:
n_samples: Number of samples
output_path: Path to save CSV (optional)
Returns:
pd.DataFrame: Generated dataset
"""
np.random.seed(42)
data = {
'sqft_living': np.random.randint(500, 5000, n_samples),
'sqft_lot': np.random.randint(1000, 50000, n_samples),
'bedrooms': np.random.randint(1, 6, n_samples),
'bathrooms': np.random.randint(1, 5, n_samples),
'floors': np.random.randint(1, 4, n_samples),
'waterfront': np.random.choice([0, 1], n_samples, p=[0.95, 0.05]),
'view': np.random.randint(0, 5, n_samples),
'condition': np.random.randint(1, 6, n_samples),
'grade': np.random.randint(1, 14, n_samples),
'yr_built': np.random.randint(1900, 2024, n_samples),
'yr_renovated': np.random.choice([0] + list(range(1950, 2024)), n_samples),
'zipcode': np.random.choice([98101, 98102, 98103, 98104, 98105,
98106, 98107, 98108, 98109, 98112], n_samples),
}
# Generate price based on features
base_price = 50000
price = (
base_price +
data['sqft_living'] * 200 +
data['sqft_lot'] * 2 +
data['bedrooms'] * 25000 +
data['bathrooms'] * 35000 +
data['waterfront'] * 500000 +
data['view'] * 50000 +
data['grade'] * 20000 +
(2024 - np.array(data['yr_built'])) * (-1000) +
np.random.randn(n_samples) * 50000
)
data['price'] = np.maximum(price, 50000).astype(int)
df = pd.DataFrame(data)
if output_path:
df.to_csv(output_path, index=False)
print(f"Housing dataset saved to {output_path}")
return df
def main():
"""Main function for CLI usage."""
parser = argparse.ArgumentParser(
description='Generate sample datasets for InsightGenAI testing'
)
parser.add_argument(
'dataset_type',
choices=['classification', 'regression', 'churn', 'housing'],
help='Type of dataset to generate'
)
parser.add_argument(
'-n', '--samples',
type=int,
default=1000,
help='Number of samples (default: 1000)'
)
parser.add_argument(
'-o', '--output',
type=str,
default=None,
help='Output file path'
)
args = parser.parse_args()
if args.output is None:
args.output = f'sample_{args.dataset_type}_data.csv'
if args.dataset_type == 'classification':
generate_classification_dataset(n_samples=args.samples, output_path=args.output)
elif args.dataset_type == 'regression':
generate_regression_dataset(n_samples=args.samples, output_path=args.output)
elif args.dataset_type == 'churn':
generate_customer_churn_dataset(n_samples=args.samples, output_path=args.output)
elif args.dataset_type == 'housing':
generate_housing_dataset(n_samples=args.samples, output_path=args.output)
if __name__ == "__main__":
main()