""" Sample Data Generator - InsightGenAI ==================================== Generate sample datasets for testing the application. Author: InsightGenAI Team Version: 1.0.0 """ import pandas as pd import numpy as np from typing import Optional import argparse def generate_classification_dataset(n_samples: int = 1000, n_features: int = 10, n_classes: int = 2, noise: float = 0.1, output_path: Optional[str] = None) -> pd.DataFrame: """ Generate a synthetic classification dataset. Args: n_samples: Number of samples n_features: Number of features n_classes: Number of classes noise: Noise level output_path: Path to save CSV (optional) Returns: pd.DataFrame: Generated dataset """ np.random.seed(42) # Generate features data = {} for i in range(n_features): if i < n_features // 2: data[f'feature_{i}'] = np.random.randn(n_samples) else: data[f'feature_{i}'] = np.random.randint(0, 100, n_samples) # Generate target with some relationship to features X = np.column_stack([data[f'feature_{i}'] for i in range(min(3, n_features))]) weights = np.random.randn(X.shape[1]) logits = X @ weights + np.random.randn(n_samples) * noise if n_classes == 2: target = (logits > np.median(logits)).astype(int) else: target = pd.cut(logits, bins=n_classes, labels=range(n_classes)).astype(int) data['target'] = target # Add some categorical features data['category'] = np.random.choice(['A', 'B', 'C'], n_samples) data['region'] = np.random.choice(['North', 'South', 'East', 'West'], n_samples) # Add a text column text_templates = [ "Customer reported satisfaction with the product.", "User experienced issues with the interface.", "Positive feedback received from client.", "Technical difficulties encountered during usage.", "Excellent performance and reliability noted." ] data['feedback'] = np.random.choice(text_templates, n_samples) # Add missing values (5% random) df = pd.DataFrame(data) mask = np.random.random(df.shape) < 0.05 df = df.mask(mask) if output_path: df.to_csv(output_path, index=False) print(f"Classification dataset saved to {output_path}") return df def generate_regression_dataset(n_samples: int = 1000, n_features: int = 8, noise: float = 0.5, output_path: Optional[str] = None) -> pd.DataFrame: """ Generate a synthetic regression dataset. Args: n_samples: Number of samples n_features: Number of features noise: Noise level output_path: Path to save CSV (optional) Returns: pd.DataFrame: Generated dataset """ np.random.seed(42) # Generate features data = {} for i in range(n_features): if i < n_features // 2: data[f'feature_{i}'] = np.random.randn(n_samples) else: data[f'feature_{i}'] = np.random.randint(1, 100, n_samples) # Generate target with linear relationship X = np.column_stack([data[f'feature_{i}'] for i in range(n_features)]) weights = np.random.randn(n_features) target = X @ weights + np.random.randn(n_samples) * noise data['target'] = target # Add categorical features data['category'] = np.random.choice(['Type1', 'Type2', 'Type3'], n_samples) # Add datetime column data['date'] = pd.date_range(start='2020-01-01', periods=n_samples, freq='D') df = pd.DataFrame(data) if output_path: df.to_csv(output_path, index=False) print(f"Regression dataset saved to {output_path}") return df def generate_customer_churn_dataset(n_samples: int = 2000, output_path: Optional[str] = None) -> pd.DataFrame: """ Generate a customer churn prediction dataset. Args: n_samples: Number of samples output_path: Path to save CSV (optional) Returns: pd.DataFrame: Generated dataset """ np.random.seed(42) data = { 'customer_id': range(1, n_samples + 1), 'age': np.random.randint(18, 80, n_samples), 'gender': np.random.choice(['Male', 'Female'], n_samples), 'tenure': np.random.randint(0, 72, n_samples), 'monthly_charges': np.random.uniform(20, 120, n_samples), 'total_charges': np.random.uniform(100, 8000, n_samples), 'contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples), 'payment_method': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], n_samples), 'internet_service': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples), 'tech_support': np.random.choice(['Yes', 'No'], n_samples), 'online_security': np.random.choice(['Yes', 'No'], n_samples), 'paperless_billing': np.random.choice(['Yes', 'No'], n_samples), } # Generate churn based on features churn_prob = ( 0.3 + (data['tenure'] < 12) * 0.2 + (data['contract'] == 'Month-to-month') * 0.15 + (data['payment_method'] == 'Electronic check') * 0.1 + (data['tech_support'] == 'No') * 0.1 + np.random.randn(n_samples) * 0.1 ) churn_prob = np.clip(churn_prob, 0, 1) data['churn'] = (np.random.random(n_samples) < churn_prob).astype(int) df = pd.DataFrame(data) if output_path: df.to_csv(output_path, index=False) print(f"Customer churn dataset saved to {output_path}") return df def generate_housing_dataset(n_samples: int = 1500, output_path: Optional[str] = None) -> pd.DataFrame: """ Generate a housing price prediction dataset. Args: n_samples: Number of samples output_path: Path to save CSV (optional) Returns: pd.DataFrame: Generated dataset """ np.random.seed(42) data = { 'sqft_living': np.random.randint(500, 5000, n_samples), 'sqft_lot': np.random.randint(1000, 50000, n_samples), 'bedrooms': np.random.randint(1, 6, n_samples), 'bathrooms': np.random.randint(1, 5, n_samples), 'floors': np.random.randint(1, 4, n_samples), 'waterfront': np.random.choice([0, 1], n_samples, p=[0.95, 0.05]), 'view': np.random.randint(0, 5, n_samples), 'condition': np.random.randint(1, 6, n_samples), 'grade': np.random.randint(1, 14, n_samples), 'yr_built': np.random.randint(1900, 2024, n_samples), 'yr_renovated': np.random.choice([0] + list(range(1950, 2024)), n_samples), 'zipcode': np.random.choice([98101, 98102, 98103, 98104, 98105, 98106, 98107, 98108, 98109, 98112], n_samples), } # Generate price based on features base_price = 50000 price = ( base_price + data['sqft_living'] * 200 + data['sqft_lot'] * 2 + data['bedrooms'] * 25000 + data['bathrooms'] * 35000 + data['waterfront'] * 500000 + data['view'] * 50000 + data['grade'] * 20000 + (2024 - np.array(data['yr_built'])) * (-1000) + np.random.randn(n_samples) * 50000 ) data['price'] = np.maximum(price, 50000).astype(int) df = pd.DataFrame(data) if output_path: df.to_csv(output_path, index=False) print(f"Housing dataset saved to {output_path}") return df def main(): """Main function for CLI usage.""" parser = argparse.ArgumentParser( description='Generate sample datasets for InsightGenAI testing' ) parser.add_argument( 'dataset_type', choices=['classification', 'regression', 'churn', 'housing'], help='Type of dataset to generate' ) parser.add_argument( '-n', '--samples', type=int, default=1000, help='Number of samples (default: 1000)' ) parser.add_argument( '-o', '--output', type=str, default=None, help='Output file path' ) args = parser.parse_args() if args.output is None: args.output = f'sample_{args.dataset_type}_data.csv' if args.dataset_type == 'classification': generate_classification_dataset(n_samples=args.samples, output_path=args.output) elif args.dataset_type == 'regression': generate_regression_dataset(n_samples=args.samples, output_path=args.output) elif args.dataset_type == 'churn': generate_customer_churn_dataset(n_samples=args.samples, output_path=args.output) elif args.dataset_type == 'housing': generate_housing_dataset(n_samples=args.samples, output_path=args.output) if __name__ == "__main__": main()