Spaces:
Sleeping
Sleeping
| """ | |
| Sample Data Generator - InsightGenAI | |
| ==================================== | |
| Generate sample datasets for testing the application. | |
| Author: InsightGenAI Team | |
| Version: 1.0.0 | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from typing import Optional | |
| import argparse | |
| def generate_classification_dataset(n_samples: int = 1000, | |
| n_features: int = 10, | |
| n_classes: int = 2, | |
| noise: float = 0.1, | |
| output_path: Optional[str] = None) -> pd.DataFrame: | |
| """ | |
| Generate a synthetic classification dataset. | |
| Args: | |
| n_samples: Number of samples | |
| n_features: Number of features | |
| n_classes: Number of classes | |
| noise: Noise level | |
| output_path: Path to save CSV (optional) | |
| Returns: | |
| pd.DataFrame: Generated dataset | |
| """ | |
| np.random.seed(42) | |
| # Generate features | |
| data = {} | |
| for i in range(n_features): | |
| if i < n_features // 2: | |
| data[f'feature_{i}'] = np.random.randn(n_samples) | |
| else: | |
| data[f'feature_{i}'] = np.random.randint(0, 100, n_samples) | |
| # Generate target with some relationship to features | |
| X = np.column_stack([data[f'feature_{i}'] for i in range(min(3, n_features))]) | |
| weights = np.random.randn(X.shape[1]) | |
| logits = X @ weights + np.random.randn(n_samples) * noise | |
| if n_classes == 2: | |
| target = (logits > np.median(logits)).astype(int) | |
| else: | |
| target = pd.cut(logits, bins=n_classes, labels=range(n_classes)).astype(int) | |
| data['target'] = target | |
| # Add some categorical features | |
| data['category'] = np.random.choice(['A', 'B', 'C'], n_samples) | |
| data['region'] = np.random.choice(['North', 'South', 'East', 'West'], n_samples) | |
| # Add a text column | |
| text_templates = [ | |
| "Customer reported satisfaction with the product.", | |
| "User experienced issues with the interface.", | |
| "Positive feedback received from client.", | |
| "Technical difficulties encountered during usage.", | |
| "Excellent performance and reliability noted." | |
| ] | |
| data['feedback'] = np.random.choice(text_templates, n_samples) | |
| # Add missing values (5% random) | |
| df = pd.DataFrame(data) | |
| mask = np.random.random(df.shape) < 0.05 | |
| df = df.mask(mask) | |
| if output_path: | |
| df.to_csv(output_path, index=False) | |
| print(f"Classification dataset saved to {output_path}") | |
| return df | |
| def generate_regression_dataset(n_samples: int = 1000, | |
| n_features: int = 8, | |
| noise: float = 0.5, | |
| output_path: Optional[str] = None) -> pd.DataFrame: | |
| """ | |
| Generate a synthetic regression dataset. | |
| Args: | |
| n_samples: Number of samples | |
| n_features: Number of features | |
| noise: Noise level | |
| output_path: Path to save CSV (optional) | |
| Returns: | |
| pd.DataFrame: Generated dataset | |
| """ | |
| np.random.seed(42) | |
| # Generate features | |
| data = {} | |
| for i in range(n_features): | |
| if i < n_features // 2: | |
| data[f'feature_{i}'] = np.random.randn(n_samples) | |
| else: | |
| data[f'feature_{i}'] = np.random.randint(1, 100, n_samples) | |
| # Generate target with linear relationship | |
| X = np.column_stack([data[f'feature_{i}'] for i in range(n_features)]) | |
| weights = np.random.randn(n_features) | |
| target = X @ weights + np.random.randn(n_samples) * noise | |
| data['target'] = target | |
| # Add categorical features | |
| data['category'] = np.random.choice(['Type1', 'Type2', 'Type3'], n_samples) | |
| # Add datetime column | |
| data['date'] = pd.date_range(start='2020-01-01', periods=n_samples, freq='D') | |
| df = pd.DataFrame(data) | |
| if output_path: | |
| df.to_csv(output_path, index=False) | |
| print(f"Regression dataset saved to {output_path}") | |
| return df | |
| def generate_customer_churn_dataset(n_samples: int = 2000, | |
| output_path: Optional[str] = None) -> pd.DataFrame: | |
| """ | |
| Generate a customer churn prediction dataset. | |
| Args: | |
| n_samples: Number of samples | |
| output_path: Path to save CSV (optional) | |
| Returns: | |
| pd.DataFrame: Generated dataset | |
| """ | |
| np.random.seed(42) | |
| data = { | |
| 'customer_id': range(1, n_samples + 1), | |
| 'age': np.random.randint(18, 80, n_samples), | |
| 'gender': np.random.choice(['Male', 'Female'], n_samples), | |
| 'tenure': np.random.randint(0, 72, n_samples), | |
| 'monthly_charges': np.random.uniform(20, 120, n_samples), | |
| 'total_charges': np.random.uniform(100, 8000, n_samples), | |
| 'contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples), | |
| 'payment_method': np.random.choice(['Electronic check', 'Mailed check', | |
| 'Bank transfer', 'Credit card'], n_samples), | |
| 'internet_service': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples), | |
| 'tech_support': np.random.choice(['Yes', 'No'], n_samples), | |
| 'online_security': np.random.choice(['Yes', 'No'], n_samples), | |
| 'paperless_billing': np.random.choice(['Yes', 'No'], n_samples), | |
| } | |
| # Generate churn based on features | |
| churn_prob = ( | |
| 0.3 + | |
| (data['tenure'] < 12) * 0.2 + | |
| (data['contract'] == 'Month-to-month') * 0.15 + | |
| (data['payment_method'] == 'Electronic check') * 0.1 + | |
| (data['tech_support'] == 'No') * 0.1 + | |
| np.random.randn(n_samples) * 0.1 | |
| ) | |
| churn_prob = np.clip(churn_prob, 0, 1) | |
| data['churn'] = (np.random.random(n_samples) < churn_prob).astype(int) | |
| df = pd.DataFrame(data) | |
| if output_path: | |
| df.to_csv(output_path, index=False) | |
| print(f"Customer churn dataset saved to {output_path}") | |
| return df | |
| def generate_housing_dataset(n_samples: int = 1500, | |
| output_path: Optional[str] = None) -> pd.DataFrame: | |
| """ | |
| Generate a housing price prediction dataset. | |
| Args: | |
| n_samples: Number of samples | |
| output_path: Path to save CSV (optional) | |
| Returns: | |
| pd.DataFrame: Generated dataset | |
| """ | |
| np.random.seed(42) | |
| data = { | |
| 'sqft_living': np.random.randint(500, 5000, n_samples), | |
| 'sqft_lot': np.random.randint(1000, 50000, n_samples), | |
| 'bedrooms': np.random.randint(1, 6, n_samples), | |
| 'bathrooms': np.random.randint(1, 5, n_samples), | |
| 'floors': np.random.randint(1, 4, n_samples), | |
| 'waterfront': np.random.choice([0, 1], n_samples, p=[0.95, 0.05]), | |
| 'view': np.random.randint(0, 5, n_samples), | |
| 'condition': np.random.randint(1, 6, n_samples), | |
| 'grade': np.random.randint(1, 14, n_samples), | |
| 'yr_built': np.random.randint(1900, 2024, n_samples), | |
| 'yr_renovated': np.random.choice([0] + list(range(1950, 2024)), n_samples), | |
| 'zipcode': np.random.choice([98101, 98102, 98103, 98104, 98105, | |
| 98106, 98107, 98108, 98109, 98112], n_samples), | |
| } | |
| # Generate price based on features | |
| base_price = 50000 | |
| price = ( | |
| base_price + | |
| data['sqft_living'] * 200 + | |
| data['sqft_lot'] * 2 + | |
| data['bedrooms'] * 25000 + | |
| data['bathrooms'] * 35000 + | |
| data['waterfront'] * 500000 + | |
| data['view'] * 50000 + | |
| data['grade'] * 20000 + | |
| (2024 - np.array(data['yr_built'])) * (-1000) + | |
| np.random.randn(n_samples) * 50000 | |
| ) | |
| data['price'] = np.maximum(price, 50000).astype(int) | |
| df = pd.DataFrame(data) | |
| if output_path: | |
| df.to_csv(output_path, index=False) | |
| print(f"Housing dataset saved to {output_path}") | |
| return df | |
| def main(): | |
| """Main function for CLI usage.""" | |
| parser = argparse.ArgumentParser( | |
| description='Generate sample datasets for InsightGenAI testing' | |
| ) | |
| parser.add_argument( | |
| 'dataset_type', | |
| choices=['classification', 'regression', 'churn', 'housing'], | |
| help='Type of dataset to generate' | |
| ) | |
| parser.add_argument( | |
| '-n', '--samples', | |
| type=int, | |
| default=1000, | |
| help='Number of samples (default: 1000)' | |
| ) | |
| parser.add_argument( | |
| '-o', '--output', | |
| type=str, | |
| default=None, | |
| help='Output file path' | |
| ) | |
| args = parser.parse_args() | |
| if args.output is None: | |
| args.output = f'sample_{args.dataset_type}_data.csv' | |
| if args.dataset_type == 'classification': | |
| generate_classification_dataset(n_samples=args.samples, output_path=args.output) | |
| elif args.dataset_type == 'regression': | |
| generate_regression_dataset(n_samples=args.samples, output_path=args.output) | |
| elif args.dataset_type == 'churn': | |
| generate_customer_churn_dataset(n_samples=args.samples, output_path=args.output) | |
| elif args.dataset_type == 'housing': | |
| generate_housing_dataset(n_samples=args.samples, output_path=args.output) | |
| if __name__ == "__main__": | |
| main() | |