Spaces:

mohsinbhatti
/

insightgenai

Sleeping

File size: 9,242 Bytes

e478478

"""
Sample Data Generator - InsightGenAI
====================================
Generate sample datasets for testing the application.

Author: InsightGenAI Team
Version: 1.0.0
"""

import pandas as pd
import numpy as np
from typing import Optional
import argparse


def generate_classification_dataset(n_samples: int = 1000, 
                                    n_features: int = 10,
                                    n_classes: int = 2,
                                    noise: float = 0.1,
                                    output_path: Optional[str] = None) -> pd.DataFrame:
    """
    Generate a synthetic classification dataset.
    
    Args:
        n_samples: Number of samples
        n_features: Number of features
        n_classes: Number of classes
        noise: Noise level
        output_path: Path to save CSV (optional)
        
    Returns:
        pd.DataFrame: Generated dataset
    """
    np.random.seed(42)
    
    # Generate features
    data = {}
    for i in range(n_features):
        if i < n_features // 2:
            data[f'feature_{i}'] = np.random.randn(n_samples)
        else:
            data[f'feature_{i}'] = np.random.randint(0, 100, n_samples)
    
    # Generate target with some relationship to features
    X = np.column_stack([data[f'feature_{i}'] for i in range(min(3, n_features))])
    weights = np.random.randn(X.shape[1])
    logits = X @ weights + np.random.randn(n_samples) * noise
    
    if n_classes == 2:
        target = (logits > np.median(logits)).astype(int)
    else:
        target = pd.cut(logits, bins=n_classes, labels=range(n_classes)).astype(int)
    
    data['target'] = target
    
    # Add some categorical features
    data['category'] = np.random.choice(['A', 'B', 'C'], n_samples)
    data['region'] = np.random.choice(['North', 'South', 'East', 'West'], n_samples)
    
    # Add a text column
    text_templates = [
        "Customer reported satisfaction with the product.",
        "User experienced issues with the interface.",
        "Positive feedback received from client.",
        "Technical difficulties encountered during usage.",
        "Excellent performance and reliability noted."
    ]
    data['feedback'] = np.random.choice(text_templates, n_samples)
    
    # Add missing values (5% random)
    df = pd.DataFrame(data)
    mask = np.random.random(df.shape) < 0.05
    df = df.mask(mask)
    
    if output_path:
        df.to_csv(output_path, index=False)
        print(f"Classification dataset saved to {output_path}")
    
    return df


def generate_regression_dataset(n_samples: int = 1000,
                                n_features: int = 8,
                                noise: float = 0.5,
                                output_path: Optional[str] = None) -> pd.DataFrame:
    """
    Generate a synthetic regression dataset.
    
    Args:
        n_samples: Number of samples
        n_features: Number of features
        noise: Noise level
        output_path: Path to save CSV (optional)
        
    Returns:
        pd.DataFrame: Generated dataset
    """
    np.random.seed(42)
    
    # Generate features
    data = {}
    for i in range(n_features):
        if i < n_features // 2:
            data[f'feature_{i}'] = np.random.randn(n_samples)
        else:
            data[f'feature_{i}'] = np.random.randint(1, 100, n_samples)
    
    # Generate target with linear relationship
    X = np.column_stack([data[f'feature_{i}'] for i in range(n_features)])
    weights = np.random.randn(n_features)
    target = X @ weights + np.random.randn(n_samples) * noise
    
    data['target'] = target
    
    # Add categorical features
    data['category'] = np.random.choice(['Type1', 'Type2', 'Type3'], n_samples)
    
    # Add datetime column
    data['date'] = pd.date_range(start='2020-01-01', periods=n_samples, freq='D')
    
    df = pd.DataFrame(data)
    
    if output_path:
        df.to_csv(output_path, index=False)
        print(f"Regression dataset saved to {output_path}")
    
    return df


def generate_customer_churn_dataset(n_samples: int = 2000,
                                     output_path: Optional[str] = None) -> pd.DataFrame:
    """
    Generate a customer churn prediction dataset.
    
    Args:
        n_samples: Number of samples
        output_path: Path to save CSV (optional)
        
    Returns:
        pd.DataFrame: Generated dataset
    """
    np.random.seed(42)
    
    data = {
        'customer_id': range(1, n_samples + 1),
        'age': np.random.randint(18, 80, n_samples),
        'gender': np.random.choice(['Male', 'Female'], n_samples),
        'tenure': np.random.randint(0, 72, n_samples),
        'monthly_charges': np.random.uniform(20, 120, n_samples),
        'total_charges': np.random.uniform(100, 8000, n_samples),
        'contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples),
        'payment_method': np.random.choice(['Electronic check', 'Mailed check', 
                                           'Bank transfer', 'Credit card'], n_samples),
        'internet_service': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples),
        'tech_support': np.random.choice(['Yes', 'No'], n_samples),
        'online_security': np.random.choice(['Yes', 'No'], n_samples),
        'paperless_billing': np.random.choice(['Yes', 'No'], n_samples),
    }
    
    # Generate churn based on features
    churn_prob = (
        0.3 + 
        (data['tenure'] < 12) * 0.2 +
        (data['contract'] == 'Month-to-month') * 0.15 +
        (data['payment_method'] == 'Electronic check') * 0.1 +
        (data['tech_support'] == 'No') * 0.1 +
        np.random.randn(n_samples) * 0.1
    )
    churn_prob = np.clip(churn_prob, 0, 1)
    data['churn'] = (np.random.random(n_samples) < churn_prob).astype(int)
    
    df = pd.DataFrame(data)
    
    if output_path:
        df.to_csv(output_path, index=False)
        print(f"Customer churn dataset saved to {output_path}")
    
    return df


def generate_housing_dataset(n_samples: int = 1500,
                              output_path: Optional[str] = None) -> pd.DataFrame:
    """
    Generate a housing price prediction dataset.
    
    Args:
        n_samples: Number of samples
        output_path: Path to save CSV (optional)
        
    Returns:
        pd.DataFrame: Generated dataset
    """
    np.random.seed(42)
    
    data = {
        'sqft_living': np.random.randint(500, 5000, n_samples),
        'sqft_lot': np.random.randint(1000, 50000, n_samples),
        'bedrooms': np.random.randint(1, 6, n_samples),
        'bathrooms': np.random.randint(1, 5, n_samples),
        'floors': np.random.randint(1, 4, n_samples),
        'waterfront': np.random.choice([0, 1], n_samples, p=[0.95, 0.05]),
        'view': np.random.randint(0, 5, n_samples),
        'condition': np.random.randint(1, 6, n_samples),
        'grade': np.random.randint(1, 14, n_samples),
        'yr_built': np.random.randint(1900, 2024, n_samples),
        'yr_renovated': np.random.choice([0] + list(range(1950, 2024)), n_samples),
        'zipcode': np.random.choice([98101, 98102, 98103, 98104, 98105, 
                                     98106, 98107, 98108, 98109, 98112], n_samples),
    }
    
    # Generate price based on features
    base_price = 50000
    price = (
        base_price +
        data['sqft_living'] * 200 +
        data['sqft_lot'] * 2 +
        data['bedrooms'] * 25000 +
        data['bathrooms'] * 35000 +
        data['waterfront'] * 500000 +
        data['view'] * 50000 +
        data['grade'] * 20000 +
        (2024 - np.array(data['yr_built'])) * (-1000) +
        np.random.randn(n_samples) * 50000
    )
    
    data['price'] = np.maximum(price, 50000).astype(int)
    
    df = pd.DataFrame(data)
    
    if output_path:
        df.to_csv(output_path, index=False)
        print(f"Housing dataset saved to {output_path}")
    
    return df


def main():
    """Main function for CLI usage."""
    parser = argparse.ArgumentParser(
        description='Generate sample datasets for InsightGenAI testing'
    )
    parser.add_argument(
        'dataset_type',
        choices=['classification', 'regression', 'churn', 'housing'],
        help='Type of dataset to generate'
    )
    parser.add_argument(
        '-n', '--samples',
        type=int,
        default=1000,
        help='Number of samples (default: 1000)'
    )
    parser.add_argument(
        '-o', '--output',
        type=str,
        default=None,
        help='Output file path'
    )
    
    args = parser.parse_args()
    
    if args.output is None:
        args.output = f'sample_{args.dataset_type}_data.csv'
    
    if args.dataset_type == 'classification':
        generate_classification_dataset(n_samples=args.samples, output_path=args.output)
    elif args.dataset_type == 'regression':
        generate_regression_dataset(n_samples=args.samples, output_path=args.output)
    elif args.dataset_type == 'churn':
        generate_customer_churn_dataset(n_samples=args.samples, output_path=args.output)
    elif args.dataset_type == 'housing':
        generate_housing_dataset(n_samples=args.samples, output_path=args.output)


if __name__ == "__main__":
    main()