File size: 3,072 Bytes
98bc1c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

class DataGenerator:
    def __init__(self):
        self.full_dataset = None
    
    def generate(self, variables, sample_size):
        """Generate sample data based on variables"""
        np.random.seed(42)  # For reproducibility
        
        data = {}
        
        # Generate data for each variable
        for variable in variables:
            if 'age' in variable.lower():
                data[variable] = np.random.normal(35, 12, sample_size).astype(int)
                data[variable] = np.clip(data[variable], 18, 80)
            
            elif 'amount' in variable.lower() or 'price' in variable.lower():
                data[variable] = np.random.lognormal(4, 1, sample_size)
                data[variable] = np.round(data[variable], 2)
            
            elif 'category' in variable.lower():
                categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books']
                data[variable] = np.random.choice(categories, sample_size)
            
            elif 'channel' in variable.lower():
                channels = ['Email', 'Social Media', 'TV', 'Print', 'Online', 'Direct']
                data[variable] = np.random.choice(channels, sample_size)
            
            elif 'location' in variable.lower():
                locations = ['Urban', 'Suburban', 'Rural']
                data[variable] = np.random.choice(locations, sample_size)
            
            elif 'frequency' in variable.lower():
                data[variable] = np.random.poisson(3, sample_size) + 1
            
            elif 'satisfaction' in variable.lower() or 'score' in variable.lower():
                data[variable] = np.random.choice([1, 2, 3, 4, 5], sample_size, 
                                                p=[0.05, 0.1, 0.2, 0.4, 0.25])
            
            elif 'time' in variable.lower():
                data[variable] = np.random.exponential(7, sample_size).astype(int) + 1
            
            else:
                # Default to numeric data
                data[variable] = np.random.normal(50, 15, sample_size)
        
        # Add ID column
        data['ID'] = range(1, sample_size + 1)
        
        # Create DataFrame
        df = pd.DataFrame(data)
        
        # Store full dataset
        self.full_dataset = df
        
        return df
    
    def get_full_dataset(self):
        """Return the full generated dataset"""
        return self.full_dataset
    
    def add_missing_values(self, df, missing_rate=0.05):
        """Add missing values to simulate real data"""
        df_with_missing = df.copy()
        
        for column in df.columns:
            if column != 'ID':
                missing_indices = np.random.choice(
                    df.index, 
                    size=int(len(df) * missing_rate), 
                    replace=False
                )
                df_with_missing.loc[missing_indices, column] = np.nan
        
        return df_with_missing