File size: 3,681 Bytes
723f9ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Data Visualization Helper
Generate chart data for frontend visualization
"""

from typing import Dict, List, Any
from collections import Counter

class DataVisualizer:
    """Helper to prepare data for charts and graphs"""
    
    def prepare_chart_data(self, data: List[Dict], x_column: str, y_column: str, chart_type: str = 'bar') -> Dict:
        """Prepare data for chart visualization"""
        if not data:
            return {'ok': False, 'error': 'No data provided'}
        
        try:
            labels = [str(row.get(x_column, '')) for row in data]
            values = [row.get(y_column, 0) for row in data]
            
            return {
                'ok': True,
                'chart_type': chart_type,
                'labels': labels,
                'values': values,
                'x_label': x_column,
                'y_label': y_column
            }
        except Exception as e:
            return {'ok': False, 'error': str(e)}
    
    def analyze_column(self, data: List[Dict], column: str) -> Dict:
        """Analyze a column and return statistics"""
        if not data:
            return {'ok': False, 'error': 'No data'}
        
        values = [row.get(column) for row in data if row.get(column) is not None]
        
        if not values:
            return {'ok': False, 'error': 'No values in column'}
        
        # Determine data type
        sample = values[0]
        is_numeric = isinstance(sample, (int, float))
        
        result = {
            'ok': True,
            'column': column,
            'count': len(values),
            'null_count': len(data) - len(values),
            'unique_count': len(set(str(v) for v in values))
        }
        
        if is_numeric:
            result.update({
                'type': 'numeric',
                'min': min(values),
                'max': max(values),
                'avg': sum(values) / len(values),
                'sum': sum(values)
            })
        else:
            # Get most common values
            counter = Counter(str(v) for v in values)
            result.update({
                'type': 'categorical',
                'most_common': counter.most_common(5)
            })
        
        return result
    
    def get_distribution(self, data: List[Dict], column: str, bins: int = 10) -> Dict:
        """Get distribution of values for histogram"""
        if not data:
            return {'ok': False, 'error': 'No data'}
        
        values = [row.get(column) for row in data if row.get(column) is not None]
        
        if not values:
            return {'ok': False, 'error': 'No values'}
        
        # Check if numeric
        if not isinstance(values[0], (int, float)):
            # For categorical, return frequency
            counter = Counter(str(v) for v in values)
            return {
                'ok': True,
                'type': 'categorical',
                'distribution': dict(counter.most_common(bins))
            }
        
        # For numeric, create bins
        min_val = min(values)
        max_val = max(values)
        bin_size = (max_val - min_val) / bins
        
        distribution = {}
        for i in range(bins):
            bin_start = min_val + i * bin_size
            bin_end = bin_start + bin_size
            bin_label = f"{bin_start:.2f}-{bin_end:.2f}"
            count = sum(1 for v in values if bin_start <= v < bin_end)
            distribution[bin_label] = count
        
        return {
            'ok': True,
            'type': 'numeric',
            'distribution': distribution,
            'bins': bins
        }

data_visualizer = DataVisualizer()