corpusdb / app /data_visualizer.py
mrsavage1's picture
Upload 52 files
723f9ab verified
"""
Data Visualization Helper
Generate chart data for frontend visualization
"""
from typing import Dict, List, Any
from collections import Counter
class DataVisualizer:
"""Helper to prepare data for charts and graphs"""
def prepare_chart_data(self, data: List[Dict], x_column: str, y_column: str, chart_type: str = 'bar') -> Dict:
"""Prepare data for chart visualization"""
if not data:
return {'ok': False, 'error': 'No data provided'}
try:
labels = [str(row.get(x_column, '')) for row in data]
values = [row.get(y_column, 0) for row in data]
return {
'ok': True,
'chart_type': chart_type,
'labels': labels,
'values': values,
'x_label': x_column,
'y_label': y_column
}
except Exception as e:
return {'ok': False, 'error': str(e)}
def analyze_column(self, data: List[Dict], column: str) -> Dict:
"""Analyze a column and return statistics"""
if not data:
return {'ok': False, 'error': 'No data'}
values = [row.get(column) for row in data if row.get(column) is not None]
if not values:
return {'ok': False, 'error': 'No values in column'}
# Determine data type
sample = values[0]
is_numeric = isinstance(sample, (int, float))
result = {
'ok': True,
'column': column,
'count': len(values),
'null_count': len(data) - len(values),
'unique_count': len(set(str(v) for v in values))
}
if is_numeric:
result.update({
'type': 'numeric',
'min': min(values),
'max': max(values),
'avg': sum(values) / len(values),
'sum': sum(values)
})
else:
# Get most common values
counter = Counter(str(v) for v in values)
result.update({
'type': 'categorical',
'most_common': counter.most_common(5)
})
return result
def get_distribution(self, data: List[Dict], column: str, bins: int = 10) -> Dict:
"""Get distribution of values for histogram"""
if not data:
return {'ok': False, 'error': 'No data'}
values = [row.get(column) for row in data if row.get(column) is not None]
if not values:
return {'ok': False, 'error': 'No values'}
# Check if numeric
if not isinstance(values[0], (int, float)):
# For categorical, return frequency
counter = Counter(str(v) for v in values)
return {
'ok': True,
'type': 'categorical',
'distribution': dict(counter.most_common(bins))
}
# For numeric, create bins
min_val = min(values)
max_val = max(values)
bin_size = (max_val - min_val) / bins
distribution = {}
for i in range(bins):
bin_start = min_val + i * bin_size
bin_end = bin_start + bin_size
bin_label = f"{bin_start:.2f}-{bin_end:.2f}"
count = sum(1 for v in values if bin_start <= v < bin_end)
distribution[bin_label] = count
return {
'ok': True,
'type': 'numeric',
'distribution': distribution,
'bins': bins
}
data_visualizer = DataVisualizer()