|
|
""" |
|
|
Visualization module for the Business Intelligence Dashboard. |
|
|
Contains functions for creating various charts and plots. |
|
|
""" |
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import matplotlib.pyplot as plt |
|
|
import matplotlib.dates as mdates |
|
|
import seaborn as sns |
|
|
from typing import Optional, List, Tuple, Any |
|
|
import warnings |
|
|
|
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
|
|
|
plt.style.use('seaborn-v0_8-whitegrid') |
|
|
sns.set_palette("husl") |
|
|
|
|
|
|
|
|
COLORS = { |
|
|
'primary': '#2E86AB', |
|
|
'secondary': '#A23B72', |
|
|
'success': '#28A745', |
|
|
'warning': '#F18F01', |
|
|
'danger': '#C73E1D', |
|
|
'info': '#17A2B8', |
|
|
'palette': ['#2E86AB', '#A23B72', '#F18F01', '#28A745', '#C73E1D', '#17A2B8', '#6C757D', '#563D7C'] |
|
|
} |
|
|
|
|
|
|
|
|
def create_time_series_plot( |
|
|
df: pd.DataFrame, |
|
|
date_column: str, |
|
|
value_column: str, |
|
|
agg_method: str = 'sum', |
|
|
freq: str = 'D', |
|
|
title: Optional[str] = None |
|
|
) -> Tuple[plt.Figure, Any]: |
|
|
""" |
|
|
Create a time series plot showing trends over time. |
|
|
|
|
|
Args: |
|
|
df: pandas DataFrame |
|
|
date_column: Name of the date column |
|
|
value_column: Name of the value column to plot |
|
|
agg_method: Aggregation method ('sum', 'mean', 'count') |
|
|
freq: Frequency for resampling ('D'=daily, 'W'=weekly, 'M'=monthly) |
|
|
title: Plot title |
|
|
|
|
|
Returns: |
|
|
Tuple of (matplotlib Figure, axes) |
|
|
""" |
|
|
if df is None or df.empty: |
|
|
fig, ax = plt.subplots(figsize=(12, 6)) |
|
|
ax.text(0.5, 0.5, 'No data available', ha='center', va='center', fontsize=14) |
|
|
return fig, ax |
|
|
|
|
|
try: |
|
|
|
|
|
plot_df = df.copy() |
|
|
plot_df[date_column] = pd.to_datetime(plot_df[date_column]) |
|
|
plot_df = plot_df.set_index(date_column) |
|
|
|
|
|
|
|
|
if agg_method == 'sum': |
|
|
ts_data = plot_df[value_column].resample(freq).sum() |
|
|
elif agg_method == 'mean': |
|
|
ts_data = plot_df[value_column].resample(freq).mean() |
|
|
elif agg_method == 'count': |
|
|
ts_data = plot_df[value_column].resample(freq).count() |
|
|
else: |
|
|
ts_data = plot_df[value_column].resample(freq).sum() |
|
|
|
|
|
|
|
|
fig, ax = plt.subplots(figsize=(12, 6)) |
|
|
|
|
|
ax.plot(ts_data.index, ts_data.values, color=COLORS['primary'], linewidth=2, marker='o', markersize=4) |
|
|
ax.fill_between(ts_data.index, ts_data.values, alpha=0.3, color=COLORS['primary']) |
|
|
|
|
|
|
|
|
ax.set_xlabel('Date', fontsize=12) |
|
|
ax.set_ylabel(f'{value_column} ({agg_method})', fontsize=12) |
|
|
ax.set_title(title or f'{value_column} Over Time ({agg_method.capitalize()})', fontsize=14, fontweight='bold') |
|
|
|
|
|
|
|
|
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d')) |
|
|
plt.xticks(rotation=45) |
|
|
|
|
|
ax.grid(True, alpha=0.3) |
|
|
plt.tight_layout() |
|
|
|
|
|
return fig, ax |
|
|
|
|
|
except Exception as e: |
|
|
fig, ax = plt.subplots(figsize=(12, 6)) |
|
|
ax.text(0.5, 0.5, f'Error creating plot: {str(e)}', ha='center', va='center', fontsize=12) |
|
|
return fig, ax |
|
|
|
|
|
|
|
|
def create_distribution_plot( |
|
|
df: pd.DataFrame, |
|
|
column: str, |
|
|
plot_type: str = 'histogram', |
|
|
bins: int = 30, |
|
|
title: Optional[str] = None |
|
|
) -> Tuple[plt.Figure, Any]: |
|
|
""" |
|
|
Create a distribution plot (histogram or box plot). |
|
|
|
|
|
Args: |
|
|
df: pandas DataFrame |
|
|
column: Column to visualize |
|
|
plot_type: 'histogram' or 'boxplot' |
|
|
bins: Number of bins for histogram |
|
|
title: Plot title |
|
|
|
|
|
Returns: |
|
|
Tuple of (matplotlib Figure, axes) |
|
|
""" |
|
|
if df is None or df.empty: |
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
|
ax.text(0.5, 0.5, 'No data available', ha='center', va='center', fontsize=14) |
|
|
return fig, ax |
|
|
|
|
|
try: |
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
|
data = df[column].dropna() |
|
|
|
|
|
if plot_type == 'histogram': |
|
|
ax.hist(data, bins=bins, color=COLORS['primary'], edgecolor='white', alpha=0.7) |
|
|
ax.axvline(data.mean(), color=COLORS['danger'], linestyle='--', linewidth=2, label=f'Mean: {data.mean():.2f}') |
|
|
ax.axvline(data.median(), color=COLORS['success'], linestyle='--', linewidth=2, label=f'Median: {data.median():.2f}') |
|
|
ax.legend() |
|
|
ax.set_ylabel('Frequency', fontsize=12) |
|
|
else: |
|
|
bp = ax.boxplot(data, patch_artist=True) |
|
|
bp['boxes'][0].set_facecolor(COLORS['primary']) |
|
|
bp['boxes'][0].set_alpha(0.7) |
|
|
ax.set_ylabel(column, fontsize=12) |
|
|
|
|
|
ax.set_xlabel(column if plot_type == 'histogram' else '', fontsize=12) |
|
|
ax.set_title(title or f'Distribution of {column}', fontsize=14, fontweight='bold') |
|
|
ax.grid(True, alpha=0.3) |
|
|
|
|
|
plt.tight_layout() |
|
|
return fig, ax |
|
|
|
|
|
except Exception as e: |
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
|
ax.text(0.5, 0.5, f'Error creating plot: {str(e)}', ha='center', va='center', fontsize=12) |
|
|
return fig, ax |
|
|
|
|
|
|
|
|
def create_category_bar_chart( |
|
|
df: pd.DataFrame, |
|
|
category_column: str, |
|
|
value_column: str, |
|
|
agg_method: str = 'sum', |
|
|
top_n: int = 10, |
|
|
title: Optional[str] = None, |
|
|
horizontal: bool = True |
|
|
) -> Tuple[plt.Figure, Any]: |
|
|
""" |
|
|
Create a bar chart for categorical analysis. |
|
|
|
|
|
Args: |
|
|
df: pandas DataFrame |
|
|
category_column: Column to group by |
|
|
value_column: Column to aggregate |
|
|
agg_method: Aggregation method |
|
|
top_n: Number of top categories to show |
|
|
title: Plot title |
|
|
horizontal: Whether to create horizontal bars |
|
|
|
|
|
Returns: |
|
|
Tuple of (matplotlib Figure, axes) |
|
|
""" |
|
|
if df is None or df.empty: |
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
ax.text(0.5, 0.5, 'No data available', ha='center', va='center', fontsize=14) |
|
|
return fig, ax |
|
|
|
|
|
try: |
|
|
|
|
|
if agg_method == 'count': |
|
|
agg_data = df.groupby(category_column)[value_column].count() |
|
|
else: |
|
|
agg_data = df.groupby(category_column)[value_column].agg(agg_method) |
|
|
|
|
|
agg_data = agg_data.sort_values(ascending=False).head(top_n) |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
|
|
|
colors = [COLORS['palette'][i % len(COLORS['palette'])] for i in range(len(agg_data))] |
|
|
|
|
|
if horizontal: |
|
|
bars = ax.barh(range(len(agg_data)), agg_data.values, color=colors, alpha=0.8) |
|
|
ax.set_yticks(range(len(agg_data))) |
|
|
ax.set_yticklabels([str(x)[:30] for x in agg_data.index]) |
|
|
ax.set_xlabel(f'{value_column} ({agg_method})', fontsize=12) |
|
|
ax.invert_yaxis() |
|
|
|
|
|
|
|
|
for i, bar in enumerate(bars): |
|
|
width = bar.get_width() |
|
|
ax.text(width, bar.get_y() + bar.get_height()/2, f'{width:,.0f}', |
|
|
ha='left', va='center', fontsize=10, fontweight='bold') |
|
|
else: |
|
|
bars = ax.bar(range(len(agg_data)), agg_data.values, color=colors, alpha=0.8) |
|
|
ax.set_xticks(range(len(agg_data))) |
|
|
ax.set_xticklabels([str(x)[:15] for x in agg_data.index], rotation=45, ha='right') |
|
|
ax.set_ylabel(f'{value_column} ({agg_method})', fontsize=12) |
|
|
|
|
|
ax.set_title(title or f'Top {top_n} {category_column} by {value_column} ({agg_method})', |
|
|
fontsize=14, fontweight='bold') |
|
|
ax.grid(True, alpha=0.3, axis='x' if horizontal else 'y') |
|
|
|
|
|
plt.tight_layout() |
|
|
return fig, ax |
|
|
|
|
|
except Exception as e: |
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
ax.text(0.5, 0.5, f'Error creating plot: {str(e)}', ha='center', va='center', fontsize=12) |
|
|
return fig, ax |
|
|
|
|
|
|
|
|
def create_pie_chart( |
|
|
df: pd.DataFrame, |
|
|
category_column: str, |
|
|
value_column: str, |
|
|
agg_method: str = 'sum', |
|
|
top_n: int = 8, |
|
|
title: Optional[str] = None |
|
|
) -> Tuple[plt.Figure, Any]: |
|
|
""" |
|
|
Create a pie chart for category distribution. |
|
|
|
|
|
Args: |
|
|
df: pandas DataFrame |
|
|
category_column: Column to group by |
|
|
value_column: Column to aggregate |
|
|
agg_method: Aggregation method |
|
|
top_n: Number of top categories to show |
|
|
title: Plot title |
|
|
|
|
|
Returns: |
|
|
Tuple of (matplotlib Figure, axes) |
|
|
""" |
|
|
if df is None or df.empty: |
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
ax.text(0.5, 0.5, 'No data available', ha='center', va='center', fontsize=14) |
|
|
return fig, ax |
|
|
|
|
|
try: |
|
|
|
|
|
if agg_method == 'count': |
|
|
agg_data = df.groupby(category_column)[value_column].count() |
|
|
else: |
|
|
agg_data = df.groupby(category_column)[value_column].agg(agg_method) |
|
|
|
|
|
agg_data = agg_data.sort_values(ascending=False).head(top_n) |
|
|
|
|
|
|
|
|
if len(df[category_column].unique()) > top_n: |
|
|
if agg_method == 'count': |
|
|
others_value = df.groupby(category_column)[value_column].count().sort_values(ascending=False).iloc[top_n:].sum() |
|
|
else: |
|
|
others_value = df.groupby(category_column)[value_column].agg(agg_method).sort_values(ascending=False).iloc[top_n:].sum() |
|
|
agg_data['Others'] = others_value |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
|
|
|
colors = COLORS['palette'][:len(agg_data)] |
|
|
wedges, texts, autotexts = ax.pie( |
|
|
agg_data.values, |
|
|
labels=[str(x)[:20] for x in agg_data.index], |
|
|
autopct='%1.1f%%', |
|
|
colors=colors, |
|
|
explode=[0.02] * len(agg_data), |
|
|
shadow=True |
|
|
) |
|
|
|
|
|
ax.set_title(title or f'{category_column} Distribution by {value_column}', |
|
|
fontsize=14, fontweight='bold') |
|
|
|
|
|
plt.tight_layout() |
|
|
return fig, ax |
|
|
|
|
|
except Exception as e: |
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
ax.text(0.5, 0.5, f'Error creating plot: {str(e)}', ha='center', va='center', fontsize=12) |
|
|
return fig, ax |
|
|
|
|
|
|
|
|
def create_scatter_plot( |
|
|
df: pd.DataFrame, |
|
|
x_column: str, |
|
|
y_column: str, |
|
|
color_column: Optional[str] = None, |
|
|
title: Optional[str] = None |
|
|
) -> Tuple[plt.Figure, Any]: |
|
|
""" |
|
|
Create a scatter plot to show relationships between variables. |
|
|
|
|
|
Args: |
|
|
df: pandas DataFrame |
|
|
x_column: Column for x-axis |
|
|
y_column: Column for y-axis |
|
|
color_column: Optional column for color coding |
|
|
title: Plot title |
|
|
|
|
|
Returns: |
|
|
Tuple of (matplotlib Figure, axes) |
|
|
""" |
|
|
if df is None or df.empty: |
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
ax.text(0.5, 0.5, 'No data available', ha='center', va='center', fontsize=14) |
|
|
return fig, ax |
|
|
|
|
|
try: |
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
|
|
|
|
|
|
plot_df = df.sample(n=min(1000, len(df)), random_state=42) if len(df) > 1000 else df |
|
|
|
|
|
if color_column and color_column in plot_df.columns: |
|
|
unique_cats = plot_df[color_column].unique()[:8] |
|
|
for i, cat in enumerate(unique_cats): |
|
|
mask = plot_df[color_column] == cat |
|
|
ax.scatter( |
|
|
plot_df.loc[mask, x_column], |
|
|
plot_df.loc[mask, y_column], |
|
|
c=COLORS['palette'][i % len(COLORS['palette'])], |
|
|
label=str(cat)[:20], |
|
|
alpha=0.6, |
|
|
s=50 |
|
|
) |
|
|
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') |
|
|
else: |
|
|
ax.scatter(plot_df[x_column], plot_df[y_column], c=COLORS['primary'], alpha=0.6, s=50) |
|
|
|
|
|
ax.set_xlabel(x_column, fontsize=12) |
|
|
ax.set_ylabel(y_column, fontsize=12) |
|
|
ax.set_title(title or f'{x_column} vs {y_column}', fontsize=14, fontweight='bold') |
|
|
ax.grid(True, alpha=0.3) |
|
|
|
|
|
plt.tight_layout() |
|
|
return fig, ax |
|
|
|
|
|
except Exception as e: |
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
ax.text(0.5, 0.5, f'Error creating plot: {str(e)}', ha='center', va='center', fontsize=12) |
|
|
return fig, ax |
|
|
|
|
|
|
|
|
def create_correlation_heatmap( |
|
|
df: pd.DataFrame, |
|
|
columns: Optional[List[str]] = None, |
|
|
title: Optional[str] = None |
|
|
) -> Tuple[plt.Figure, Any]: |
|
|
""" |
|
|
Create a correlation heatmap for numerical columns. |
|
|
|
|
|
Args: |
|
|
df: pandas DataFrame |
|
|
columns: List of columns to include (None for all numeric) |
|
|
title: Plot title |
|
|
|
|
|
Returns: |
|
|
Tuple of (matplotlib Figure, axes) |
|
|
""" |
|
|
if df is None or df.empty: |
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
ax.text(0.5, 0.5, 'No data available', ha='center', va='center', fontsize=14) |
|
|
return fig, ax |
|
|
|
|
|
try: |
|
|
|
|
|
if columns: |
|
|
numeric_df = df[columns].select_dtypes(include=[np.number]) |
|
|
else: |
|
|
numeric_df = df.select_dtypes(include=[np.number]) |
|
|
|
|
|
if numeric_df.shape[1] < 2: |
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
ax.text(0.5, 0.5, 'Need at least 2 numeric columns for correlation', |
|
|
ha='center', va='center', fontsize=14) |
|
|
return fig, ax |
|
|
|
|
|
corr_matrix = numeric_df.corr() |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
|
|
|
mask = np.triu(np.ones_like(corr_matrix, dtype=bool)) |
|
|
sns.heatmap( |
|
|
corr_matrix, |
|
|
mask=mask, |
|
|
annot=True, |
|
|
cmap='RdBu_r', |
|
|
center=0, |
|
|
fmt='.2f', |
|
|
square=True, |
|
|
linewidths=0.5, |
|
|
ax=ax, |
|
|
vmin=-1, |
|
|
vmax=1 |
|
|
) |
|
|
|
|
|
ax.set_title(title or 'Correlation Heatmap', fontsize=14, fontweight='bold') |
|
|
|
|
|
plt.tight_layout() |
|
|
return fig, ax |
|
|
|
|
|
except Exception as e: |
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
ax.text(0.5, 0.5, f'Error creating plot: {str(e)}', ha='center', va='center', fontsize=12) |
|
|
return fig, ax |
|
|
|
|
|
|
|
|
def save_plot(fig: plt.Figure, filename: str = "chart.png", dpi: int = 150) -> str: |
|
|
""" |
|
|
Save a matplotlib figure to a file. |
|
|
|
|
|
Args: |
|
|
fig: matplotlib Figure to save |
|
|
filename: Output filename |
|
|
dpi: Resolution |
|
|
|
|
|
Returns: |
|
|
Path to saved file |
|
|
""" |
|
|
try: |
|
|
fig.savefig(filename, dpi=dpi, bbox_inches='tight', facecolor='white') |
|
|
return filename |
|
|
except Exception as e: |
|
|
return None |