|
|
|
|
|
""" |
|
|
Visualization utilities for Gradio app. |
|
|
Creates Plotly visualizations for different tabs. |
|
|
""" |
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import plotly.graph_objects as go |
|
|
import plotly.express as px |
|
|
from plotly.subplots import make_subplots |
|
|
from functools import lru_cache |
|
|
import hashlib |
|
|
|
|
|
|
|
|
def create_kpi_display(kpi_metrics): |
|
|
""" |
|
|
Create KPI metrics display as HTML. |
|
|
|
|
|
Args: |
|
|
kpi_metrics: dict with KPI values |
|
|
|
|
|
Returns: |
|
|
HTML string |
|
|
""" |
|
|
html = f""" |
|
|
<div style="display: flex; justify-content: space-around; gap: 20px; margin: 20px 0;"> |
|
|
<div style="text-align: center; padding: 20px; background: #f0f0f0; border-radius: 8px; flex: 1;"> |
|
|
<div style="font-size: 28px; font-weight: bold; color: #2E86AB;"> |
|
|
{kpi_metrics['total_customers']:,} |
|
|
</div> |
|
|
<div style="font-size: 14px; color: #666; margin-top: 5px;"> |
|
|
Tổng số khách hàng |
|
|
</div> |
|
|
</div> |
|
|
<div style="text-align: center; padding: 20px; background: #f0f0f0; border-radius: 8px; flex: 1;"> |
|
|
<div style="font-size: 28px; font-weight: bold; color: #A23B72;"> |
|
|
{kpi_metrics['total_transactions']:,} |
|
|
</div> |
|
|
<div style="font-size: 14px; color: #666; margin-top: 5px;"> |
|
|
Tổng số giao dịch |
|
|
</div> |
|
|
</div> |
|
|
<div style="text-align: center; padding: 20px; background: #f0f0f0; border-radius: 8px; flex: 1;"> |
|
|
<div style="font-size: 28px; font-weight: bold; color: #F18F01;"> |
|
|
£{kpi_metrics['avg_revenue']:.2f} |
|
|
</div> |
|
|
<div style="font-size: 14px; color: #666; margin-top: 5px;"> |
|
|
Doanh thu trung bình/giao dịch |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
""" |
|
|
return html |
|
|
|
|
|
|
|
|
def plot_revenue_over_time(df, date_start=None, date_end=None): |
|
|
""" |
|
|
Plot revenue over time with date filtering. |
|
|
|
|
|
Args: |
|
|
df: Cleaned data DataFrame |
|
|
date_start: Start date for filtering |
|
|
date_end: End date for filtering |
|
|
|
|
|
Returns: |
|
|
Plotly figure |
|
|
""" |
|
|
data = df.copy() |
|
|
|
|
|
|
|
|
if date_start: |
|
|
data = data[data["InvoiceDate"] >= date_start] |
|
|
if date_end: |
|
|
data = data[data["InvoiceDate"] <= date_end] |
|
|
|
|
|
|
|
|
daily_revenue = data.groupby(data["InvoiceDate"].dt.date)["TotalPrice"].sum() |
|
|
|
|
|
fig = go.Figure() |
|
|
fig.add_trace(go.Scatter( |
|
|
x=daily_revenue.index, |
|
|
y=daily_revenue.values, |
|
|
mode='lines', |
|
|
line=dict(color='#2E86AB', width=2), |
|
|
fill='tozeroy', |
|
|
name='Doanh thu' |
|
|
)) |
|
|
|
|
|
fig.update_layout( |
|
|
title="Doanh thu theo ngày", |
|
|
xaxis_title="Ngày", |
|
|
yaxis_title="Doanh thu (GBP)", |
|
|
hovermode='x unified', |
|
|
height=400, |
|
|
template='plotly_white' |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def plot_hourly_daily_heatmap(df): |
|
|
""" |
|
|
Create heatmap of purchases by hour and day of week. |
|
|
|
|
|
Args: |
|
|
df: Cleaned data DataFrame with DayOfWeek and HourOfDay |
|
|
|
|
|
Returns: |
|
|
Plotly figure |
|
|
""" |
|
|
heatmap_data = df.groupby(["DayOfWeek", "HourOfDay"]).size().unstack(fill_value=0) |
|
|
|
|
|
day_names = ["Thứ 2", "Thứ 3", "Thứ 4", "Thứ 5", "Thứ 6", "Thứ 7", "Chủ nhật"] |
|
|
|
|
|
fig = go.Figure(data=go.Heatmap( |
|
|
z=heatmap_data.values, |
|
|
x=heatmap_data.columns, |
|
|
y=[day_names[i] for i in heatmap_data.index], |
|
|
colorscale='Viridis', |
|
|
name='Số giao dịch' |
|
|
)) |
|
|
|
|
|
fig.update_layout( |
|
|
title="Heatmap thời gian mua hàng: Giờ trong ngày x Ngày trong tuần", |
|
|
xaxis_title="Giờ trong ngày", |
|
|
yaxis_title="Ngày trong tuần", |
|
|
height=400, |
|
|
template='plotly_white' |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def plot_elbow_silhouette(inertias, silhouette_scores, k_range=range(2, 11)): |
|
|
""" |
|
|
Plot Elbow method and Silhouette scores. |
|
|
|
|
|
Args: |
|
|
inertias: List of inertias for different K |
|
|
silhouette_scores: List of silhouette scores |
|
|
k_range: Range of K values |
|
|
|
|
|
Returns: |
|
|
Plotly figure |
|
|
""" |
|
|
fig = make_subplots( |
|
|
rows=1, cols=2, |
|
|
subplot_titles=("Phương pháp Elbow", "Silhouette Score") |
|
|
) |
|
|
|
|
|
k_list = list(k_range) |
|
|
|
|
|
|
|
|
fig.add_trace( |
|
|
go.Scatter( |
|
|
x=k_list, y=inertias, |
|
|
mode='lines+markers', |
|
|
name='Inertia', |
|
|
line=dict(color='#2E86AB', width=2), |
|
|
marker=dict(size=8), |
|
|
), |
|
|
row=1, col=1 |
|
|
) |
|
|
|
|
|
|
|
|
best_k_idx = np.argmax(silhouette_scores) |
|
|
best_k = k_list[best_k_idx] |
|
|
|
|
|
fig.add_trace( |
|
|
go.Scatter( |
|
|
x=k_list, y=silhouette_scores, |
|
|
mode='lines+markers', |
|
|
name='Silhouette Score', |
|
|
line=dict(color='#2ECC71', width=2), |
|
|
marker=dict(size=8), |
|
|
), |
|
|
row=1, col=2 |
|
|
) |
|
|
|
|
|
|
|
|
fig.add_annotation( |
|
|
x=best_k, y=silhouette_scores[best_k_idx], |
|
|
text=f"Tốt nhất: K={best_k}", |
|
|
showarrow=True, |
|
|
arrowhead=2, |
|
|
arrowsize=1, |
|
|
arrowwidth=2, |
|
|
arrowcolor="red", |
|
|
bgcolor="yellow", |
|
|
bordercolor="red", |
|
|
borderwidth=2, |
|
|
row=1, col=2 |
|
|
) |
|
|
|
|
|
fig.update_xaxes(title_text="Số lượng clusters (K)", row=1, col=1) |
|
|
fig.update_yaxes(title_text="Inertia", row=1, col=1) |
|
|
|
|
|
fig.update_xaxes(title_text="Số lượng clusters (K)", row=1, col=2) |
|
|
fig.update_yaxes(title_text="Silhouette Score", row=1, col=2) |
|
|
|
|
|
fig.update_layout(height=400, showlegend=False, template='plotly_white') |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def plot_clusters_pca_2d(pca_features, cluster_labels, k): |
|
|
""" |
|
|
Plot clusters in 2D PCA space with minimal hover data for performance. |
|
|
|
|
|
Args: |
|
|
pca_features: DataFrame with PCA features |
|
|
cluster_labels: Array of cluster labels |
|
|
k: Number of clusters |
|
|
|
|
|
Returns: |
|
|
Plotly figure |
|
|
""" |
|
|
df_plot = pca_features.copy() |
|
|
df_plot['Cluster'] = cluster_labels |
|
|
|
|
|
|
|
|
fig = px.scatter( |
|
|
df_plot, |
|
|
x='PC1', y='PC2', |
|
|
color='Cluster', |
|
|
hover_data={'PC1': ':.2f', 'PC2': ':.2f'}, |
|
|
color_continuous_scale='Viridis', |
|
|
title=f'Phân cụm K-Means (K={k}) - Không gian PCA', |
|
|
labels={'Cluster': 'Cluster'}, |
|
|
) |
|
|
|
|
|
fig.update_traces( |
|
|
marker=dict(size=4, opacity=0.7), |
|
|
hovertemplate='<b>Cluster %{customdata[0]}</b><br>PC1: %{x:.2f}<br>PC2: %{y:.2f}<extra></extra>' |
|
|
) |
|
|
|
|
|
fig.update_layout( |
|
|
height=500, |
|
|
template='plotly_white', |
|
|
hovermode='closest', |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def plot_radar_chart(cluster_means, k, cluster_idx=0, features_to_plot=None): |
|
|
""" |
|
|
Create radar chart for a specific cluster using 8 features (like create_individual_radar_plots). |
|
|
|
|
|
Args: |
|
|
cluster_means: DataFrame with cluster means |
|
|
k: Number of clusters |
|
|
cluster_idx: Index of cluster to display (default: 0) |
|
|
features_to_plot: List of features to include in radar (default: 8 features) |
|
|
|
|
|
Returns: |
|
|
Plotly figure |
|
|
""" |
|
|
if features_to_plot is None: |
|
|
features_to_plot = [ |
|
|
"Sum_Quantity", |
|
|
"Sum_TotalPrice", |
|
|
"Mean_UnitPrice", |
|
|
"Count_Invoice", |
|
|
"Count_Stock", |
|
|
"Mean_TotalPriceSumPerInvoice", |
|
|
"Mean_TotalPriceMeanPerStock", |
|
|
"Mean_StockCountPerInvoice", |
|
|
] |
|
|
|
|
|
|
|
|
data_selected = cluster_means[features_to_plot].copy() |
|
|
|
|
|
|
|
|
|
|
|
global_min = data_selected.min() |
|
|
global_max = data_selected.max() |
|
|
data_normalized = (data_selected - global_min) / (global_max - global_min) |
|
|
data_normalized = data_normalized.fillna(0) |
|
|
|
|
|
|
|
|
feature_labels = { |
|
|
"Sum_Quantity": "Khối lượng mua", |
|
|
"Sum_TotalPrice": "Tổng chi tiêu", |
|
|
"Mean_UnitPrice": "Mức giá ưa thích", |
|
|
"Count_Invoice": "Tần suất mua", |
|
|
"Count_Stock": "Đa dạng sản phẩm", |
|
|
"Mean_TotalPriceSumPerInvoice": "Giá trị/giao dịch", |
|
|
"Mean_TotalPriceMeanPerStock": "Chi tiêu/sản phẩm", |
|
|
"Mean_StockCountPerInvoice": "Sản phẩm/giao dịch", |
|
|
} |
|
|
|
|
|
categories = [feature_labels.get(f, f) for f in features_to_plot] |
|
|
|
|
|
colors = ["#2E86AB", "#A23B72", "#F18F01", "#C73E1D"] |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
|
|
|
if cluster_idx in data_normalized.index: |
|
|
cluster_row = data_normalized.loc[cluster_idx] |
|
|
color = colors[cluster_idx % len(colors)] |
|
|
|
|
|
fig.add_trace(go.Scatterpolar( |
|
|
r=cluster_row.tolist(), |
|
|
theta=categories, |
|
|
fill='toself', |
|
|
name=f'Cluster {cluster_idx}', |
|
|
line=dict(color=color, width=2), |
|
|
marker=dict(size=8), |
|
|
)) |
|
|
|
|
|
fig.update_layout( |
|
|
polar=dict( |
|
|
radialaxis=dict( |
|
|
visible=True, |
|
|
range=[0, 1], |
|
|
tickformat='.0%', |
|
|
) |
|
|
), |
|
|
title=f'Chi tiết Cluster {cluster_idx} - Biểu đồ Radar (K={k})', |
|
|
height=500, |
|
|
showlegend=True, |
|
|
template='plotly_white', |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def create_cluster_stats_table(cluster_means, k): |
|
|
""" |
|
|
Create HTML table for cluster statistics. |
|
|
|
|
|
Args: |
|
|
cluster_means: DataFrame with cluster means |
|
|
k: Number of clusters |
|
|
|
|
|
Returns: |
|
|
Pandas DataFrame formatted for display |
|
|
""" |
|
|
|
|
|
df_display = cluster_means.copy() |
|
|
df_display = df_display.round(2) |
|
|
df_display.index.name = "Cluster" |
|
|
|
|
|
return df_display |
|
|
|