Spaces:
Sleeping
Sleeping
| # tools/visuals.py — reusable Plotly helpers | |
| # ------------------------------------------------------------ | |
| import os | |
| import tempfile | |
| from typing import List, Tuple, Union | |
| import numpy as np | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from scipy.cluster.hierarchy import linkage, leaves_list | |
| # ----------------------------------------------------------------- | |
| # Typing alias: every helper returns a plotly.graph_objects.Figure | |
| # ----------------------------------------------------------------- | |
| Plot = go.Figure | |
| # ----------------------------------------------------------------- | |
| # Utility: save figure to high‑res PNG under a writable dir (/tmp) | |
| # ----------------------------------------------------------------- | |
| def _save_fig(fig: Plot, prefix: str, outdir: str = "/tmp") -> str: | |
| os.makedirs(outdir, exist_ok=True) | |
| tmp = tempfile.NamedTemporaryFile( | |
| prefix=prefix, suffix=".png", dir=outdir, delete=False | |
| ) | |
| fig.write_image(tmp.name, scale=3) | |
| return tmp.name | |
| # ----------------------------------------------------------------- | |
| # 1) Histogram (+ optional KDE) | |
| # ----------------------------------------------------------------- | |
| def histogram_tool( | |
| file_path: str, | |
| column: str, | |
| bins: int = 30, | |
| kde: bool = True, | |
| output_dir: str = "/tmp", | |
| ) -> Union[Tuple[Plot, str], str]: | |
| ext = os.path.splitext(file_path)[1].lower() | |
| df = pd.read_excel(file_path) if ext in (".xls", ".xlsx") else pd.read_csv(file_path) | |
| if column not in df.columns: | |
| return f"❌ Column '{column}' not found." | |
| series = pd.to_numeric(df[column], errors="coerce").dropna() | |
| if series.empty: | |
| return f"❌ No numeric data in '{column}'." | |
| if kde: | |
| # density + hist using numpy histogram | |
| hist, edges = np.histogram(series, bins=bins) | |
| fig = go.Figure() | |
| fig.add_bar(x=edges[:-1], y=hist, name="Histogram") | |
| fig.add_scatter( | |
| x=np.linspace(series.min(), series.max(), 500), | |
| y=np.exp(np.poly1d(np.polyfit(series, np.log(series.rank()), 1))( | |
| np.linspace(series.min(), series.max(), 500) | |
| )), | |
| mode="lines", | |
| name="KDE (approx)", | |
| ) | |
| else: | |
| fig = px.histogram( | |
| series, nbins=bins, title=f"Histogram – {column}", template="plotly_dark" | |
| ) | |
| fig.update_layout(template="plotly_dark") | |
| return fig, _save_fig(fig, f"hist_{column}_", output_dir) | |
| # ----------------------------------------------------------------- | |
| # 2) Box plot | |
| # ----------------------------------------------------------------- | |
| def boxplot_tool( | |
| file_path: str, | |
| column: str, | |
| output_dir: str = "/tmp", | |
| ) -> Union[Tuple[Plot, str], str]: | |
| ext = os.path.splitext(file_path)[1].lower() | |
| df = pd.read_excel(file_path) if ext in (".xls", ".xlsx") else pd.read_csv(file_path) | |
| if column not in df.columns: | |
| return f"❌ Column '{column}' not found." | |
| series = pd.to_numeric(df[column], errors="coerce").dropna() | |
| if series.empty: | |
| return f"❌ No numeric data in '{column}'." | |
| fig = px.box( | |
| series, points="outliers", title=f"Boxplot – {column}", template="plotly_dark" | |
| ) | |
| return fig, _save_fig(fig, f"box_{column}_", output_dir) | |
| # ----------------------------------------------------------------- | |
| # 3) Violin plot | |
| # ----------------------------------------------------------------- | |
| def violin_tool( | |
| file_path: str, | |
| column: str, | |
| output_dir: str = "/tmp", | |
| ) -> Union[Tuple[Plot, str], str]: | |
| ext = os.path.splitext(file_path)[1].lower() | |
| df = pd.read_excel(file_path) if ext in (".xls", ".xlsx") else pd.read_csv(file_path) | |
| if column not in df.columns: | |
| return f"❌ Column '{column}' not found." | |
| series = pd.to_numeric(df[column], errors="coerce").dropna() | |
| if series.empty: | |
| return f"❌ No numeric data in '{column}'." | |
| fig = px.violin( | |
| series, box=True, points="all", title=f"Violin – {column}", template="plotly_dark" | |
| ) | |
| return fig, _save_fig(fig, f"violin_{column}_", output_dir) | |
| # ----------------------------------------------------------------- | |
| # 4) Scatter‑matrix | |
| # ----------------------------------------------------------------- | |
| def scatter_matrix_tool( | |
| file_path: str, | |
| columns: List[str], | |
| output_dir: str = "/tmp", | |
| size: int = 5, | |
| ) -> Union[Tuple[Plot, str], str]: | |
| ext = os.path.splitext(file_path)[1].lower() | |
| df = pd.read_excel(file_path) if ext in (".xls", ".xlsx") else pd.read_csv(file_path) | |
| missing = [c for c in columns if c not in df.columns] | |
| if missing: | |
| return f"❌ Missing columns: {', '.join(missing)}" | |
| df_num = df[columns].apply(pd.to_numeric, errors="coerce").dropna() | |
| if df_num.empty: | |
| return "❌ No valid numeric data." | |
| fig = px.scatter_matrix( | |
| df_num, dimensions=columns, title="Scatter Matrix", template="plotly_dark" | |
| ) | |
| fig.update_traces(diagonal_visible=False, marker=dict(size=size)) | |
| return fig, _save_fig(fig, "scatter_matrix_", output_dir) | |
| # ----------------------------------------------------------------- | |
| # 5) Correlation heat‑map (optional clustering) | |
| # ----------------------------------------------------------------- | |
| def corr_heatmap_tool( | |
| file_path: str, | |
| columns: List[str] | None = None, | |
| output_dir: str = "/tmp", | |
| cluster: bool = True, | |
| ) -> Union[Tuple[Plot, str], str]: | |
| ext = os.path.splitext(file_path)[1].lower() | |
| df = pd.read_excel(file_path) if ext in (".xls", ".xlsx") else pd.read_csv(file_path) | |
| df_num = df.select_dtypes("number") if columns is None else df[columns] | |
| df_num = df_num.apply(pd.to_numeric, errors="coerce").dropna(axis=1, how="all") | |
| if df_num.shape[1] < 2: | |
| return "❌ Need ≥ 2 numeric columns." | |
| corr = df_num.corr() | |
| if cluster: | |
| order = leaves_list(linkage(corr, "average")) | |
| corr = corr.iloc[order, order] | |
| fig = px.imshow( | |
| corr, | |
| color_continuous_scale="RdBu", | |
| title="Correlation Heat‑map", | |
| labels=dict(color="ρ"), | |
| template="plotly_dark", | |
| ) | |
| return fig, _save_fig(fig, "corr_heatmap_", output_dir) | |