Spaces:

mgbam
/

CognitiveEDA

Sleeping

App Files Files Community

mgbam commited on Jun 18, 2025

Commit

7fa4e3f

verified ·

1 Parent(s): 640d10c

Update analysis_modules.py

Browse files

Files changed (1) hide show

analysis_modules.py +93 -37

analysis_modules.py CHANGED Viewed

@@ -1,56 +1,112 @@
 # analysis_modules.py
 import pandas as pd
 import plotly.express as px
 from statsmodels.tsa.seasonal import seasonal_decompose
 from statsmodels.tsa.stattools import adfuller
-from sklearn.cluster import KMeans
 from wordcloud import WordCloud
-import matplotlib.pyplot as plt
-import io
-import base64
 # --- Time-Series Module ---
 def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
     """Performs time-series decomposition and stationarity testing."""
-    df[date_col] = pd.to_datetime(df[date_col])
-    ts_df = df.set_index(date_col)[value_col].dropna()
-    # Decomposition
-    decomposition = seasonal_decompose(ts_df, model='additive', period=12) # Assuming monthly data
-    fig_decomp = px.line(pd.DataFrame({'trend': decomposition.trend, 'seasonal': decomposition.seasonal, 'residual': decomposition.resid}),
-                         title=f"Time-Series Decomposition of {value_col}")
-    # Stationarity Test (ADF)
-    adf_result = adfuller(ts_df)
-    adf_md = f"""
-    ### Stationarity Analysis (ADF Test)
-    - **Test Statistic:** `{adf_result[0]:.4f}`
-    - **p-value:** `{adf_result[1]:.4f}`
-    - **Conclusion:** The series is likely **{'stationary' if adf_result[1] < 0.05 else 'non-stationary'}**.
-    """
-    return fig_decomp, adf_md
 # --- Text Analysis Module ---
 def generate_word_cloud(df: pd.DataFrame, text_col: str):
-    """Generates a word cloud from a text column."""
-    text = ' '.join(df[text_col].dropna().astype(str))
-    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
-    # Convert matplotlib plot to a data URI for Gradio
-    buf = io.BytesIO()
-    wordcloud.to_image().save(buf, format='png')
-    img_str = "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode('utf-8')
-    return img_str
 # --- Clustering Module ---
 def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
     """Performs K-Means clustering and returns a scatter plot."""
-    cluster_data = df[numeric_cols].dropna()
-    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto').fit(cluster_data)
-    cluster_data['Cluster'] = kmeans.labels_.astype(str)
-    # For visualization, we'll use the first two numeric columns
-    fig_cluster = px.scatter(cluster_data, x=numeric_cols[0], y=numeric_cols[1], color='Cluster',
-                             title=f"K-Means Clustering (k={n_clusters})")
-    return fig_cluster

 # analysis_modules.py
+import base64
+import io
+import logging
 import pandas as pd
 import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import StandardScaler
 from statsmodels.tsa.seasonal import seasonal_decompose
 from statsmodels.tsa.stattools import adfuller
 from wordcloud import WordCloud
 # --- Time-Series Module ---
 def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
     """Performs time-series decomposition and stationarity testing."""
+    if not date_col or not value_col:
+        return go.Figure(), "Please select both a date/time column and a value column."
+    try:
+        # Prepare data
+        ts_df = df.copy()
+        ts_df[date_col] = pd.to_datetime(ts_df[date_col])
+        ts_df = ts_df.set_index(date_col).sort_index()
+        ts_data = ts_df[value_col].dropna()
+        if len(ts_data) < 24: # Need at least 2 periods for decomposition
+             return go.Figure(), "Not enough data points (< 24) for time-series decomposition."
+        # Decomposition (assuming monthly data for period=12)
+        result = seasonal_decompose(ts_data, model='additive', period=12)
+        fig_decomp = px.line(
+            pd.DataFrame({'Trend': result.trend, 'Seasonal': result.seasonal, 'Residual': result.resid}),
+            title=f"<b>Time-Series Decomposition of '{value_col}'</b>",
+            labels={'value': 'Value', 'index': 'Date'},
+            template="plotly_white",
+        )
+        fig_decomp.update_layout(legend_title_text='Components')
+        # Stationarity Test (ADF)
+        adf_result = adfuller(ts_data)
+        conclusion = 'likely **stationary** (p < 0.05)' if adf_result[1] < 0.05 else 'likely **non-stationary** (p >= 0.05)'
+        adf_md = f"""
+        ### Stationarity Analysis (ADF Test)
+        - **ADF Statistic:** `{adf_result[0]:.4f}`
+        - **p-value:** `{adf_result[1]:.4f}`
+        - **Conclusion:** The time-series is {conclusion}. A non-stationary series may require differencing for forecasting models.
+        """
+        return fig_decomp, adf_md
+    except Exception as e:
+        logging.error(f"Time-series analysis failed: {e}", exc_info=True)
+        return go.Figure(), f"❌ **Error:** Could not perform time-series analysis. Reason: {e}"
 # --- Text Analysis Module ---
 def generate_word_cloud(df: pd.DataFrame, text_col: str):
+    """Generates a word cloud from a text column and returns it as a data URI."""
+    if not text_col:
+        return None # Return None to hide the HTML component
+    try:
+        text = ' '.join(df[text_col].dropna().astype(str))
+        if not text:
+            return "<p style='text-align:center;'>No text data available in this column to generate a cloud.</p>"
+        wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)
+        # Convert matplotlib plot to a base64 encoded string for Gradio HTML
+        buf = io.BytesIO()
+        wordcloud.to_image().save(buf, format='png')
+        img_str = base64.b64encode(buf.getvalue()).decode('utf-8')
+        html_content = f'<div style="text-align:center;"><img src="data:image/png;base64,{img_str}" alt="Word Cloud"></div>'
+        return html_content
+    except Exception as e:
+        logging.error(f"Word cloud generation failed: {e}", exc_info=True)
+        return f"❌ **Error:** Could not generate word cloud. Reason: {e}"
 # --- Clustering Module ---
 def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
     """Performs K-Means clustering and returns a scatter plot."""
+    if len(numeric_cols) < 2:
+        return go.Figure(), "Clustering requires at least 2 numeric features."
+    try:
+        cluster_data = df[numeric_cols].dropna()
+        if len(cluster_data) < n_clusters:
+            return go.Figure(), f"Not enough data points ({len(cluster_data)}) for {n_clusters} clusters."
+        # Scale data for better clustering performance
+        scaler = StandardScaler()
+        scaled_data = scaler.fit_transform(cluster_data)
+        kmeans = KMeans(n_clusters=int(n_clusters), random_state=42, n_init='auto').fit(scaled_data)
+        cluster_data['Cluster'] = kmeans.labels_.astype(str)
+        # Visualize using the first two principal components for a more holistic view
+        fig_cluster = px.scatter(
+            cluster_data, x=numeric_cols[0], y=numeric_cols[1], color='Cluster',
+            title=f"<b>K-Means Clustering Result (k={int(n_clusters)})</b>",
+            template="plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid
+        )
+        cluster_md = f"""
+        ### Clustering Summary
+        - **Features Used:** {', '.join(numeric_cols)}
+        - **Number of Clusters (K):** {int(n_clusters)}
+        - **Insight:** The plot shows the separation of data into {int(n_clusters)} distinct groups based on the selected features.
+        """
+        return fig_cluster, cluster_md
+    except Exception as e:
+        logging.error(f"Clustering failed: {e}", exc_info=True)
+        return go.Figure(), f"❌ **Error:** Could not perform clustering. Reason: {e}"