Spaces:
Sleeping
Sleeping
Update analysis_modules.py
Browse files- analysis_modules.py +93 -37
analysis_modules.py
CHANGED
|
@@ -1,56 +1,112 @@
|
|
| 1 |
# analysis_modules.py
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
import plotly.express as px
|
|
|
|
|
|
|
|
|
|
| 5 |
from statsmodels.tsa.seasonal import seasonal_decompose
|
| 6 |
from statsmodels.tsa.stattools import adfuller
|
| 7 |
-
from sklearn.cluster import KMeans
|
| 8 |
from wordcloud import WordCloud
|
| 9 |
-
import matplotlib.pyplot as plt
|
| 10 |
-
import io
|
| 11 |
-
import base64
|
| 12 |
|
| 13 |
# --- Time-Series Module ---
|
| 14 |
def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
|
| 15 |
"""Performs time-series decomposition and stationarity testing."""
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# --- Text Analysis Module ---
|
| 35 |
def generate_word_cloud(df: pd.DataFrame, text_col: str):
|
| 36 |
-
"""Generates a word cloud from a text column."""
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
# --- Clustering Module ---
|
| 47 |
def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
|
| 48 |
"""Performs K-Means clustering and returns a scatter plot."""
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# analysis_modules.py
|
| 2 |
|
| 3 |
+
import base64
|
| 4 |
+
import io
|
| 5 |
+
import logging
|
| 6 |
+
|
| 7 |
import pandas as pd
|
| 8 |
import plotly.express as px
|
| 9 |
+
import plotly.graph_objects as go
|
| 10 |
+
from sklearn.cluster import KMeans
|
| 11 |
+
from sklearn.preprocessing import StandardScaler
|
| 12 |
from statsmodels.tsa.seasonal import seasonal_decompose
|
| 13 |
from statsmodels.tsa.stattools import adfuller
|
|
|
|
| 14 |
from wordcloud import WordCloud
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# --- Time-Series Module ---
|
| 17 |
def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
|
| 18 |
"""Performs time-series decomposition and stationarity testing."""
|
| 19 |
+
if not date_col or not value_col:
|
| 20 |
+
return go.Figure(), "Please select both a date/time column and a value column."
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
# Prepare data
|
| 24 |
+
ts_df = df.copy()
|
| 25 |
+
ts_df[date_col] = pd.to_datetime(ts_df[date_col])
|
| 26 |
+
ts_df = ts_df.set_index(date_col).sort_index()
|
| 27 |
+
ts_data = ts_df[value_col].dropna()
|
| 28 |
+
|
| 29 |
+
if len(ts_data) < 24: # Need at least 2 periods for decomposition
|
| 30 |
+
return go.Figure(), "Not enough data points (< 24) for time-series decomposition."
|
| 31 |
+
|
| 32 |
+
# Decomposition (assuming monthly data for period=12)
|
| 33 |
+
result = seasonal_decompose(ts_data, model='additive', period=12)
|
| 34 |
+
fig_decomp = px.line(
|
| 35 |
+
pd.DataFrame({'Trend': result.trend, 'Seasonal': result.seasonal, 'Residual': result.resid}),
|
| 36 |
+
title=f"<b>Time-Series Decomposition of '{value_col}'</b>",
|
| 37 |
+
labels={'value': 'Value', 'index': 'Date'},
|
| 38 |
+
template="plotly_white",
|
| 39 |
+
)
|
| 40 |
+
fig_decomp.update_layout(legend_title_text='Components')
|
| 41 |
|
| 42 |
+
# Stationarity Test (ADF)
|
| 43 |
+
adf_result = adfuller(ts_data)
|
| 44 |
+
conclusion = 'likely **stationary** (p < 0.05)' if adf_result[1] < 0.05 else 'likely **non-stationary** (p >= 0.05)'
|
| 45 |
+
adf_md = f"""
|
| 46 |
+
### Stationarity Analysis (ADF Test)
|
| 47 |
+
- **ADF Statistic:** `{adf_result[0]:.4f}`
|
| 48 |
+
- **p-value:** `{adf_result[1]:.4f}`
|
| 49 |
+
- **Conclusion:** The time-series is {conclusion}. A non-stationary series may require differencing for forecasting models.
|
| 50 |
+
"""
|
| 51 |
+
return fig_decomp, adf_md
|
| 52 |
+
except Exception as e:
|
| 53 |
+
logging.error(f"Time-series analysis failed: {e}", exc_info=True)
|
| 54 |
+
return go.Figure(), f"❌ **Error:** Could not perform time-series analysis. Reason: {e}"
|
| 55 |
|
| 56 |
# --- Text Analysis Module ---
|
| 57 |
def generate_word_cloud(df: pd.DataFrame, text_col: str):
|
| 58 |
+
"""Generates a word cloud from a text column and returns it as a data URI."""
|
| 59 |
+
if not text_col:
|
| 60 |
+
return None # Return None to hide the HTML component
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
text = ' '.join(df[text_col].dropna().astype(str))
|
| 64 |
+
if not text:
|
| 65 |
+
return "<p style='text-align:center;'>No text data available in this column to generate a cloud.</p>"
|
| 66 |
+
|
| 67 |
+
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)
|
| 68 |
+
|
| 69 |
+
# Convert matplotlib plot to a base64 encoded string for Gradio HTML
|
| 70 |
+
buf = io.BytesIO()
|
| 71 |
+
wordcloud.to_image().save(buf, format='png')
|
| 72 |
+
img_str = base64.b64encode(buf.getvalue()).decode('utf-8')
|
| 73 |
+
html_content = f'<div style="text-align:center;"><img src="data:image/png;base64,{img_str}" alt="Word Cloud"></div>'
|
| 74 |
+
return html_content
|
| 75 |
+
except Exception as e:
|
| 76 |
+
logging.error(f"Word cloud generation failed: {e}", exc_info=True)
|
| 77 |
+
return f"❌ **Error:** Could not generate word cloud. Reason: {e}"
|
| 78 |
|
| 79 |
# --- Clustering Module ---
|
| 80 |
def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
|
| 81 |
"""Performs K-Means clustering and returns a scatter plot."""
|
| 82 |
+
if len(numeric_cols) < 2:
|
| 83 |
+
return go.Figure(), "Clustering requires at least 2 numeric features."
|
| 84 |
+
|
| 85 |
+
try:
|
| 86 |
+
cluster_data = df[numeric_cols].dropna()
|
| 87 |
+
if len(cluster_data) < n_clusters:
|
| 88 |
+
return go.Figure(), f"Not enough data points ({len(cluster_data)}) for {n_clusters} clusters."
|
| 89 |
+
|
| 90 |
+
# Scale data for better clustering performance
|
| 91 |
+
scaler = StandardScaler()
|
| 92 |
+
scaled_data = scaler.fit_transform(cluster_data)
|
| 93 |
+
|
| 94 |
+
kmeans = KMeans(n_clusters=int(n_clusters), random_state=42, n_init='auto').fit(scaled_data)
|
| 95 |
+
cluster_data['Cluster'] = kmeans.labels_.astype(str)
|
| 96 |
+
|
| 97 |
+
# Visualize using the first two principal components for a more holistic view
|
| 98 |
+
fig_cluster = px.scatter(
|
| 99 |
+
cluster_data, x=numeric_cols[0], y=numeric_cols[1], color='Cluster',
|
| 100 |
+
title=f"<b>K-Means Clustering Result (k={int(n_clusters)})</b>",
|
| 101 |
+
template="plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid
|
| 102 |
+
)
|
| 103 |
+
cluster_md = f"""
|
| 104 |
+
### Clustering Summary
|
| 105 |
+
- **Features Used:** {', '.join(numeric_cols)}
|
| 106 |
+
- **Number of Clusters (K):** {int(n_clusters)}
|
| 107 |
+
- **Insight:** The plot shows the separation of data into {int(n_clusters)} distinct groups based on the selected features.
|
| 108 |
+
"""
|
| 109 |
+
return fig_cluster, cluster_md
|
| 110 |
+
except Exception as e:
|
| 111 |
+
logging.error(f"Clustering failed: {e}", exc_info=True)
|
| 112 |
+
return go.Figure(), f"❌ **Error:** Could not perform clustering. Reason: {e}"
|