Spaces:
Sleeping
Sleeping
| # analysis_modules.py | |
| import base64 | |
| import io | |
| import logging | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from sklearn.cluster import KMeans | |
| from sklearn.decomposition import PCA | |
| from sklearn.preprocessing import StandardScaler | |
| from statsmodels.tsa.seasonal import seasonal_decompose | |
| from statsmodels.tsa.stattools import adfuller | |
| from wordcloud import WordCloud | |
| # --- Time-Series Module --- | |
| def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str): | |
| """ | |
| Performs time-series decomposition and stationarity testing with robust error handling. | |
| Args: | |
| df (pd.DataFrame): The input DataFrame. | |
| date_col (str): The name of the column containing datetime information. | |
| value_col (str): The name of the numeric column to analyze. | |
| Returns: | |
| tuple: A Plotly Figure and a Markdown string with analysis. | |
| """ | |
| if not date_col or not value_col: | |
| return go.Figure(), "Please select both a date/time column and a value column to begin analysis." | |
| try: | |
| logging.info(f"Analyzing time-series for date='{date_col}' and value='{value_col}'") | |
| ts_df = df.copy() | |
| ts_df[date_col] = pd.to_datetime(ts_df[date_col]) | |
| ts_df = ts_df.set_index(date_col).sort_index() | |
| ts_data = ts_df[value_col].dropna() | |
| # A common period for decomposition is 12 (monthly), require at least 2 full periods. | |
| period = 12 | |
| if len(ts_data) < 2 * period: | |
| msg = f"Not enough data points ({len(ts_data)}) for a reliable time-series decomposition (requires at least {2*period})." | |
| logging.warning(msg) | |
| return go.Figure().update_layout(title=msg), "" | |
| # Decomposition | |
| result = seasonal_decompose(ts_data, model='additive', period=period) | |
| fig_decomp = px.line( | |
| pd.DataFrame({'Trend': result.trend, 'Seasonal': result.seasonal, 'Residual': result.resid}), | |
| title=f"<b>Time-Series Decomposition of '{value_col}'</b>", | |
| labels={'value': 'Value', 'index': 'Date'}, template="plotly_white" | |
| ).update_layout(legend_title_text='Components') | |
| # Stationarity Test (ADF) | |
| adf_result = adfuller(ts_data) | |
| conclusion = 'likely **stationary** (p < 0.05)' if adf_result[1] < 0.05 else 'likely **non-stationary** (p >= 0.05)' | |
| adf_md = f""" | |
| ### Stationarity Analysis (Augmented Dickey-Fuller Test) | |
| - **ADF Statistic:** `{adf_result[0]:.4f}` | |
| - **p-value:** `{adf_result[1]:.4f}` | |
| - **Conclusion:** The time-series is {conclusion}. Non-stationary series often require differencing before being used in forecasting models like ARIMA. | |
| """ | |
| return fig_decomp, adf_md | |
| except Exception as e: | |
| logging.error(f"Time-series analysis failed: {e}", exc_info=True) | |
| return go.Figure(), f"β **Error:** Could not perform analysis. Please ensure the date column is a valid time format and the value column is numeric. \n`{e}`" | |
| # --- Text Analysis Module --- | |
| def generate_word_cloud(df: pd.DataFrame, text_col: str): | |
| """ | |
| Generates a word cloud from a text column and returns it as an HTML object. | |
| Args: | |
| df (pd.DataFrame): The input DataFrame. | |
| text_col (str): The name of the column containing text data. | |
| Returns: | |
| str: An HTML string containing the word cloud image or an error message. | |
| """ | |
| if not text_col: | |
| return "<p style='text-align:center; padding: 20px;'>Select a text column to generate a word cloud.</p>" | |
| try: | |
| logging.info(f"Generating word cloud for column '{text_col}'") | |
| text = ' '.join(df[text_col].dropna().astype(str)) | |
| if not text.strip(): | |
| return "<p style='text-align:center; padding: 20px;'>No text data available in this column to generate a cloud.</p>" | |
| wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis', max_words=150).generate(text) | |
| buf = io.BytesIO() | |
| wordcloud.to_image().save(buf, format='png') | |
| img_str = base64.b64encode(buf.getvalue()).decode('utf-8') | |
| html_content = f'<div style="text-align:center;"><img src="data:image/png;base64,{img_str}" alt="Word Cloud for {text_col}" style="border-radius: 8px;"></div>' | |
| return html_content | |
| except Exception as e: | |
| logging.error(f"Word cloud generation failed: {e}", exc_info=True) | |
| return f"<p style='text-align:center; color:red; padding: 20px;'>β **Error:** Could not generate word cloud. Reason: {e}</p>" | |
| # --- Clustering Module --- | |
| def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4): | |
| """ | |
| Performs K-Means clustering using best practices (scaling and PCA for visualization). | |
| Args: | |
| df (pd.DataFrame): The input DataFrame. | |
| numeric_cols (list): A list of numeric columns to use for clustering. | |
| n_clusters (int): The number of clusters (k) to create. | |
| Returns: | |
| tuple: A Plotly Figure and a Markdown string with analysis. | |
| """ | |
| if len(numeric_cols) < 2: | |
| return go.Figure(), "Clustering requires at least 2 numeric features. Please select a dataset with more numeric columns." | |
| try: | |
| logging.info(f"Performing K-Means clustering with k={n_clusters} on {len(numeric_cols)} features.") | |
| cluster_data = df[numeric_cols].dropna() | |
| if len(cluster_data) < n_clusters: | |
| return go.Figure(), f"Not enough data points ({len(cluster_data)}) for {n_clusters} clusters." | |
| # Step 1: Scale data - Crucial for distance-based algorithms like K-Means | |
| scaler = StandardScaler() | |
| scaled_data = scaler.fit_transform(cluster_data) | |
| # Step 2: Perform K-Means clustering | |
| kmeans = KMeans(n_clusters=int(n_clusters), random_state=42, n_init=10).fit(scaled_data) | |
| cluster_data['Cluster'] = kmeans.labels_.astype(str) | |
| # Step 3: Use PCA to reduce dimensionality for a meaningful 2D visualization | |
| pca = PCA(n_components=2) | |
| components = pca.fit_transform(scaled_data) | |
| cluster_data['PCA1'] = components[:, 0] | |
| cluster_data['PCA2'] = components[:, 1] | |
| # Step 4: Create the plot using the principal components | |
| fig_cluster = px.scatter( | |
| cluster_data, x='PCA1', y='PCA2', color='Cluster', | |
| title=f"<b>K-Means Clustering Visualization (k={int(n_clusters)})</b>", | |
| labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'}, | |
| template="plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid | |
| ) | |
| explained_variance = pca.explained_variance_ratio_.sum() * 100 | |
| cluster_md = f""" | |
| ### Clustering Summary & Methodology | |
| - **Features Used:** `{len(numeric_cols)}` numeric features were scaled and used for clustering. | |
| - **Number of Clusters (K):** `{int(n_clusters)}` | |
| - **Visualization:** To visualize the high-dimensional clusters in 2D, Principal Component Analysis (PCA) was used. | |
| - **Explained Variance:** The two components shown explain **{explained_variance:.2f}%** of the variance in the data. | |
| """ | |
| return fig_cluster, cluster_md | |
| except Exception as e: | |
| logging.error(f"Clustering failed: {e}", exc_info=True) | |
| return go.Figure(), f"β **Error:** Could not perform clustering. \n`{e}`" |