Spaces:

mgbam
/

CognitiveEDA

Sleeping

App Files Files Community

CognitiveEDA / analysis_modules.py

mgbam

Update analysis_modules.py

cff0e3d verified 9 months ago

raw

history blame contribute delete

7.48 kB

	# analysis_modules.py

	import base64
	import io
	import logging

	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from sklearn.cluster import KMeans
	from sklearn.decomposition import PCA
	from sklearn.preprocessing import StandardScaler
	from statsmodels.tsa.seasonal import seasonal_decompose
	from statsmodels.tsa.stattools import adfuller
	from wordcloud import WordCloud

	# --- Time-Series Module ---
	def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
	"""
	Performs time-series decomposition and stationarity testing with robust error handling.

	Args:
	df (pd.DataFrame): The input DataFrame.
	date_col (str): The name of the column containing datetime information.
	value_col (str): The name of the numeric column to analyze.

	Returns:
	tuple: A Plotly Figure and a Markdown string with analysis.
	"""
	if not date_col or not value_col:
	return go.Figure(), "Please select both a date/time column and a value column to begin analysis."

	try:
	logging.info(f"Analyzing time-series for date='{date_col}' and value='{value_col}'")
	ts_df = df.copy()
	ts_df[date_col] = pd.to_datetime(ts_df[date_col])
	ts_df = ts_df.set_index(date_col).sort_index()
	ts_data = ts_df[value_col].dropna()

	# A common period for decomposition is 12 (monthly), require at least 2 full periods.
	period = 12
	if len(ts_data) < 2 * period:
	msg = f"Not enough data points ({len(ts_data)}) for a reliable time-series decomposition (requires at least {2*period})."
	logging.warning(msg)
	return go.Figure().update_layout(title=msg), ""

	# Decomposition
	result = seasonal_decompose(ts_data, model='additive', period=period)
	fig_decomp = px.line(
	pd.DataFrame({'Trend': result.trend, 'Seasonal': result.seasonal, 'Residual': result.resid}),
	title=f"<b>Time-Series Decomposition of '{value_col}'</b>",
	labels={'value': 'Value', 'index': 'Date'}, template="plotly_white"
	).update_layout(legend_title_text='Components')

	# Stationarity Test (ADF)
	adf_result = adfuller(ts_data)
	conclusion = 'likely stationary (p < 0.05)' if adf_result[1] < 0.05 else 'likely non-stationary (p >= 0.05)'
	adf_md = f"""
	### Stationarity Analysis (Augmented Dickey-Fuller Test)
	- ADF Statistic: `{adf_result[0]:.4f}`
	- p-value: `{adf_result[1]:.4f}`
	- Conclusion: The time-series is {conclusion}. Non-stationary series often require differencing before being used in forecasting models like ARIMA.
	"""
	return fig_decomp, adf_md
	except Exception as e:
	logging.error(f"Time-series analysis failed: {e}", exc_info=True)
	return go.Figure(), f"❌ Error: Could not perform analysis. Please ensure the date column is a valid time format and the value column is numeric. \n`{e}`"

	# --- Text Analysis Module ---
	def generate_word_cloud(df: pd.DataFrame, text_col: str):
	"""
	Generates a word cloud from a text column and returns it as an HTML object.

	Args:
	df (pd.DataFrame): The input DataFrame.
	text_col (str): The name of the column containing text data.

	Returns:
	str: An HTML string containing the word cloud image or an error message.
	"""
	if not text_col:
	return "<p style='text-align:center; padding: 20px;'>Select a text column to generate a word cloud.</p>"

	try:
	logging.info(f"Generating word cloud for column '{text_col}'")
	text = ' '.join(df[text_col].dropna().astype(str))
	if not text.strip():
	return "<p style='text-align:center; padding: 20px;'>No text data available in this column to generate a cloud.</p>"

	wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis', max_words=150).generate(text)

	buf = io.BytesIO()
	wordcloud.to_image().save(buf, format='png')
	img_str = base64.b64encode(buf.getvalue()).decode('utf-8')
	html_content = f'<div style="text-align:center;"><img src="data:image/png;base64,{img_str}" alt="Word Cloud for {text_col}" style="border-radius: 8px;"></div>'
	return html_content
	except Exception as e:
	logging.error(f"Word cloud generation failed: {e}", exc_info=True)
	return f"<p style='text-align:center; color:red; padding: 20px;'>❌ Error: Could not generate word cloud. Reason: {e}</p>"

	# --- Clustering Module ---
	def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
	"""
	Performs K-Means clustering using best practices (scaling and PCA for visualization).

	Args:
	df (pd.DataFrame): The input DataFrame.
	numeric_cols (list): A list of numeric columns to use for clustering.
	n_clusters (int): The number of clusters (k) to create.

	Returns:
	tuple: A Plotly Figure and a Markdown string with analysis.
	"""
	if len(numeric_cols) < 2:
	return go.Figure(), "Clustering requires at least 2 numeric features. Please select a dataset with more numeric columns."

	try:
	logging.info(f"Performing K-Means clustering with k={n_clusters} on {len(numeric_cols)} features.")
	cluster_data = df[numeric_cols].dropna()
	if len(cluster_data) < n_clusters:
	return go.Figure(), f"Not enough data points ({len(cluster_data)}) for {n_clusters} clusters."

	# Step 1: Scale data - Crucial for distance-based algorithms like K-Means
	scaler = StandardScaler()
	scaled_data = scaler.fit_transform(cluster_data)

	# Step 2: Perform K-Means clustering
	kmeans = KMeans(n_clusters=int(n_clusters), random_state=42, n_init=10).fit(scaled_data)
	cluster_data['Cluster'] = kmeans.labels_.astype(str)

	# Step 3: Use PCA to reduce dimensionality for a meaningful 2D visualization
	pca = PCA(n_components=2)
	components = pca.fit_transform(scaled_data)
	cluster_data['PCA1'] = components[:, 0]
	cluster_data['PCA2'] = components[:, 1]

	# Step 4: Create the plot using the principal components
	fig_cluster = px.scatter(
	cluster_data, x='PCA1', y='PCA2', color='Cluster',
	title=f"<b>K-Means Clustering Visualization (k={int(n_clusters)})</b>",
	labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
	template="plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid
	)

	explained_variance = pca.explained_variance_ratio_.sum() * 100
	cluster_md = f"""
	### Clustering Summary & Methodology
	- Features Used: `{len(numeric_cols)}` numeric features were scaled and used for clustering.
	- Number of Clusters (K): `{int(n_clusters)}`
	- Visualization: To visualize the high-dimensional clusters in 2D, Principal Component Analysis (PCA) was used.
	- Explained Variance: The two components shown explain {explained_variance:.2f}% of the variance in the data.
	"""
	return fig_cluster, cluster_md
	except Exception as e:
	logging.error(f"Clustering failed: {e}", exc_info=True)
	return go.Figure(), f"❌ Error: Could not perform clustering. \n`{e}`"