Spaces:
Sleeping
Sleeping
| """ | |
| Statistical analysis and insights generation | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from scipy import stats | |
| from typing import Dict, Tuple, Union | |
| def calculate_descriptive_stats(df: pd.DataFrame, column: str) -> Dict: | |
| """ | |
| Calculate descriptive statistics for a column | |
| Args: | |
| df: Input DataFrame | |
| column: Column name | |
| Returns: | |
| Dictionary with statistics | |
| """ | |
| stats_dict = { | |
| "count": df[column].count(), | |
| "mean": df[column].mean(), | |
| "median": df[column].median(), | |
| "std": df[column].std(), | |
| "min": df[column].min(), | |
| "25%": df[column].quantile(0.25), | |
| "75%": df[column].quantile(0.75), | |
| "max": df[column].max(), | |
| "skewness": df[column].skew(), | |
| "kurtosis": df[column].kurtosis(), | |
| } | |
| return stats_dict | |
| def correlation_analysis(df: pd.DataFrame, method: str = "pearson") -> pd.DataFrame: | |
| """ | |
| Perform correlation analysis | |
| Args: | |
| df: Input DataFrame with numeric columns | |
| method: 'pearson', 'spearman', or 'kendall' | |
| Returns: | |
| Correlation matrix | |
| """ | |
| numeric_df = df.select_dtypes(include=[np.number]) | |
| corr_matrix = numeric_df.corr(method=method) | |
| return corr_matrix | |
| def hypothesis_testing(group1: pd.Series, group2: pd.Series, | |
| test_type: str = "ttest") -> Dict: | |
| """ | |
| Perform hypothesis testing between two groups | |
| Args: | |
| group1: First group data | |
| group2: Second group data | |
| test_type: 't-test', 'mannwhitneyu', or 'chi2' | |
| Returns: | |
| Dictionary with test results | |
| """ | |
| results = {} | |
| if test_type == "ttest": | |
| statistic, p_value = stats.ttest_ind(group1.dropna(), group2.dropna()) | |
| results = { | |
| "test": "Independent t-test", | |
| "statistic": statistic, | |
| "p_value": p_value, | |
| "significant": p_value < 0.05 | |
| } | |
| elif test_type == "mannwhitneyu": | |
| statistic, p_value = stats.mannwhitneyu(group1.dropna(), group2.dropna()) | |
| results = { | |
| "test": "Mann-Whitney U Test", | |
| "statistic": statistic, | |
| "p_value": p_value, | |
| "significant": p_value < 0.05 | |
| } | |
| return results | |
| def anova_test(groups: list) -> Dict: | |
| """ | |
| Perform ANOVA test | |
| Args: | |
| groups: List of group data Series | |
| Returns: | |
| Dictionary with ANOVA results | |
| """ | |
| clean_groups = [g.dropna() for g in groups] | |
| f_stat, p_value = stats.f_oneway(*clean_groups) | |
| return { | |
| "test": "ANOVA", | |
| "f_statistic": f_stat, | |
| "p_value": p_value, | |
| "significant": p_value < 0.05 | |
| } | |
| def chi_square_test(contingency_table: pd.DataFrame) -> Dict: | |
| """ | |
| Perform Chi-square test for independence | |
| Args: | |
| contingency_table: Contingency table (DataFrame) | |
| Returns: | |
| Dictionary with test results | |
| """ | |
| chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table) | |
| return { | |
| "test": "Chi-square", | |
| "statistic": chi2, | |
| "p_value": p_value, | |
| "degrees_of_freedom": dof, | |
| "significant": p_value < 0.05 | |
| } | |
| def trend_analysis(df: pd.DataFrame, time_col: str, value_col: str) -> Dict: | |
| """ | |
| Perform simple trend analysis | |
| Args: | |
| df: Input DataFrame | |
| time_col: Column name for time/date | |
| value_col: Column name for values | |
| Returns: | |
| Dictionary with trend metrics | |
| """ | |
| df_sorted = df.sort_values(time_col).copy() | |
| x = np.arange(len(df_sorted)) | |
| y = df_sorted[value_col].values | |
| slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) | |
| return { | |
| "slope": slope, | |
| "intercept": intercept, | |
| "r_squared": r_value**2, | |
| "p_value": p_value, | |
| "trend": "upward" if slope > 0 else "downward", | |
| "significant": p_value < 0.05 | |
| } | |