""" Statistical analysis and insights generation """ import pandas as pd import numpy as np from scipy import stats from typing import Dict, Tuple, Union def calculate_descriptive_stats(df: pd.DataFrame, column: str) -> Dict: """ Calculate descriptive statistics for a column Args: df: Input DataFrame column: Column name Returns: Dictionary with statistics """ stats_dict = { "count": df[column].count(), "mean": df[column].mean(), "median": df[column].median(), "std": df[column].std(), "min": df[column].min(), "25%": df[column].quantile(0.25), "75%": df[column].quantile(0.75), "max": df[column].max(), "skewness": df[column].skew(), "kurtosis": df[column].kurtosis(), } return stats_dict def correlation_analysis(df: pd.DataFrame, method: str = "pearson") -> pd.DataFrame: """ Perform correlation analysis Args: df: Input DataFrame with numeric columns method: 'pearson', 'spearman', or 'kendall' Returns: Correlation matrix """ numeric_df = df.select_dtypes(include=[np.number]) corr_matrix = numeric_df.corr(method=method) return corr_matrix def hypothesis_testing(group1: pd.Series, group2: pd.Series, test_type: str = "ttest") -> Dict: """ Perform hypothesis testing between two groups Args: group1: First group data group2: Second group data test_type: 't-test', 'mannwhitneyu', or 'chi2' Returns: Dictionary with test results """ results = {} if test_type == "ttest": statistic, p_value = stats.ttest_ind(group1.dropna(), group2.dropna()) results = { "test": "Independent t-test", "statistic": statistic, "p_value": p_value, "significant": p_value < 0.05 } elif test_type == "mannwhitneyu": statistic, p_value = stats.mannwhitneyu(group1.dropna(), group2.dropna()) results = { "test": "Mann-Whitney U Test", "statistic": statistic, "p_value": p_value, "significant": p_value < 0.05 } return results def anova_test(groups: list) -> Dict: """ Perform ANOVA test Args: groups: List of group data Series Returns: Dictionary with ANOVA results """ clean_groups = [g.dropna() for g in groups] f_stat, p_value = stats.f_oneway(*clean_groups) return { "test": "ANOVA", "f_statistic": f_stat, "p_value": p_value, "significant": p_value < 0.05 } def chi_square_test(contingency_table: pd.DataFrame) -> Dict: """ Perform Chi-square test for independence Args: contingency_table: Contingency table (DataFrame) Returns: Dictionary with test results """ chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table) return { "test": "Chi-square", "statistic": chi2, "p_value": p_value, "degrees_of_freedom": dof, "significant": p_value < 0.05 } def trend_analysis(df: pd.DataFrame, time_col: str, value_col: str) -> Dict: """ Perform simple trend analysis Args: df: Input DataFrame time_col: Column name for time/date value_col: Column name for values Returns: Dictionary with trend metrics """ df_sorted = df.sort_values(time_col).copy() x = np.arange(len(df_sorted)) y = df_sorted[value_col].values slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) return { "slope": slope, "intercept": intercept, "r_squared": r_value**2, "p_value": p_value, "trend": "upward" if slope > 0 else "downward", "significant": p_value < 0.05 }