Spaces:
Sleeping
Sleeping
File size: 7,078 Bytes
4e67a93 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | #!/usr/bin/env python3
"""
Core correlation analysis functionality shared between Gradio app and CLI interface
"""
import pandas as pd
import numpy as np
from scipy import stats
import os
from datetime import datetime
# Import utility functions
from csv_utils import robust_csv_loader, find_email_column, find_ipm_columns
def convert_percentage_to_numeric(series):
"""Convert percentage strings to numeric values."""
if series.dtype == 'object':
return pd.to_numeric(series.astype(str).str.rstrip('%'), errors='coerce') / 100
else:
return pd.to_numeric(series, errors='coerce')
def load_and_merge_data(kpi_file, scores_file):
"""
Load KPI and scores files and merge them on email.
Args:
kpi_file: Path to KPI file (CSV or Excel)
scores_file: Path to scores file (CSV)
Returns:
tuple: (kpi_df, scores_df, merged_df, kpi_email_col, fy2425_ipm_col, fy2324_ipm_col)
"""
# Load KPI file
kpi_df = robust_csv_loader(kpi_file, required_columns=['Email'])
# Load scores file
scores_df = robust_csv_loader(scores_file, required_columns=['email', 'problem_score', 'ability_score'])
# Find email column in KPI dataframe
kpi_email_col = find_email_column(kpi_df)
if not kpi_email_col:
raise ValueError("Could not find email column in KPI file")
# Find IPM columns
fy2425_ipm_col, fy2324_ipm_col = find_ipm_columns(kpi_df)
# Normalize email columns
kpi_df['email_normalized'] = kpi_df[kpi_email_col].str.lower()
scores_df['email_normalized'] = scores_df['email'].str.lower()
# Merge datasets
merged_df = pd.merge(scores_df, kpi_df, on='email_normalized', how='inner')
return kpi_df, scores_df, merged_df, kpi_email_col, fy2425_ipm_col, fy2324_ipm_col
def analyze_data_quality(kpi_df, scores_df, merged_df, kpi_email_col='Email'):
"""
Analyze and report data quality statistics.
Returns:
dict: Data quality statistics
"""
stats = {
'kpi_records': len(kpi_df),
'scores_records': len(scores_df),
'matched_emails': len(merged_df)
}
# Calculate match rate
kpi_emails = set(kpi_df[kpi_email_col].str.lower().dropna())
scores_emails = set(scores_df['email'].str.lower().dropna())
common_emails = kpi_emails.intersection(scores_emails)
kpi_only = kpi_emails - scores_emails
scores_only = scores_emails - kpi_emails
stats.update({
'common_emails': len(common_emails),
'emails_only_in_kpi': len(kpi_only),
'emails_only_in_scores': len(scores_only),
'match_rate_kpi': len(common_emails)/len(kpi_emails)*100 if len(kpi_emails) > 0 else 0,
'match_rate_scores': len(common_emails)/len(scores_emails)*100 if len(scores_emails) > 0 else 0
})
return stats
def calculate_correlations(merged_df, fy2324_ipm_col, fy2425_ipm_col):
"""
Calculate correlations for all pairs.
Args:
merged_df: Merged dataframe with both scores and KPI data
fy2324_ipm_col: Column name for FY23/24 IPM
fy2425_ipm_col: Column name for FY24/25 IPM
Returns:
tuple: (results_dict, pairs_plot_data)
"""
# Extract columns
A = merged_df['problem_score']
B = merged_df['ability_score']
C = merged_df[fy2324_ipm_col]
D = merged_df[fy2425_ipm_col]
# Define pairs
pairs = [
('AC', A, C, 'problem_score', 'FY23/24 IPM'),
('AD', A, D, 'problem_score', 'FY24/25 IPM'),
('BC', B, C, 'ability_score', 'FY23/24 IPM'),
('BD', B, D, 'ability_score', 'FY24/25 IPM')
]
results = {}
plot_data = {}
# Calculate correlations for each pair
for pair_name, series1, series2, name1, name2 in pairs:
# Clean data
series1_clean = pd.to_numeric(series1, errors='coerce')
series2_clean = convert_percentage_to_numeric(series2)
# Create dataframe and drop NaN
pair_df = pd.DataFrame({
'var1': series1_clean,
'var2': series2_clean
})
initial_count = len(pair_df)
pair_df_clean = pair_df.dropna()
valid_points = len(pair_df_clean)
# Initialize result
results[pair_name] = {
'pearson': {
'correlation': None,
'p_value': None,
'n_samples': valid_points
},
'spearman': {
'correlation': None,
'p_value': None,
'n_samples': valid_points
},
'data_quality': {
'initial_records': initial_count,
'valid_records': valid_points,
'completion_rate': f"{valid_points/initial_count*100:.1f}%" if initial_count > 0 else "0%"
}
}
# Store plot data
plot_data[pair_name] = {
'x_data': pair_df_clean['var1'].values if valid_points > 0 else [],
'y_data': pair_df_clean['var2'].values if valid_points > 0 else [],
'x_label': name1,
'y_label': name2,
'pearson_corr': None,
'spearman_corr': None,
'n_samples': valid_points
}
# Calculate correlations if enough data
if valid_points >= 3:
pearson_corr, pearson_p = stats.pearsonr(pair_df_clean['var1'], pair_df_clean['var2'])
spearman_corr, spearman_p = stats.spearmanr(pair_df_clean['var1'], pair_df_clean['var2'])
results[pair_name]['pearson']['correlation'] = float(pearson_corr)
results[pair_name]['pearson']['p_value'] = float(pearson_p)
results[pair_name]['spearman']['correlation'] = float(spearman_corr)
results[pair_name]['spearman']['p_value'] = float(spearman_p)
plot_data[pair_name]['pearson_corr'] = float(pearson_corr)
plot_data[pair_name]['spearman_corr'] = float(spearman_corr)
return results, plot_data
def analyze_correlations_full(kpi_file, scores_file):
"""
Complete correlation analysis pipeline.
Args:
kpi_file: Path to KPI file
scores_file: Path to scores file
Returns:
tuple: (data_quality_stats, correlation_results, plot_data, column_info)
"""
# Load and merge data
kpi_df, scores_df, merged_df, kpi_email_col, fy2425_ipm_col, fy2324_ipm_col = load_and_merge_data(kpi_file, scores_file)
# Get data quality stats
data_quality_stats = analyze_data_quality(kpi_df, scores_df, merged_df, kpi_email_col)
# Calculate correlations
correlation_results, plot_data = calculate_correlations(merged_df, fy2324_ipm_col, fy2425_ipm_col)
# Column info for reference
column_info = {
'kpi_email_col': kpi_email_col,
'fy2425_ipm_col': fy2425_ipm_col,
'fy2324_ipm_col': fy2324_ipm_col
}
return data_quality_stats, correlation_results, plot_data, column_info |