File size: 7,078 Bytes
4e67a93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python3
"""
Core correlation analysis functionality shared between Gradio app and CLI interface
"""

import pandas as pd
import numpy as np
from scipy import stats
import os
from datetime import datetime

# Import utility functions
from csv_utils import robust_csv_loader, find_email_column, find_ipm_columns


def convert_percentage_to_numeric(series):
    """Convert percentage strings to numeric values."""
    if series.dtype == 'object':
        return pd.to_numeric(series.astype(str).str.rstrip('%'), errors='coerce') / 100
    else:
        return pd.to_numeric(series, errors='coerce')


def load_and_merge_data(kpi_file, scores_file):
    """
    Load KPI and scores files and merge them on email.
    
    Args:
        kpi_file: Path to KPI file (CSV or Excel)
        scores_file: Path to scores file (CSV)
    
    Returns:
        tuple: (kpi_df, scores_df, merged_df, kpi_email_col, fy2425_ipm_col, fy2324_ipm_col)
    """
    # Load KPI file
    kpi_df = robust_csv_loader(kpi_file, required_columns=['Email'])
    
    # Load scores file
    scores_df = robust_csv_loader(scores_file, required_columns=['email', 'problem_score', 'ability_score'])
    
    # Find email column in KPI dataframe
    kpi_email_col = find_email_column(kpi_df)
    if not kpi_email_col:
        raise ValueError("Could not find email column in KPI file")
    
    # Find IPM columns
    fy2425_ipm_col, fy2324_ipm_col = find_ipm_columns(kpi_df)
    
    # Normalize email columns
    kpi_df['email_normalized'] = kpi_df[kpi_email_col].str.lower()
    scores_df['email_normalized'] = scores_df['email'].str.lower()
    
    # Merge datasets
    merged_df = pd.merge(scores_df, kpi_df, on='email_normalized', how='inner')
    
    return kpi_df, scores_df, merged_df, kpi_email_col, fy2425_ipm_col, fy2324_ipm_col


def analyze_data_quality(kpi_df, scores_df, merged_df, kpi_email_col='Email'):
    """
    Analyze and report data quality statistics.
    
    Returns:
        dict: Data quality statistics
    """
    stats = {
        'kpi_records': len(kpi_df),
        'scores_records': len(scores_df),
        'matched_emails': len(merged_df)
    }
    
    # Calculate match rate
    kpi_emails = set(kpi_df[kpi_email_col].str.lower().dropna())
    scores_emails = set(scores_df['email'].str.lower().dropna())
    
    common_emails = kpi_emails.intersection(scores_emails)
    kpi_only = kpi_emails - scores_emails
    scores_only = scores_emails - kpi_emails
    
    stats.update({
        'common_emails': len(common_emails),
        'emails_only_in_kpi': len(kpi_only),
        'emails_only_in_scores': len(scores_only),
        'match_rate_kpi': len(common_emails)/len(kpi_emails)*100 if len(kpi_emails) > 0 else 0,
        'match_rate_scores': len(common_emails)/len(scores_emails)*100 if len(scores_emails) > 0 else 0
    })
    
    return stats


def calculate_correlations(merged_df, fy2324_ipm_col, fy2425_ipm_col):
    """
    Calculate correlations for all pairs.
    
    Args:
        merged_df: Merged dataframe with both scores and KPI data
        fy2324_ipm_col: Column name for FY23/24 IPM
        fy2425_ipm_col: Column name for FY24/25 IPM
    
    Returns:
        tuple: (results_dict, pairs_plot_data)
    """
    # Extract columns
    A = merged_df['problem_score']
    B = merged_df['ability_score']
    C = merged_df[fy2324_ipm_col]
    D = merged_df[fy2425_ipm_col]
    
    # Define pairs
    pairs = [
        ('AC', A, C, 'problem_score', 'FY23/24 IPM'),
        ('AD', A, D, 'problem_score', 'FY24/25 IPM'),
        ('BC', B, C, 'ability_score', 'FY23/24 IPM'),
        ('BD', B, D, 'ability_score', 'FY24/25 IPM')
    ]
    
    results = {}
    plot_data = {}
    
    # Calculate correlations for each pair
    for pair_name, series1, series2, name1, name2 in pairs:
        # Clean data
        series1_clean = pd.to_numeric(series1, errors='coerce')
        series2_clean = convert_percentage_to_numeric(series2)
        
        # Create dataframe and drop NaN
        pair_df = pd.DataFrame({
            'var1': series1_clean,
            'var2': series2_clean
        })
        
        initial_count = len(pair_df)
        pair_df_clean = pair_df.dropna()
        valid_points = len(pair_df_clean)
        
        # Initialize result
        results[pair_name] = {
            'pearson': {
                'correlation': None,
                'p_value': None,
                'n_samples': valid_points
            },
            'spearman': {
                'correlation': None,
                'p_value': None,
                'n_samples': valid_points
            },
            'data_quality': {
                'initial_records': initial_count,
                'valid_records': valid_points,
                'completion_rate': f"{valid_points/initial_count*100:.1f}%" if initial_count > 0 else "0%"
            }
        }
        
        # Store plot data
        plot_data[pair_name] = {
            'x_data': pair_df_clean['var1'].values if valid_points > 0 else [],
            'y_data': pair_df_clean['var2'].values if valid_points > 0 else [],
            'x_label': name1,
            'y_label': name2,
            'pearson_corr': None,
            'spearman_corr': None,
            'n_samples': valid_points
        }
        
        # Calculate correlations if enough data
        if valid_points >= 3:
            pearson_corr, pearson_p = stats.pearsonr(pair_df_clean['var1'], pair_df_clean['var2'])
            spearman_corr, spearman_p = stats.spearmanr(pair_df_clean['var1'], pair_df_clean['var2'])
            
            results[pair_name]['pearson']['correlation'] = float(pearson_corr)
            results[pair_name]['pearson']['p_value'] = float(pearson_p)
            results[pair_name]['spearman']['correlation'] = float(spearman_corr)
            results[pair_name]['spearman']['p_value'] = float(spearman_p)
            
            plot_data[pair_name]['pearson_corr'] = float(pearson_corr)
            plot_data[pair_name]['spearman_corr'] = float(spearman_corr)
    
    return results, plot_data


def analyze_correlations_full(kpi_file, scores_file):
    """
    Complete correlation analysis pipeline.
    
    Args:
        kpi_file: Path to KPI file
        scores_file: Path to scores file
    
    Returns:
        tuple: (data_quality_stats, correlation_results, plot_data, column_info)
    """
    # Load and merge data
    kpi_df, scores_df, merged_df, kpi_email_col, fy2425_ipm_col, fy2324_ipm_col = load_and_merge_data(kpi_file, scores_file)
    
    # Get data quality stats
    data_quality_stats = analyze_data_quality(kpi_df, scores_df, merged_df, kpi_email_col)
    
    # Calculate correlations
    correlation_results, plot_data = calculate_correlations(merged_df, fy2324_ipm_col, fy2425_ipm_col)
    
    # Column info for reference
    column_info = {
        'kpi_email_col': kpi_email_col,
        'fy2425_ipm_col': fy2425_ipm_col,
        'fy2324_ipm_col': fy2324_ipm_col
    }
    
    return data_quality_stats, correlation_results, plot_data, column_info