kpi_analysis / analyze_correlations_v2.py
zh3036's picture
Deploy KPI snapshot 2025-06-12
4e67a93
#!/usr/bin/env python3
"""
KPI Correlation Analysis Script v2
Analyzes correlations between IPM scores and axiia scores with improved robustness
Usage:
python3 analyze_correlations_v2.py -k <kpi_file> -s <scores_file> [-o <output_file>]
Examples:
python3 analyze_correlations_v2.py -k ../../data/lenovo_kpi.csv -s ../../data/lenovo-scores-0603.csv -o score_corr.yaml
python3 analyze_correlations_v2.py -k kpi.csv -s score.csv -o scr.yaml
"""
import pandas as pd
import numpy as np
import yaml
import os
import argparse
import sys
import matplotlib.pyplot as plt
import seaborn as sns
# Import core analysis functions
from correlation_analysis_core import (
analyze_correlations_full,
convert_percentage_to_numeric
)
def print_data_quality_report(data_quality_stats):
"""Print data quality statistics to console."""
print("\n=== DATA QUALITY REPORT ===")
print(f"KPI file: {data_quality_stats['kpi_records']} records")
print(f"Scores file: {data_quality_stats['scores_records']} records")
print(f"Matched emails: {data_quality_stats['matched_emails']} records")
print(f"\nEmail matching statistics:")
print(f" - Common emails: {data_quality_stats['common_emails']}")
print(f" - Emails only in KPI file: {data_quality_stats['emails_only_in_kpi']}")
print(f" - Emails only in Scores file: {data_quality_stats['emails_only_in_scores']}")
print(f" - Match rate (KPI perspective): {data_quality_stats['match_rate_kpi']:.1f}%")
print(f" - Match rate (Scores perspective): {data_quality_stats['match_rate_scores']:.1f}%")
def create_correlation_plots(pairs_data, output_dir=None):
"""Create scatter plots for each correlation pair."""
# Set up the figure
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()
# Set style
sns.set_style("whitegrid")
for idx, (pair_name, data_dict) in enumerate(pairs_data.items()):
ax = axes[idx]
# Extract data
x_data = data_dict['x_data']
y_data = data_dict['y_data']
x_label = data_dict['x_label']
y_label = data_dict['y_label']
pearson_corr = data_dict['pearson_corr']
spearman_corr = data_dict['spearman_corr']
n_samples = data_dict['n_samples']
# Create scatter plot
ax.scatter(x_data, y_data, alpha=0.6, s=50)
# Add trend line
if len(x_data) > 0:
z = np.polyfit(x_data, y_data, 1)
p = np.poly1d(z)
ax.plot(sorted(x_data), p(sorted(x_data)), "r--", alpha=0.8, linewidth=2)
# Set labels and title
ax.set_xlabel(x_label, fontsize=10)
ax.set_ylabel(y_label, fontsize=10)
ax.set_title(f'{pair_name}: {x_label} vs {y_label}', fontsize=12, fontweight='bold')
# Add correlation info as text
if pearson_corr is not None:
corr_text = f'Pearson r = {pearson_corr:.3f}\nSpearman ρ = {spearman_corr:.3f}\nn = {n_samples}'
else:
corr_text = f'Insufficient data\nn = {n_samples}'
ax.text(0.05, 0.95, corr_text, transform=ax.transAxes,
verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
# Format y-axis as percentage if it's IPM data
if 'IPM' in y_label:
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
# Adjust layout
plt.tight_layout()
# Save the plot
if output_dir:
plot_filename = os.path.join(output_dir, 'correlation_plots.png')
else:
plot_filename = 'correlation_plots.png'
plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
print(f"\nCorrelation plots saved to: {plot_filename}")
# Also save individual plots
for idx, (pair_name, data_dict) in enumerate(pairs_data.items()):
fig_individual = plt.figure(figsize=(8, 6))
# Extract data
x_data = data_dict['x_data']
y_data = data_dict['y_data']
x_label = data_dict['x_label']
y_label = data_dict['y_label']
pearson_corr = data_dict['pearson_corr']
spearman_corr = data_dict['spearman_corr']
n_samples = data_dict['n_samples']
# Create scatter plot
plt.scatter(x_data, y_data, alpha=0.6, s=50)
# Add trend line
if len(x_data) > 0:
z = np.polyfit(x_data, y_data, 1)
p = np.poly1d(z)
plt.plot(sorted(x_data), p(sorted(x_data)), "r--", alpha=0.8, linewidth=2)
# Set labels and title
plt.xlabel(x_label, fontsize=12)
plt.ylabel(y_label, fontsize=12)
plt.title(f'{pair_name}: {x_label} vs {y_label}', fontsize=14, fontweight='bold')
# Add correlation info as text
if pearson_corr is not None:
corr_text = f'Pearson r = {pearson_corr:.3f}\nSpearman ρ = {spearman_corr:.3f}\nn = {n_samples}'
else:
corr_text = f'Insufficient data\nn = {n_samples}'
plt.text(0.05, 0.95, corr_text, transform=plt.gca().transAxes,
verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
# Format y-axis as percentage if it's IPM data
if 'IPM' in y_label:
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
# Save individual plot
if output_dir:
individual_filename = os.path.join(output_dir, f'correlation_{pair_name}.png')
else:
individual_filename = f'correlation_{pair_name}.png'
plt.savefig(individual_filename, dpi=300, bbox_inches='tight')
plt.close()
plt.show()
def main():
# Parse command line arguments
parser = argparse.ArgumentParser(
description='Analyze correlations between KPI IPM scores and axiia scores',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='''Examples:
python3 analyze_correlations_v2.py -k ../../data/lenovo_kpi.csv -s ../../data/lenovo-scores-0603.csv -o score_corr.yaml
python3 analyze_correlations_v2.py -k kpi.csv -s score.csv -o scr.yaml'''
)
parser.add_argument('-k', '--kpi', required=True, dest='kpi_file',
help='Path to the KPI CSV file')
parser.add_argument('-s', '--scores', required=True, dest='scores_file',
help='Path to the scores CSV file')
parser.add_argument('-o', '--output', default='score_corr.yaml',
help='Output YAML file name (default: score_corr.yaml)')
parser.add_argument('-p', '--plot', action='store_true',
help='Generate correlation plots')
args = parser.parse_args()
# Validate input files exist
if not os.path.exists(args.kpi_file):
print(f"Error: KPI file not found: {args.kpi_file}")
sys.exit(1)
if not os.path.exists(args.scores_file):
print(f"Error: Scores file not found: {args.scores_file}")
sys.exit(1)
# Load and analyze data using core functions
print("Loading data files...")
try:
# Use core analysis function
data_quality_stats, correlation_results, plot_data, column_info = analyze_correlations_full(
args.kpi_file, args.scores_file
)
except Exception as e:
print(f"Error during analysis: {str(e)}")
sys.exit(1)
# Print column info
print(f"\nUsing columns:")
print(f" Email column: {column_info['kpi_email_col']}")
print(f" FY24/25 IPM: {column_info['fy2425_ipm_col']}")
print(f" FY23/24 IPM: {column_info['fy2324_ipm_col']}")
# Print data quality report
print_data_quality_report(data_quality_stats)
if data_quality_stats['matched_emails'] == 0:
print("\nError: No matching emails found between the two files!")
sys.exit(1)
# Print correlation analysis results
print("\n=== CORRELATION ANALYSIS ===")
for pair_name in ['AC', 'AD', 'BC', 'BD']:
if pair_name in correlation_results:
corr_data = correlation_results[pair_name]
pd_data = plot_data[pair_name]
print(f"\nProcessing {pair_name}: {pd_data['x_label']} vs {pd_data['y_label']}")
print(f" Initial records: {corr_data['data_quality']['initial_records']}")
print(f" Valid data points: {corr_data['data_quality']['valid_records']}")
print(f" Completion rate: {corr_data['data_quality']['completion_rate']}")
if corr_data['pearson']['correlation'] is not None:
print(f" Pearson correlation: {corr_data['pearson']['correlation']:.4f} (p={corr_data['pearson']['p_value']:.4f})")
print(f" Spearman correlation: {corr_data['spearman']['correlation']:.4f} (p={corr_data['spearman']['p_value']:.4f})")
else:
print(f" Warning: Not enough valid data points for {pair_name}")
# Create results dictionary for YAML output
results = {
'metadata': {
'kpi_file': os.path.basename(args.kpi_file),
'scores_file': os.path.basename(args.scores_file),
'total_matched_emails': data_quality_stats['matched_emails'],
'analysis_timestamp': pd.Timestamp.now().isoformat()
},
'correlations': correlation_results
}
# Save results to YAML file
script_dir = os.path.dirname(os.path.abspath(__file__))
output_file = os.path.join(script_dir, args.output)
with open(output_file, 'w') as f:
yaml.dump(results, f, default_flow_style=False, sort_keys=False)
print(f"\nResults saved to {output_file}")
# Print summary
print("\n=== CORRELATION SUMMARY ===")
for pair_name in ['AC', 'AD', 'BC', 'BD']:
if pair_name in correlation_results:
corr_data = correlation_results[pair_name]
print(f"\n{pair_name}:")
print(f" Valid samples: {corr_data['data_quality']['valid_records']} / {corr_data['data_quality']['initial_records']} ({corr_data['data_quality']['completion_rate']})")
if corr_data['pearson']['correlation'] is not None:
print(f" Pearson: {corr_data['pearson']['correlation']:.4f}")
print(f" Spearman: {corr_data['spearman']['correlation']:.4f}")
else:
print(f" Pearson: N/A")
print(f" Spearman: N/A")
# Create correlation plots
if args.plot:
script_dir = os.path.dirname(os.path.abspath(__file__))
create_correlation_plots(plot_data, output_dir=script_dir)
if __name__ == "__main__":
main()