Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| KPI Correlation Analysis Script v2 | |
| Analyzes correlations between IPM scores and axiia scores with improved robustness | |
| Usage: | |
| python3 analyze_correlations_v2.py -k <kpi_file> -s <scores_file> [-o <output_file>] | |
| Examples: | |
| python3 analyze_correlations_v2.py -k ../../data/lenovo_kpi.csv -s ../../data/lenovo-scores-0603.csv -o score_corr.yaml | |
| python3 analyze_correlations_v2.py -k kpi.csv -s score.csv -o scr.yaml | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import yaml | |
| import os | |
| import argparse | |
| import sys | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| # Import core analysis functions | |
| from correlation_analysis_core import ( | |
| analyze_correlations_full, | |
| convert_percentage_to_numeric | |
| ) | |
| def print_data_quality_report(data_quality_stats): | |
| """Print data quality statistics to console.""" | |
| print("\n=== DATA QUALITY REPORT ===") | |
| print(f"KPI file: {data_quality_stats['kpi_records']} records") | |
| print(f"Scores file: {data_quality_stats['scores_records']} records") | |
| print(f"Matched emails: {data_quality_stats['matched_emails']} records") | |
| print(f"\nEmail matching statistics:") | |
| print(f" - Common emails: {data_quality_stats['common_emails']}") | |
| print(f" - Emails only in KPI file: {data_quality_stats['emails_only_in_kpi']}") | |
| print(f" - Emails only in Scores file: {data_quality_stats['emails_only_in_scores']}") | |
| print(f" - Match rate (KPI perspective): {data_quality_stats['match_rate_kpi']:.1f}%") | |
| print(f" - Match rate (Scores perspective): {data_quality_stats['match_rate_scores']:.1f}%") | |
| def create_correlation_plots(pairs_data, output_dir=None): | |
| """Create scatter plots for each correlation pair.""" | |
| # Set up the figure | |
| fig, axes = plt.subplots(2, 2, figsize=(12, 10)) | |
| axes = axes.flatten() | |
| # Set style | |
| sns.set_style("whitegrid") | |
| for idx, (pair_name, data_dict) in enumerate(pairs_data.items()): | |
| ax = axes[idx] | |
| # Extract data | |
| x_data = data_dict['x_data'] | |
| y_data = data_dict['y_data'] | |
| x_label = data_dict['x_label'] | |
| y_label = data_dict['y_label'] | |
| pearson_corr = data_dict['pearson_corr'] | |
| spearman_corr = data_dict['spearman_corr'] | |
| n_samples = data_dict['n_samples'] | |
| # Create scatter plot | |
| ax.scatter(x_data, y_data, alpha=0.6, s=50) | |
| # Add trend line | |
| if len(x_data) > 0: | |
| z = np.polyfit(x_data, y_data, 1) | |
| p = np.poly1d(z) | |
| ax.plot(sorted(x_data), p(sorted(x_data)), "r--", alpha=0.8, linewidth=2) | |
| # Set labels and title | |
| ax.set_xlabel(x_label, fontsize=10) | |
| ax.set_ylabel(y_label, fontsize=10) | |
| ax.set_title(f'{pair_name}: {x_label} vs {y_label}', fontsize=12, fontweight='bold') | |
| # Add correlation info as text | |
| if pearson_corr is not None: | |
| corr_text = f'Pearson r = {pearson_corr:.3f}\nSpearman ρ = {spearman_corr:.3f}\nn = {n_samples}' | |
| else: | |
| corr_text = f'Insufficient data\nn = {n_samples}' | |
| ax.text(0.05, 0.95, corr_text, transform=ax.transAxes, | |
| verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5)) | |
| # Format y-axis as percentage if it's IPM data | |
| if 'IPM' in y_label: | |
| ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.0%}'.format(y))) | |
| # Adjust layout | |
| plt.tight_layout() | |
| # Save the plot | |
| if output_dir: | |
| plot_filename = os.path.join(output_dir, 'correlation_plots.png') | |
| else: | |
| plot_filename = 'correlation_plots.png' | |
| plt.savefig(plot_filename, dpi=300, bbox_inches='tight') | |
| print(f"\nCorrelation plots saved to: {plot_filename}") | |
| # Also save individual plots | |
| for idx, (pair_name, data_dict) in enumerate(pairs_data.items()): | |
| fig_individual = plt.figure(figsize=(8, 6)) | |
| # Extract data | |
| x_data = data_dict['x_data'] | |
| y_data = data_dict['y_data'] | |
| x_label = data_dict['x_label'] | |
| y_label = data_dict['y_label'] | |
| pearson_corr = data_dict['pearson_corr'] | |
| spearman_corr = data_dict['spearman_corr'] | |
| n_samples = data_dict['n_samples'] | |
| # Create scatter plot | |
| plt.scatter(x_data, y_data, alpha=0.6, s=50) | |
| # Add trend line | |
| if len(x_data) > 0: | |
| z = np.polyfit(x_data, y_data, 1) | |
| p = np.poly1d(z) | |
| plt.plot(sorted(x_data), p(sorted(x_data)), "r--", alpha=0.8, linewidth=2) | |
| # Set labels and title | |
| plt.xlabel(x_label, fontsize=12) | |
| plt.ylabel(y_label, fontsize=12) | |
| plt.title(f'{pair_name}: {x_label} vs {y_label}', fontsize=14, fontweight='bold') | |
| # Add correlation info as text | |
| if pearson_corr is not None: | |
| corr_text = f'Pearson r = {pearson_corr:.3f}\nSpearman ρ = {spearman_corr:.3f}\nn = {n_samples}' | |
| else: | |
| corr_text = f'Insufficient data\nn = {n_samples}' | |
| plt.text(0.05, 0.95, corr_text, transform=plt.gca().transAxes, | |
| verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5)) | |
| # Format y-axis as percentage if it's IPM data | |
| if 'IPM' in y_label: | |
| plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.0%}'.format(y))) | |
| # Save individual plot | |
| if output_dir: | |
| individual_filename = os.path.join(output_dir, f'correlation_{pair_name}.png') | |
| else: | |
| individual_filename = f'correlation_{pair_name}.png' | |
| plt.savefig(individual_filename, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| plt.show() | |
| def main(): | |
| # Parse command line arguments | |
| parser = argparse.ArgumentParser( | |
| description='Analyze correlations between KPI IPM scores and axiia scores', | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog='''Examples: | |
| python3 analyze_correlations_v2.py -k ../../data/lenovo_kpi.csv -s ../../data/lenovo-scores-0603.csv -o score_corr.yaml | |
| python3 analyze_correlations_v2.py -k kpi.csv -s score.csv -o scr.yaml''' | |
| ) | |
| parser.add_argument('-k', '--kpi', required=True, dest='kpi_file', | |
| help='Path to the KPI CSV file') | |
| parser.add_argument('-s', '--scores', required=True, dest='scores_file', | |
| help='Path to the scores CSV file') | |
| parser.add_argument('-o', '--output', default='score_corr.yaml', | |
| help='Output YAML file name (default: score_corr.yaml)') | |
| parser.add_argument('-p', '--plot', action='store_true', | |
| help='Generate correlation plots') | |
| args = parser.parse_args() | |
| # Validate input files exist | |
| if not os.path.exists(args.kpi_file): | |
| print(f"Error: KPI file not found: {args.kpi_file}") | |
| sys.exit(1) | |
| if not os.path.exists(args.scores_file): | |
| print(f"Error: Scores file not found: {args.scores_file}") | |
| sys.exit(1) | |
| # Load and analyze data using core functions | |
| print("Loading data files...") | |
| try: | |
| # Use core analysis function | |
| data_quality_stats, correlation_results, plot_data, column_info = analyze_correlations_full( | |
| args.kpi_file, args.scores_file | |
| ) | |
| except Exception as e: | |
| print(f"Error during analysis: {str(e)}") | |
| sys.exit(1) | |
| # Print column info | |
| print(f"\nUsing columns:") | |
| print(f" Email column: {column_info['kpi_email_col']}") | |
| print(f" FY24/25 IPM: {column_info['fy2425_ipm_col']}") | |
| print(f" FY23/24 IPM: {column_info['fy2324_ipm_col']}") | |
| # Print data quality report | |
| print_data_quality_report(data_quality_stats) | |
| if data_quality_stats['matched_emails'] == 0: | |
| print("\nError: No matching emails found between the two files!") | |
| sys.exit(1) | |
| # Print correlation analysis results | |
| print("\n=== CORRELATION ANALYSIS ===") | |
| for pair_name in ['AC', 'AD', 'BC', 'BD']: | |
| if pair_name in correlation_results: | |
| corr_data = correlation_results[pair_name] | |
| pd_data = plot_data[pair_name] | |
| print(f"\nProcessing {pair_name}: {pd_data['x_label']} vs {pd_data['y_label']}") | |
| print(f" Initial records: {corr_data['data_quality']['initial_records']}") | |
| print(f" Valid data points: {corr_data['data_quality']['valid_records']}") | |
| print(f" Completion rate: {corr_data['data_quality']['completion_rate']}") | |
| if corr_data['pearson']['correlation'] is not None: | |
| print(f" Pearson correlation: {corr_data['pearson']['correlation']:.4f} (p={corr_data['pearson']['p_value']:.4f})") | |
| print(f" Spearman correlation: {corr_data['spearman']['correlation']:.4f} (p={corr_data['spearman']['p_value']:.4f})") | |
| else: | |
| print(f" Warning: Not enough valid data points for {pair_name}") | |
| # Create results dictionary for YAML output | |
| results = { | |
| 'metadata': { | |
| 'kpi_file': os.path.basename(args.kpi_file), | |
| 'scores_file': os.path.basename(args.scores_file), | |
| 'total_matched_emails': data_quality_stats['matched_emails'], | |
| 'analysis_timestamp': pd.Timestamp.now().isoformat() | |
| }, | |
| 'correlations': correlation_results | |
| } | |
| # Save results to YAML file | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| output_file = os.path.join(script_dir, args.output) | |
| with open(output_file, 'w') as f: | |
| yaml.dump(results, f, default_flow_style=False, sort_keys=False) | |
| print(f"\nResults saved to {output_file}") | |
| # Print summary | |
| print("\n=== CORRELATION SUMMARY ===") | |
| for pair_name in ['AC', 'AD', 'BC', 'BD']: | |
| if pair_name in correlation_results: | |
| corr_data = correlation_results[pair_name] | |
| print(f"\n{pair_name}:") | |
| print(f" Valid samples: {corr_data['data_quality']['valid_records']} / {corr_data['data_quality']['initial_records']} ({corr_data['data_quality']['completion_rate']})") | |
| if corr_data['pearson']['correlation'] is not None: | |
| print(f" Pearson: {corr_data['pearson']['correlation']:.4f}") | |
| print(f" Spearman: {corr_data['spearman']['correlation']:.4f}") | |
| else: | |
| print(f" Pearson: N/A") | |
| print(f" Spearman: N/A") | |
| # Create correlation plots | |
| if args.plot: | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| create_correlation_plots(plot_data, output_dir=script_dir) | |
| if __name__ == "__main__": | |
| main() |