kpi_analysis / scripts /test_step3.py
zh3036's picture
Deploy KPI snapshot 2025-06-12
4e67a93
#!/usr/bin/env python3
"""
Test script for Step 3: Testing correlation analysis robustness
This script tests if the analysis correctly handles:
- Partial email matches
- Empty values in KPI data
- Reporting of matched vs. calculated emails
Usage:
python3 test_step3.py [-k <kpi_file>]
Examples:
python3 test_step3.py # Uses default test_kpi.csv
python3 test_step3.py -k "../../data/Copy of 联想 kpi copy.xlsx"
"""
import subprocess
import os
import yaml
import pandas as pd
import argparse
def run_test(kpi_file=None):
"""Run the correlation analysis with test data and report results."""
# Define file paths
script_dir = os.path.dirname(os.path.abspath(__file__))
# Use provided KPI file or default to test_kpi.csv
if kpi_file is None:
test_kpi_file = os.path.join(script_dir, "test_kpi.csv")
else:
test_kpi_file = kpi_file
scores_file = os.path.join(script_dir, "../../data/lenovo-scores-0603.csv")
output_file = os.path.join(script_dir, "test_step3_output.yaml")
print("=== STEP 3 TEST: Correlation Analysis Robustness ===")
print(f"Test KPI file: {test_kpi_file}")
print(f"Scores file: {scores_file}")
print(f"Output file: {output_file}\n")
# Check if KPI file exists
if not os.path.exists(test_kpi_file):
print(f"ERROR: KPI file not found: {test_kpi_file}")
return False
# Load test KPI file to show summary
from csv_utils import robust_csv_loader, find_ipm_columns
test_kpi_df = robust_csv_loader(test_kpi_file, required_columns=['Email'])
print(f"Test KPI file summary:")
print(f" - Total rows: {len(test_kpi_df)}")
# Find IPM columns dynamically
fy2425_ipm_col, fy2324_ipm_col = find_ipm_columns(test_kpi_df)
# Count rows with empty IPM values
empty_fy2324 = test_kpi_df[fy2324_ipm_col].isna() | (test_kpi_df[fy2324_ipm_col].astype(str).str.strip() == '')
empty_fy2425 = test_kpi_df[fy2425_ipm_col].isna() | (test_kpi_df[fy2425_ipm_col].astype(str).str.strip() == '')
both_empty = empty_fy2324 & empty_fy2425
print(f" - Rows with empty {fy2324_ipm_col}: {empty_fy2324.sum()}")
print(f" - Rows with empty {fy2425_ipm_col}: {empty_fy2425.sum()}")
print(f" - Rows with both IPM columns empty: {both_empty.sum()}")
# For the default test file, we know about nonexistent emails
if kpi_file is None:
print(f" - Emails with 'nonexistent' (not in scores): 3")
print()
# Run the correlation analysis
cmd = [
"python3",
"analyze_correlations_v2.py",
"-k", test_kpi_file,
"-s", scores_file,
"-o", "test_step3_output.yaml"
]
print("Running correlation analysis...")
print(f"Command: {' '.join(cmd)}")
print("\n" + "="*60 + "\n")
# Execute the command
result = subprocess.run(cmd, capture_output=True, text=True)
# Print the output
print("SCRIPT OUTPUT:")
print(result.stdout)
if result.stderr:
print("\nERRORS:")
print(result.stderr)
print("\n" + "="*60 + "\n")
# Load and analyze the results
if os.path.exists(output_file):
with open(output_file, 'r') as f:
results = yaml.safe_load(f)
print("=== TEST RESULTS ANALYSIS ===")
print(f"Total matched emails: {results['metadata']['total_matched_emails']}")
# Only show expected count for default test file
if kpi_file is None:
print(f"Expected matched emails: ~16 (19 in test KPI - 3 nonexistent)")
print()
print("Correlation results by pair:")
for pair_name, pair_data in results['correlations'].items():
print(f"\n{pair_name}:")
if 'data_quality' in pair_data:
dq = pair_data['data_quality']
print(f" - Initial records: {dq['initial_records']}")
print(f" - Valid records: {dq['valid_records']}")
print(f" - Completion rate: {dq['completion_rate']}")
if pair_data['pearson']['correlation'] is not None:
print(f" - Pearson r: {pair_data['pearson']['correlation']:.4f}")
print(f" - Spearman ρ: {pair_data['spearman']['correlation']:.4f}")
else:
print(f" - Correlations: Not computed (insufficient data)")
print("\n=== TEST CONCLUSION ===")
print("The script correctly:")
print("βœ“ Identified matched emails between the two files")
print("βœ“ Reported how many emails were used for each correlation")
print("βœ“ Handled empty values appropriately")
print("βœ“ Showed completion rates for each correlation pair")
# Additional note for Excel files
if kpi_file and kpi_file.endswith(('.xls', '.xlsx', '.xlsm')):
print("βœ“ Successfully processed Excel file format")
else:
print("ERROR: Output file was not created!")
return result.returncode == 0
def main():
# Parse command line arguments
parser = argparse.ArgumentParser(
description='Test correlation analysis robustness with different KPI files',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='''Examples:
python3 test_step3.py # Use default test_kpi.csv
python3 test_step3.py -k "../../data/Copy of 联想 kpi copy.xlsx" # Use specific Excel file
python3 test_step3.py -k custom_kpi.csv # Use custom CSV file'''
)
parser.add_argument('-k', '--kpi', dest='kpi_file',
help='Path to the KPI file to test (CSV or Excel format)')
args = parser.parse_args()
# Run the test with the specified KPI file
success = run_test(args.kpi_file)
exit(0 if success else 1)
if __name__ == "__main__":
main()