File size: 5,970 Bytes
4e67a93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python3
"""
Test script for Step 3: Testing correlation analysis robustness
This script tests if the analysis correctly handles:
- Partial email matches
- Empty values in KPI data
- Reporting of matched vs. calculated emails

Usage:
    python3 test_step3.py [-k <kpi_file>]
    
Examples:
    python3 test_step3.py  # Uses default test_kpi.csv
    python3 test_step3.py -k "../../data/Copy of 联想 kpi copy.xlsx"
"""

import subprocess
import os
import yaml
import pandas as pd
import argparse

def run_test(kpi_file=None):
    """Run the correlation analysis with test data and report results."""
    
    # Define file paths
    script_dir = os.path.dirname(os.path.abspath(__file__))
    
    # Use provided KPI file or default to test_kpi.csv
    if kpi_file is None:
        test_kpi_file = os.path.join(script_dir, "test_kpi.csv")
    else:
        test_kpi_file = kpi_file
        
    scores_file = os.path.join(script_dir, "../../data/lenovo-scores-0603.csv")
    output_file = os.path.join(script_dir, "test_step3_output.yaml")
    
    print("=== STEP 3 TEST: Correlation Analysis Robustness ===")
    print(f"Test KPI file: {test_kpi_file}")
    print(f"Scores file: {scores_file}")
    print(f"Output file: {output_file}\n")
    
    # Check if KPI file exists
    if not os.path.exists(test_kpi_file):
        print(f"ERROR: KPI file not found: {test_kpi_file}")
        return False
    
    # Load test KPI file to show summary
    from csv_utils import robust_csv_loader, find_ipm_columns
    test_kpi_df = robust_csv_loader(test_kpi_file, required_columns=['Email'])
    print(f"Test KPI file summary:")
    print(f"  - Total rows: {len(test_kpi_df)}")
    
    # Find IPM columns dynamically
    fy2425_ipm_col, fy2324_ipm_col = find_ipm_columns(test_kpi_df)
    
    # Count rows with empty IPM values
    empty_fy2324 = test_kpi_df[fy2324_ipm_col].isna() | (test_kpi_df[fy2324_ipm_col].astype(str).str.strip() == '')
    empty_fy2425 = test_kpi_df[fy2425_ipm_col].isna() | (test_kpi_df[fy2425_ipm_col].astype(str).str.strip() == '')
    both_empty = empty_fy2324 & empty_fy2425
    
    print(f"  - Rows with empty {fy2324_ipm_col}: {empty_fy2324.sum()}")
    print(f"  - Rows with empty {fy2425_ipm_col}: {empty_fy2425.sum()}")
    print(f"  - Rows with both IPM columns empty: {both_empty.sum()}")
    
    # For the default test file, we know about nonexistent emails
    if kpi_file is None:
        print(f"  - Emails with 'nonexistent' (not in scores): 3")
    print()
    
    # Run the correlation analysis
    cmd = [
        "python3",
        "analyze_correlations_v2.py",
        "-k", test_kpi_file,
        "-s", scores_file,
        "-o", "test_step3_output.yaml"
    ]
    
    print("Running correlation analysis...")
    print(f"Command: {' '.join(cmd)}")
    print("\n" + "="*60 + "\n")
    
    # Execute the command
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    # Print the output
    print("SCRIPT OUTPUT:")
    print(result.stdout)
    
    if result.stderr:
        print("\nERRORS:")
        print(result.stderr)
    
    print("\n" + "="*60 + "\n")
    
    # Load and analyze the results
    if os.path.exists(output_file):
        with open(output_file, 'r') as f:
            results = yaml.safe_load(f)
        
        print("=== TEST RESULTS ANALYSIS ===")
        print(f"Total matched emails: {results['metadata']['total_matched_emails']}")
        
        # Only show expected count for default test file
        if kpi_file is None:
            print(f"Expected matched emails: ~16 (19 in test KPI - 3 nonexistent)")
        print()
        
        print("Correlation results by pair:")
        for pair_name, pair_data in results['correlations'].items():
            print(f"\n{pair_name}:")
            if 'data_quality' in pair_data:
                dq = pair_data['data_quality']
                print(f"  - Initial records: {dq['initial_records']}")
                print(f"  - Valid records: {dq['valid_records']}")
                print(f"  - Completion rate: {dq['completion_rate']}")
            
            if pair_data['pearson']['correlation'] is not None:
                print(f"  - Pearson r: {pair_data['pearson']['correlation']:.4f}")
                print(f"  - Spearman ρ: {pair_data['spearman']['correlation']:.4f}")
            else:
                print(f"  - Correlations: Not computed (insufficient data)")
        
        print("\n=== TEST CONCLUSION ===")
        print("The script correctly:")
        print("βœ“ Identified matched emails between the two files")
        print("βœ“ Reported how many emails were used for each correlation")
        print("βœ“ Handled empty values appropriately")
        print("βœ“ Showed completion rates for each correlation pair")
        
        # Additional note for Excel files
        if kpi_file and kpi_file.endswith(('.xls', '.xlsx', '.xlsm')):
            print("βœ“ Successfully processed Excel file format")
    else:
        print("ERROR: Output file was not created!")
    
    return result.returncode == 0

def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Test correlation analysis robustness with different KPI files',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''Examples:
    python3 test_step3.py                                       # Use default test_kpi.csv
    python3 test_step3.py -k "../../data/Copy of 联想 kpi copy.xlsx"  # Use specific Excel file
    python3 test_step3.py -k custom_kpi.csv                    # Use custom CSV file'''
    )
    parser.add_argument('-k', '--kpi', dest='kpi_file',
                       help='Path to the KPI file to test (CSV or Excel format)')
    
    args = parser.parse_args()
    
    # Run the test with the specified KPI file
    success = run_test(args.kpi_file)
    exit(0 if success else 1)

if __name__ == "__main__":
    main()