Spaces:

zh3036
/

kpi_analysis

Sleeping

App Files Files Community

kpi_analysis / scripts /test_step3.py

zh3036

Deploy KPI snapshot 2025-06-12

4e67a93 10 months ago

raw

history blame contribute delete

5.97 kB

	#!/usr/bin/env python3
	"""
	Test script for Step 3: Testing correlation analysis robustness
	This script tests if the analysis correctly handles:
	- Partial email matches
	- Empty values in KPI data
	- Reporting of matched vs. calculated emails

	Usage:
	python3 test_step3.py [-k <kpi_file>]

	Examples:
	python3 test_step3.py # Uses default test_kpi.csv
	python3 test_step3.py -k "../../data/Copy of 联想 kpi copy.xlsx"
	"""

	import subprocess
	import os
	import yaml
	import pandas as pd
	import argparse

	def run_test(kpi_file=None):
	"""Run the correlation analysis with test data and report results."""

	# Define file paths
	script_dir = os.path.dirname(os.path.abspath(__file__))

	# Use provided KPI file or default to test_kpi.csv
	if kpi_file is None:
	test_kpi_file = os.path.join(script_dir, "test_kpi.csv")
	else:
	test_kpi_file = kpi_file

	scores_file = os.path.join(script_dir, "../../data/lenovo-scores-0603.csv")
	output_file = os.path.join(script_dir, "test_step3_output.yaml")

	print("=== STEP 3 TEST: Correlation Analysis Robustness ===")
	print(f"Test KPI file: {test_kpi_file}")
	print(f"Scores file: {scores_file}")
	print(f"Output file: {output_file}\n")

	# Check if KPI file exists
	if not os.path.exists(test_kpi_file):
	print(f"ERROR: KPI file not found: {test_kpi_file}")
	return False

	# Load test KPI file to show summary
	from csv_utils import robust_csv_loader, find_ipm_columns
	test_kpi_df = robust_csv_loader(test_kpi_file, required_columns=['Email'])
	print(f"Test KPI file summary:")
	print(f" - Total rows: {len(test_kpi_df)}")

	# Find IPM columns dynamically
	fy2425_ipm_col, fy2324_ipm_col = find_ipm_columns(test_kpi_df)

	# Count rows with empty IPM values
	empty_fy2324 = test_kpi_df[fy2324_ipm_col].isna() \| (test_kpi_df[fy2324_ipm_col].astype(str).str.strip() == '')
	empty_fy2425 = test_kpi_df[fy2425_ipm_col].isna() \| (test_kpi_df[fy2425_ipm_col].astype(str).str.strip() == '')
	both_empty = empty_fy2324 & empty_fy2425

	print(f" - Rows with empty {fy2324_ipm_col}: {empty_fy2324.sum()}")
	print(f" - Rows with empty {fy2425_ipm_col}: {empty_fy2425.sum()}")
	print(f" - Rows with both IPM columns empty: {both_empty.sum()}")

	# For the default test file, we know about nonexistent emails
	if kpi_file is None:
	print(f" - Emails with 'nonexistent' (not in scores): 3")
	print()

	# Run the correlation analysis
	cmd = [
	"python3",
	"analyze_correlations_v2.py",
	"-k", test_kpi_file,
	"-s", scores_file,
	"-o", "test_step3_output.yaml"
	]

	print("Running correlation analysis...")
	print(f"Command: {' '.join(cmd)}")
	print("\n" + "="*60 + "\n")

	# Execute the command
	result = subprocess.run(cmd, capture_output=True, text=True)

	# Print the output
	print("SCRIPT OUTPUT:")
	print(result.stdout)

	if result.stderr:
	print("\nERRORS:")
	print(result.stderr)

	print("\n" + "="*60 + "\n")

	# Load and analyze the results
	if os.path.exists(output_file):
	with open(output_file, 'r') as f:
	results = yaml.safe_load(f)

	print("=== TEST RESULTS ANALYSIS ===")
	print(f"Total matched emails: {results['metadata']['total_matched_emails']}")

	# Only show expected count for default test file
	if kpi_file is None:
	print(f"Expected matched emails: ~16 (19 in test KPI - 3 nonexistent)")
	print()

	print("Correlation results by pair:")
	for pair_name, pair_data in results['correlations'].items():
	print(f"\n{pair_name}:")
	if 'data_quality' in pair_data:
	dq = pair_data['data_quality']
	print(f" - Initial records: {dq['initial_records']}")
	print(f" - Valid records: {dq['valid_records']}")
	print(f" - Completion rate: {dq['completion_rate']}")

	if pair_data['pearson']['correlation'] is not None:
	print(f" - Pearson r: {pair_data['pearson']['correlation']:.4f}")
	print(f" - Spearman ρ: {pair_data['spearman']['correlation']:.4f}")
	else:
	print(f" - Correlations: Not computed (insufficient data)")

	print("\n=== TEST CONCLUSION ===")
	print("The script correctly:")
	print("✓ Identified matched emails between the two files")
	print("✓ Reported how many emails were used for each correlation")
	print("✓ Handled empty values appropriately")
	print("✓ Showed completion rates for each correlation pair")

	# Additional note for Excel files
	if kpi_file and kpi_file.endswith(('.xls', '.xlsx', '.xlsm')):
	print("✓ Successfully processed Excel file format")
	else:
	print("ERROR: Output file was not created!")

	return result.returncode == 0

	def main():
	# Parse command line arguments
	parser = argparse.ArgumentParser(
	description='Test correlation analysis robustness with different KPI files',
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog='''Examples:
	python3 test_step3.py # Use default test_kpi.csv
	python3 test_step3.py -k "../../data/Copy of 联想 kpi copy.xlsx" # Use specific Excel file
	python3 test_step3.py -k custom_kpi.csv # Use custom CSV file'''
	)
	parser.add_argument('-k', '--kpi', dest='kpi_file',
	help='Path to the KPI file to test (CSV or Excel format)')

	args = parser.parse_args()

	# Run the test with the specified KPI file
	success = run_test(args.kpi_file)
	exit(0 if success else 1)

	if __name__ == "__main__":
	main()