Spaces:

zh3036
/

kpi_analysis

Sleeping

App Files Files Community

kpi_analysis / scripts /verify_email_matching.py

zh3036

Deploy KPI snapshot 2025-06-12

4e67a93 10 months ago

raw

history blame contribute delete

3.8 kB

	#!/usr/bin/env python3
	"""
	Verification script to confirm email matching works correctly with actual data
	"""

	import pandas as pd
	from csv_utils import robust_csv_loader, find_ipm_columns
	from kpi_correlation_app import convert_percentage_to_numeric

	def verify_email_matching():
	"""Verify email matching with actual data files"""

	print("=" * 70)
	print("Email Matching Verification with Actual Data")
	print("=" * 70)

	# Load the actual files
	print("\n1. Loading actual data files...")

	scores_df = pd.read_csv('lenovo-scores-0603.csv')
	kpi_df = robust_csv_loader('lenovo_kpi_filled.csv')

	print(f"Scores file: {len(scores_df)} records")
	print(f"KPI file: {len(kpi_df)} records")

	# Find IPM columns
	fy2425_col, fy2324_col = find_ipm_columns(kpi_df)
	print(f"\nIPM columns found:")
	print(f" FY23/24: {fy2324_col}")
	print(f" FY24/25: {fy2425_col}")

	# Normalize emails
	print("\n2. Normalizing emails for matching...")
	scores_df['email_normalized'] = scores_df['email'].str.lower()
	kpi_df['email_normalized'] = kpi_df['Email'].str.lower()

	# Merge datasets
	print("\n3. Merging datasets...")
	merged_df = pd.merge(scores_df, kpi_df, on='email_normalized', how='inner')
	print(f"Matched {len(merged_df)} emails")

	# Test with specific emails
	print("\n4. Testing specific email matches:")

	# Get first 3 matched emails for demonstration
	test_emails = merged_df['email_normalized'].head(3).tolist()

	for email in test_emails:
	row = merged_df[merged_df['email_normalized'] == email].iloc[0]

	print(f"\n{email}:")
	print(f" From scores file:")
	print(f" - problem_score: {row['problem_score']}")
	print(f" - ability_score: {row['ability_score']}")
	print(f" From KPI file:")
	print(f" - Original email: {row['Email']}")
	print(f" - {fy2324_col}: {row[fy2324_col]}")
	print(f" - {fy2425_col}: {row[fy2425_col]}")

	# Convert and show numeric values
	if pd.notna(row[fy2324_col]) and pd.notna(row[fy2425_col]):
	fy2324_numeric = convert_percentage_to_numeric(pd.Series([row[fy2324_col]]))[0]
	fy2425_numeric = convert_percentage_to_numeric(pd.Series([row[fy2425_col]]))[0]
	print(f" Converted IPM values:")
	print(f" - FY23/24: {row[fy2324_col]} → {fy2324_numeric:.3f}")
	print(f" - FY24/25: {row[fy2425_col]} → {fy2425_numeric:.3f}")

	# Show some statistics
	print("\n5. Email matching statistics:")

	# Check for case differences
	case_differences = 0
	for _, row in merged_df.iterrows():
	if row['email'] != row['Email']:
	case_differences += 1

	print(f" - Emails with case differences: {case_differences}")
	print(f" - Percentage of emails matched: {len(merged_df)/len(scores_df)*100:.1f}%")

	# Show unmatched emails count
	unmatched_scores = len(scores_df) - len(merged_df)
	unmatched_kpi = len(kpi_df) - len(merged_df)
	print(f" - Unmatched emails in scores: {unmatched_scores}")
	print(f" - Unmatched emails in KPI: {unmatched_kpi}")

	return merged_df


	if __name__ == '__main__':
	try:
	merged_df = verify_email_matching()
	print("\n\n✅ Verification complete!")
	print("\nConclusion: The email matching correctly:")
	print("1. Matches emails case-insensitively")
	print("2. Associates the correct values from both files")
	print("3. Converts percentage strings to numeric values")
	print("4. Handles multi-level column headers in the KPI file")
	except Exception as e:
	print(f"\n❌ Error during verification: {str(e)}")