kpi_analysis / scripts /verify_email_matching.py
zh3036's picture
Deploy KPI snapshot 2025-06-12
4e67a93
#!/usr/bin/env python3
"""
Verification script to confirm email matching works correctly with actual data
"""
import pandas as pd
from csv_utils import robust_csv_loader, find_ipm_columns
from kpi_correlation_app import convert_percentage_to_numeric
def verify_email_matching():
"""Verify email matching with actual data files"""
print("=" * 70)
print("Email Matching Verification with Actual Data")
print("=" * 70)
# Load the actual files
print("\n1. Loading actual data files...")
scores_df = pd.read_csv('lenovo-scores-0603.csv')
kpi_df = robust_csv_loader('lenovo_kpi_filled.csv')
print(f"Scores file: {len(scores_df)} records")
print(f"KPI file: {len(kpi_df)} records")
# Find IPM columns
fy2425_col, fy2324_col = find_ipm_columns(kpi_df)
print(f"\nIPM columns found:")
print(f" FY23/24: {fy2324_col}")
print(f" FY24/25: {fy2425_col}")
# Normalize emails
print("\n2. Normalizing emails for matching...")
scores_df['email_normalized'] = scores_df['email'].str.lower()
kpi_df['email_normalized'] = kpi_df['Email'].str.lower()
# Merge datasets
print("\n3. Merging datasets...")
merged_df = pd.merge(scores_df, kpi_df, on='email_normalized', how='inner')
print(f"Matched {len(merged_df)} emails")
# Test with specific emails
print("\n4. Testing specific email matches:")
# Get first 3 matched emails for demonstration
test_emails = merged_df['email_normalized'].head(3).tolist()
for email in test_emails:
row = merged_df[merged_df['email_normalized'] == email].iloc[0]
print(f"\n{email}:")
print(f" From scores file:")
print(f" - problem_score: {row['problem_score']}")
print(f" - ability_score: {row['ability_score']}")
print(f" From KPI file:")
print(f" - Original email: {row['Email']}")
print(f" - {fy2324_col}: {row[fy2324_col]}")
print(f" - {fy2425_col}: {row[fy2425_col]}")
# Convert and show numeric values
if pd.notna(row[fy2324_col]) and pd.notna(row[fy2425_col]):
fy2324_numeric = convert_percentage_to_numeric(pd.Series([row[fy2324_col]]))[0]
fy2425_numeric = convert_percentage_to_numeric(pd.Series([row[fy2425_col]]))[0]
print(f" Converted IPM values:")
print(f" - FY23/24: {row[fy2324_col]}{fy2324_numeric:.3f}")
print(f" - FY24/25: {row[fy2425_col]}{fy2425_numeric:.3f}")
# Show some statistics
print("\n5. Email matching statistics:")
# Check for case differences
case_differences = 0
for _, row in merged_df.iterrows():
if row['email'] != row['Email']:
case_differences += 1
print(f" - Emails with case differences: {case_differences}")
print(f" - Percentage of emails matched: {len(merged_df)/len(scores_df)*100:.1f}%")
# Show unmatched emails count
unmatched_scores = len(scores_df) - len(merged_df)
unmatched_kpi = len(kpi_df) - len(merged_df)
print(f" - Unmatched emails in scores: {unmatched_scores}")
print(f" - Unmatched emails in KPI: {unmatched_kpi}")
return merged_df
if __name__ == '__main__':
try:
merged_df = verify_email_matching()
print("\n\n✅ Verification complete!")
print("\nConclusion: The email matching correctly:")
print("1. Matches emails case-insensitively")
print("2. Associates the correct values from both files")
print("3. Converts percentage strings to numeric values")
print("4. Handles multi-level column headers in the KPI file")
except Exception as e:
print(f"\n❌ Error during verification: {str(e)}")