capstone_backend / folder1 /utils /data_processing.py
logeswari's picture
msg
79d167d
import pandas as pd
def preprocess_data(data: pd.DataFrame):
"""Clean and preprocess HR analytics data."""
# Convert date columns
date_columns = ['Survey Date', 'StartDate', 'DOB']
for col in date_columns:
if col in data.columns:
data[col] = pd.to_datetime(data[col], errors='coerce')
# Calculate Age from DOB
if 'DOB' in data.columns:
data['Age'] = (pd.to_datetime("today") - data['DOB']).dt.days // 365
# Convert Performance Score to Numeric
def clean_performance_score(value):
score_map = {"Exceeds": 5, "Fully Meets": 4, "Needs Improvement": 3, "PIP": 2}
if isinstance(value, (int, float)):
return value
if isinstance(value, str):
value = value.strip()
return score_map.get(value, None)
return None
if 'Performance Score' in data.columns:
data['Performance Score'] = data['Performance Score'].apply(clean_performance_score)
data['Performance Score'] = pd.to_numeric(data['Performance Score'], errors='coerce')
return data