akwel_performance / src /preprocessor.py
ArkenB's picture
Create preprocessor.py
62a4c11 verified
# src/preprocessor.py
import pandas as pd
import numpy as np
# Helper function to recursively convert numpy types to standard Python types
def convert_numpy_types(data):
"""Recursively converts numpy types in nested data structures to standard Python types."""
if isinstance(data, dict):
return {convert_numpy_types(k): convert_numpy_types(v) for k, v in data.items()}
elif isinstance(data, list):
return [convert_numpy_types(i) for i in data]
elif isinstance(data, np.integer): # Catches numpy ints (int32, int64 etc.)
return int(data)
elif isinstance(data, np.floating): # Catches numpy floats
return float(data)
elif isinstance(data, np.bool_): # Catches numpy bools
return bool(data)
elif isinstance(data, np.ndarray): # Handle arrays if they appear unexpectedly
return convert_numpy_types(data.tolist())
elif isinstance(data, (str, int, float, bool, type(None))): # Keep standard types
return data
else: # Attempt conversion for other numpy types or return as string
try:
# Handle cases like numpy strings or other objects if possible
if hasattr(data, 'item'): # Generic item() method for numpy scalars
return data.item()
except Exception:
pass
# Fallback: return string representation if unsure
print(f"Warning: Type {type(data)} not explicitly handled in convert_numpy_types. Converting to string.")
return str(data)
def transform_wide_to_long(df_wide: pd.DataFrame) -> pd.DataFrame:
"""
Transforms a wide DataFrame (subjects as columns) to a long DataFrame
(Subject, Feedback_Stars, Instructor_Rating columns).
Assumes subject columns are named like 'SubjectName_Feedback_Stars' and 'SubjectName_Instructor_Rating'.
"""
id_col_options = ['EmployeeID', 'Employee ID', 'ID'] # Common names for an ID column
id_vars = ['Department']
present_id_cols = [col for col in id_col_options if col in df_wide.columns]
if present_id_cols:
chosen_id_col = 'EmployeeID' if 'EmployeeID' in present_id_cols else present_id_cols[0]
id_vars.append(chosen_id_col)
print(f"Using '{chosen_id_col}' as part of ID variables for melting.")
else:
df_wide = df_wide.reset_index().rename(columns={'index': '_TempRowID'})
id_vars.append('_TempRowID')
print("No standard 'EmployeeID' found. Using temporary row ID for melting.")
feedback_cols = [col for col in df_wide.columns if col.endswith('_Feedback_Stars')]
rating_cols = [col for col in df_wide.columns if col.endswith('_Instructor_Rating')]
if not feedback_cols and not rating_cols:
print("Warning: No columns found ending with '_Feedback_Stars' or '_Instructor_Rating'. Transformation might be incorrect.")
return pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars', 'Instructor_Rating'])
df_feedback_long = pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars'])
if feedback_cols:
df_feedback_long = pd.melt(df_wide,
id_vars=id_vars,
value_vars=feedback_cols,
var_name='Subject_Raw_FB',
value_name='Feedback_Stars')
df_feedback_long['Subject'] = df_feedback_long['Subject_Raw_FB'].str.replace('_Feedback_Stars', '')
df_feedback_long.drop(columns=['Subject_Raw_FB'], inplace=True)
df_rating_long = pd.DataFrame(columns=id_vars + ['Subject', 'Instructor_Rating'])
if rating_cols:
df_rating_long = pd.melt(df_wide,
id_vars=id_vars,
value_vars=rating_cols,
var_name='Subject_Raw_IR',
value_name='Instructor_Rating')
df_rating_long['Subject'] = df_rating_long['Subject_Raw_IR'].str.replace('_Instructor_Rating', '')
df_rating_long.drop(columns=['Subject_Raw_IR'], inplace=True)
# Merge feedback and ratings
if not df_feedback_long.empty and not df_rating_long.empty:
df_long = pd.merge(df_feedback_long, df_rating_long, on=id_vars + ['Subject'], how='outer')
elif not df_feedback_long.empty:
df_long = df_feedback_long
df_long['Instructor_Rating'] = np.nan
elif not df_rating_long.empty:
df_long = df_rating_long
df_long['Feedback_Stars'] = np.nan
else:
df_long = pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars', 'Instructor_Rating'])
if '_TempRowID' in df_long.columns:
df_long.drop(columns=['_TempRowID'], inplace=True)
# Remove rows where BOTH Feedback_Stars and Instructor_Rating are NaN
df_long.dropna(subset=['Feedback_Stars', 'Instructor_Rating'], how='all', inplace=True)
# Ensure Subject and Department are strings
df_long['Subject'] = df_long['Subject'].astype(str)
df_long['Department'] = df_long['Department'].astype(str)
print(f"Data transformed to long format. Shape: {df_long.shape}")
return df_long
def preprocess_data(df_wide: pd.DataFrame) -> pd.DataFrame:
"""
Transforms wide data to long, cleans it, and ensures appropriate types.
"""
print("Starting preprocessing...")
df_long = transform_wide_to_long(df_wide)
if df_long.empty:
print("Warning: Data transformation resulted in an empty DataFrame.")
return df_long
# Convert rating columns to numeric, coercing errors; use nullable Int64
df_long['Feedback_Stars'] = pd.to_numeric(df_long['Feedback_Stars'], errors='coerce').astype('Int64')
df_long['Instructor_Rating'] = pd.to_numeric(df_long['Instructor_Rating'], errors='coerce').astype('Int64')
# Subject and Department should already be strings from transform_wide_to_long
df_long['Subject'] = df_long['Subject'].astype(str)
df_long['Department'] = df_long['Department'].astype(str)
# Optional: Validate range (1-5) if needed, though clipping/rounding happens later
# df_long['Feedback_Stars'] = df_long['Feedback_Stars'].apply(lambda x: x if pd.isna(x) or 1 <= x <= 5 else pd.NA)
# df_long['Instructor_Rating'] = df_long['Instructor_Rating'].apply(lambda x: x if pd.isna(x) or 1 <= x <= 5 else pd.NA)
print("Data preprocessing (type conversion, NaN handling) complete.")
return df_long
def get_feedback_distribution(df: pd.DataFrame) -> dict:
"""Calculates feedback distribution per subject, ensuring standard Python types."""
if 'Feedback_Stars' not in df.columns or df['Feedback_Stars'].isnull().all():
return {}
distribution = {}
# Ensure Subject is string type for grouping
df['Subject'] = df['Subject'].astype(str)
for subject in df['Subject'].unique():
subject_df = df[df['Subject'] == subject]
# Use dropna() before value_counts
counts_series = subject_df['Feedback_Stars'].dropna().value_counts().sort_index()
if not counts_series.empty:
# Explicitly convert keys (ratings) and values (counts) to standard int
subject_dist = {int(k): int(v) for k, v in counts_series.items()}
distribution[subject] = subject_dist # subject is already string
# Although direct conversion is done, run through helper as a final safeguard for nested types
return convert_numpy_types(distribution)
def get_instructor_rating_distribution(df: pd.DataFrame) -> dict:
"""Calculates instructor rating distribution per subject, ensuring standard Python types."""
if 'Instructor_Rating' not in df.columns or df['Instructor_Rating'].isnull().all():
return {}
distribution = {}
# Ensure Subject is string type for grouping
df['Subject'] = df['Subject'].astype(str)
for subject in df['Subject'].unique():
subject_df = df[df['Subject'] == subject]
counts_series = subject_df['Instructor_Rating'].dropna().value_counts().sort_index()
if not counts_series.empty:
subject_dist = {int(k): int(v) for k, v in counts_series.items()}
distribution[subject] = subject_dist
return convert_numpy_types(distribution)
def get_average_scores(df: pd.DataFrame) -> dict:
"""Calculates average scores (rounded) and counts, returning dict of DataFrames."""
# Ensure relevant columns are appropriate types before aggregation
df['Subject'] = df['Subject'].astype(str)
df['Department'] = df['Department'].astype(str)
# Ensure ratings are numeric for mean calculation (already done in preprocess_data, but good practice)
df['Feedback_Stars'] = pd.to_numeric(df['Feedback_Stars'], errors='coerce')
df['Instructor_Rating'] = pd.to_numeric(df['Instructor_Rating'], errors='coerce')
# --- Aggregation and Rounding ---
avg_scores_subject = df.groupby('Subject').agg(
Average_Feedback_Stars=('Feedback_Stars', 'mean'),
Average_Instructor_Rating=('Instructor_Rating', 'mean'),
Total_Responses_Feedback=('Feedback_Stars', 'count'),
Total_Responses_Instructor=('Instructor_Rating', 'count')
).round(1).reset_index() # Round averages to 1 decimal place
avg_scores_dept = df.groupby('Department').agg(
Average_Feedback_Stars=('Feedback_Stars', 'mean'),
Average_Instructor_Rating=('Instructor_Rating', 'mean'),
Total_Responses_Feedback=('Feedback_Stars', 'count'),
Total_Responses_Instructor=('Instructor_Rating', 'count')
).round(1).reset_index() # Round averages to 1 decimal place
avg_scores_subject_dept = df.groupby(['Department', 'Subject']).agg(
Average_Feedback_Stars=('Feedback_Stars', 'mean'),
Average_Instructor_Rating=('Instructor_Rating', 'mean')
).round(1).reset_index() # Round averages to 1 decimal place
# --- End Rounding ---
# Convert count columns explicitly to standard int
# (though usually not an issue in DFs, good practice for consistency)
for col in ['Total_Responses_Feedback', 'Total_Responses_Instructor']:
if col in avg_scores_subject.columns:
# Use nullable Int64 if counts can be 0, then convert safely
avg_scores_subject[col] = avg_scores_subject[col].astype('Int64').fillna(0).astype(int)
if col in avg_scores_dept.columns:
avg_scores_dept[col] = avg_scores_dept[col].astype('Int64').fillna(0).astype(int)
# The resulting DataFrames might still contain float64 for means,
# but these are generally handled correctly by pandas methods like to_markdown().
# The critical part was converting the dictionaries from distribution functions.
return {
"avg_scores_subject": avg_scores_subject,
"avg_scores_dept": avg_scores_dept,
"avg_scores_subject_dept": avg_scores_subject_dept
}