# src/preprocessor.py import pandas as pd import numpy as np # Helper function to recursively convert numpy types to standard Python types def convert_numpy_types(data): """Recursively converts numpy types in nested data structures to standard Python types.""" if isinstance(data, dict): return {convert_numpy_types(k): convert_numpy_types(v) for k, v in data.items()} elif isinstance(data, list): return [convert_numpy_types(i) for i in data] elif isinstance(data, np.integer): # Catches numpy ints (int32, int64 etc.) return int(data) elif isinstance(data, np.floating): # Catches numpy floats return float(data) elif isinstance(data, np.bool_): # Catches numpy bools return bool(data) elif isinstance(data, np.ndarray): # Handle arrays if they appear unexpectedly return convert_numpy_types(data.tolist()) elif isinstance(data, (str, int, float, bool, type(None))): # Keep standard types return data else: # Attempt conversion for other numpy types or return as string try: # Handle cases like numpy strings or other objects if possible if hasattr(data, 'item'): # Generic item() method for numpy scalars return data.item() except Exception: pass # Fallback: return string representation if unsure print(f"Warning: Type {type(data)} not explicitly handled in convert_numpy_types. Converting to string.") return str(data) def transform_wide_to_long(df_wide: pd.DataFrame) -> pd.DataFrame: """ Transforms a wide DataFrame (subjects as columns) to a long DataFrame (Subject, Feedback_Stars, Instructor_Rating columns). Assumes subject columns are named like 'SubjectName_Feedback_Stars' and 'SubjectName_Instructor_Rating'. """ id_col_options = ['EmployeeID', 'Employee ID', 'ID'] # Common names for an ID column id_vars = ['Department'] present_id_cols = [col for col in id_col_options if col in df_wide.columns] if present_id_cols: chosen_id_col = 'EmployeeID' if 'EmployeeID' in present_id_cols else present_id_cols[0] id_vars.append(chosen_id_col) print(f"Using '{chosen_id_col}' as part of ID variables for melting.") else: df_wide = df_wide.reset_index().rename(columns={'index': '_TempRowID'}) id_vars.append('_TempRowID') print("No standard 'EmployeeID' found. Using temporary row ID for melting.") feedback_cols = [col for col in df_wide.columns if col.endswith('_Feedback_Stars')] rating_cols = [col for col in df_wide.columns if col.endswith('_Instructor_Rating')] if not feedback_cols and not rating_cols: print("Warning: No columns found ending with '_Feedback_Stars' or '_Instructor_Rating'. Transformation might be incorrect.") return pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars', 'Instructor_Rating']) df_feedback_long = pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars']) if feedback_cols: df_feedback_long = pd.melt(df_wide, id_vars=id_vars, value_vars=feedback_cols, var_name='Subject_Raw_FB', value_name='Feedback_Stars') df_feedback_long['Subject'] = df_feedback_long['Subject_Raw_FB'].str.replace('_Feedback_Stars', '') df_feedback_long.drop(columns=['Subject_Raw_FB'], inplace=True) df_rating_long = pd.DataFrame(columns=id_vars + ['Subject', 'Instructor_Rating']) if rating_cols: df_rating_long = pd.melt(df_wide, id_vars=id_vars, value_vars=rating_cols, var_name='Subject_Raw_IR', value_name='Instructor_Rating') df_rating_long['Subject'] = df_rating_long['Subject_Raw_IR'].str.replace('_Instructor_Rating', '') df_rating_long.drop(columns=['Subject_Raw_IR'], inplace=True) # Merge feedback and ratings if not df_feedback_long.empty and not df_rating_long.empty: df_long = pd.merge(df_feedback_long, df_rating_long, on=id_vars + ['Subject'], how='outer') elif not df_feedback_long.empty: df_long = df_feedback_long df_long['Instructor_Rating'] = np.nan elif not df_rating_long.empty: df_long = df_rating_long df_long['Feedback_Stars'] = np.nan else: df_long = pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars', 'Instructor_Rating']) if '_TempRowID' in df_long.columns: df_long.drop(columns=['_TempRowID'], inplace=True) # Remove rows where BOTH Feedback_Stars and Instructor_Rating are NaN df_long.dropna(subset=['Feedback_Stars', 'Instructor_Rating'], how='all', inplace=True) # Ensure Subject and Department are strings df_long['Subject'] = df_long['Subject'].astype(str) df_long['Department'] = df_long['Department'].astype(str) print(f"Data transformed to long format. Shape: {df_long.shape}") return df_long def preprocess_data(df_wide: pd.DataFrame) -> pd.DataFrame: """ Transforms wide data to long, cleans it, and ensures appropriate types. """ print("Starting preprocessing...") df_long = transform_wide_to_long(df_wide) if df_long.empty: print("Warning: Data transformation resulted in an empty DataFrame.") return df_long # Convert rating columns to numeric, coercing errors; use nullable Int64 df_long['Feedback_Stars'] = pd.to_numeric(df_long['Feedback_Stars'], errors='coerce').astype('Int64') df_long['Instructor_Rating'] = pd.to_numeric(df_long['Instructor_Rating'], errors='coerce').astype('Int64') # Subject and Department should already be strings from transform_wide_to_long df_long['Subject'] = df_long['Subject'].astype(str) df_long['Department'] = df_long['Department'].astype(str) # Optional: Validate range (1-5) if needed, though clipping/rounding happens later # df_long['Feedback_Stars'] = df_long['Feedback_Stars'].apply(lambda x: x if pd.isna(x) or 1 <= x <= 5 else pd.NA) # df_long['Instructor_Rating'] = df_long['Instructor_Rating'].apply(lambda x: x if pd.isna(x) or 1 <= x <= 5 else pd.NA) print("Data preprocessing (type conversion, NaN handling) complete.") return df_long def get_feedback_distribution(df: pd.DataFrame) -> dict: """Calculates feedback distribution per subject, ensuring standard Python types.""" if 'Feedback_Stars' not in df.columns or df['Feedback_Stars'].isnull().all(): return {} distribution = {} # Ensure Subject is string type for grouping df['Subject'] = df['Subject'].astype(str) for subject in df['Subject'].unique(): subject_df = df[df['Subject'] == subject] # Use dropna() before value_counts counts_series = subject_df['Feedback_Stars'].dropna().value_counts().sort_index() if not counts_series.empty: # Explicitly convert keys (ratings) and values (counts) to standard int subject_dist = {int(k): int(v) for k, v in counts_series.items()} distribution[subject] = subject_dist # subject is already string # Although direct conversion is done, run through helper as a final safeguard for nested types return convert_numpy_types(distribution) def get_instructor_rating_distribution(df: pd.DataFrame) -> dict: """Calculates instructor rating distribution per subject, ensuring standard Python types.""" if 'Instructor_Rating' not in df.columns or df['Instructor_Rating'].isnull().all(): return {} distribution = {} # Ensure Subject is string type for grouping df['Subject'] = df['Subject'].astype(str) for subject in df['Subject'].unique(): subject_df = df[df['Subject'] == subject] counts_series = subject_df['Instructor_Rating'].dropna().value_counts().sort_index() if not counts_series.empty: subject_dist = {int(k): int(v) for k, v in counts_series.items()} distribution[subject] = subject_dist return convert_numpy_types(distribution) def get_average_scores(df: pd.DataFrame) -> dict: """Calculates average scores (rounded) and counts, returning dict of DataFrames.""" # Ensure relevant columns are appropriate types before aggregation df['Subject'] = df['Subject'].astype(str) df['Department'] = df['Department'].astype(str) # Ensure ratings are numeric for mean calculation (already done in preprocess_data, but good practice) df['Feedback_Stars'] = pd.to_numeric(df['Feedback_Stars'], errors='coerce') df['Instructor_Rating'] = pd.to_numeric(df['Instructor_Rating'], errors='coerce') # --- Aggregation and Rounding --- avg_scores_subject = df.groupby('Subject').agg( Average_Feedback_Stars=('Feedback_Stars', 'mean'), Average_Instructor_Rating=('Instructor_Rating', 'mean'), Total_Responses_Feedback=('Feedback_Stars', 'count'), Total_Responses_Instructor=('Instructor_Rating', 'count') ).round(1).reset_index() # Round averages to 1 decimal place avg_scores_dept = df.groupby('Department').agg( Average_Feedback_Stars=('Feedback_Stars', 'mean'), Average_Instructor_Rating=('Instructor_Rating', 'mean'), Total_Responses_Feedback=('Feedback_Stars', 'count'), Total_Responses_Instructor=('Instructor_Rating', 'count') ).round(1).reset_index() # Round averages to 1 decimal place avg_scores_subject_dept = df.groupby(['Department', 'Subject']).agg( Average_Feedback_Stars=('Feedback_Stars', 'mean'), Average_Instructor_Rating=('Instructor_Rating', 'mean') ).round(1).reset_index() # Round averages to 1 decimal place # --- End Rounding --- # Convert count columns explicitly to standard int # (though usually not an issue in DFs, good practice for consistency) for col in ['Total_Responses_Feedback', 'Total_Responses_Instructor']: if col in avg_scores_subject.columns: # Use nullable Int64 if counts can be 0, then convert safely avg_scores_subject[col] = avg_scores_subject[col].astype('Int64').fillna(0).astype(int) if col in avg_scores_dept.columns: avg_scores_dept[col] = avg_scores_dept[col].astype('Int64').fillna(0).astype(int) # The resulting DataFrames might still contain float64 for means, # but these are generally handled correctly by pandas methods like to_markdown(). # The critical part was converting the dictionaries from distribution functions. return { "avg_scores_subject": avg_scores_subject, "avg_scores_dept": avg_scores_dept, "avg_scores_subject_dept": avg_scores_subject_dept }