Spaces:
Paused
Paused
| # src/preprocessor.py | |
| import pandas as pd | |
| import numpy as np | |
| # Helper function to recursively convert numpy types to standard Python types | |
| def convert_numpy_types(data): | |
| """Recursively converts numpy types in nested data structures to standard Python types.""" | |
| if isinstance(data, dict): | |
| return {convert_numpy_types(k): convert_numpy_types(v) for k, v in data.items()} | |
| elif isinstance(data, list): | |
| return [convert_numpy_types(i) for i in data] | |
| elif isinstance(data, np.integer): # Catches numpy ints (int32, int64 etc.) | |
| return int(data) | |
| elif isinstance(data, np.floating): # Catches numpy floats | |
| return float(data) | |
| elif isinstance(data, np.bool_): # Catches numpy bools | |
| return bool(data) | |
| elif isinstance(data, np.ndarray): # Handle arrays if they appear unexpectedly | |
| return convert_numpy_types(data.tolist()) | |
| elif isinstance(data, (str, int, float, bool, type(None))): # Keep standard types | |
| return data | |
| else: # Attempt conversion for other numpy types or return as string | |
| try: | |
| # Handle cases like numpy strings or other objects if possible | |
| if hasattr(data, 'item'): # Generic item() method for numpy scalars | |
| return data.item() | |
| except Exception: | |
| pass | |
| # Fallback: return string representation if unsure | |
| print(f"Warning: Type {type(data)} not explicitly handled in convert_numpy_types. Converting to string.") | |
| return str(data) | |
| def transform_wide_to_long(df_wide: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Transforms a wide DataFrame (subjects as columns) to a long DataFrame | |
| (Subject, Feedback_Stars, Instructor_Rating columns). | |
| Assumes subject columns are named like 'SubjectName_Feedback_Stars' and 'SubjectName_Instructor_Rating'. | |
| """ | |
| id_col_options = ['EmployeeID', 'Employee ID', 'ID'] # Common names for an ID column | |
| id_vars = ['Department'] | |
| present_id_cols = [col for col in id_col_options if col in df_wide.columns] | |
| if present_id_cols: | |
| chosen_id_col = 'EmployeeID' if 'EmployeeID' in present_id_cols else present_id_cols[0] | |
| id_vars.append(chosen_id_col) | |
| print(f"Using '{chosen_id_col}' as part of ID variables for melting.") | |
| else: | |
| df_wide = df_wide.reset_index().rename(columns={'index': '_TempRowID'}) | |
| id_vars.append('_TempRowID') | |
| print("No standard 'EmployeeID' found. Using temporary row ID for melting.") | |
| feedback_cols = [col for col in df_wide.columns if col.endswith('_Feedback_Stars')] | |
| rating_cols = [col for col in df_wide.columns if col.endswith('_Instructor_Rating')] | |
| if not feedback_cols and not rating_cols: | |
| print("Warning: No columns found ending with '_Feedback_Stars' or '_Instructor_Rating'. Transformation might be incorrect.") | |
| return pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars', 'Instructor_Rating']) | |
| df_feedback_long = pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars']) | |
| if feedback_cols: | |
| df_feedback_long = pd.melt(df_wide, | |
| id_vars=id_vars, | |
| value_vars=feedback_cols, | |
| var_name='Subject_Raw_FB', | |
| value_name='Feedback_Stars') | |
| df_feedback_long['Subject'] = df_feedback_long['Subject_Raw_FB'].str.replace('_Feedback_Stars', '') | |
| df_feedback_long.drop(columns=['Subject_Raw_FB'], inplace=True) | |
| df_rating_long = pd.DataFrame(columns=id_vars + ['Subject', 'Instructor_Rating']) | |
| if rating_cols: | |
| df_rating_long = pd.melt(df_wide, | |
| id_vars=id_vars, | |
| value_vars=rating_cols, | |
| var_name='Subject_Raw_IR', | |
| value_name='Instructor_Rating') | |
| df_rating_long['Subject'] = df_rating_long['Subject_Raw_IR'].str.replace('_Instructor_Rating', '') | |
| df_rating_long.drop(columns=['Subject_Raw_IR'], inplace=True) | |
| # Merge feedback and ratings | |
| if not df_feedback_long.empty and not df_rating_long.empty: | |
| df_long = pd.merge(df_feedback_long, df_rating_long, on=id_vars + ['Subject'], how='outer') | |
| elif not df_feedback_long.empty: | |
| df_long = df_feedback_long | |
| df_long['Instructor_Rating'] = np.nan | |
| elif not df_rating_long.empty: | |
| df_long = df_rating_long | |
| df_long['Feedback_Stars'] = np.nan | |
| else: | |
| df_long = pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars', 'Instructor_Rating']) | |
| if '_TempRowID' in df_long.columns: | |
| df_long.drop(columns=['_TempRowID'], inplace=True) | |
| # Remove rows where BOTH Feedback_Stars and Instructor_Rating are NaN | |
| df_long.dropna(subset=['Feedback_Stars', 'Instructor_Rating'], how='all', inplace=True) | |
| # Ensure Subject and Department are strings | |
| df_long['Subject'] = df_long['Subject'].astype(str) | |
| df_long['Department'] = df_long['Department'].astype(str) | |
| print(f"Data transformed to long format. Shape: {df_long.shape}") | |
| return df_long | |
| def preprocess_data(df_wide: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Transforms wide data to long, cleans it, and ensures appropriate types. | |
| """ | |
| print("Starting preprocessing...") | |
| df_long = transform_wide_to_long(df_wide) | |
| if df_long.empty: | |
| print("Warning: Data transformation resulted in an empty DataFrame.") | |
| return df_long | |
| # Convert rating columns to numeric, coercing errors; use nullable Int64 | |
| df_long['Feedback_Stars'] = pd.to_numeric(df_long['Feedback_Stars'], errors='coerce').astype('Int64') | |
| df_long['Instructor_Rating'] = pd.to_numeric(df_long['Instructor_Rating'], errors='coerce').astype('Int64') | |
| # Subject and Department should already be strings from transform_wide_to_long | |
| df_long['Subject'] = df_long['Subject'].astype(str) | |
| df_long['Department'] = df_long['Department'].astype(str) | |
| # Optional: Validate range (1-5) if needed, though clipping/rounding happens later | |
| # df_long['Feedback_Stars'] = df_long['Feedback_Stars'].apply(lambda x: x if pd.isna(x) or 1 <= x <= 5 else pd.NA) | |
| # df_long['Instructor_Rating'] = df_long['Instructor_Rating'].apply(lambda x: x if pd.isna(x) or 1 <= x <= 5 else pd.NA) | |
| print("Data preprocessing (type conversion, NaN handling) complete.") | |
| return df_long | |
| def get_feedback_distribution(df: pd.DataFrame) -> dict: | |
| """Calculates feedback distribution per subject, ensuring standard Python types.""" | |
| if 'Feedback_Stars' not in df.columns or df['Feedback_Stars'].isnull().all(): | |
| return {} | |
| distribution = {} | |
| # Ensure Subject is string type for grouping | |
| df['Subject'] = df['Subject'].astype(str) | |
| for subject in df['Subject'].unique(): | |
| subject_df = df[df['Subject'] == subject] | |
| # Use dropna() before value_counts | |
| counts_series = subject_df['Feedback_Stars'].dropna().value_counts().sort_index() | |
| if not counts_series.empty: | |
| # Explicitly convert keys (ratings) and values (counts) to standard int | |
| subject_dist = {int(k): int(v) for k, v in counts_series.items()} | |
| distribution[subject] = subject_dist # subject is already string | |
| # Although direct conversion is done, run through helper as a final safeguard for nested types | |
| return convert_numpy_types(distribution) | |
| def get_instructor_rating_distribution(df: pd.DataFrame) -> dict: | |
| """Calculates instructor rating distribution per subject, ensuring standard Python types.""" | |
| if 'Instructor_Rating' not in df.columns or df['Instructor_Rating'].isnull().all(): | |
| return {} | |
| distribution = {} | |
| # Ensure Subject is string type for grouping | |
| df['Subject'] = df['Subject'].astype(str) | |
| for subject in df['Subject'].unique(): | |
| subject_df = df[df['Subject'] == subject] | |
| counts_series = subject_df['Instructor_Rating'].dropna().value_counts().sort_index() | |
| if not counts_series.empty: | |
| subject_dist = {int(k): int(v) for k, v in counts_series.items()} | |
| distribution[subject] = subject_dist | |
| return convert_numpy_types(distribution) | |
| def get_average_scores(df: pd.DataFrame) -> dict: | |
| """Calculates average scores (rounded) and counts, returning dict of DataFrames.""" | |
| # Ensure relevant columns are appropriate types before aggregation | |
| df['Subject'] = df['Subject'].astype(str) | |
| df['Department'] = df['Department'].astype(str) | |
| # Ensure ratings are numeric for mean calculation (already done in preprocess_data, but good practice) | |
| df['Feedback_Stars'] = pd.to_numeric(df['Feedback_Stars'], errors='coerce') | |
| df['Instructor_Rating'] = pd.to_numeric(df['Instructor_Rating'], errors='coerce') | |
| # --- Aggregation and Rounding --- | |
| avg_scores_subject = df.groupby('Subject').agg( | |
| Average_Feedback_Stars=('Feedback_Stars', 'mean'), | |
| Average_Instructor_Rating=('Instructor_Rating', 'mean'), | |
| Total_Responses_Feedback=('Feedback_Stars', 'count'), | |
| Total_Responses_Instructor=('Instructor_Rating', 'count') | |
| ).round(1).reset_index() # Round averages to 1 decimal place | |
| avg_scores_dept = df.groupby('Department').agg( | |
| Average_Feedback_Stars=('Feedback_Stars', 'mean'), | |
| Average_Instructor_Rating=('Instructor_Rating', 'mean'), | |
| Total_Responses_Feedback=('Feedback_Stars', 'count'), | |
| Total_Responses_Instructor=('Instructor_Rating', 'count') | |
| ).round(1).reset_index() # Round averages to 1 decimal place | |
| avg_scores_subject_dept = df.groupby(['Department', 'Subject']).agg( | |
| Average_Feedback_Stars=('Feedback_Stars', 'mean'), | |
| Average_Instructor_Rating=('Instructor_Rating', 'mean') | |
| ).round(1).reset_index() # Round averages to 1 decimal place | |
| # --- End Rounding --- | |
| # Convert count columns explicitly to standard int | |
| # (though usually not an issue in DFs, good practice for consistency) | |
| for col in ['Total_Responses_Feedback', 'Total_Responses_Instructor']: | |
| if col in avg_scores_subject.columns: | |
| # Use nullable Int64 if counts can be 0, then convert safely | |
| avg_scores_subject[col] = avg_scores_subject[col].astype('Int64').fillna(0).astype(int) | |
| if col in avg_scores_dept.columns: | |
| avg_scores_dept[col] = avg_scores_dept[col].astype('Int64').fillna(0).astype(int) | |
| # The resulting DataFrames might still contain float64 for means, | |
| # but these are generally handled correctly by pandas methods like to_markdown(). | |
| # The critical part was converting the dictionaries from distribution functions. | |
| return { | |
| "avg_scores_subject": avg_scores_subject, | |
| "avg_scores_dept": avg_scores_dept, | |
| "avg_scores_subject_dept": avg_scores_subject_dept | |
| } | |