Spaces:

ArkenB
/

akwel_performance

Paused

File size: 10,846 Bytes

62a4c11

# src/preprocessor.py
import pandas as pd
import numpy as np

# Helper function to recursively convert numpy types to standard Python types
def convert_numpy_types(data):
    """Recursively converts numpy types in nested data structures to standard Python types."""
    if isinstance(data, dict):
        return {convert_numpy_types(k): convert_numpy_types(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [convert_numpy_types(i) for i in data]
    elif isinstance(data, np.integer): # Catches numpy ints (int32, int64 etc.)
        return int(data)
    elif isinstance(data, np.floating): # Catches numpy floats
        return float(data)
    elif isinstance(data, np.bool_): # Catches numpy bools
        return bool(data)
    elif isinstance(data, np.ndarray): # Handle arrays if they appear unexpectedly
        return convert_numpy_types(data.tolist())
    elif isinstance(data, (str, int, float, bool, type(None))): # Keep standard types
        return data
    else: # Attempt conversion for other numpy types or return as string
        try:
            # Handle cases like numpy strings or other objects if possible
             if hasattr(data, 'item'): # Generic item() method for numpy scalars
                 return data.item()
        except Exception:
             pass
        # Fallback: return string representation if unsure
        print(f"Warning: Type {type(data)} not explicitly handled in convert_numpy_types. Converting to string.")
        return str(data)


def transform_wide_to_long(df_wide: pd.DataFrame) -> pd.DataFrame:
    """
    Transforms a wide DataFrame (subjects as columns) to a long DataFrame
    (Subject, Feedback_Stars, Instructor_Rating columns).
    Assumes subject columns are named like 'SubjectName_Feedback_Stars' and 'SubjectName_Instructor_Rating'.
    """
    id_col_options = ['EmployeeID', 'Employee ID', 'ID'] # Common names for an ID column
    id_vars = ['Department']

    present_id_cols = [col for col in id_col_options if col in df_wide.columns]
    if present_id_cols:
        chosen_id_col = 'EmployeeID' if 'EmployeeID' in present_id_cols else present_id_cols[0]
        id_vars.append(chosen_id_col)
        print(f"Using '{chosen_id_col}' as part of ID variables for melting.")
    else:
        df_wide = df_wide.reset_index().rename(columns={'index': '_TempRowID'})
        id_vars.append('_TempRowID')
        print("No standard 'EmployeeID' found. Using temporary row ID for melting.")

    feedback_cols = [col for col in df_wide.columns if col.endswith('_Feedback_Stars')]
    rating_cols = [col for col in df_wide.columns if col.endswith('_Instructor_Rating')]

    if not feedback_cols and not rating_cols:
        print("Warning: No columns found ending with '_Feedback_Stars' or '_Instructor_Rating'. Transformation might be incorrect.")
        return pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars', 'Instructor_Rating'])

    df_feedback_long = pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars'])
    if feedback_cols:
        df_feedback_long = pd.melt(df_wide,
                                   id_vars=id_vars,
                                   value_vars=feedback_cols,
                                   var_name='Subject_Raw_FB',
                                   value_name='Feedback_Stars')
        df_feedback_long['Subject'] = df_feedback_long['Subject_Raw_FB'].str.replace('_Feedback_Stars', '')
        df_feedback_long.drop(columns=['Subject_Raw_FB'], inplace=True)

    df_rating_long = pd.DataFrame(columns=id_vars + ['Subject', 'Instructor_Rating'])
    if rating_cols:
        df_rating_long = pd.melt(df_wide,
                                 id_vars=id_vars,
                                 value_vars=rating_cols,
                                 var_name='Subject_Raw_IR',
                                 value_name='Instructor_Rating')
        df_rating_long['Subject'] = df_rating_long['Subject_Raw_IR'].str.replace('_Instructor_Rating', '')
        df_rating_long.drop(columns=['Subject_Raw_IR'], inplace=True)

    # Merge feedback and ratings
    if not df_feedback_long.empty and not df_rating_long.empty:
        df_long = pd.merge(df_feedback_long, df_rating_long, on=id_vars + ['Subject'], how='outer')
    elif not df_feedback_long.empty:
        df_long = df_feedback_long
        df_long['Instructor_Rating'] = np.nan
    elif not df_rating_long.empty:
        df_long = df_rating_long
        df_long['Feedback_Stars'] = np.nan
    else:
        df_long = pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars', 'Instructor_Rating'])

    if '_TempRowID' in df_long.columns:
        df_long.drop(columns=['_TempRowID'], inplace=True)

    # Remove rows where BOTH Feedback_Stars and Instructor_Rating are NaN
    df_long.dropna(subset=['Feedback_Stars', 'Instructor_Rating'], how='all', inplace=True)

    # Ensure Subject and Department are strings
    df_long['Subject'] = df_long['Subject'].astype(str)
    df_long['Department'] = df_long['Department'].astype(str)

    print(f"Data transformed to long format. Shape: {df_long.shape}")
    return df_long


def preprocess_data(df_wide: pd.DataFrame) -> pd.DataFrame:
    """
    Transforms wide data to long, cleans it, and ensures appropriate types.
    """
    print("Starting preprocessing...")
    df_long = transform_wide_to_long(df_wide)

    if df_long.empty:
        print("Warning: Data transformation resulted in an empty DataFrame.")
        return df_long

    # Convert rating columns to numeric, coercing errors; use nullable Int64
    df_long['Feedback_Stars'] = pd.to_numeric(df_long['Feedback_Stars'], errors='coerce').astype('Int64')
    df_long['Instructor_Rating'] = pd.to_numeric(df_long['Instructor_Rating'], errors='coerce').astype('Int64')

    # Subject and Department should already be strings from transform_wide_to_long
    df_long['Subject'] = df_long['Subject'].astype(str)
    df_long['Department'] = df_long['Department'].astype(str)

    # Optional: Validate range (1-5) if needed, though clipping/rounding happens later
    # df_long['Feedback_Stars'] = df_long['Feedback_Stars'].apply(lambda x: x if pd.isna(x) or 1 <= x <= 5 else pd.NA)
    # df_long['Instructor_Rating'] = df_long['Instructor_Rating'].apply(lambda x: x if pd.isna(x) or 1 <= x <= 5 else pd.NA)

    print("Data preprocessing (type conversion, NaN handling) complete.")
    return df_long


def get_feedback_distribution(df: pd.DataFrame) -> dict:
    """Calculates feedback distribution per subject, ensuring standard Python types."""
    if 'Feedback_Stars' not in df.columns or df['Feedback_Stars'].isnull().all():
        return {}
    distribution = {}
    # Ensure Subject is string type for grouping
    df['Subject'] = df['Subject'].astype(str)

    for subject in df['Subject'].unique():
        subject_df = df[df['Subject'] == subject]
        # Use dropna() before value_counts
        counts_series = subject_df['Feedback_Stars'].dropna().value_counts().sort_index()
        if not counts_series.empty:
            # Explicitly convert keys (ratings) and values (counts) to standard int
            subject_dist = {int(k): int(v) for k, v in counts_series.items()}
            distribution[subject] = subject_dist # subject is already string

    # Although direct conversion is done, run through helper as a final safeguard for nested types
    return convert_numpy_types(distribution)

def get_instructor_rating_distribution(df: pd.DataFrame) -> dict:
    """Calculates instructor rating distribution per subject, ensuring standard Python types."""
    if 'Instructor_Rating' not in df.columns or df['Instructor_Rating'].isnull().all():
        return {}
    distribution = {}
    # Ensure Subject is string type for grouping
    df['Subject'] = df['Subject'].astype(str)

    for subject in df['Subject'].unique():
        subject_df = df[df['Subject'] == subject]
        counts_series = subject_df['Instructor_Rating'].dropna().value_counts().sort_index()
        if not counts_series.empty:
            subject_dist = {int(k): int(v) for k, v in counts_series.items()}
            distribution[subject] = subject_dist

    return convert_numpy_types(distribution)

def get_average_scores(df: pd.DataFrame) -> dict:
    """Calculates average scores (rounded) and counts, returning dict of DataFrames."""

    # Ensure relevant columns are appropriate types before aggregation
    df['Subject'] = df['Subject'].astype(str)
    df['Department'] = df['Department'].astype(str)
    # Ensure ratings are numeric for mean calculation (already done in preprocess_data, but good practice)
    df['Feedback_Stars'] = pd.to_numeric(df['Feedback_Stars'], errors='coerce')
    df['Instructor_Rating'] = pd.to_numeric(df['Instructor_Rating'], errors='coerce')

    # --- Aggregation and Rounding ---
    avg_scores_subject = df.groupby('Subject').agg(
        Average_Feedback_Stars=('Feedback_Stars', 'mean'),
        Average_Instructor_Rating=('Instructor_Rating', 'mean'),
        Total_Responses_Feedback=('Feedback_Stars', 'count'),
        Total_Responses_Instructor=('Instructor_Rating', 'count')
    ).round(1).reset_index() # Round averages to 1 decimal place

    avg_scores_dept = df.groupby('Department').agg(
        Average_Feedback_Stars=('Feedback_Stars', 'mean'),
        Average_Instructor_Rating=('Instructor_Rating', 'mean'),
        Total_Responses_Feedback=('Feedback_Stars', 'count'),
        Total_Responses_Instructor=('Instructor_Rating', 'count')
    ).round(1).reset_index() # Round averages to 1 decimal place

    avg_scores_subject_dept = df.groupby(['Department', 'Subject']).agg(
        Average_Feedback_Stars=('Feedback_Stars', 'mean'),
        Average_Instructor_Rating=('Instructor_Rating', 'mean')
    ).round(1).reset_index() # Round averages to 1 decimal place
    # --- End Rounding ---

    # Convert count columns explicitly to standard int
    # (though usually not an issue in DFs, good practice for consistency)
    for col in ['Total_Responses_Feedback', 'Total_Responses_Instructor']:
         if col in avg_scores_subject.columns:
             # Use nullable Int64 if counts can be 0, then convert safely
             avg_scores_subject[col] = avg_scores_subject[col].astype('Int64').fillna(0).astype(int)
         if col in avg_scores_dept.columns:
             avg_scores_dept[col] = avg_scores_dept[col].astype('Int64').fillna(0).astype(int)

    # The resulting DataFrames might still contain float64 for means,
    # but these are generally handled correctly by pandas methods like to_markdown().
    # The critical part was converting the dictionaries from distribution functions.
    return {
        "avg_scores_subject": avg_scores_subject,
        "avg_scores_dept": avg_scores_dept,
        "avg_scores_subject_dept": avg_scores_subject_dept
    }