File size: 10,846 Bytes
62a4c11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# src/preprocessor.py
import pandas as pd
import numpy as np

# Helper function to recursively convert numpy types to standard Python types
def convert_numpy_types(data):
    """Recursively converts numpy types in nested data structures to standard Python types."""
    if isinstance(data, dict):
        return {convert_numpy_types(k): convert_numpy_types(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [convert_numpy_types(i) for i in data]
    elif isinstance(data, np.integer): # Catches numpy ints (int32, int64 etc.)
        return int(data)
    elif isinstance(data, np.floating): # Catches numpy floats
        return float(data)
    elif isinstance(data, np.bool_): # Catches numpy bools
        return bool(data)
    elif isinstance(data, np.ndarray): # Handle arrays if they appear unexpectedly
        return convert_numpy_types(data.tolist())
    elif isinstance(data, (str, int, float, bool, type(None))): # Keep standard types
        return data
    else: # Attempt conversion for other numpy types or return as string
        try:
            # Handle cases like numpy strings or other objects if possible
             if hasattr(data, 'item'): # Generic item() method for numpy scalars
                 return data.item()
        except Exception:
             pass
        # Fallback: return string representation if unsure
        print(f"Warning: Type {type(data)} not explicitly handled in convert_numpy_types. Converting to string.")
        return str(data)


def transform_wide_to_long(df_wide: pd.DataFrame) -> pd.DataFrame:
    """
    Transforms a wide DataFrame (subjects as columns) to a long DataFrame
    (Subject, Feedback_Stars, Instructor_Rating columns).
    Assumes subject columns are named like 'SubjectName_Feedback_Stars' and 'SubjectName_Instructor_Rating'.
    """
    id_col_options = ['EmployeeID', 'Employee ID', 'ID'] # Common names for an ID column
    id_vars = ['Department']

    present_id_cols = [col for col in id_col_options if col in df_wide.columns]
    if present_id_cols:
        chosen_id_col = 'EmployeeID' if 'EmployeeID' in present_id_cols else present_id_cols[0]
        id_vars.append(chosen_id_col)
        print(f"Using '{chosen_id_col}' as part of ID variables for melting.")
    else:
        df_wide = df_wide.reset_index().rename(columns={'index': '_TempRowID'})
        id_vars.append('_TempRowID')
        print("No standard 'EmployeeID' found. Using temporary row ID for melting.")

    feedback_cols = [col for col in df_wide.columns if col.endswith('_Feedback_Stars')]
    rating_cols = [col for col in df_wide.columns if col.endswith('_Instructor_Rating')]

    if not feedback_cols and not rating_cols:
        print("Warning: No columns found ending with '_Feedback_Stars' or '_Instructor_Rating'. Transformation might be incorrect.")
        return pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars', 'Instructor_Rating'])

    df_feedback_long = pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars'])
    if feedback_cols:
        df_feedback_long = pd.melt(df_wide,
                                   id_vars=id_vars,
                                   value_vars=feedback_cols,
                                   var_name='Subject_Raw_FB',
                                   value_name='Feedback_Stars')
        df_feedback_long['Subject'] = df_feedback_long['Subject_Raw_FB'].str.replace('_Feedback_Stars', '')
        df_feedback_long.drop(columns=['Subject_Raw_FB'], inplace=True)

    df_rating_long = pd.DataFrame(columns=id_vars + ['Subject', 'Instructor_Rating'])
    if rating_cols:
        df_rating_long = pd.melt(df_wide,
                                 id_vars=id_vars,
                                 value_vars=rating_cols,
                                 var_name='Subject_Raw_IR',
                                 value_name='Instructor_Rating')
        df_rating_long['Subject'] = df_rating_long['Subject_Raw_IR'].str.replace('_Instructor_Rating', '')
        df_rating_long.drop(columns=['Subject_Raw_IR'], inplace=True)

    # Merge feedback and ratings
    if not df_feedback_long.empty and not df_rating_long.empty:
        df_long = pd.merge(df_feedback_long, df_rating_long, on=id_vars + ['Subject'], how='outer')
    elif not df_feedback_long.empty:
        df_long = df_feedback_long
        df_long['Instructor_Rating'] = np.nan
    elif not df_rating_long.empty:
        df_long = df_rating_long
        df_long['Feedback_Stars'] = np.nan
    else:
        df_long = pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars', 'Instructor_Rating'])

    if '_TempRowID' in df_long.columns:
        df_long.drop(columns=['_TempRowID'], inplace=True)

    # Remove rows where BOTH Feedback_Stars and Instructor_Rating are NaN
    df_long.dropna(subset=['Feedback_Stars', 'Instructor_Rating'], how='all', inplace=True)

    # Ensure Subject and Department are strings
    df_long['Subject'] = df_long['Subject'].astype(str)
    df_long['Department'] = df_long['Department'].astype(str)

    print(f"Data transformed to long format. Shape: {df_long.shape}")
    return df_long


def preprocess_data(df_wide: pd.DataFrame) -> pd.DataFrame:
    """
    Transforms wide data to long, cleans it, and ensures appropriate types.
    """
    print("Starting preprocessing...")
    df_long = transform_wide_to_long(df_wide)

    if df_long.empty:
        print("Warning: Data transformation resulted in an empty DataFrame.")
        return df_long

    # Convert rating columns to numeric, coercing errors; use nullable Int64
    df_long['Feedback_Stars'] = pd.to_numeric(df_long['Feedback_Stars'], errors='coerce').astype('Int64')
    df_long['Instructor_Rating'] = pd.to_numeric(df_long['Instructor_Rating'], errors='coerce').astype('Int64')

    # Subject and Department should already be strings from transform_wide_to_long
    df_long['Subject'] = df_long['Subject'].astype(str)
    df_long['Department'] = df_long['Department'].astype(str)

    # Optional: Validate range (1-5) if needed, though clipping/rounding happens later
    # df_long['Feedback_Stars'] = df_long['Feedback_Stars'].apply(lambda x: x if pd.isna(x) or 1 <= x <= 5 else pd.NA)
    # df_long['Instructor_Rating'] = df_long['Instructor_Rating'].apply(lambda x: x if pd.isna(x) or 1 <= x <= 5 else pd.NA)

    print("Data preprocessing (type conversion, NaN handling) complete.")
    return df_long


def get_feedback_distribution(df: pd.DataFrame) -> dict:
    """Calculates feedback distribution per subject, ensuring standard Python types."""
    if 'Feedback_Stars' not in df.columns or df['Feedback_Stars'].isnull().all():
        return {}
    distribution = {}
    # Ensure Subject is string type for grouping
    df['Subject'] = df['Subject'].astype(str)

    for subject in df['Subject'].unique():
        subject_df = df[df['Subject'] == subject]
        # Use dropna() before value_counts
        counts_series = subject_df['Feedback_Stars'].dropna().value_counts().sort_index()
        if not counts_series.empty:
            # Explicitly convert keys (ratings) and values (counts) to standard int
            subject_dist = {int(k): int(v) for k, v in counts_series.items()}
            distribution[subject] = subject_dist # subject is already string

    # Although direct conversion is done, run through helper as a final safeguard for nested types
    return convert_numpy_types(distribution)

def get_instructor_rating_distribution(df: pd.DataFrame) -> dict:
    """Calculates instructor rating distribution per subject, ensuring standard Python types."""
    if 'Instructor_Rating' not in df.columns or df['Instructor_Rating'].isnull().all():
        return {}
    distribution = {}
    # Ensure Subject is string type for grouping
    df['Subject'] = df['Subject'].astype(str)

    for subject in df['Subject'].unique():
        subject_df = df[df['Subject'] == subject]
        counts_series = subject_df['Instructor_Rating'].dropna().value_counts().sort_index()
        if not counts_series.empty:
            subject_dist = {int(k): int(v) for k, v in counts_series.items()}
            distribution[subject] = subject_dist

    return convert_numpy_types(distribution)

def get_average_scores(df: pd.DataFrame) -> dict:
    """Calculates average scores (rounded) and counts, returning dict of DataFrames."""

    # Ensure relevant columns are appropriate types before aggregation
    df['Subject'] = df['Subject'].astype(str)
    df['Department'] = df['Department'].astype(str)
    # Ensure ratings are numeric for mean calculation (already done in preprocess_data, but good practice)
    df['Feedback_Stars'] = pd.to_numeric(df['Feedback_Stars'], errors='coerce')
    df['Instructor_Rating'] = pd.to_numeric(df['Instructor_Rating'], errors='coerce')

    # --- Aggregation and Rounding ---
    avg_scores_subject = df.groupby('Subject').agg(
        Average_Feedback_Stars=('Feedback_Stars', 'mean'),
        Average_Instructor_Rating=('Instructor_Rating', 'mean'),
        Total_Responses_Feedback=('Feedback_Stars', 'count'),
        Total_Responses_Instructor=('Instructor_Rating', 'count')
    ).round(1).reset_index() # Round averages to 1 decimal place

    avg_scores_dept = df.groupby('Department').agg(
        Average_Feedback_Stars=('Feedback_Stars', 'mean'),
        Average_Instructor_Rating=('Instructor_Rating', 'mean'),
        Total_Responses_Feedback=('Feedback_Stars', 'count'),
        Total_Responses_Instructor=('Instructor_Rating', 'count')
    ).round(1).reset_index() # Round averages to 1 decimal place

    avg_scores_subject_dept = df.groupby(['Department', 'Subject']).agg(
        Average_Feedback_Stars=('Feedback_Stars', 'mean'),
        Average_Instructor_Rating=('Instructor_Rating', 'mean')
    ).round(1).reset_index() # Round averages to 1 decimal place
    # --- End Rounding ---

    # Convert count columns explicitly to standard int
    # (though usually not an issue in DFs, good practice for consistency)
    for col in ['Total_Responses_Feedback', 'Total_Responses_Instructor']:
         if col in avg_scores_subject.columns:
             # Use nullable Int64 if counts can be 0, then convert safely
             avg_scores_subject[col] = avg_scores_subject[col].astype('Int64').fillna(0).astype(int)
         if col in avg_scores_dept.columns:
             avg_scores_dept[col] = avg_scores_dept[col].astype('Int64').fillna(0).astype(int)

    # The resulting DataFrames might still contain float64 for means,
    # but these are generally handled correctly by pandas methods like to_markdown().
    # The critical part was converting the dictionaries from distribution functions.
    return {
        "avg_scores_subject": avg_scores_subject,
        "avg_scores_dept": avg_scores_dept,
        "avg_scores_subject_dept": avg_scores_subject_dept
    }