Spaces:
Paused
Paused
File size: 10,846 Bytes
62a4c11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | # src/preprocessor.py
import pandas as pd
import numpy as np
# Helper function to recursively convert numpy types to standard Python types
def convert_numpy_types(data):
"""Recursively converts numpy types in nested data structures to standard Python types."""
if isinstance(data, dict):
return {convert_numpy_types(k): convert_numpy_types(v) for k, v in data.items()}
elif isinstance(data, list):
return [convert_numpy_types(i) for i in data]
elif isinstance(data, np.integer): # Catches numpy ints (int32, int64 etc.)
return int(data)
elif isinstance(data, np.floating): # Catches numpy floats
return float(data)
elif isinstance(data, np.bool_): # Catches numpy bools
return bool(data)
elif isinstance(data, np.ndarray): # Handle arrays if they appear unexpectedly
return convert_numpy_types(data.tolist())
elif isinstance(data, (str, int, float, bool, type(None))): # Keep standard types
return data
else: # Attempt conversion for other numpy types or return as string
try:
# Handle cases like numpy strings or other objects if possible
if hasattr(data, 'item'): # Generic item() method for numpy scalars
return data.item()
except Exception:
pass
# Fallback: return string representation if unsure
print(f"Warning: Type {type(data)} not explicitly handled in convert_numpy_types. Converting to string.")
return str(data)
def transform_wide_to_long(df_wide: pd.DataFrame) -> pd.DataFrame:
"""
Transforms a wide DataFrame (subjects as columns) to a long DataFrame
(Subject, Feedback_Stars, Instructor_Rating columns).
Assumes subject columns are named like 'SubjectName_Feedback_Stars' and 'SubjectName_Instructor_Rating'.
"""
id_col_options = ['EmployeeID', 'Employee ID', 'ID'] # Common names for an ID column
id_vars = ['Department']
present_id_cols = [col for col in id_col_options if col in df_wide.columns]
if present_id_cols:
chosen_id_col = 'EmployeeID' if 'EmployeeID' in present_id_cols else present_id_cols[0]
id_vars.append(chosen_id_col)
print(f"Using '{chosen_id_col}' as part of ID variables for melting.")
else:
df_wide = df_wide.reset_index().rename(columns={'index': '_TempRowID'})
id_vars.append('_TempRowID')
print("No standard 'EmployeeID' found. Using temporary row ID for melting.")
feedback_cols = [col for col in df_wide.columns if col.endswith('_Feedback_Stars')]
rating_cols = [col for col in df_wide.columns if col.endswith('_Instructor_Rating')]
if not feedback_cols and not rating_cols:
print("Warning: No columns found ending with '_Feedback_Stars' or '_Instructor_Rating'. Transformation might be incorrect.")
return pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars', 'Instructor_Rating'])
df_feedback_long = pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars'])
if feedback_cols:
df_feedback_long = pd.melt(df_wide,
id_vars=id_vars,
value_vars=feedback_cols,
var_name='Subject_Raw_FB',
value_name='Feedback_Stars')
df_feedback_long['Subject'] = df_feedback_long['Subject_Raw_FB'].str.replace('_Feedback_Stars', '')
df_feedback_long.drop(columns=['Subject_Raw_FB'], inplace=True)
df_rating_long = pd.DataFrame(columns=id_vars + ['Subject', 'Instructor_Rating'])
if rating_cols:
df_rating_long = pd.melt(df_wide,
id_vars=id_vars,
value_vars=rating_cols,
var_name='Subject_Raw_IR',
value_name='Instructor_Rating')
df_rating_long['Subject'] = df_rating_long['Subject_Raw_IR'].str.replace('_Instructor_Rating', '')
df_rating_long.drop(columns=['Subject_Raw_IR'], inplace=True)
# Merge feedback and ratings
if not df_feedback_long.empty and not df_rating_long.empty:
df_long = pd.merge(df_feedback_long, df_rating_long, on=id_vars + ['Subject'], how='outer')
elif not df_feedback_long.empty:
df_long = df_feedback_long
df_long['Instructor_Rating'] = np.nan
elif not df_rating_long.empty:
df_long = df_rating_long
df_long['Feedback_Stars'] = np.nan
else:
df_long = pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars', 'Instructor_Rating'])
if '_TempRowID' in df_long.columns:
df_long.drop(columns=['_TempRowID'], inplace=True)
# Remove rows where BOTH Feedback_Stars and Instructor_Rating are NaN
df_long.dropna(subset=['Feedback_Stars', 'Instructor_Rating'], how='all', inplace=True)
# Ensure Subject and Department are strings
df_long['Subject'] = df_long['Subject'].astype(str)
df_long['Department'] = df_long['Department'].astype(str)
print(f"Data transformed to long format. Shape: {df_long.shape}")
return df_long
def preprocess_data(df_wide: pd.DataFrame) -> pd.DataFrame:
"""
Transforms wide data to long, cleans it, and ensures appropriate types.
"""
print("Starting preprocessing...")
df_long = transform_wide_to_long(df_wide)
if df_long.empty:
print("Warning: Data transformation resulted in an empty DataFrame.")
return df_long
# Convert rating columns to numeric, coercing errors; use nullable Int64
df_long['Feedback_Stars'] = pd.to_numeric(df_long['Feedback_Stars'], errors='coerce').astype('Int64')
df_long['Instructor_Rating'] = pd.to_numeric(df_long['Instructor_Rating'], errors='coerce').astype('Int64')
# Subject and Department should already be strings from transform_wide_to_long
df_long['Subject'] = df_long['Subject'].astype(str)
df_long['Department'] = df_long['Department'].astype(str)
# Optional: Validate range (1-5) if needed, though clipping/rounding happens later
# df_long['Feedback_Stars'] = df_long['Feedback_Stars'].apply(lambda x: x if pd.isna(x) or 1 <= x <= 5 else pd.NA)
# df_long['Instructor_Rating'] = df_long['Instructor_Rating'].apply(lambda x: x if pd.isna(x) or 1 <= x <= 5 else pd.NA)
print("Data preprocessing (type conversion, NaN handling) complete.")
return df_long
def get_feedback_distribution(df: pd.DataFrame) -> dict:
"""Calculates feedback distribution per subject, ensuring standard Python types."""
if 'Feedback_Stars' not in df.columns or df['Feedback_Stars'].isnull().all():
return {}
distribution = {}
# Ensure Subject is string type for grouping
df['Subject'] = df['Subject'].astype(str)
for subject in df['Subject'].unique():
subject_df = df[df['Subject'] == subject]
# Use dropna() before value_counts
counts_series = subject_df['Feedback_Stars'].dropna().value_counts().sort_index()
if not counts_series.empty:
# Explicitly convert keys (ratings) and values (counts) to standard int
subject_dist = {int(k): int(v) for k, v in counts_series.items()}
distribution[subject] = subject_dist # subject is already string
# Although direct conversion is done, run through helper as a final safeguard for nested types
return convert_numpy_types(distribution)
def get_instructor_rating_distribution(df: pd.DataFrame) -> dict:
"""Calculates instructor rating distribution per subject, ensuring standard Python types."""
if 'Instructor_Rating' not in df.columns or df['Instructor_Rating'].isnull().all():
return {}
distribution = {}
# Ensure Subject is string type for grouping
df['Subject'] = df['Subject'].astype(str)
for subject in df['Subject'].unique():
subject_df = df[df['Subject'] == subject]
counts_series = subject_df['Instructor_Rating'].dropna().value_counts().sort_index()
if not counts_series.empty:
subject_dist = {int(k): int(v) for k, v in counts_series.items()}
distribution[subject] = subject_dist
return convert_numpy_types(distribution)
def get_average_scores(df: pd.DataFrame) -> dict:
"""Calculates average scores (rounded) and counts, returning dict of DataFrames."""
# Ensure relevant columns are appropriate types before aggregation
df['Subject'] = df['Subject'].astype(str)
df['Department'] = df['Department'].astype(str)
# Ensure ratings are numeric for mean calculation (already done in preprocess_data, but good practice)
df['Feedback_Stars'] = pd.to_numeric(df['Feedback_Stars'], errors='coerce')
df['Instructor_Rating'] = pd.to_numeric(df['Instructor_Rating'], errors='coerce')
# --- Aggregation and Rounding ---
avg_scores_subject = df.groupby('Subject').agg(
Average_Feedback_Stars=('Feedback_Stars', 'mean'),
Average_Instructor_Rating=('Instructor_Rating', 'mean'),
Total_Responses_Feedback=('Feedback_Stars', 'count'),
Total_Responses_Instructor=('Instructor_Rating', 'count')
).round(1).reset_index() # Round averages to 1 decimal place
avg_scores_dept = df.groupby('Department').agg(
Average_Feedback_Stars=('Feedback_Stars', 'mean'),
Average_Instructor_Rating=('Instructor_Rating', 'mean'),
Total_Responses_Feedback=('Feedback_Stars', 'count'),
Total_Responses_Instructor=('Instructor_Rating', 'count')
).round(1).reset_index() # Round averages to 1 decimal place
avg_scores_subject_dept = df.groupby(['Department', 'Subject']).agg(
Average_Feedback_Stars=('Feedback_Stars', 'mean'),
Average_Instructor_Rating=('Instructor_Rating', 'mean')
).round(1).reset_index() # Round averages to 1 decimal place
# --- End Rounding ---
# Convert count columns explicitly to standard int
# (though usually not an issue in DFs, good practice for consistency)
for col in ['Total_Responses_Feedback', 'Total_Responses_Instructor']:
if col in avg_scores_subject.columns:
# Use nullable Int64 if counts can be 0, then convert safely
avg_scores_subject[col] = avg_scores_subject[col].astype('Int64').fillna(0).astype(int)
if col in avg_scores_dept.columns:
avg_scores_dept[col] = avg_scores_dept[col].astype('Int64').fillna(0).astype(int)
# The resulting DataFrames might still contain float64 for means,
# but these are generally handled correctly by pandas methods like to_markdown().
# The critical part was converting the dictionaries from distribution functions.
return {
"avg_scores_subject": avg_scores_subject,
"avg_scores_dept": avg_scores_dept,
"avg_scores_subject_dept": avg_scores_subject_dept
}
|