Spaces:

ArkenB
/

akwel_performance

Paused

App Files Files Community

akwel_performance / src /preprocessor.py

ArkenB

Create preprocessor.py

62a4c11 verified 9 months ago

raw

history blame contribute delete

10.8 kB

	# src/preprocessor.py
	import pandas as pd
	import numpy as np

	# Helper function to recursively convert numpy types to standard Python types
	def convert_numpy_types(data):
	"""Recursively converts numpy types in nested data structures to standard Python types."""
	if isinstance(data, dict):
	return {convert_numpy_types(k): convert_numpy_types(v) for k, v in data.items()}
	elif isinstance(data, list):
	return [convert_numpy_types(i) for i in data]
	elif isinstance(data, np.integer): # Catches numpy ints (int32, int64 etc.)
	return int(data)
	elif isinstance(data, np.floating): # Catches numpy floats
	return float(data)
	elif isinstance(data, np.bool_): # Catches numpy bools
	return bool(data)
	elif isinstance(data, np.ndarray): # Handle arrays if they appear unexpectedly
	return convert_numpy_types(data.tolist())
	elif isinstance(data, (str, int, float, bool, type(None))): # Keep standard types
	return data
	else: # Attempt conversion for other numpy types or return as string
	try:
	# Handle cases like numpy strings or other objects if possible
	if hasattr(data, 'item'): # Generic item() method for numpy scalars
	return data.item()
	except Exception:
	pass
	# Fallback: return string representation if unsure
	print(f"Warning: Type {type(data)} not explicitly handled in convert_numpy_types. Converting to string.")
	return str(data)


	def transform_wide_to_long(df_wide: pd.DataFrame) -> pd.DataFrame:
	"""
	Transforms a wide DataFrame (subjects as columns) to a long DataFrame
	(Subject, Feedback_Stars, Instructor_Rating columns).
	Assumes subject columns are named like 'SubjectName_Feedback_Stars' and 'SubjectName_Instructor_Rating'.
	"""
	id_col_options = ['EmployeeID', 'Employee ID', 'ID'] # Common names for an ID column
	id_vars = ['Department']

	present_id_cols = [col for col in id_col_options if col in df_wide.columns]
	if present_id_cols:
	chosen_id_col = 'EmployeeID' if 'EmployeeID' in present_id_cols else present_id_cols[0]
	id_vars.append(chosen_id_col)
	print(f"Using '{chosen_id_col}' as part of ID variables for melting.")
	else:
	df_wide = df_wide.reset_index().rename(columns={'index': '_TempRowID'})
	id_vars.append('_TempRowID')
	print("No standard 'EmployeeID' found. Using temporary row ID for melting.")

	feedback_cols = [col for col in df_wide.columns if col.endswith('_Feedback_Stars')]
	rating_cols = [col for col in df_wide.columns if col.endswith('_Instructor_Rating')]

	if not feedback_cols and not rating_cols:
	print("Warning: No columns found ending with '_Feedback_Stars' or '_Instructor_Rating'. Transformation might be incorrect.")
	return pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars', 'Instructor_Rating'])

	df_feedback_long = pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars'])
	if feedback_cols:
	df_feedback_long = pd.melt(df_wide,
	id_vars=id_vars,
	value_vars=feedback_cols,
	var_name='Subject_Raw_FB',
	value_name='Feedback_Stars')
	df_feedback_long['Subject'] = df_feedback_long['Subject_Raw_FB'].str.replace('_Feedback_Stars', '')
	df_feedback_long.drop(columns=['Subject_Raw_FB'], inplace=True)

	df_rating_long = pd.DataFrame(columns=id_vars + ['Subject', 'Instructor_Rating'])
	if rating_cols:
	df_rating_long = pd.melt(df_wide,
	id_vars=id_vars,
	value_vars=rating_cols,
	var_name='Subject_Raw_IR',
	value_name='Instructor_Rating')
	df_rating_long['Subject'] = df_rating_long['Subject_Raw_IR'].str.replace('_Instructor_Rating', '')
	df_rating_long.drop(columns=['Subject_Raw_IR'], inplace=True)

	# Merge feedback and ratings
	if not df_feedback_long.empty and not df_rating_long.empty:
	df_long = pd.merge(df_feedback_long, df_rating_long, on=id_vars + ['Subject'], how='outer')
	elif not df_feedback_long.empty:
	df_long = df_feedback_long
	df_long['Instructor_Rating'] = np.nan
	elif not df_rating_long.empty:
	df_long = df_rating_long
	df_long['Feedback_Stars'] = np.nan
	else:
	df_long = pd.DataFrame(columns=id_vars + ['Subject', 'Feedback_Stars', 'Instructor_Rating'])

	if '_TempRowID' in df_long.columns:
	df_long.drop(columns=['_TempRowID'], inplace=True)

	# Remove rows where BOTH Feedback_Stars and Instructor_Rating are NaN
	df_long.dropna(subset=['Feedback_Stars', 'Instructor_Rating'], how='all', inplace=True)

	# Ensure Subject and Department are strings
	df_long['Subject'] = df_long['Subject'].astype(str)
	df_long['Department'] = df_long['Department'].astype(str)

	print(f"Data transformed to long format. Shape: {df_long.shape}")
	return df_long


	def preprocess_data(df_wide: pd.DataFrame) -> pd.DataFrame:
	"""
	Transforms wide data to long, cleans it, and ensures appropriate types.
	"""
	print("Starting preprocessing...")
	df_long = transform_wide_to_long(df_wide)

	if df_long.empty:
	print("Warning: Data transformation resulted in an empty DataFrame.")
	return df_long

	# Convert rating columns to numeric, coercing errors; use nullable Int64
	df_long['Feedback_Stars'] = pd.to_numeric(df_long['Feedback_Stars'], errors='coerce').astype('Int64')
	df_long['Instructor_Rating'] = pd.to_numeric(df_long['Instructor_Rating'], errors='coerce').astype('Int64')

	# Subject and Department should already be strings from transform_wide_to_long
	df_long['Subject'] = df_long['Subject'].astype(str)
	df_long['Department'] = df_long['Department'].astype(str)

	# Optional: Validate range (1-5) if needed, though clipping/rounding happens later
	# df_long['Feedback_Stars'] = df_long['Feedback_Stars'].apply(lambda x: x if pd.isna(x) or 1 <= x <= 5 else pd.NA)
	# df_long['Instructor_Rating'] = df_long['Instructor_Rating'].apply(lambda x: x if pd.isna(x) or 1 <= x <= 5 else pd.NA)

	print("Data preprocessing (type conversion, NaN handling) complete.")
	return df_long


	def get_feedback_distribution(df: pd.DataFrame) -> dict:
	"""Calculates feedback distribution per subject, ensuring standard Python types."""
	if 'Feedback_Stars' not in df.columns or df['Feedback_Stars'].isnull().all():
	return {}
	distribution = {}
	# Ensure Subject is string type for grouping
	df['Subject'] = df['Subject'].astype(str)

	for subject in df['Subject'].unique():
	subject_df = df[df['Subject'] == subject]
	# Use dropna() before value_counts
	counts_series = subject_df['Feedback_Stars'].dropna().value_counts().sort_index()
	if not counts_series.empty:
	# Explicitly convert keys (ratings) and values (counts) to standard int
	subject_dist = {int(k): int(v) for k, v in counts_series.items()}
	distribution[subject] = subject_dist # subject is already string

	# Although direct conversion is done, run through helper as a final safeguard for nested types
	return convert_numpy_types(distribution)

	def get_instructor_rating_distribution(df: pd.DataFrame) -> dict:
	"""Calculates instructor rating distribution per subject, ensuring standard Python types."""
	if 'Instructor_Rating' not in df.columns or df['Instructor_Rating'].isnull().all():
	return {}
	distribution = {}
	# Ensure Subject is string type for grouping
	df['Subject'] = df['Subject'].astype(str)

	for subject in df['Subject'].unique():
	subject_df = df[df['Subject'] == subject]
	counts_series = subject_df['Instructor_Rating'].dropna().value_counts().sort_index()
	if not counts_series.empty:
	subject_dist = {int(k): int(v) for k, v in counts_series.items()}
	distribution[subject] = subject_dist

	return convert_numpy_types(distribution)

	def get_average_scores(df: pd.DataFrame) -> dict:
	"""Calculates average scores (rounded) and counts, returning dict of DataFrames."""

	# Ensure relevant columns are appropriate types before aggregation
	df['Subject'] = df['Subject'].astype(str)
	df['Department'] = df['Department'].astype(str)
	# Ensure ratings are numeric for mean calculation (already done in preprocess_data, but good practice)
	df['Feedback_Stars'] = pd.to_numeric(df['Feedback_Stars'], errors='coerce')
	df['Instructor_Rating'] = pd.to_numeric(df['Instructor_Rating'], errors='coerce')

	# --- Aggregation and Rounding ---
	avg_scores_subject = df.groupby('Subject').agg(
	Average_Feedback_Stars=('Feedback_Stars', 'mean'),
	Average_Instructor_Rating=('Instructor_Rating', 'mean'),
	Total_Responses_Feedback=('Feedback_Stars', 'count'),
	Total_Responses_Instructor=('Instructor_Rating', 'count')
	).round(1).reset_index() # Round averages to 1 decimal place

	avg_scores_dept = df.groupby('Department').agg(
	Average_Feedback_Stars=('Feedback_Stars', 'mean'),
	Average_Instructor_Rating=('Instructor_Rating', 'mean'),
	Total_Responses_Feedback=('Feedback_Stars', 'count'),
	Total_Responses_Instructor=('Instructor_Rating', 'count')
	).round(1).reset_index() # Round averages to 1 decimal place

	avg_scores_subject_dept = df.groupby(['Department', 'Subject']).agg(
	Average_Feedback_Stars=('Feedback_Stars', 'mean'),
	Average_Instructor_Rating=('Instructor_Rating', 'mean')
	).round(1).reset_index() # Round averages to 1 decimal place
	# --- End Rounding ---

	# Convert count columns explicitly to standard int
	# (though usually not an issue in DFs, good practice for consistency)
	for col in ['Total_Responses_Feedback', 'Total_Responses_Instructor']:
	if col in avg_scores_subject.columns:
	# Use nullable Int64 if counts can be 0, then convert safely
	avg_scores_subject[col] = avg_scores_subject[col].astype('Int64').fillna(0).astype(int)
	if col in avg_scores_dept.columns:
	avg_scores_dept[col] = avg_scores_dept[col].astype('Int64').fillna(0).astype(int)

	# The resulting DataFrames might still contain float64 for means,
	# but these are generally handled correctly by pandas methods like to_markdown().
	# The critical part was converting the dictionaries from distribution functions.
	return {
	"avg_scores_subject": avg_scores_subject,
	"avg_scores_dept": avg_scores_dept,
	"avg_scores_subject_dept": avg_scores_subject_dept
	}