Spaces:
Runtime error
Runtime error
| # feature_engineering.py | |
| import pandas as pd | |
| import numpy as np | |
| def simulate_student_interactions(df_qa, num_students, interactions_per_student): | |
| """Generates a realistic, time-series log from static Q&A data.""" | |
| if df_qa is None or df_qa.empty: | |
| return pd.DataFrame() | |
| print(f"\nSimulating interaction logs for {num_students} students...") | |
| all_interactions = [] | |
| for student_id in range(num_students): | |
| student_interactions = df_qa.sample(n=interactions_per_student, replace=True).copy() | |
| student_interactions['student_id'] = student_id | |
| mastery = {source: 0.1 for source in df_qa['source'].unique()} | |
| correct_list = [] | |
| for _, row in student_interactions.iterrows(): | |
| source = row['source'] | |
| is_correct = 1 if np.random.rand() < mastery.get(source, 0.1) else 0 | |
| correct_list.append(is_correct) | |
| mastery[source] += (1 - mastery.get(source, 0.1)) * 0.25 if is_correct else -mastery.get(source, 0.1) * 0.1 | |
| student_interactions['is_correct'] = correct_list | |
| correct_times = np.random.normal(25, 5, size=len(student_interactions)) | |
| incorrect_times = np.random.normal(60, 15, size=len(student_interactions)) | |
| student_interactions['response_time_sec'] = np.where(student_interactions['is_correct'] == 1, correct_times, incorrect_times).clip(5, 300) | |
| student_interactions['timestamp'] = pd.to_datetime(pd.Timestamp.now() + pd.to_timedelta(np.arange(len(student_interactions)), 'm')) | |
| all_interactions.append(student_interactions) | |
| df_simulated = pd.concat(all_interactions, ignore_index=True) if all_interactions else pd.DataFrame() | |
| print(f"Simulation complete. Generated {len(df_simulated):,} interactions.") | |
| return df_simulated | |
| def create_features(df, skill_encoder): | |
| """ | |
| Takes a dataframe of student interactions and engineers the features | |
| needed for the LGBM model. | |
| """ | |
| processed_df = df.copy() | |
| known_sources = skill_encoder.classes_ | |
| processed_df = processed_df[processed_df['source'].isin(known_sources)] | |
| if processed_df.empty: | |
| return pd.DataFrame() | |
| processed_df['skill_id_encoded'] = skill_encoder.transform(processed_df['source']) | |
| processed_df.sort_values(['student_id', 'timestamp'], inplace=True, kind='mergesort') | |
| processed_df['prior_is_correct'] = processed_df.groupby('student_id')['is_correct'].shift(1) | |
| processed_df['prior_response_time'] = processed_df.groupby('student_id')['response_time_sec'].shift(1) | |
| processed_df['skill_attempts'] = processed_df.groupby(['student_id', 'skill_id_encoded']).cumcount() | |
| skill_correct_sum = processed_df.groupby(['student_id', 'skill_id_encoded'])['is_correct'].cumsum() | |
| processed_df['skill_correct_rate'] = (skill_correct_sum - processed_df['is_correct']) / processed_df['skill_attempts'] | |
| processed_df['skill_correct_rate'] = processed_df['skill_correct_rate'].fillna(0.5) | |
| processed_df['question_length'] = processed_df['question'].str.len().fillna(0) | |
| processed_df.dropna(subset=['prior_is_correct', 'prior_response_time'], inplace=True) | |
| return processed_df |