psychology-tutor-engine / feature_engineering.py
adfras's picture
Initial commit: Psychology tutor engine and data pipelines
1da14e1
# feature_engineering.py
import pandas as pd
import numpy as np
def simulate_student_interactions(df_qa, num_students, interactions_per_student):
"""Generates a realistic, time-series log from static Q&A data."""
if df_qa is None or df_qa.empty:
return pd.DataFrame()
print(f"\nSimulating interaction logs for {num_students} students...")
all_interactions = []
for student_id in range(num_students):
student_interactions = df_qa.sample(n=interactions_per_student, replace=True).copy()
student_interactions['student_id'] = student_id
mastery = {source: 0.1 for source in df_qa['source'].unique()}
correct_list = []
for _, row in student_interactions.iterrows():
source = row['source']
is_correct = 1 if np.random.rand() < mastery.get(source, 0.1) else 0
correct_list.append(is_correct)
mastery[source] += (1 - mastery.get(source, 0.1)) * 0.25 if is_correct else -mastery.get(source, 0.1) * 0.1
student_interactions['is_correct'] = correct_list
correct_times = np.random.normal(25, 5, size=len(student_interactions))
incorrect_times = np.random.normal(60, 15, size=len(student_interactions))
student_interactions['response_time_sec'] = np.where(student_interactions['is_correct'] == 1, correct_times, incorrect_times).clip(5, 300)
student_interactions['timestamp'] = pd.to_datetime(pd.Timestamp.now() + pd.to_timedelta(np.arange(len(student_interactions)), 'm'))
all_interactions.append(student_interactions)
df_simulated = pd.concat(all_interactions, ignore_index=True) if all_interactions else pd.DataFrame()
print(f"Simulation complete. Generated {len(df_simulated):,} interactions.")
return df_simulated
def create_features(df, skill_encoder):
"""
Takes a dataframe of student interactions and engineers the features
needed for the LGBM model.
"""
processed_df = df.copy()
known_sources = skill_encoder.classes_
processed_df = processed_df[processed_df['source'].isin(known_sources)]
if processed_df.empty:
return pd.DataFrame()
processed_df['skill_id_encoded'] = skill_encoder.transform(processed_df['source'])
processed_df.sort_values(['student_id', 'timestamp'], inplace=True, kind='mergesort')
processed_df['prior_is_correct'] = processed_df.groupby('student_id')['is_correct'].shift(1)
processed_df['prior_response_time'] = processed_df.groupby('student_id')['response_time_sec'].shift(1)
processed_df['skill_attempts'] = processed_df.groupby(['student_id', 'skill_id_encoded']).cumcount()
skill_correct_sum = processed_df.groupby(['student_id', 'skill_id_encoded'])['is_correct'].cumsum()
processed_df['skill_correct_rate'] = (skill_correct_sum - processed_df['is_correct']) / processed_df['skill_attempts']
processed_df['skill_correct_rate'] = processed_df['skill_correct_rate'].fillna(0.5)
processed_df['question_length'] = processed_df['question'].str.len().fillna(0)
processed_df.dropna(subset=['prior_is_correct', 'prior_response_time'], inplace=True)
return processed_df