File size: 7,558 Bytes
357af64 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | """
Feature Extraction Module for ContextFlow RL Model
This module extracts the 64-dimensional state vector used by the RL model
for doubt prediction.
State Vector Structure (64 features):
βββ Topic Embedding (32 dims) - TF-IDF of learning topic
βββ Progress (1 dim) - Session progress 0.0-1.0
βββ Confusion Signals (16 dims) - Behavioral indicators
βββ Gesture Signals (14 dims) - Hand gesture frequencies
βββ Time Spent (1 dim) - Normalized session time
"""
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import Dict, List, Optional
class FeatureExtractor:
"""Extract 64-dimensional state vector for RL model"""
def __init__(self):
self.state_dim = 64
# TF-IDF vectorizer for topics (32 dimensions)
self.topic_vectorizer = TfidfVectorizer(max_features=32)
self._fit_topic_vectorizer()
# Signal names for interpretability
self.confusion_signal_names = [
'mouse_hesitation', 'scroll_reversals', 'time_on_page',
'eye_tracking_x', 'eye_tracking_y', 'page_scrolling',
'click_frequency', 'back_button', 'tab_switches',
'copy_attempts', 'zoom_level', 'scroll_speed',
'reading_pauses', 'search_usage', 'bookmark_usage', 'print_usage'
]
self.gesture_signal_names = [
'pinch', 'swipe_up', 'swipe_down', 'swipe_left', 'swipe_right',
'two_finger_swipe', 'point', 'wave', 'thumbs_up', 'thumbs_down',
'fist', 'open_palm', 'rotation', 'zoom_gesture'
]
def _fit_topic_vectorizer(self):
"""Fit TF-IDF on common learning topics"""
topics = [
'machine learning', 'deep learning', 'neural networks',
'python programming', 'data science', 'statistics',
'linear algebra', 'calculus', 'probability',
'natural language processing', 'computer vision',
'reinforcement learning', 'supervised learning', 'unsupervised learning',
'classification', 'regression', 'clustering',
'backpropagation', 'gradient descent', 'optimization',
'transformers', 'attention mechanism', 'bert', 'gpt',
'cnn', 'rnn', 'lstm', 'gru',
'overfitting', 'underfitting', 'regularization',
'cross validation', 'hyperparameters', 'training'
]
self.topic_vectorizer.fit(topics)
def extract_topic_embedding(self, topic: str) -> np.ndarray:
"""Extract 32-dimensional topic embedding"""
topic_vec = self.topic_vectorizer.transform([topic.lower()]).toarray()[0]
# Ensure 32 dimensions (pad if needed)
if len(topic_vec) < 32:
topic_vec = np.pad(topic_vec, (0, 32 - len(topic_vec)))
return topic_vec[:32]
def extract_confusion_signals(self, signals: Dict) -> np.ndarray:
"""
Extract 16-dimensional confusion signal vector
Args:
signals: Dict with keys like 'mouse_hesitation', 'scroll_reversals', etc.
Returns:
Normalized confusion signals (0.0-1.0)
"""
result = np.zeros(16)
for i, name in enumerate(self.confusion_signal_names):
if name in signals:
value = float(signals[name])
# Normalize based on signal type
if name == 'mouse_hesitation':
result[i] = min(value / 5.0, 1.0) # 0-5 scale
elif name == 'scroll_reversals':
result[i] = min(value / 10.0, 1.0) # 0-10 scale
elif name == 'time_on_page':
result[i] = min(value / 300.0, 1.0) # 0-5min scale
elif 'eye_tracking' in name:
result[i] = min(abs(value), 1.0) # -1 to 1 scale
else:
result[i] = min(value, 1.0)
return result
def extract_gesture_signals(self, gestures: Dict) -> np.ndarray:
"""
Extract 14-dimensional gesture signal vector
Args:
gestures: Dict with gesture counts or frequencies
Returns:
Normalized gesture signals (0.0-1.0)
"""
result = np.zeros(14)
for i, name in enumerate(self.gesture_signal_names):
if name in gestures:
value = float(gestures[name])
result[i] = min(value / 20.0, 1.0) # Normalize to 0-20 range
return result
def extract_state(
self,
topic: str,
progress: float,
confusion_signals: Dict,
gesture_signals: Dict,
time_spent: float
) -> np.ndarray:
"""
Extract complete 64-dimensional state vector
Args:
topic: Learning topic string
progress: Session progress (0.0-1.0)
confusion_signals: Dict of behavioral signals
gesture_signals: Dict of gesture counts
time_spent: Time spent in seconds
Returns:
64-dimensional state vector
"""
# Topic embedding: 32 dims
topic_emb = self.extract_topic_embedding(topic)
# Progress: 1 dim
progress_arr = np.array([np.clip(progress, 0.0, 1.0)])
# Confusion signals: 16 dims
confusion_arr = self.extract_confusion_signals(confusion_signals)
# Gesture signals: 14 dims
gesture_arr = self.extract_gesture_signals(gesture_signals)
# Time spent: 1 dim (normalized to 0-30min)
time_arr = np.array([min(time_spent / 1800.0, 1.0)])
# Concatenate all features
state = np.concatenate([
topic_emb, # 32 dims
progress_arr, # 1 dim
confusion_arr, # 16 dims
gesture_arr, # 14 dims
time_arr # 1 dim
])
assert len(state) == 64, f"State vector should be 64 dims, got {len(state)}"
return state
def get_feature_names(self) -> List[str]:
"""Get interpretable feature names"""
names = []
# Topic features
for i in range(32):
names.append(f"topic_{i}")
names.append("progress")
# Confusion signals
names.extend(self.confusion_signal_names)
# Gesture signals
names.extend(self.gesture_signal_names)
names.append("time_spent")
return names
def create_sample_state() -> np.ndarray:
"""Create a sample state vector for testing"""
extractor = FeatureExtractor()
return extractor.extract_state(
topic="machine learning",
progress=0.5,
confusion_signals={
'mouse_hesitation': 2.5,
'scroll_reversals': 4,
'time_on_page': 120,
'click_frequency': 8,
'back_button': 2
},
gesture_signals={
'pinch': 5,
'swipe_right': 3,
'point': 2
},
time_spent=300
)
if __name__ == "__main__":
# Test feature extraction
extractor = FeatureExtractor()
state = create_sample_state()
print(f"State vector shape: {state.shape}")
print(f"Sum of features: {state.sum():.4f}")
print(f"Features > 0: {(state > 0).sum()}")
|