File size: 7,558 Bytes

357af64

"""
Feature Extraction Module for ContextFlow RL Model

This module extracts the 64-dimensional state vector used by the RL model
for doubt prediction.

State Vector Structure (64 features):
├── Topic Embedding (32 dims)     - TF-IDF of learning topic
├── Progress (1 dim)              - Session progress 0.0-1.0
├── Confusion Signals (16 dims)    - Behavioral indicators
├── Gesture Signals (14 dims)      - Hand gesture frequencies
└── Time Spent (1 dim)            - Normalized session time
"""

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import Dict, List, Optional


class FeatureExtractor:
    """Extract 64-dimensional state vector for RL model"""
    
    def __init__(self):
        self.state_dim = 64
        
        # TF-IDF vectorizer for topics (32 dimensions)
        self.topic_vectorizer = TfidfVectorizer(max_features=32)
        self._fit_topic_vectorizer()
        
        # Signal names for interpretability
        self.confusion_signal_names = [
            'mouse_hesitation', 'scroll_reversals', 'time_on_page',
            'eye_tracking_x', 'eye_tracking_y', 'page_scrolling',
            'click_frequency', 'back_button', 'tab_switches',
            'copy_attempts', 'zoom_level', 'scroll_speed',
            'reading_pauses', 'search_usage', 'bookmark_usage', 'print_usage'
        ]
        
        self.gesture_signal_names = [
            'pinch', 'swipe_up', 'swipe_down', 'swipe_left', 'swipe_right',
            'two_finger_swipe', 'point', 'wave', 'thumbs_up', 'thumbs_down',
            'fist', 'open_palm', 'rotation', 'zoom_gesture'
        ]
    
    def _fit_topic_vectorizer(self):
        """Fit TF-IDF on common learning topics"""
        topics = [
            'machine learning', 'deep learning', 'neural networks',
            'python programming', 'data science', 'statistics',
            'linear algebra', 'calculus', 'probability',
            'natural language processing', 'computer vision',
            'reinforcement learning', 'supervised learning', 'unsupervised learning',
            'classification', 'regression', 'clustering',
            'backpropagation', 'gradient descent', 'optimization',
            'transformers', 'attention mechanism', 'bert', 'gpt',
            'cnn', 'rnn', 'lstm', 'gru',
            'overfitting', 'underfitting', 'regularization',
            'cross validation', 'hyperparameters', 'training'
        ]
        self.topic_vectorizer.fit(topics)
    
    def extract_topic_embedding(self, topic: str) -> np.ndarray:
        """Extract 32-dimensional topic embedding"""
        topic_vec = self.topic_vectorizer.transform([topic.lower()]).toarray()[0]
        
        # Ensure 32 dimensions (pad if needed)
        if len(topic_vec) < 32:
            topic_vec = np.pad(topic_vec, (0, 32 - len(topic_vec)))
        
        return topic_vec[:32]
    
    def extract_confusion_signals(self, signals: Dict) -> np.ndarray:
        """
        Extract 16-dimensional confusion signal vector
        
        Args:
            signals: Dict with keys like 'mouse_hesitation', 'scroll_reversals', etc.
        
        Returns:
            Normalized confusion signals (0.0-1.0)
        """
        result = np.zeros(16)
        
        for i, name in enumerate(self.confusion_signal_names):
            if name in signals:
                value = float(signals[name])
                # Normalize based on signal type
                if name == 'mouse_hesitation':
                    result[i] = min(value / 5.0, 1.0)  # 0-5 scale
                elif name == 'scroll_reversals':
                    result[i] = min(value / 10.0, 1.0)  # 0-10 scale
                elif name == 'time_on_page':
                    result[i] = min(value / 300.0, 1.0)  # 0-5min scale
                elif 'eye_tracking' in name:
                    result[i] = min(abs(value), 1.0)  # -1 to 1 scale
                else:
                    result[i] = min(value, 1.0)
        
        return result
    
    def extract_gesture_signals(self, gestures: Dict) -> np.ndarray:
        """
        Extract 14-dimensional gesture signal vector
        
        Args:
            gestures: Dict with gesture counts or frequencies
        
        Returns:
            Normalized gesture signals (0.0-1.0)
        """
        result = np.zeros(14)
        
        for i, name in enumerate(self.gesture_signal_names):
            if name in gestures:
                value = float(gestures[name])
                result[i] = min(value / 20.0, 1.0)  # Normalize to 0-20 range
        
        return result
    
    def extract_state(
        self,
        topic: str,
        progress: float,
        confusion_signals: Dict,
        gesture_signals: Dict,
        time_spent: float
    ) -> np.ndarray:
        """
        Extract complete 64-dimensional state vector
        
        Args:
            topic: Learning topic string
            progress: Session progress (0.0-1.0)
            confusion_signals: Dict of behavioral signals
            gesture_signals: Dict of gesture counts
            time_spent: Time spent in seconds
        
        Returns:
            64-dimensional state vector
        """
        # Topic embedding: 32 dims
        topic_emb = self.extract_topic_embedding(topic)
        
        # Progress: 1 dim
        progress_arr = np.array([np.clip(progress, 0.0, 1.0)])
        
        # Confusion signals: 16 dims
        confusion_arr = self.extract_confusion_signals(confusion_signals)
        
        # Gesture signals: 14 dims
        gesture_arr = self.extract_gesture_signals(gesture_signals)
        
        # Time spent: 1 dim (normalized to 0-30min)
        time_arr = np.array([min(time_spent / 1800.0, 1.0)])
        
        # Concatenate all features
        state = np.concatenate([
            topic_emb,      # 32 dims
            progress_arr,   # 1 dim
            confusion_arr,  # 16 dims
            gesture_arr,    # 14 dims
            time_arr        # 1 dim
        ])
        
        assert len(state) == 64, f"State vector should be 64 dims, got {len(state)}"
        
        return state
    
    def get_feature_names(self) -> List[str]:
        """Get interpretable feature names"""
        names = []
        
        # Topic features
        for i in range(32):
            names.append(f"topic_{i}")
        
        names.append("progress")
        
        # Confusion signals
        names.extend(self.confusion_signal_names)
        
        # Gesture signals
        names.extend(self.gesture_signal_names)
        
        names.append("time_spent")
        
        return names


def create_sample_state() -> np.ndarray:
    """Create a sample state vector for testing"""
    extractor = FeatureExtractor()
    
    return extractor.extract_state(
        topic="machine learning",
        progress=0.5,
        confusion_signals={
            'mouse_hesitation': 2.5,
            'scroll_reversals': 4,
            'time_on_page': 120,
            'click_frequency': 8,
            'back_button': 2
        },
        gesture_signals={
            'pinch': 5,
            'swipe_right': 3,
            'point': 2
        },
        time_spent=300
    )


if __name__ == "__main__":
    # Test feature extraction
    extractor = FeatureExtractor()
    state = create_sample_state()
    
    print(f"State vector shape: {state.shape}")
    print(f"Sum of features: {state.sum():.4f}")
    print(f"Features > 0: {(state > 0).sum()}")