"""
Feature engineering for behavioral fingerprinting
Extracts behavioral features from raw events to create employee baselines
"""
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, timezone
from typing import List, Dict, Optional
import models

async def calculate_behavioral_fingerprint(employee_id: str, days_back: int = 30) -> Optional[Dict[str, float]]:
    """
    Calculate behavioral fingerprint for an employee based on historical events
    
    Args:
        employee_id: Employee ID
        days_back: Number of days to look back for baseline calculation
        
    Returns:
        Dictionary of behavioral features
    """
    cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_back)
    
    # Get employee
    employee = await models.Employee.get(employee_id)
    if not employee:
        return None
    
    # Get events
    # Note: Beanie find returns a cursor, to_list executes it
    events = await models.BehavioralEvent.find(
        models.BehavioralEvent.employee_id == employee.id,
        models.BehavioralEvent.timestamp >= cutoff_date.replace(tzinfo=None) # naive datetime for mongo helper usually
    ).to_list()
    
    if not events:
        # Return default fingerprint for new employees
        return get_default_fingerprint()
    
    # Convert to DataFrame for easier analysis
    events_df = pd.DataFrame([{
        'event_type': e.event_type,
        'timestamp': e.timestamp,
        'location': e.location,
        'ip_address': e.ip_address,
        'port': e.port,
        'file_path': e.file_path,
        'action': e.action,
        'success': e.success,
        'cpu_usage': getattr(e, 'cpu_usage', 0.0),
        'memory_usage': getattr(e, 'memory_usage', 0.0)
    } for e in events])
    
    # Extract features
    features = {}
    
    # 1. Login time patterns
    login_events = events_df[events_df['event_type'] == 'login']
    if len(login_events) > 0:
        login_hours = login_events['timestamp'].dt.hour
        features['avg_login_hour'] = float(login_hours.mean())
        features['login_hour_std'] = float(login_hours.std()) if len(login_hours) > 1 else 0.0
    else:
        features['avg_login_hour'] = 9.0  # Default 9 AM
        features['login_hour_std'] = 2.0
    
    # 2. Location patterns
    unique_locations = events_df['location'].dropna().nunique()
    features['unique_locations_count'] = unique_locations
    
    # Calculate average distance from baseline location
    if employee.baseline_location:
        # Simplified: count how often location differs from baseline
        location_events = events_df[events_df['location'].notna()]
        if len(location_events) > 0:
            different_locations = (location_events['location'] != employee.baseline_location).sum()
            features['avg_location_distance'] = float(different_locations / len(location_events))
        else:
            features['avg_location_distance'] = 0.0
    else:
        features['avg_location_distance'] = 0.0
    
    # 3. Port usage patterns
    port_events = events_df[events_df['port'].notna()]
    if len(port_events) > 0:
        features['unique_ports_count'] = int(port_events['port'].nunique())
        features['avg_port_number'] = float(port_events['port'].mean())
    else:
        features['unique_ports_count'] = 0
        features['avg_port_number'] = 0.0
    
    # 4. File access patterns
    file_events = events_df[events_df['event_type'] == 'file_access']
    features['file_access_rate'] = len(file_events) / max(days_back, 1)
    
    # Sensitive file access (files in /etc, /root, or containing 'secret', 'password', etc.)
    if len(file_events) > 0:
        sensitive_keywords = ['secret', 'password', 'credential', 'key', '/etc/', '/root/', 'config']
        sensitive_files = file_events[
            file_events['file_path'].str.contains('|'.join(sensitive_keywords), case=False, na=False)
        ]
        features['sensitive_file_access_rate'] = len(sensitive_files) / max(days_back, 1)
    else:
        features['sensitive_file_access_rate'] = 0.0
    
    # 5. Privilege escalation patterns
    priv_events = events_df[events_df['event_type'] == 'privilege_escalation']
    features['privilege_escalation_rate'] = len(priv_events) / max(days_back, 1)
    
    # 6. Firewall changes
    firewall_events = events_df[events_df['event_type'] == 'firewall']
    features['firewall_change_rate'] = len(firewall_events) / max(days_back / 7, 1)  # per week
    
    # 7. Network activity volume (simplified: count of network events)
    network_events = events_df[events_df['event_type'] == 'network']
    features['network_activity_volume'] = len(network_events) / max(days_back, 1)
    
    # 8. Failed login rate
    failed_logins = login_events[login_events['success'] == False] if len(login_events) > 0 else pd.DataFrame()
    features['failed_login_rate'] = len(failed_logins) / max(days_back, 1)
    
    # 9. Time-based patterns
    if len(events_df) > 0:
        events_df['day_of_week'] = events_df['timestamp'].dt.dayofweek
        events_df['hour'] = events_df['timestamp'].dt.hour
        
        # Weekday vs weekend activity
        weekday_events = events_df[events_df['day_of_week'] < 5]
        weekend_events = events_df[events_df['day_of_week'] >= 5]
        total_events = len(events_df)
        features['weekday_activity_ratio'] = len(weekday_events) / total_events if total_events > 0 else 0.7
        
        # Night vs day activity (night: 22:00 - 06:00)
        night_events = events_df[(events_df['hour'] >= 22) | (events_df['hour'] < 6)]
        features['night_activity_ratio'] = len(night_events) / total_events if total_events > 0 else 0.0
    else:
        features['weekday_activity_ratio'] = 0.7
        features['night_activity_ratio'] = 0.0

    # 10. System Resource Patterns
    if len(events_df) > 0:
        features['avg_cpu_usage'] = float(events_df['cpu_usage'].mean())
        features['std_cpu_usage'] = float(events_df['cpu_usage'].std()) if len(events_df) > 1 else 0.0
        features['avg_memory_usage'] = float(events_df['memory_usage'].mean())
        features['std_memory_usage'] = float(events_df['memory_usage'].std()) if len(events_df) > 1 else 0.0
    else:
        features['avg_cpu_usage'] = 0.0
        features['std_cpu_usage'] = 0.0
        features['avg_memory_usage'] = 0.0
        features['std_memory_usage'] = 0.0
    
    return features


def get_default_fingerprint() -> Dict[str, float]:
    """Return default fingerprint for employees with no history"""
    return {
        'avg_login_hour': 9.0,
        'login_hour_std': 2.0,
        'unique_locations_count': 1,
        'avg_location_distance': 0.0,
        'unique_ports_count': 3,
        'avg_port_number': 443.0,
        'file_access_rate': 5.0,
        'sensitive_file_access_rate': 0.1,
        'privilege_escalation_rate': 0.5,
        'firewall_change_rate': 0.0,
        'network_activity_volume': 10.0,
        'failed_login_rate': 0.0,
        'weekday_activity_ratio': 0.8,
        'weekday_activity_ratio': 0.8,
        'night_activity_ratio': 0.05,
        'avg_cpu_usage': 10.0,
        'std_cpu_usage': 5.0,
        'avg_memory_usage': 40.0,
        'std_memory_usage': 5.0
    }


async def extract_features_from_recent_events(employee_id: str, hours_back: int = 24) -> Dict[str, float]:
    """
    Extract features from recent events for real-time anomaly detection
    Similar to fingerprint but for shorter time window
    """
    cutoff_time = datetime.now(timezone.utc) - timedelta(hours=hours_back)
    
    # Get employee
    employee = await models.Employee.get(employee_id)
    if not employee:
        # This might happen for new unknown IDs in events, fallback to default
        return get_default_fingerprint()
    
    events = await models.BehavioralEvent.find(
        models.BehavioralEvent.employee_id == employee.id,
        models.BehavioralEvent.timestamp >= cutoff_time.replace(tzinfo=None)
    ).to_list()
    
    if not events:
        return get_default_fingerprint()
    
    # Use same logic as fingerprint calculation but with shorter window
    events_df = pd.DataFrame([{
        'event_type': e.event_type,
        'timestamp': e.timestamp,
        'location': e.location,
        'ip_address': e.ip_address,
        'port': e.port,
        'file_path': e.file_path,
        'action': e.action,
        'success': e.success,
        'cpu_usage': getattr(e, 'cpu_usage', 0.0),
        'memory_usage': getattr(e, 'memory_usage', 0.0)
    } for e in events])
    
    features = {}
    
    # Calculate same features but normalize by hours instead of days
    login_events = events_df[events_df['event_type'] == 'login']
    if len(login_events) > 0:
        login_hours = login_events['timestamp'].dt.hour
        features['avg_login_hour'] = float(login_hours.mean())
        features['login_hour_std'] = float(login_hours.std()) if len(login_hours) > 1 else 0.0
    else:
        features['avg_login_hour'] = 9.0
        features['login_hour_std'] = 2.0
    
    features['unique_locations_count'] = events_df['location'].dropna().nunique()
    
    if employee.baseline_location:
        location_events = events_df[events_df['location'].notna()]
        if len(location_events) > 0:
            different_locations = (location_events['location'] != employee.baseline_location).sum()
            features['avg_location_distance'] = float(different_locations / len(location_events))
        else:
            features['avg_location_distance'] = 0.0
    else:
        features['avg_location_distance'] = 0.0
    
    port_events = events_df[events_df['port'].notna()]
    if len(port_events) > 0:
        features['unique_ports_count'] = int(port_events['port'].nunique())
        features['avg_port_number'] = float(port_events['port'].mean())
    else:
        features['unique_ports_count'] = 0
        features['avg_port_number'] = 0.0
    
    file_events = events_df[events_df['event_type'] == 'file_access']
    features['file_access_rate'] = len(file_events) / max(hours_back / 24, 1)
    
    if len(file_events) > 0:
        sensitive_keywords = ['secret', 'password', 'credential', 'key', '/etc/', '/root/', 'config']
        sensitive_files = file_events[
            file_events['file_path'].str.contains('|'.join(sensitive_keywords), case=False, na=False)
        ]
        features['sensitive_file_access_rate'] = len(sensitive_files) / max(hours_back / 24, 1)
    else:
        features['sensitive_file_access_rate'] = 0.0
    
    priv_events = events_df[events_df['event_type'] == 'privilege_escalation']
    features['privilege_escalation_rate'] = len(priv_events) / max(hours_back / 24, 1)
    
    firewall_events = events_df[events_df['event_type'] == 'firewall']
    features['firewall_change_rate'] = len(firewall_events) / max(hours_back / (24 * 7), 1)
    
    network_events = events_df[events_df['event_type'] == 'network']
    features['network_activity_volume'] = len(network_events) / max(hours_back / 24, 1)
    
    failed_logins = login_events[login_events['success'] == False] if len(login_events) > 0 else pd.DataFrame()
    features['failed_login_rate'] = len(failed_logins) / max(hours_back / 24, 1)
    
    if len(events_df) > 0:
        events_df['day_of_week'] = events_df['timestamp'].dt.dayofweek
        events_df['hour'] = events_df['timestamp'].dt.hour
        
        weekday_events = events_df[events_df['day_of_week'] < 5]
        total_events = len(events_df)
        features['weekday_activity_ratio'] = len(weekday_events) / total_events if total_events > 0 else 0.7
        
        night_events = events_df[(events_df['hour'] >= 22) | (events_df['hour'] < 6)]
        features['night_activity_ratio'] = len(night_events) / total_events if total_events > 0 else 0.0
    else:
        features['weekday_activity_ratio'] = 0.7
        features['night_activity_ratio'] = 0.0

    if len(events_df) > 0:
        features['avg_cpu_usage'] = float(events_df['cpu_usage'].mean())
        features['std_cpu_usage'] = float(events_df['cpu_usage'].std()) if len(events_df) > 1 else 0.0
        features['avg_memory_usage'] = float(events_df['memory_usage'].mean())
        features['std_memory_usage'] = float(events_df['memory_usage'].std()) if len(events_df) > 1 else 0.0
    else:
        features['avg_cpu_usage'] = 0.0
        features['std_cpu_usage'] = 0.0
        features['avg_memory_usage'] = 0.0
        features['std_memory_usage'] = 0.0
    
    return features


def get_feature_names() -> List[str]:
    """Return list of feature names in consistent order"""
    return [
        'avg_login_hour',
        'login_hour_std',
        'unique_locations_count',
        'avg_location_distance',
        'unique_ports_count',
        'avg_port_number',
        'file_access_rate',
        'sensitive_file_access_rate',
        'privilege_escalation_rate',
        'firewall_change_rate',
        'network_activity_volume',
        'failed_login_rate',
        'weekday_activity_ratio',
        'night_activity_ratio',
        'avg_cpu_usage',
        'std_cpu_usage',
        'avg_memory_usage',
        'std_memory_usage'
    ]


def features_to_array(features: Dict[str, float]) -> np.ndarray:
    """Convert feature dictionary to numpy array in consistent order"""
    feature_names = get_feature_names()
    return np.array([features.get(name, 0.0) for name in feature_names]).reshape(1, -1)