import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import numpy as np
from typing import Optional, Dict, Any, List, Tuple, Union
from dataclasses import dataclass, field
from enum import Enum
import logging

logger = logging.getLogger(__name__)


class ReasoningPath(Enum):
    """Available reasoning paths with different compute requirements"""
    FAST = "fast"  # <100ms - cached/simple responses
    STANDARD = "standard"  # 1-5s - normal forward pass
    EXPERT = "expert"  # expert MoE path (activates experts)
    DEEP = "deep"  # 10-60s - chain-of-thought
    ULTRA_DEEP = "ultra_deep"  # minutes - recursive reasoning


@dataclass
class ComplexityFeatures:
    """Features used for complexity scoring"""
    token_length: int
    token_entropy: float
    has_math: bool
    has_code: bool
    named_entities_count: int
    syntactic_depth: float
    conversation_depth: int
    prior_failures: int = 0
    user_preference_score: float = 0.5
    use_moe: bool = False  # Whether to use MoE for this path
    domain_signals: Dict[str, float] = field(default_factory=dict)


@dataclass
class RoutingDecision:
    """Routing decision output"""
    path: ReasoningPath
    confidence: float
    complexity_score: float
    estimated_latency_ms: float
    debug_info: Dict[str, Any] = field(default_factory=dict)


class ComplexityScorer(nn.Module):
    """Neural network for scoring input complexity"""
    
    def __init__(self, feature_dim: int = 128, hidden_dim: int = 256):
        super().__init__()
        
        # Feature extractors
        self.text_encoder = nn.Sequential(
            nn.Linear(feature_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, hidden_dim // 2)
        )
        
        # Domain-specific encoders
        self.math_encoder = nn.Linear(32, hidden_dim // 4)
        self.code_encoder = nn.Linear(32, hidden_dim // 4)
        
        # Complexity predictor
        self.complexity_head = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
        
        # Feature statistics
        self.register_buffer('feature_mean', torch.zeros(feature_dim))
        self.register_buffer('feature_std', torch.ones(feature_dim))
        
    def extract_features(self, text: str, tokens: torch.Tensor) -> ComplexityFeatures:
        """Extract complexity features from input"""
        # Token statistics
        token_length = len(tokens)
        
        # Calculate token entropy
        token_probs = torch.softmax(torch.randn(len(tokens)), dim=-1)  # Placeholder
        token_entropy = -torch.sum(token_probs * torch.log(token_probs + 1e-10)).item()
        
        # Domain detection
        has_math = any(symbol in text for symbol in ['=', '∫', '∑', '∂', 'sqrt', 'log'])
        has_code = any(keyword in text for keyword in ['def', 'class', 'function', '{', '}', '()', '[]'])
        
        # Named entities (simplified)
        import re
        capitals = re.findall(r'\b[A-Z][a-z]+\b', text)
        named_entities_count = len(set(capitals))
        
        # Syntactic complexity (simplified - could use actual parser)
        syntactic_depth = len(text.split('.')) * np.log(1 + len(text.split(',')))
        
        return ComplexityFeatures(
            token_length=token_length,
            token_entropy=token_entropy,
            has_math=has_math,
            has_code=has_code,
            named_entities_count=named_entities_count,
            syntactic_depth=syntactic_depth,
            conversation_depth=0  # Set by conversation manager
        )
    
    def forward(self, features: ComplexityFeatures) -> torch.Tensor:
        """Compute complexity score from features"""
        # Create feature vector
        dtype = next(self.parameters()).dtype
        device = next(self.parameters()).device
        feature_vec = torch.tensor([
            features.token_length / 1000.0,  # Normalize
            features.token_entropy / 10.0,
            float(features.has_math),
            float(features.has_code),
            features.named_entities_count / 20.0,
            features.syntactic_depth / 100.0,
            features.conversation_depth / 10.0,
            features.prior_failures / 5.0,
            features.user_preference_score
        ], dtype=dtype, device=device).unsqueeze(0)
        
        # Pad to feature_dim
        if feature_vec.shape[1] < self.feature_mean.shape[0]:
            padding = torch.zeros((1, self.feature_mean.shape[0] - feature_vec.shape[1]), dtype=dtype, device=device)
            feature_vec = torch.cat([feature_vec, padding], dim=1)
        
        # Normalize features
        feature_vec = (feature_vec - self.feature_mean.to(dtype=dtype, device=device)) / (self.feature_std.to(dtype=dtype, device=device) + 1e-8)
        
        # Encode features
        text_features = self.text_encoder(feature_vec)
        
        # Add domain-specific features if present
        if features.has_math:
            math_features = self.math_encoder(torch.randn(1, 32, dtype=dtype, device=device))  # Placeholder
            text_features = torch.cat([text_features, math_features], dim=-1)
        
        if features.has_code:
            code_features = self.code_encoder(torch.randn(1, 32, dtype=dtype, device=device))  # Placeholder
            text_features = torch.cat([text_features, code_features], dim=-1)
        
        # Pad if necessary
        if text_features.shape[1] < 256:
            padding = torch.zeros((1, 256 - text_features.shape[1]), dtype=dtype, device=device)
            text_features = torch.cat([text_features, padding], dim=1)
        
        # Predict complexity
        complexity_score = self.complexity_head(text_features)
        
        return complexity_score.squeeze()


class RouterNetwork(nn.Module):
    """Neural router for path selection"""
    
    def __init__(self, hidden_dim: int = 4096, router_hidden: int = 1024, n_paths: int = 4):
        super().__init__()
        
        self.n_paths = n_paths
        
        # Router MLP
        self.router = nn.Sequential(
            nn.Linear(hidden_dim + 9, router_hidden),  # +9 for complexity features
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(router_hidden, router_hidden // 2),
            nn.ReLU(),
            nn.Linear(router_hidden // 2, n_paths)
        )
        
        # Confidence predictor
        self.confidence = nn.Sequential(
            nn.Linear(hidden_dim + n_paths, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )
        
    def forward(self, hidden_states: torch.Tensor, complexity_features: ComplexityFeatures) -> Tuple[torch.Tensor, torch.Tensor]:
        """Route to appropriate path based on input"""
        batch_size = hidden_states.shape[0]
        
        # Pool hidden states
        pooled = hidden_states.mean(dim=1)  # [batch, hidden_dim]
        
        # Create feature vector
        dtype = hidden_states.dtype
        device = hidden_states.device
        feature_vec = torch.tensor([
            complexity_features.token_length / 1000.0,
            complexity_features.token_entropy / 10.0,
            float(complexity_features.has_math),
            float(complexity_features.has_code),
            complexity_features.named_entities_count / 20.0,
            complexity_features.syntactic_depth / 100.0,
            complexity_features.conversation_depth / 10.0,
            complexity_features.prior_failures / 5.0,
            complexity_features.user_preference_score
        ], dtype=dtype, device=device).unsqueeze(0).repeat(batch_size, 1)
        
        # Concatenate features
        router_input = torch.cat([pooled, feature_vec], dim=-1)
        
        # Get routing probabilities
        logits = self.router(router_input)
        probs = F.softmax(logits, dim=-1)
        
        # Predict confidence
        conf_input = torch.cat([pooled, probs], dim=-1)
        confidence = self.confidence(conf_input).squeeze(-1)
        
        return probs, confidence


class DynamicReasoningEngine(nn.Module):
    """Main DRE orchestrator for adaptive inference"""
    
    def __init__(
        self,
        base_model: nn.Module,
        config: Dict[str, Any],
        fast_model: Optional[nn.Module] = None,
        enable_caching: bool = True
    ):
        super().__init__()
        
        self.base_model = base_model
        self.fast_model = fast_model or self._create_distilled_model()
        self.config = config
        
        # Components
        self.complexity_scorer = ComplexityScorer()
        self.router = RouterNetwork(
            hidden_dim=config.get('hidden_dim', 4096),
            n_paths=len(ReasoningPath)
        )
        # Hidden-state based complexity head to avoid placeholder randomness and to vary per-input
        self.hidden_complexity_head = nn.Sequential(
            nn.Linear(config.get('hidden_dim', 4096), 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 1),
            nn.Sigmoid(),
        )
        
        # Caching
        self.enable_caching = enable_caching
        self.cache = {} if enable_caching else None
        self.cache_hits = 0
        self.cache_misses = 0
        
        # Thresholds for routing (can be learned)
        # Reordered so EXPERT sits between STANDARD and DEEP for better MoE engagement
        self.complexity_thresholds = {
            ReasoningPath.FAST: 0.2,
            ReasoningPath.STANDARD: 0.35,
            ReasoningPath.EXPERT: 0.5,      # MoE experts - moderate complexity
            ReasoningPath.DEEP: 0.75,       # Chain-of-thought - high complexity
            ReasoningPath.ULTRA_DEEP: 0.9   # Recursive reasoning - very high complexity
        }
        
        # Latency tracking
        self.latency_history = {path: [] for path in ReasoningPath}
        
        # DRE metrics tracking
        self.activation_counts = {path: 0 for path in ReasoningPath}
        self.total_activations = 0
        self.complexity_scores = []
        self.confidence_scores = []
        self.reasoning_steps = []
    
    def _create_distilled_model(self):
        """Create a smaller distilled version of the base model"""
        # Placeholder - in practice, load a pre-distilled model
        return nn.Sequential(
            nn.Linear(self.base_model.config.n_embd, 512),
            nn.ReLU(),
            nn.Linear(512, self.base_model.config.vocab_size)
        )
    
    def _check_cache(self, input_hash: str) -> Optional[torch.Tensor]:
        """Check if response is cached"""
        if not self.enable_caching:
            return None
            
        if input_hash in self.cache:
            self.cache_hits += 1
            logger.info(f"Cache hit! Hits: {self.cache_hits}, Misses: {self.cache_misses}")
            return self.cache[input_hash]
        
        self.cache_misses += 1
        return None
    
    def _fast_inference(self, input_ids: torch.Tensor, **kwargs) -> torch.Tensor:
        """Fast path: cached or distilled model inference"""
        # Check cache first
        input_hash = hash(input_ids.cpu().numpy().tobytes())
        cached = self._check_cache(str(input_hash))
        if cached is not None:
            return cached
        
        # Use distilled model
        if self.fast_model is not None:
            with torch.no_grad():
                embeddings = self.base_model.embed_tokens(input_ids)
                pooled = embeddings.mean(dim=1)
                output = self.fast_model(pooled)
                
                # Cache result
                if self.enable_caching:
                    self.cache[str(input_hash)] = output
                    
                return output
        
        return None
    
    def _standard_inference(self, input_ids: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
        """Standard path: normal forward pass"""
        return self.base_model(input_ids, **kwargs)
    
    def _deep_inference(
        self, 
        input_ids: torch.Tensor,
        max_steps: int = 10,
        **kwargs
    ) -> Dict[str, torch.Tensor]:
        """Deep path: chain-of-thought reasoning"""
        outputs = []
        current_input = input_ids
        
        for step in range(max_steps):
            # Generate reasoning step
            step_output = self.base_model(current_input, **kwargs)
            outputs.append(step_output)
            
            # Check if reasoning is complete (simplified)
            if self._is_reasoning_complete(step_output):
                break
            
            # Prepare next input (would include generated tokens in practice)
            current_input = input_ids  # Placeholder
        
        # Aggregate outputs
        final_output = self._aggregate_reasoning_steps(outputs)
        return final_output
    
    def _ultra_deep_inference(
        self,
        input_ids: torch.Tensor,
        max_depth: int = 5,
        **kwargs
    ) -> Dict[str, torch.Tensor]:
        """Ultra-deep path: recursive reasoning with self-reflection"""
        def recursive_reason(input_ids, depth):
            if depth == 0:
                return self._standard_inference(input_ids, **kwargs)
            
            # Generate initial response
            response = self._deep_inference(input_ids, **kwargs)
            
            # Self-critique (placeholder)
            critique = self._generate_critique(response)
            
            # Refine based on critique
            refined = recursive_reason(input_ids, depth - 1)
            
            return self._merge_responses(response, refined)
        
        return recursive_reason(input_ids, max_depth)
    
    def _is_reasoning_complete(self, output: Dict[str, torch.Tensor]) -> bool:
        """Check if reasoning chain is complete"""
        # Simplified - check for end token or confidence threshold
        logits = output.get('logits', None)
        if logits is not None:
            probs = F.softmax(logits[:, -1, :], dim=-1)
            max_prob = probs.max().item()
            return max_prob > 0.95  # High confidence
        return False
    
    def _aggregate_reasoning_steps(self, outputs: List[Dict]) -> Dict[str, torch.Tensor]:
        """Aggregate multiple reasoning steps"""
        # Simple averaging (can be more sophisticated)
        aggregated = {}
        for key in outputs[0].keys():
            if isinstance(outputs[0][key], torch.Tensor):
                stacked = torch.stack([o[key] for o in outputs])
                aggregated[key] = stacked.mean(dim=0)
            else:
                aggregated[key] = outputs[-1][key]  # Take last
        return aggregated
    
    def _generate_critique(self, response: Dict[str, torch.Tensor]) -> torch.Tensor:
        """Generate self-critique of response"""
        # Placeholder - would use a critique model
        return torch.randn_like(response['logits'])
    
    def _merge_responses(self, response1: Dict, response2: Dict) -> Dict[str, torch.Tensor]:
        """Merge two responses"""
        merged = {}
        for key in response1.keys():
            if isinstance(response1[key], torch.Tensor):
                # Weighted average
                merged[key] = 0.6 * response1[key] + 0.4 * response2[key]
            else:
                merged[key] = response1[key]
        return merged
    
    def route(
        self,
        input_ids: torch.Tensor,
        text: str = "",
        use_soft_routing: bool = False,
        override_path: Optional[ReasoningPath] = None
    ) -> RoutingDecision:
        """Decide which reasoning path to use"""
        
        # Extract features
        features = self.complexity_scorer.extract_features(text, input_ids[0])
        
        # Get complexity score - combine hidden-state signal with features for better variation
        # Use base embeddings as input signal but DETACH to avoid training the base model from DRE aux loss
        embeddings = self.base_model.embed_tokens(input_ids).detach()
        pooled = embeddings.mean(dim=1)  # [batch, hidden_dim]
        complexity_hidden = self.hidden_complexity_head(pooled).squeeze(-1)  # [batch]
        complexity_features = self.complexity_scorer(features).squeeze()
        # Blend signals; if batch, average feature score across batch for stability
        if isinstance(complexity_features, torch.Tensor) and complexity_features.dim() == 0:
            complexity_features_tensor = complexity_features
        else:
            # Coerce to tensor on the right device/dtype
            complexity_features_tensor = torch.as_tensor(complexity_features, dtype=complexity_hidden.dtype, device=complexity_hidden.device)
        complexity_score_tensor = 0.7 * complexity_hidden + 0.3 * complexity_features_tensor
        complexity_score = float(complexity_score_tensor.mean().detach().cpu().item())
        
        # Get router prediction (allow grads for router so it can learn via aux loss)
        probs, confidence = self.router(embeddings, features)
        
        # Override if specified
        if override_path:
            return RoutingDecision(
                path=override_path,
                confidence=1.0,
                complexity_score=complexity_score,
                estimated_latency_ms=self._estimate_latency(override_path),
                debug_info={'override': True}
            )
        
        # Soft routing: combine outputs from multiple paths
        if use_soft_routing:
            # Return probabilities for weighted combination
            probs_np = probs.detach().to(torch.float32).cpu().numpy()
            return RoutingDecision(
                path=ReasoningPath.STANDARD,  # Default
                confidence=confidence.item(),
                complexity_score=complexity_score,
                estimated_latency_ms=self._estimate_latency_weighted(probs),
                debug_info={'probs': probs_np, 'soft_routing': True}
            )
        
        # Hard routing: select single path
        path_idx = probs.argmax(dim=-1).item()
        selected_path = list(ReasoningPath)[path_idx]
        
        # Apply complexity threshold override only when NOT training
        # During training, allow the router to learn the mapping; rely on thresholds at inference time
        if not self.training:
            if complexity_score < self.complexity_thresholds[ReasoningPath.FAST]:
                selected_path = ReasoningPath.FAST
            elif complexity_score < self.complexity_thresholds[ReasoningPath.STANDARD]:
                selected_path = ReasoningPath.STANDARD
            elif complexity_score < self.complexity_thresholds[ReasoningPath.DEEP]:
                selected_path = ReasoningPath.DEEP
            elif complexity_score >= self.complexity_thresholds[ReasoningPath.ULTRA_DEEP]:
                selected_path = ReasoningPath.ULTRA_DEEP
        
        # Stash tensors for aux loss computation during forward()
        self._last_router_tensors = {
            'probs': probs,  # [batch, n_paths]
            'confidence': confidence,  # [batch]
            'complexity': complexity_score_tensor,  # [batch]
        }
        probs_np = probs.detach().to(torch.float32).cpu().numpy()
        return RoutingDecision(
            path=selected_path,
            confidence=confidence.item(),
            complexity_score=complexity_score,
            estimated_latency_ms=self._estimate_latency(selected_path),
            debug_info={
                'probs': probs_np,
                'features': features.__dict__
            }
        )
    
    def _estimate_latency(self, path: ReasoningPath) -> float:
        """Estimate latency for a given path"""
        latency_ranges = {
            ReasoningPath.FAST: (10, 100),
            ReasoningPath.STANDARD: (1000, 5000),
            ReasoningPath.EXPERT: (3000, 10000),  # MoE experts - slower than standard, faster than deep
            ReasoningPath.DEEP: (10000, 60000),
            ReasoningPath.ULTRA_DEEP: (60000, 300000)
        }
        
        if self.latency_history[path]:
            # Use historical average
            return np.mean(self.latency_history[path][-10:])
        
        # Use midpoint of range
        min_lat, max_lat = latency_ranges[path]
        return (min_lat + max_lat) / 2
    
    def _estimate_latency_weighted(self, probs: torch.Tensor) -> float:
        """Estimate weighted latency for soft routing"""
        latencies = [self._estimate_latency(path) for path in ReasoningPath]
        weighted_latency = sum(p * l for p, l in zip(probs[0].detach().to(torch.float32).cpu().numpy(), latencies))
        return weighted_latency
    
    def get_current_metrics(self) -> Dict[str, Any]:
        """Get current DRE metrics for logging"""
        if self.total_activations == 0:
            return {
                'activation_rate': 0.0,
                'avg_complexity': 0.0,
                'avg_confidence': 0.0,
                'avg_reasoning_steps': 0.0,
                'path_distribution': {path.value: 0.0 for path in ReasoningPath}
            }
        
        # Calculate activation rates per path
        path_distribution = {
            path.value: self.activation_counts[path] / self.total_activations * 100
            for path in ReasoningPath
        }
        
        # Calculate averages
        avg_complexity = float(np.mean(self.complexity_scores[-100:])) if self.complexity_scores else 0.0
        avg_confidence = float(np.mean(self.confidence_scores[-100:])) if self.confidence_scores else 0.0
        avg_reasoning_steps = float(np.mean(self.reasoning_steps[-50:])) if self.reasoning_steps else 0.0
        
        # Cache efficiency
        cache_hit_rate = 0.0
        if self.enable_caching and (self.cache_hits + self.cache_misses) > 0:
            cache_hit_rate = self.cache_hits / (self.cache_hits + self.cache_misses) * 100
        
        return {
            'activation_rate': self.total_activations,
            'avg_complexity': avg_complexity,
            'avg_confidence': avg_confidence,
            'avg_reasoning_steps': avg_reasoning_steps,
            'path_distribution': path_distribution,
            'cache_hit_rate': cache_hit_rate,
            'total_cache_hits': self.cache_hits,
            'total_cache_misses': self.cache_misses
        }
    
    def forward(
        self,
        input_ids: torch.Tensor,
        text: str = "",
        override_path: Optional[ReasoningPath] = None,
        **kwargs
    ) -> Dict[str, Any]:
        """Main forward pass with dynamic routing"""
        
        # Route to appropriate path
        routing_decision = self.route(input_ids, text, override_path=override_path)
        
        # Track timing
        start_time = time.time()
        
        # Execute selected path
        if routing_decision.path == ReasoningPath.FAST:
            # During training with labels, run STANDARD inference to get valid loss/hidden_states
            train_needs_loss = self.training and (kwargs.get('labels', None) is not None)
            if train_needs_loss:
                output = self._standard_inference(input_ids, **kwargs)
            else:
                output = self._fast_inference(input_ids, **kwargs)
                # Convert to standard format if needed
                if not isinstance(output, dict):
                    output = {'logits': output}
                
        elif routing_decision.path == ReasoningPath.STANDARD:
            output = self._standard_inference(input_ids, **kwargs)
            
        elif routing_decision.path == ReasoningPath.EXPERT:
            # Expert path shares the same base forward; UltraThinkCore will apply MoE based on routing_info['use_moe']
            output = self._standard_inference(input_ids, **kwargs)
            
        elif routing_decision.path == ReasoningPath.DEEP:
            output = self._deep_inference(input_ids, **kwargs)
            
        elif routing_decision.path == ReasoningPath.ULTRA_DEEP:
            output = self._ultra_deep_inference(input_ids, **kwargs)
            
        else:
            raise ValueError(f"Unknown reasoning path: {routing_decision.path}")
        
        # Track latency
        latency_ms = (time.time() - start_time) * 1000
        self.latency_history[routing_decision.path].append(latency_ms)
        
        # Update DRE metrics
        self.activation_counts[routing_decision.path] += 1
        self.total_activations += 1
        self.complexity_scores.append(routing_decision.complexity_score)
        self.confidence_scores.append(routing_decision.confidence)
        
        # Compute a small auxiliary loss to train the router (balance + latency + confidence)
        dre_aux_loss = None
        try:
            if self.training and hasattr(self, '_last_router_tensors'):
                probs = self._last_router_tensors['probs']  # [batch, n_paths]
                confidence = self._last_router_tensors['confidence']  # [batch]
                # Encourage balanced usage across paths (Switch-Transformer style)
                target_uniform = torch.full_like(probs[0], 1.0 / probs.shape[-1])
                balance_loss = (probs.mean(dim=0) - target_uniform).pow(2).mean()
                # Penalize expected latency (prefer cheaper paths unless LM loss demands otherwise)
                # Relative costs for FAST, STANDARD, EXPERT, DEEP, ULTRA_DEEP
                path_costs = torch.tensor([0.1, 1.0, 1.5, 2.5, 4.0], dtype=probs.dtype, device=probs.device)
                expected_cost = (probs * path_costs).sum(dim=-1).mean()
                # Encourage higher confidence
                conf_loss = -torch.log(confidence.clamp_min(1e-6)).mean()
                dre_aux_loss = balance_loss + 0.1 * expected_cost + 0.01 * conf_loss
        except Exception:
            dre_aux_loss = None
        
        # Track reasoning steps for deep paths
        if routing_decision.path in [ReasoningPath.DEEP, ReasoningPath.ULTRA_DEEP]:
            steps = routing_decision.debug_info.get('reasoning_steps', 1)
            self.reasoning_steps.append(steps)
        
        # Add routing info to output
        output['routing_info'] = {
            'path': routing_decision.path.value,
            'complexity_score': routing_decision.complexity_score,
            'confidence': routing_decision.confidence,
            'latency_ms': latency_ms,
            'debug': routing_decision.debug_info,
            'dre_metrics': self.get_current_metrics(),
            'use_moe': (routing_decision.path == ReasoningPath.EXPERT)
        }
        # Expose aux loss to the trainer for joint optimization
        if dre_aux_loss is not None:
            output['dre_aux_loss'] = dre_aux_loss
        
        # Avoid issues with torch.compile/torch._dynamo tracing Python f-strings and time
        try:
            is_compiling = getattr(torch._dynamo, 'is_compiling', lambda: False)()
        except Exception:
            is_compiling = False
        if not is_compiling:
            # Use logger parameter interpolation to avoid formatting issues
            logger.info("DRE: Path=%s, Complexity=%.3f, Latency=%.1fms",
                        routing_decision.path.value,
                        float(routing_decision.complexity_score),
                        float(latency_ms))

        return output