from state import Model as Model, Parallelism, Training
from dtypes import DType


class MemoryCalculation:
    def __init__(self, modelconfig: Model, parallelismconfig: Parallelism, trainingconfig: Training):
        self.model = modelconfig
        self.parallelism = parallelismconfig
        self.training = trainingconfig

    def calculate_num_parameters(self) -> float:
        # https://michaelwornow.net/2024/01/18/counting-params-in-transformer
        # https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=memory_usage_in_transformers

        # Biases are not added/omitted on a per-model basis for simplicity.
        # Just include them where they could appear. They're small in comparison to weights anyway and it forms an upper bound.

        #self tax
        b, s = self.training.batch_size, self.training.sequence_length
        h, i, l, v, e = (
            self.model.hidden_dim,
            self.model.intermediate_size,
            self.model.num_layers,
            self.model.vocab_size,
            self.model.total_experts,
        )
        tp, pp, ep = (
            self.parallelism.tensor_parallelism,
            self.parallelism.pipeline_parallelism,
            self.parallelism.expert_parallelism,
        )

        # Embedding layers
        input_embedding = v * h / tp
        unembedding = 0
        if not self.model.weight_tied_embeddings:
            unembedding = h * v / tp

        # Attention
        # weights and biases = *2
        layer_norm_attn_in = 2 * h # not tp sharded
        qkv = 3 * h * h / tp
        attn_output_proj = h * h + h / tp
        attn = layer_norm_attn_in + qkv + attn_output_proj

        # MLP
        layer_norm_mlp_in = 2 * h # not tp sharded
        router = h * e + e # assuming replicated for simplicity
        mlp_up_proj = h * i + i / tp
        mlp_gate_proj = h * i + i / tp
        mlp_down_proj = i * h + h / tp
        expert = mlp_up_proj + mlp_gate_proj + mlp_down_proj
        experts = expert * e / ep
        mlp = layer_norm_mlp_in + router + experts

        layer = attn + mlp
        layers = layer * l


        final_layer_norm = 2 * h # not tp sharded

        # pp and weight tying makes knowing where to embed layer challenging
        # going to assume "worst" case and it's at the end with final layer norm
        # even though that's pretty small
        total_params = 0
        if pp == 1:
            total_params = input_embedding + layers + unembedding + final_layer_norm
        if pp > 1:
            total_params = max(input_embedding, unembedding) + layers/pp + final_layer_norm
        return total_params

    def calculate_activation_parameters(self) -> float:
        # https://blog.eleuther.ai/transformer-math/#activations-and-batch-size
        # https://arxiv.org/abs/2205.05198
        # pp not considered since most pp schemes will run multiple concurrent batches to reduce the bubble
        b, s = self.training.batch_size, self.training.sequence_length
        h, i, l, v, e, ae = (
            self.model.hidden_dim,
            self.model.intermediate_size,
            self.model.num_layers,
            self.model.vocab_size,
            self.model.total_experts,
            self.model.active_experts,
        )
        tp, cp, pp, ep = (
            self.parallelism.tensor_parallelism,
            self.parallelism.context_parallelism,
            self.parallelism.pipeline_parallelism,
            self.parallelism.expert_parallelism,
        )
        sp = tp
        if self.training.gradient_checkpointing:
            # full recomputation
            embed = 0
            layer = s * b * h / cp / tp  # only keep initial input to layer
            layers = layer * l
            embed = 0
            final_layer_out = (
                s * b * h / cp / sp
            )
            final_norm = s * b * h / cp / sp  
            unembed = s * b * v / cp / tp
            logits = s * b * v / cp / sp # come back to this
            num_params = (
                embed + layers + final_layer_out + final_norm + unembed + logits
            )
            return num_params
        else:
            # assume flash attention ie do selective recomputation
            # assume tensor parallel + sequence parallel as described in https://arxiv.org/abs/2205.05198
            # the variables calculate the activation outputs
            # Attention Block
            layer_in = s * b * h / cp / tp 
            attn_norm = s * b * h / cp / sp 
            flash = s * b * h / cp / tp
            # everything else is recalculated by flash attention
            projection = s * b * h / cp / tp
            attn = layer_in + attn_norm + flash + projection
            # MLP Block
            mlp_norm = s * b * h / cp / sp 

            mlp_up = s * b * i / cp / tp
            mlp_gate = s * b * i / cp / tp
            hadamard_swiglu = s * b * i / cp / tp
            mlp_down = s * b * h / cp / tp
            if self.model.is_moe:
                router = (
                s * b * e / cp / sp)  # makes sense to sp shard if mlp_norm out is sp sharded
                expert = mlp_up + mlp_gate + hadamard_swiglu + mlp_down
                experts = expert * ae
                mlp = mlp_norm + router + experts
            else:
                mlp = mlp_norm + mlp_up + mlp_gate + hadamard_swiglu + mlp_down
            layer = attn + mlp
            layers = layer * l # no decrease from PP because schedules will increase microbatches
            # Other
            embed = 0
            final_layer_out = (
                s * b * h / cp / tp
            )  # both sequence and context parallelism
            final_norm = s * b * h / cp / sp  
            unembed = s * b * v / cp / tp
            logits = s * b * v / cp / tp
            num_params = (
                embed + layers + final_layer_out + final_norm + unembed + logits
            )
            return num_params
        
    def calculate_parameter_memory(self) -> float:
        if self.training.mixed_precision:
            master_copy = self.calculate_num_parameters() * self.training.precision
            working_copy = self.calculate_num_parameters() * self.training.param_dtype
            return master_copy + working_copy
        else:
            return self.calculate_num_parameters() * self.training.precision
    
    def calculate_gradient_memory(self) -> float:
        # https://blog.eleuther.ai/transformer-math/#gradients
        return (
            self.calculate_num_parameters() * 4
        )  # gradients are same size as parameters

    def calculate_optimizer_memory(self) -> float:
        # https://blog.eleuther.ai/transformer-math/#optimizer-states
        # https://www.determined.ai/blog/act-mem-2, https://web.archive.org/web/20250308172134/https://www.determined.ai/blog/act-mem-2
        return (
            2 * self.calculate_num_parameters() * DType.FP32
        )  # Adam optimizer with 2 states per parameter, assume always fp32

    def calculate_activation_memory(self) -> float:
        if self.training.mixed_precision:
            return self.calculate_activation_parameters() * self.training.param_dtype
        else:
            return (
                self.calculate_activation_parameters() * self.training.precision
            )