docs: translate all Korean comments and docstrings to English

858e8b2 about 1 month ago

2.82 kB

	"""Model utility functions."""

	from __future__ import annotations

	import math
	from typing import TYPE_CHECKING

	from llm_lab.config import ModelConfig

	if TYPE_CHECKING:
	from .llm_model import LLMModel


	def count_parameters_detailed(model: "LLMModel") -> dict:
	"""Print a detailed breakdown of the model's parameter count by component."""
	total = 0
	breakdown = {}

	# Embedding
	emb_params = model.token_embedding.weight.numel()
	breakdown["token_embedding"] = emb_params
	total += emb_params

	# Per layer
	layer_total = 0
	layer_detail = {}
	layer = model.layers[0]

	for name, param in layer.named_parameters():
	layer_detail[name] = param.numel()
	layer_total += param.numel()

	breakdown["per_layer"] = layer_detail
	breakdown["per_layer_total"] = layer_total
	breakdown["all_layers_total"] = layer_total * len(model.layers)
	total += layer_total * len(model.layers)

	# Final norm
	norm_params = model.final_norm.weight.numel()
	breakdown["final_norm"] = norm_params
	total += norm_params

	# LM head (weight tying, so 0 additional parameters)
	breakdown["lm_head"] = "weight tying (0 additional)"
	breakdown["total"] = total

	return breakdown


	def estimate_memory_gb(config: ModelConfig, batch_size: int = 4, dtype_bytes: int = 2) -> dict:
	"""Estimate GPU memory usage of the model.

	Args:
	dtype_bytes: 2 (bf16/fp16) or 4 (fp32)
	"""
	# Approximate parameter count
	emb = config.vocab_size * config.hidden_dim
	per_layer = (
	config.hidden_dim * (config.num_heads + 2 * config.num_kv_heads) * config.head_dim # QKV
	+ config.num_heads * config.head_dim * config.hidden_dim # O proj
	+ 3 * config.hidden_dim * config.intermediate_dim # SwiGLU (gate + up + down)
	+ 2 * config.hidden_dim # 2 × RMSNorm
	)
	total_params = emb + per_layer * config.num_layers + config.hidden_dim

	model_gb = total_params * dtype_bytes / 1e9
	optimizer_gb = total_params * 8 / 1e9 # AdamW: 2 states × fp32
	gradient_gb = total_params * dtype_bytes / 1e9

	# Activation memory (assuming activation checkpointing is applied)
	# Rough estimate: batch_size × seq_len × hidden_dim × num_layers × factor
	activation_gb = (
	batch_size * config.max_seq_len * config.hidden_dim * 4 # bytes
	* math.sqrt(config.num_layers) # effect of checkpointing
	/ 1e9
	)

	return {
	"total_parameters": total_params,
	"model_weights_gb": round(model_gb, 2),
	"optimizer_states_gb": round(optimizer_gb, 2),
	"gradients_gb": round(gradient_gb, 2),
	"activations_estimated_gb": round(activation_gb, 2),
	"total_estimated_gb": round(model_gb + optimizer_gb + gradient_gb + activation_gb, 2),
	}