docs: translate all Korean comments and docstrings to English

858e8b2 about 1 month ago

5.79 kB

	"""Perplexity (PPL) evaluator."""

	import math
	import time
	from typing import Dict, List

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import DataLoader

	from llm_lab.config import EvalConfig


	class PerplexityEvaluator:
	"""Measures Perplexity (PPL).

	What is Perplexity?
	PPL = exp(average cross-entropy loss)

	Intuitive meaning:
	- PPL = 1: Perfect prediction (impossible)
	- PPL = 10: Equivalent to picking from 10 candidates each time
	- PPL = 100: Equivalent to picking from 100 candidates (close to random)
	- PPL = 32000: Random selection from the entire vocab (initial random model)

	Good benchmark for a 1B model (English web text):
	- Trained on 5B tokens: PPL ~30-40
	- Trained on 10B tokens: PPL ~20-30
	- Trained on 20B tokens: PPL ~15-25

	Measurement method:
	- Compute cross-entropy over all tokens in the validation dataset
	- Average per token, then apply exp()
	- Padding tokens are excluded (ignore_index=-100)
	"""

	def __init__(self, config: EvalConfig):
	self.config = config

	@torch.no_grad()
	def evaluate(
	self,
	model: nn.Module,
	dataloader: DataLoader,
	device: torch.device,
	dtype: torch.dtype = torch.bfloat16,
	desc: str = "Evaluation",
	) -> Dict[str, float]:
	"""Measures Perplexity.

	Returns:
	{
	"loss": average cross-entropy loss,
	"perplexity": exp(loss),
	"num_tokens": total number of tokens used for evaluation,
	"num_batches": number of batches used for evaluation,
	}
	"""
	model.eval()

	total_loss = 0.0
	total_tokens = 0
	num_batches = 0

	print(f"\n📊 {desc}")
	start_time = time.time()

	for i, batch in enumerate(dataloader):
	if i >= self.config.max_eval_batches:
	break

	input_ids = batch["input_ids"].to(device)
	targets = batch["targets"].to(device)

	with torch.amp.autocast(device_type="cuda", dtype=dtype, enabled=(dtype != torch.float32)):
	logits, _ = model(input_ids)

	# Per-token cross-entropy (reduction='none')
	# logits: (B, S, V) → (B*S, V)
	# targets: (B, S) → (B*S,)
	loss_per_token = F.cross_entropy(
	logits.view(-1, logits.size(-1)),
	targets.view(-1),
	ignore_index=-100,
	reduction="none",
	)

	# Count only valid tokens that are not -100
	valid_mask = (targets.view(-1) != -100)
	valid_tokens = valid_mask.sum().item()

	total_loss += loss_per_token[valid_mask].sum().item()
	total_tokens += valid_tokens
	num_batches += 1

	if (i + 1) % 20 == 0:
	running_ppl = math.exp(min(total_loss / max(total_tokens, 1), 20))
	print(f" Batch {i+1}/{self.config.max_eval_batches}: running PPL = {running_ppl:.2f}")

	elapsed = time.time() - start_time
	avg_loss = total_loss / max(total_tokens, 1)
	perplexity = math.exp(min(avg_loss, 100)) # prevent overflow

	results = {
	"loss": round(avg_loss, 4),
	"perplexity": round(perplexity, 2),
	"num_tokens": total_tokens,
	"num_batches": num_batches,
	"eval_time_sec": round(elapsed, 1),
	}

	print(f" ────────────────────────────────")
	print(f" Loss: {results['loss']:.4f}")
	print(f" Perplexity: {results['perplexity']:.2f}")
	print(f" Eval tokens: {total_tokens:,}")
	print(f" Elapsed: {elapsed:.1f}s")

	return results

	@torch.no_grad()
	def evaluate_per_position(
	self,
	model: nn.Module,
	dataloader: DataLoader,
	device: torch.device,
	dtype: torch.dtype = torch.bfloat16,
	max_batches: int = 50,
	) -> List[float]:
	"""Measures loss per position within a sequence.

	Learning insight:
	- Positions 0~10: Higher loss (insufficient context)
	- Positions 100+: Loss stabilizes lower (context is leveraged)
	- This pattern demonstrates the Transformer's in-context learning capability
	"""
	model.eval()
	seq_len = None
	position_loss_sum = None
	position_count = None

	for i, batch in enumerate(dataloader):
	if i >= max_batches:
	break

	input_ids = batch["input_ids"].to(device)
	targets = batch["targets"].to(device)
	B, S = targets.shape

	if seq_len is None:
	seq_len = S
	position_loss_sum = torch.zeros(S, device=device)
	position_count = torch.zeros(S, device=device)

	with torch.amp.autocast(device_type="cuda", dtype=dtype, enabled=(dtype != torch.float32)):
	logits, _ = model(input_ids)

	# Per-token loss in shape (B, S)
	loss_per_token = F.cross_entropy(
	logits.view(-1, logits.size(-1)),
	targets.view(-1),
	ignore_index=-100,
	reduction="none",
	).view(B, S)

	valid_mask = (targets != -100).float()
	position_loss_sum += (loss_per_token * valid_mask).sum(dim=0)
	position_count += valid_mask.sum(dim=0)

	# Average loss per position
	position_avg_loss = (position_loss_sum / position_count.clamp(min=1)).cpu().tolist()
	return position_avg_loss