codsworth-3.8m / codsworth /eval /evaluator.py
Jaqshanahan's picture
Initial upload of Codsworth model
b84d85a verified
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
import logging
from typing import Optional, Callable
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from codsworth.config import CodsworthConfig
from codsworth.model import CodsworthTransformer
from codsworth.utils import get_device, AverageMeter
from codsworth.eval.perplexity import Perplexity
logger = logging.getLogger("codsworth")
class Evaluator:
"""Evaluator for Codsworth model."""
def __init__(
self,
model: CodsworthTransformer,
tokenizer,
config: Optional[CodsworthConfig] = None,
device: Optional[torch.device] = None,
):
self.model = model
self.tokenizer = tokenizer
self.config = config
self.device = device or get_device()
self.model = self.model.to(self.device)
self.model.eval()
self.perplexity_metric = Perplexity()
def evaluate(
self,
dataloader: DataLoader,
max_batches: Optional[int] = None,
) -> dict:
self.model.eval()
loss_meter = AverageMeter("loss")
perplexity_meter = AverageMeter("perplexity")
total_tokens = 0
total_loss = 0.0
with torch.no_grad():
for batch_idx, batch in enumerate(dataloader):
if max_batches is not None and batch_idx >= max_batches:
break
input_ids = batch["input_ids"].to(self.device)
labels = batch["labels"].to(self.device)
outputs = self.model(input_ids=input_ids, labels=labels)
loss = outputs["loss"]
loss_meter.update(loss.item())
perplexity = torch.exp(loss)
perplexity_meter.update(perplexity.item())
batch_tokens = (labels != self.tokenizer.pad_id).sum().item()
total_tokens += batch_tokens
total_loss += loss.item() * batch_tokens
avg_loss = total_loss / max(1, total_tokens)
avg_perplexity = torch.exp(torch.tensor(avg_loss)).item()
results = {
"loss": loss_meter.avg,
"perplexity": perplexity_meter.avg,
"avg_perplexity": avg_perplexity,
"total_tokens": total_tokens,
}
logger.info(f"Evaluation Results:")
logger.info(f" Loss: {results['loss']:.4f}")
logger.info(f" Perplexity: {results['perplexity']:.4f}")
return results
def evaluate_generation(
self,
prompt: str,
max_new_tokens: int = 100,
temperature: float = 1.0,
top_k: Optional[int] = None,
top_p: Optional[float] = None,
) -> str:
self.model.eval()
input_ids = self.tokenizer.encode(
prompt,
add_bos=True,
add_eos=False,
)
input_ids = torch.tensor([input_ids], dtype=torch.long).to(self.device)
with torch.no_grad():
generated = self.model.generate(
input_ids=input_ids,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_k=top_k,
top_p=top_p,
eos_token_id=self.tokenizer.eos_id,
)
generated_text = self.tokenizer.decode(
generated[0],
remove_special_tokens=True,
)
return generated_text
def evaluate_perplexity(
self,
texts: list[str],
context_length: int = 2048,
) -> float:
self.model.eval()
total_loss = 0.0
total_tokens = 0
with torch.no_grad():
for text in texts:
token_ids = self.tokenizer.encode(
text,
add_bos=False,
add_eos=False,
)
num_chunks = (len(token_ids) + context_length - 1) // context_length
for chunk_idx in range(num_chunks):
start = chunk_idx * context_length
end = min(start + context_length, len(token_ids))
chunk_ids = token_ids[start:end]
labels = token_ids[start + 1 : end + 1]
if len(chunk_ids) < 2:
continue
input_tensor = torch.tensor([chunk_ids], dtype=torch.long).to(self.device)
labels_tensor = torch.tensor([labels], dtype=torch.long).to(self.device)
outputs = self.model(input_ids=input_tensor, labels=labels_tensor)
token_loss = outputs["loss"].item()
num_valid_tokens = len([l for l in labels if l != self.tokenizer.pad_id])
total_loss += token_loss * num_valid_tokens
total_tokens += num_valid_tokens
if total_tokens == 0:
return float("inf")
avg_loss = total_loss / total_tokens
perplexity = torch.exp(torch.tensor(avg_loss)).item()
return perplexity
def compare_models(
self,
other_model: CodsworthTransformer,
dataloader: DataLoader,
max_batches: Optional[int] = None,
) -> dict:
self_results = self.evaluate(dataloader, max_batches)
other_evaluator = Evaluator(
other_model,
self.tokenizer,
self.config,
self.device,
)
other_results = other_evaluator.evaluate(dataloader, max_batches)
return {
"model_1": self_results,
"model_2": other_results,
"difference": {
"loss": self_results["loss"] - other_results["loss"],
"perplexity": self_results["perplexity"] - other_results["perplexity"],
},
}
def evaluate(
model: CodsworthTransformer,
tokenizer,
dataloader: DataLoader,
device: Optional[torch.device] = None,
max_batches: Optional[int] = None,
) -> dict:
evaluator = Evaluator(model, tokenizer, device=device)
return evaluator.evaluate(dataloader, max_batches)