| import os |
| import sys |
| import argparse |
| import logging |
| from pathlib import Path |
|
|
| import torch |
|
|
| sys.path.insert(0, str(Path(__file__).parent.parent.parent)) |
|
|
| from codsworth.config import CodsworthConfig |
| from codsworth.model import CodsworthTransformer |
| from codsworth.tokenizer import Tokenizer |
| from codsworth.utils import setup_logging, load_checkpoint, get_device |
| from codsworth.train.dataset import CodsworthDataset, CodsworthDataLoader |
| from codsworth.eval.evaluator import Evaluator |
|
|
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser(description="Evaluate Codsworth model") |
| |
| parser.add_argument("--checkpoint", type=str, required=True, help="Path to model checkpoint") |
| parser.add_argument("--tokenizer", type=str, required=True, help="Path to tokenizer") |
| parser.add_argument("--eval_files", type=str, nargs="+", default=["data/val/*.txt"]) |
| parser.add_argument("--output", type=str, default=None, help="Output file for results") |
| |
| parser.add_argument("--context_length", type=int, default=2048) |
| parser.add_argument("--batch_size", type=int, default=16) |
| parser.add_argument("--max_batches", type=int, default=None, help="Max batches to evaluate") |
| |
| parser.add_argument("--device", type=str, default=None, help="Device to use") |
| parser.add_argument("--dtype", type=str, default="bf16", help="Data type (bf16/fp32)") |
| |
| parser.add_argument("--log_level", type=str, default="INFO") |
| |
| return parser.parse_args() |
|
|
|
|
| def main(): |
| args = parse_args() |
| |
| logger = setup_logging(log_level=args.log_level) |
| |
| device = get_device() if args.device is None else torch.device(args.device) |
| logger.info(f"Using device: {device}") |
| |
| tokenizer = Tokenizer.load(args.tokenizer) |
| logger.info(f"Loaded tokenizer from {args.tokenizer}") |
| |
| dtype_map = {"bf16": torch.bfloat16, "fp32": torch.float32, "fp16": torch.float16} |
| dtype = dtype_map.get(args.dtype, torch.bfloat16) |
| |
| config = CodsworthConfig(context_length=args.context_length) |
| model = CodsworthTransformer(config) |
| |
| checkpoint = load_checkpoint(model, args.checkpoint, device=device) |
| logger.info(f"Loaded checkpoint from {args.checkpoint}") |
| |
| model = model.to(device=device, dtype=dtype) |
| model.eval() |
| |
| eval_dataset = CodsworthDataset( |
| file_paths=args.eval_files, |
| tokenizer=tokenizer, |
| context_length=args.context_length, |
| shuffle=False, |
| ) |
| |
| eval_loader = CodsworthDataLoader( |
| eval_dataset, |
| batch_size=args.batch_size, |
| shuffle=False, |
| num_workers=0, |
| ) |
| |
| logger.info(f"Eval dataset size: {len(eval_dataset)}") |
| |
| evaluator = Evaluator(model, tokenizer, config, device) |
| |
| logger.info("Starting evaluation...") |
| results = evaluator.evaluate(eval_loader, max_batches=args.max_batches) |
| |
| logger.info("\n=== Evaluation Results ===") |
| logger.info(f"Loss: {results['loss']:.4f}") |
| logger.info(f"Perplexity: {results['perplexity']:.4f}") |
| logger.info(f"Tokens evaluated: {results['total_tokens']:,}") |
| |
| if args.output: |
| import json |
| with open(args.output, "w") as f: |
| json.dump(results, f, indent=2) |
| logger.info(f"Results saved to {args.output}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |