Spaces:

pomilon-lab
/

Aetheris-Inference

Sleeping

Aetheris-Inference / scripts /validate.py

Pomilon

Deploy Aetheris to HF Space

1df0e33 14 days ago

3.14 kB

	import argparse
	import os
	import torch
	import math
	import time
	import sys
	from pathlib import Path

	# Add project root to path
	sys.path.append(str(Path(__file__).resolve().parent.parent))

	from aetheris.config import AetherisConfig
	from aetheris.model import HybridMambaMoE
	from aetheris.data import create_streaming_loader, get_tokenizer
	from aetheris.utils import load_latest_checkpoint

	@torch.no_grad()
	def evaluate_model(model, val_loader, device, max_batches=100):
	print(f"\n{'='50}\nStarting Validation (Max {max_batches} batches)\n{'='50}")

	model.eval()
	total_loss = 0.0
	num_batches = 0
	start_time = time.time()

	for batch in val_loader:
	if num_batches >= max_batches:
	break

	input_ids, labels = batch
	input_ids = input_ids.to(device, non_blocking=True)
	labels = labels.to(device, non_blocking=True)

	with torch.amp.autocast('cuda', dtype=torch.float16):
	output = model(input_ids, labels)
	loss = output["loss"]

	total_loss += loss.item()
	num_batches += 1

	if num_batches % 20 == 0:
	print(f"-> Processed {num_batches}/{max_batches} batches...")

	end_time = time.time()

	if num_batches == 0:
	print("No validation batches were processed.")
	return float('inf')

	avg_loss = total_loss / num_batches
	perplexity = math.exp(avg_loss)

	print(f"\n--- Validation Results ---")
	print(f"Total batches processed: {num_batches}")
	print(f"Time taken: {end_time - start_time:.2f} seconds")
	print(f"Average Loss: {avg_loss:.4f}")
	print(f"Perplexity: {perplexity:.2f}")
	print(f"--------------------------\n")

	return avg_loss

	def main():
	parser = argparse.ArgumentParser(description="Validate Aetheris Model")
	parser.add_argument("--config", type=str, default="configs/default.yaml", help="Path to config file")
	parser.add_argument("--checkpoint_dir", type=str, default="checkpoints", help="Directory with checkpoints")
	parser.add_argument("--checkpoint_name", type=str, default="checkpoint_current.pth", help="Checkpoint file name")
	parser.add_argument("--batch_size", type=int, default=2, help="Batch size")
	parser.add_argument("--hf_token", type=str, default=os.environ.get("HF_TOKEN"), help="HuggingFace Token")
	parser.add_argument("--dataset", type=str, default="cerebras/SlimPajama-627B", help="Dataset to validate on")
	parser.add_argument("--dataset_mode", type=str, default="pretrain", help="pretrain or sft")

	args = parser.parse_args()

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	config = AetherisConfig.from_yaml(args.config)
	tokenizer = get_tokenizer()

	model = HybridMambaMoE(config).to(device).to(config.torch_dtype)

	load_latest_checkpoint(model, None, None, device, args.checkpoint_dir, args.checkpoint_name)

	val_loader = create_streaming_loader(args.dataset, "validation", tokenizer, config, args.batch_size, mode=args.dataset_mode, hf_token=args.hf_token)

	evaluate_model(model, val_loader, device)

	if __name__ == "__main__":
	main()