Spaces:
Sleeping
Sleeping
File size: 3,141 Bytes
1df0e33 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import argparse
import os
import torch
import math
import time
import sys
from pathlib import Path
# Add project root to path
sys.path.append(str(Path(__file__).resolve().parent.parent))
from aetheris.config import AetherisConfig
from aetheris.model import HybridMambaMoE
from aetheris.data import create_streaming_loader, get_tokenizer
from aetheris.utils import load_latest_checkpoint
@torch.no_grad()
def evaluate_model(model, val_loader, device, max_batches=100):
print(f"\n{'='*50}\nStarting Validation (Max {max_batches} batches)\n{'='*50}")
model.eval()
total_loss = 0.0
num_batches = 0
start_time = time.time()
for batch in val_loader:
if num_batches >= max_batches:
break
input_ids, labels = batch
input_ids = input_ids.to(device, non_blocking=True)
labels = labels.to(device, non_blocking=True)
with torch.amp.autocast('cuda', dtype=torch.float16):
output = model(input_ids, labels)
loss = output["loss"]
total_loss += loss.item()
num_batches += 1
if num_batches % 20 == 0:
print(f"-> Processed {num_batches}/{max_batches} batches...")
end_time = time.time()
if num_batches == 0:
print("No validation batches were processed.")
return float('inf')
avg_loss = total_loss / num_batches
perplexity = math.exp(avg_loss)
print(f"\n--- Validation Results ---")
print(f"Total batches processed: {num_batches}")
print(f"Time taken: {end_time - start_time:.2f} seconds")
print(f"Average Loss: {avg_loss:.4f}")
print(f"Perplexity: {perplexity:.2f}")
print(f"--------------------------\n")
return avg_loss
def main():
parser = argparse.ArgumentParser(description="Validate Aetheris Model")
parser.add_argument("--config", type=str, default="configs/default.yaml", help="Path to config file")
parser.add_argument("--checkpoint_dir", type=str, default="checkpoints", help="Directory with checkpoints")
parser.add_argument("--checkpoint_name", type=str, default="checkpoint_current.pth", help="Checkpoint file name")
parser.add_argument("--batch_size", type=int, default=2, help="Batch size")
parser.add_argument("--hf_token", type=str, default=os.environ.get("HF_TOKEN"), help="HuggingFace Token")
parser.add_argument("--dataset", type=str, default="cerebras/SlimPajama-627B", help="Dataset to validate on")
parser.add_argument("--dataset_mode", type=str, default="pretrain", help="pretrain or sft")
args = parser.parse_args()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = AetherisConfig.from_yaml(args.config)
tokenizer = get_tokenizer()
model = HybridMambaMoE(config).to(device).to(config.torch_dtype)
load_latest_checkpoint(model, None, None, device, args.checkpoint_dir, args.checkpoint_name)
val_loader = create_streaming_loader(args.dataset, "validation", tokenizer, config, args.batch_size, mode=args.dataset_mode, hf_token=args.hf_token)
evaluate_model(model, val_loader, device)
if __name__ == "__main__":
main()
|