Spaces:
Build error
Build error
| """ | |
| Quantumaurora: Advanced Transformer-based Language Model | |
| Version: 1.0.0 | |
| Created: 2025 | |
| """ | |
| import numpy as np | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from torch.utils.data import Dataset, DataLoader | |
| from transformers import PreTrainedTokenizerFast | |
| from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders | |
| import math | |
| from typing import Optional, Dict, List, Tuple | |
| from torch.cuda.amp import autocast, GradScaler | |
| from torch.nn.parallel import DistributedDataParallel | |
| import torch.distributed as dist | |
| import torch.multiprocessing as mp | |
| from torch.utils.checkpoint import checkpoint | |
| import json | |
| import os | |
| from datetime import datetime | |
| class QuantumauroraConfig: | |
| """Configuration class for Quantumaurora model""" | |
| def __init__(self, | |
| vocab_size: int = 50000, | |
| d_model: int = 512, | |
| num_heads: int = 8, | |
| num_layers: int = 6, | |
| d_ff: int = 2048, | |
| dropout: float = 0.1, | |
| attention_type: str = "full", | |
| use_checkpointing: bool = True, | |
| max_sequence_length: int = 2048, | |
| model_version: str = "1.0.0"): | |
| self.vocab_size = vocab_size | |
| self.d_model = d_model | |
| self.num_heads = num_heads | |
| self.num_layers = num_layers | |
| self.d_ff = d_ff | |
| self.dropout = dropout | |
| self.attention_type = attention_type | |
| self.use_checkpointing = use_checkpointing | |
| self.max_sequence_length = max_sequence_length | |
| self.model_version = model_version | |
| self.model_type = "quantumaurora" | |
| def save(self, path: str): | |
| """Save configuration to JSON file""" | |
| config_dict = self.__dict__ | |
| config_dict['timestamp'] = datetime.now().isoformat() | |
| with open(path, 'w') as f: | |
| json.dump(config_dict, f, indent=2) | |
| def load(cls, path: str) -> 'QuantumauroraConfig': | |
| """Load configuration from JSON file""" | |
| with open(path, 'r') as f: | |
| config_dict = json.load(f) | |
| # Remove timestamp from loaded config | |
| if 'timestamp' in config_dict: | |
| del config_dict['timestamp'] | |
| return cls(**config_dict) | |
| class Quantumaurora(nn.Module): | |
| """ | |
| Quantumaurora: Advanced Transformer-based Language Model | |
| A state-of-the-art language model featuring: | |
| - Multi-head attention with sparse/local patterns | |
| - Multiple pre-training objectives | |
| - Gradient checkpointing | |
| - Mixed precision training | |
| - Distributed training support | |
| """ | |
| def __init__(self, config: QuantumauroraConfig): | |
| super().__init__() | |
| self.config = config | |
| # Model components | |
| self.token_embedding = nn.Embedding(config.vocab_size, config.d_model) | |
| self.positional_encoding = PositionalEncoding(config.d_model) | |
| self.transformer_blocks = nn.ModuleList([ | |
| TransformerBlock( | |
| config.d_model, | |
| config.num_heads, | |
| config.d_ff, | |
| config.dropout, | |
| config.attention_type | |
| ) for _ in range(config.num_layers) | |
| ]) | |
| self.pretraining_objectives = PreTrainingObjectives( | |
| config.d_model, | |
| config.vocab_size | |
| ) | |
| self.dropout = nn.Dropout(config.dropout) | |
| def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]: | |
| x = self.token_embedding(x) | |
| x = self.positional_encoding(x) | |
| x = self.dropout(x) | |
| for transformer_block in self.transformer_blocks: | |
| if self.config.use_checkpointing and self.training: | |
| x = checkpoint(transformer_block, x, mask) | |
| else: | |
| x = transformer_block(x, mask) | |
| return self.pretraining_objectives(x) | |
| def save_pretrained(self, path: str): | |
| """Save model and configuration""" | |
| os.makedirs(path, exist_ok=True) | |
| # Save configuration | |
| config_path = os.path.join(path, 'config.json') | |
| self.config.save(config_path) | |
| # Save model weights | |
| model_path = os.path.join(path, 'model.pt') | |
| torch.save(self.state_dict(), model_path) | |
| # Save tokenizer if available | |
| if hasattr(self, 'tokenizer'): | |
| tokenizer_path = os.path.join(path, 'tokenizer.json') | |
| self.tokenizer.save(tokenizer_path) | |
| def from_pretrained(cls, path: str) -> 'Quantumaurora': | |
| """Load pretrained model and configuration""" | |
| config = QuantumauroraConfig.load(os.path.join(path, 'config.json')) | |
| model = cls(config) | |
| model_path = os.path.join(path, 'model.pt') | |
| model.load_state_dict(torch.load(model_path)) | |
| # Load tokenizer if available | |
| tokenizer_path = os.path.join(path, 'tokenizer.json') | |
| if os.path.exists(tokenizer_path): | |
| model.tokenizer = PreTrainedTokenizerFast.from_file(tokenizer_path) | |
| return model | |
| class QuantumauroraTrainer: | |
| """Training manager for Quantumaurora model""" | |
| def __init__(self, | |
| model: Quantumaurora, | |
| train_dataloader: DataLoader, | |
| optimizer: torch.optim.Optimizer, | |
| device: str = "cuda", | |
| use_mixed_precision: bool = True, | |
| distributed: bool = True): | |
| self.model = model | |
| self.train_dataloader = train_dataloader | |
| self.optimizer = optimizer | |
| self.device = device | |
| self.use_mixed_precision = use_mixed_precision | |
| self.distributed = distributed | |
| if use_mixed_precision: | |
| self.scaler = GradScaler() | |
| if distributed: | |
| self.model = DistributedDataParallel(model) | |
| def train(self, num_epochs: int, save_dir: str = None): | |
| """Main training loop""" | |
| best_loss = float('inf') | |
| for epoch in range(num_epochs): | |
| losses = self.train_epoch(epoch) | |
| # Save checkpoint if this is the best model | |
| if save_dir and losses['total'] < best_loss: | |
| best_loss = losses['total'] | |
| self.model.save_pretrained(os.path.join(save_dir, f'checkpoint-{epoch}')) | |
| print(f"Epoch {epoch+1}/{num_epochs}") | |
| for loss_name, loss_value in losses.items(): | |
| print(f"{loss_name}: {loss_value:.4f}") | |
| def main(): | |
| """Example usage of Quantumaurora""" | |
| # Initialize configuration | |
| config = QuantumauroraConfig( | |
| vocab_size=50000, | |
| d_model=768, | |
| num_heads=12, | |
| num_layers=12, | |
| attention_type="sparse" | |
| ) | |
| # Initialize model | |
| model = Quantumaurora(config) | |
| # Multi-GPU training if available | |
| world_size = torch.cuda.device_count() | |
| if world_size > 1: | |
| mp.spawn( | |
| train_distributed, | |
| args=(world_size, model, dataset), | |
| nprocs=world_size, | |
| join=True | |
| ) | |
| else: | |
| # Single GPU training | |
| trainer = QuantumauroraTrainer( | |
| model=model, | |
| train_dataloader=train_dataloader, | |
| optimizer=torch.optim.Adam(model.parameters()), | |
| use_mixed_precision=True, | |
| distributed=False | |
| ) | |
| trainer.train( | |
| num_epochs=10, | |
| save_dir='quantumaurora_checkpoints' | |
| ) | |
| if __name__ == "__main__": | |
| main() | |