TeszenAI
/

MTP3.7

@@ -1,446 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.utils.data import DataLoader, random_split
-from torch.optim import AdamW
-from torch.optim.lr_scheduler import CosineAnnealingLR
-from tqdm import tqdm
-import yaml
-import os
-import pickle
-import math
-from model import MTPMiniModel
-from tokenizer import MTPTokenizer
-from dataset import MTPDataset, collate_fn
-class MTPTrainer:
-    """Entrenador mejorado x20 (versión CPU/GPU básica)"""
-    def __init__(self, config_path='config.yaml'):
-        with open(config_path, 'r', encoding='utf-8') as f:
-            self.config = yaml.safe_load(f)
-        # Device detection
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        torch.set_num_threads(self.config['training']['num_threads'])
-        print("=" * 70)
-        print("MTP MINI x20 - Transformer Avanzado")
-        print("=" * 70)
-        print(f"\n🔥 Device: {self.device}")
-        if self.device.type == 'cuda':
-            print(f"🔥 GPU: {torch.cuda.get_device_name(0)}")
-            print(f"🔥 VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
-        # Tokenizer
-        print("\n[1/6] Inicializando tokenizer mejorado...")
-        self.tokenizer = MTPTokenizer()
-        tokenizer_path = 'mtp_tokenizer.model'
-        if not os.path.exists(tokenizer_path):
-            print("   -> Entrenando nuevo tokenizer...")
-            self.tokenizer.train(
-                self.config['data']['corpus_path'],
-                vocab_size=self.config['model']['vocab_size'],
-                model_prefix='mtp_tokenizer'
-            )
-        else:
-            print(f"   -> Cargando tokenizer: {tokenizer_path}")
-            self.tokenizer.load(tokenizer_path)
-        print(f"   ✅ Vocabulario: {self.tokenizer.vocab_size()} tokens")
-        # Modelo
-        print("\n[2/6] Inicializando modelo GRANDE...")
-        model_config = self.config['model']
-        self.model = MTPMiniModel(
-            vocab_size=self.tokenizer.vocab_size(),
-            d_model=model_config['d_model'],
-            n_layers=model_config['n_layers'],
-            n_heads=model_config['n_heads'],
-            d_ff=model_config['d_ff'],
-            max_seq_len=model_config['max_seq_len'],
-            dropout=model_config['dropout'],
-            use_swiglu=model_config.get('use_swiglu', True),
-            use_flash_attention=model_config.get('use_flash_attention', True),
-            use_reasoning_layer=model_config.get('use_reasoning_layer', True),
-            reasoning_steps=model_config.get('reasoning_steps', 3),
-            use_confidence_score=model_config.get('use_confidence_score', True)
-        )
-        param_count = self.model.count_parameters()
-        print(f"   ✅ Parámetros: {param_count:,} ({param_count/1e6:.1f}M)")
-        print(f"   ✅ Arquitectura: {model_config['n_layers']} layers, "
-              f"{model_config['n_heads']} heads, dim={model_config['d_model']}")
-        improvements = [
-            "RoPE", "RMSNorm", "SwiGLU", "Flash Attention",
-            "Reasoning Layers", "Confidence Score", "Anti-Hallucination",
-            "Label Smoothing", "Advanced Repetition Penalty", "Early Stopping"
-        ]
-        print(f"   ✅ Mejoras: {', '.join(improvements)}")
-        # Mover a device
-        self.model.to(self.device)
-        if self.device.type == 'cuda':
-            memory_allocated = torch.cuda.memory_allocated(0) / 1e9
-            print(f"   ✅ VRAM usada: {memory_allocated:.2f} GB")
-        # Dataset
-        print("\n[3/6] Cargando dataset con filtrado de calidad...")
-        full_dataset = MTPDataset(
-            self.config['data']['corpus_path'],
-            self.tokenizer,
-            max_seq_len=model_config['max_seq_len'],
-            use_augmentation=self.config['data'].get('use_augmentation', True),
-            augmentation_prob=self.config['data'].get('augmentation_prob', 0.4),
-            min_quality_score=self.config['data'].get('min_quality_score', 0.3),
-            remove_duplicates=self.config['data'].get('remove_duplicates', True)
-        )
-        total_examples = len(full_dataset)
-        if total_examples < 100:
-            print(f"   ⚠️ WARNING: Dataset pequeño ({total_examples} ejemplos)")
-            print(f"   ⚠️ Se recomienda 1000+ ejemplos para óptimo rendimiento")
-        val_split = self.config.get('data', {}).get('validation_split', 0.12)
-        val_size = max(1, int(total_examples * val_split))
-        train_size = total_examples - val_size
-        if train_size > 0:
-            self.train_dataset, self.val_dataset = random_split(
-                full_dataset,
-                [train_size, val_size],
-                generator=torch.Generator().manual_seed(42)
-            )
-            print(f"   ✅ Train: {len(self.train_dataset)} ejemplos")
-            print(f"   ✅ Validation: {len(self.val_dataset)} ejemplos")
-        else:
-            self.train_dataset = full_dataset
-            self.val_dataset = full_dataset
-            print(f"   ⚠️ Dataset muy pequeño - usando todo para train y val")
-        self.train_loader = DataLoader(
-            self.train_dataset,
-            batch_size=self.config['training']['batch_size'],
-            shuffle=True,
-            collate_fn=lambda batch: collate_fn(batch, self.tokenizer.pad_id()),
-            num_workers=0
-        )
-        self.val_loader = DataLoader(
-            self.val_dataset,
-            batch_size=self.config['training']['batch_size'],
-            shuffle=False,
-            collate_fn=lambda batch: collate_fn(batch, self.tokenizer.pad_id()),
-            num_workers=0
-        )
-        # Optimizer con grupos diferenciados
-        print("\n[4/6] Configurando optimizer avanzado...")
-        decay_params = []
-        no_decay_params = []
-        reasoning_params = []
-        for name, param in self.model.named_parameters():
-            if param.requires_grad:
-                if 'reasoning' in name:
-                    reasoning_params.append(param)
-                elif 'bias' in name or 'norm' in name or 'embedding' in name:
-                    no_decay_params.append(param)
-                else:
-                    decay_params.append(param)
-        param_groups = [
-            {'params': decay_params, 'weight_decay': self.config['training']['weight_decay']},
-            {'params': no_decay_params, 'weight_decay': 0.0},
-        ]
-        if reasoning_params:
-            param_groups.append({
-                'params': reasoning_params,
-                'weight_decay': self.config['training']['weight_decay'] * 0.5,
-                'lr': self.config['training']['learning_rate'] * 0.8
-            })
-        self.optimizer = AdamW(
-            param_groups,
-            lr=self.config['training']['learning_rate'],
-            betas=(0.9, 0.95),
-            eps=1e-8
-        )
-        print(f"   ✅ Optimizer: AdamW")
-        print(f"   ✅ LR: {self.config['training']['learning_rate']}")
-        print(f"   ✅ Weight decay: {self.config['training']['weight_decay']}")
-        # Learning rate scheduler
-        print("\n[5/6] Configurando LR scheduler...")
-        self.warmup_steps = self.config['training'].get('warmup_steps', 500)
-        total_steps = len(self.train_loader) * self.config['training']['epochs']
-        if self.config['training'].get('use_lr_scheduler', True):
-            self.scheduler = CosineAnnealingLR(
-                self.optimizer,
-                T_max=total_steps - self.warmup_steps,
-                eta_min=self.config['training'].get('min_lr', 0.000005)
-            )
-            print(f"   ✅ Scheduler: Cosine Annealing")
-        else:
-            self.scheduler = None
-            print(f"   ✅ Scheduler: None")
-        print(f"   ✅ Warmup steps: {self.warmup_steps}")
-        self.start_epoch = 0
-        self.global_step = 0
-        self.best_val_loss = float('inf')
-        # Early stopping
-        self.patience = self.config['training'].get('patience', 8)
-        self.min_delta = self.config['training'].get('min_delta', 0.0005)
-        self.patience_counter = 0
-        print(f"   ✅ Early stopping: patience={self.patience}, min_delta={self.min_delta}")
-        # Gradient accumulation
-        self.accumulation_steps = self.config['training'].get('accumulation_steps', 8)
-        effective_batch = self.config['training']['batch_size'] * self.accumulation_steps
-        print(f"   ✅ Gradient accumulation: {self.accumulation_steps} steps")
-        print(f"   ✅ Effective batch size: {effective_batch}")
-        self.use_eos_weight = self.config['training'].get('use_eos_loss_weight', True)
-        # Resume checkpoint
-        if os.path.exists('checkpoint.pt'):
-            print("\n[6/6] Cargando checkpoint...")
-            self.load_checkpoint('checkpoint.pt')
-        else:
-            print("\n[6/6] Listo para entrenar!")
-    def get_lr(self):
-        """Get current learning rate with warmup"""
-        if self.global_step < self.warmup_steps:
-            return self.config['training']['learning_rate'] * (self.global_step / self.warmup_steps)
-        return self.optimizer.param_groups[0]['lr']
-    def train_epoch(self, epoch):
-        """Train one epoch"""
-        self.model.train()
-        total_loss = 0
-        total_confidence = 0
-        confidence_samples = 0
-        progress_bar = tqdm(self.train_loader, desc=f"Epoch {epoch+1}")
-        self.optimizer.zero_grad()
-        for batch_idx, (input_ids, target_ids) in enumerate(progress_bar):
-            input_ids = input_ids.to(self.device, non_blocking=True)
-            target_ids = target_ids.to(self.device, non_blocking=True)
-            if self.model.use_confidence:
-                logits, loss, confidence = self.model(
-                    input_ids, target_ids,
-                    use_eos_weight=self.use_eos_weight,
-                    return_confidence=True
-                )
-                mask = (target_ids != 0).float()
-                avg_conf = (confidence * mask).sum() / mask.sum()
-                total_confidence += avg_conf.item()
-                confidence_samples += 1
-            else:
-                logits, loss = self.model(
-                    input_ids, target_ids,
-                    use_eos_weight=self.use_eos_weight
-                )
-            loss = loss / self.accumulation_steps
-            loss.backward()
-            if (batch_idx + 1) % self.accumulation_steps == 0:
-                torch.nn.utils.clip_grad_norm_(
-                    self.model.parameters(),
-                    self.config['training']['max_grad_norm']
-                )
-                if self.global_step < self.warmup_steps:
-                    lr = self.get_lr()
-                    for param_group in self.optimizer.param_groups:
-                        param_group['lr'] = lr
-                self.optimizer.step()
-                if self.scheduler and self.global_step >= self.warmup_steps:
-                    self.scheduler.step()
-                self.optimizer.zero_grad()
-                self.global_step += 1
-            total_loss += loss.item() * self.accumulation_steps
-            postfix = {
-                'loss': f"{loss.item() * self.accumulation_steps:.4f}",
-                'lr': f"{self.get_lr():.6f}"
-            }
-            if confidence_samples > 0:
-                postfix['conf'] = f"{total_confidence/confidence_samples:.3f}"
-            progress_bar.set_postfix(postfix)
-        avg_loss = total_loss / len(self.train_loader)
-        avg_confidence = total_confidence / confidence_samples if confidence_samples > 0 else 0
-        return avg_loss, avg_confidence
-    def validate(self):
-        """Validate model"""
-        self.model.eval()
-        total_loss = 0
-        total_confidence = 0
-        confidence_samples = 0
-        with torch.no_grad():
-            for input_ids, target_ids in self.val_loader:
-                input_ids = input_ids.to(self.device, non_blocking=True)
-                target_ids = target_ids.to(self.device, non_blocking=True)
-                if self.model.use_confidence:
-                    logits, loss, confidence = self.model(
-                        input_ids, target_ids,
-                        return_confidence=True
-                    )
-                    mask = (target_ids != 0).float()
-                    avg_conf = (confidence * mask).sum() / mask.sum()
-                    total_confidence += avg_conf.item()
-                    confidence_samples += 1
-                else:
-                    logits, loss = self.model(input_ids, target_ids)
-                total_loss += loss.item()
-        avg_loss = total_loss / len(self.val_loader)
-        avg_confidence = total_confidence / confidence_samples if confidence_samples > 0 else 0
-        return avg_loss, avg_confidence
-    def train(self):
-        """Main training loop"""
-        print("\n" + "=" * 70)
-        print("INICIANDO ENTRENAMIENTO")
-        print("=" * 70)
-        epochs = self.config['training']['epochs']
-        for epoch in range(self.start_epoch, epochs):
-            train_loss, train_conf = self.train_epoch(epoch)
-            val_loss, val_conf = self.validate()
-            if self.device.type == 'cuda':
-                torch.cuda.empty_cache()
-            print(f"\nEpoch {epoch+1}/{epochs}")
-            print(f"  Train Loss:       {train_loss:.4f}")
-            print(f"  Val Loss:         {val_loss:.4f}")
-            print(f"  Train Confidence: {train_conf:.3f}")
-            print(f"  Val Confidence:   {val_conf:.3f}")
-            print(f"  LR:               {self.get_lr():.6f}")
-            # Early stopping
-            if val_loss < (self.best_val_loss - self.min_delta):
-                self.best_val_loss = val_loss
-                self.patience_counter = 0
-                self.save_checkpoint('best_model.pt', epoch + 1, is_best=True)
-                print(f"  ✅ Nuevo mejor modelo! (Val Loss: {val_loss:.4f})")
-            else:
-                self.patience_counter += 1
-                print(f"  -> No improvement. Patience: {self.patience_counter}/{self.patience}")
-                if self.patience_counter >= self.patience:
-                    print(f"\n⚠️ Early stopping. Mejor val loss: {self.best_val_loss:.4f}")
-                    break
-            if (epoch + 1) % self.config['training']['save_every'] == 0:
-                self.save_checkpoint('checkpoint.pt', epoch + 1)
-        print("\n" + "=" * 70)
-        print("ENTRENAMIENTO COMPLETADO")
-        print(f"Mejor Val Loss: {self.best_val_loss:.4f}")
-        print("=" * 70)
-        if os.path.exists('best_model.pt'):
-            print("\nCargando mejor modelo...")
-            checkpoint = torch.load('best_model.pt', map_location=self.device)
-            self.model.load_state_dict(checkpoint['model_state_dict'])
-        self.save_model()
-    def save_checkpoint(self, path, epoch, is_best=False):
-        """Save checkpoint"""
-        checkpoint = {
-            'epoch': epoch,
-            'global_step': self.global_step,
-            'model_state_dict': self.model.state_dict(),
-            'optimizer_state_dict': self.optimizer.state_dict(),
-            'scheduler_state_dict': self.scheduler.state_dict() if self.scheduler else None,
-            'best_val_loss': self.best_val_loss,
-            'patience_counter': self.patience_counter,
-            'config': self.config
-        }
-        torch.save(checkpoint, path)
-        if not is_best:
-            print(f"   💾 Checkpoint guardado: {path}")
-    def load_checkpoint(self, path):
-        """Load checkpoint"""
-        checkpoint = torch.load(path, map_location=self.device)
-        self.model.load_state_dict(checkpoint['model_state_dict'])
-        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
-        if self.scheduler and checkpoint['scheduler_state_dict']:
-            self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
-        self.start_epoch = checkpoint['epoch']
-        self.global_step = checkpoint['global_step']
-        self.best_val_loss = checkpoint.get('best_val_loss', float('inf'))
-        self.patience_counter = checkpoint.get('patience_counter', 0)
-        print(f"   ✅ Resumido desde epoch {self.start_epoch}")
-        print(f"   ✅ Mejor val loss: {self.best_val_loss:.4f}")
-    def save_model(self):
-        """Save final model"""
-        os.makedirs('output', exist_ok=True)
-        self.model.to('cpu')
-        model_data = {
-            'model_state_dict': self.model.state_dict(),
-            'config': self.config,
-            'vocab_size': self.tokenizer.vocab_size(),
-            'tokenizer_path': self.tokenizer.model_path,
-            'training_info': {
-                'final_epoch': self.start_epoch,
-                'best_val_loss': self.best_val_loss,
-                'total_parameters': self.model.count_parameters()
-            }
-        }
-        output_path = 'output/mtp_mini.pkl'
-        with open(output_path, 'wb') as f:
-            pickle.dump(model_data, f)
-        print(f"\n✅ Modelo final guardado: {output_path}")
-        print(f"💾 Tamaño: {os.path.getsize(output_path) / (1024*1024):.2f} MB")
-        print(f"🧠 Parámetros: {self.model.count_parameters()/1e6:.1f}M")
-if __name__ == '__main__':
-    trainer = MTPTrainer('config.yaml')
-    trainer.train()