TeszenAI
/

MTP3.6

+import torch
+from torch.utils.data import Dataset
+import json
+import random
+class MTPDataset(Dataset):
+    """Dataset mejorado con augmentación de datos"""
+    def __init__(self, corpus_path, tokenizer, max_seq_len=512,
+                 use_augmentation=False, augmentation_prob=0.3):
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+        self.use_augmentation = use_augmentation
+        self.augmentation_prob = augmentation_prob
+        self.data = []
+        # Load corpus
+        with open(corpus_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                entry = json.loads(line)
+                if 'instruction' in entry and 'response' in entry:
+                    self.data.append(entry)
+        print(f"✓ Loaded {len(self.data)} examples from corpus")
+        if use_augmentation:
+            print(f"✓ Data augmentation enabled (prob={augmentation_prob})")
+    def __len__(self):
+        return len(self.data)
+    def augment_text(self, text):
+        """Augmentación simple de texto"""
+        if not self.use_augmentation or random.random() > self.augmentation_prob:
+            return text
+        # Variación 1: Agregar espacios aleatorios (simula variaciones en formato)
+        if random.random() < 0.3:
+            text = text.strip()
+        # Variación 2: Cambiar puntuación final
+        if random.random() < 0.2:
+            if text.endswith('.'):
+                text = text[:-1]
+            elif not text.endswith(('.', '!', '?')):
+                text = text + '.'
+        return text
+    def __getitem__(self, idx):
+        entry = self.data[idx]
+        instruction = entry['instruction']
+        response = entry['response']
+        # Aplicar augmentación
+        instruction = self.augment_text(instruction)
+        response = self.augment_text(response)
+        # Formato mejorado
+        full_text = f"### Instrucción:\n{instruction}\n\n### Respuesta:\n{response}"
+        # Tokenize
+        tokens = self.tokenizer.encode(full_text)
+        # Add BOS and EOS
+        tokens = [self.tokenizer.bos_id()] + tokens + [self.tokenizer.eos_id()]
+        # Truncate if too long
+        if len(tokens) > self.max_seq_len:
+            # Truncar manteniendo BOS y EOS
+            tokens = [tokens[0]] + tokens[1:self.max_seq_len-1] + [self.tokenizer.eos_id()]
+        # Convert to tensor
+        input_ids = torch.tensor(tokens[:-1], dtype=torch.long)
+        target_ids = torch.tensor(tokens[1:], dtype=torch.long)
+        return input_ids, target_ids
+def collate_fn(batch, pad_id=0):
+    """Custom collate function con padding inteligente"""
+    input_ids = [item[0] for item in batch]
+    target_ids = [item[1] for item in batch]
+    # Find max length in batch
+    max_len = max(len(ids) for ids in input_ids)
+    # Pad sequences
+    input_ids_padded = []
+    target_ids_padded = []
+    for inp, tgt in zip(input_ids, target_ids):
+        pad_len = max_len - len(inp)
+        input_ids_padded.append(torch.cat([inp, torch.full((pad_len,), pad_id, dtype=torch.long)]))
+        target_ids_padded.append(torch.cat([tgt, torch.full((pad_len,), pad_id, dtype=torch.long)]))
+    return torch.stack(input_ids_padded), torch.stack(target_ids_padded)

tokenizer.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import sentencepiece as spm
+import os
+import json
+class MTPTokenizer:
+    """Tokenizer using SentencePiece BPE"""
+    def __init__(self, model_path=None):
+        self.sp = None
+        self.model_path = model_path
+        if model_path and os.path.exists(model_path):
+            self.load(model_path)
+    def train(self, corpus_path, vocab_size=4000, model_prefix='mtp_tokenizer'):
+        """Train SentencePiece BPE tokenizer on corpus"""
+        # Extract text from JSONL corpus
+        texts = []
+        with open(corpus_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                data = json.loads(line)
+                if 'instruction' in data:
+                    texts.append(data['instruction'])
+                if 'response' in data:
+                    texts.append(data['response'])
+        # Save temporary text file
+        temp_file = 'temp_corpus.txt'
+        with open(temp_file, 'w', encoding='utf-8') as f:
+            f.write('\n'.join(texts))
+        # Calculate optimal vocab size based on corpus
+        total_chars = sum(len(text) for text in texts)
+        max_vocab = min(vocab_size, int(total_chars * 0.15))  # Heuristic: ~15% of chars
+        print(f"   → Corpus stats: {len(texts)} texts, {total_chars} characters")
+        print(f"   → Adjusted vocab size: {max_vocab} (requested: {vocab_size})")
+        # Train SentencePiece with adjusted parameters
+        try:
+            spm.SentencePieceTrainer.train(
+                input=temp_file,
+                model_prefix=model_prefix,
+                vocab_size=max_vocab,
+                model_type='bpe',
+                pad_id=0,
+                unk_id=1,
+                bos_id=2,
+                eos_id=3,
+                character_coverage=1.0,
+                normalization_rule_name='identity',
+                num_threads=4,
+                split_digits=True,
+                allow_whitespace_only_pieces=False,
+                byte_fallback=False,
+                max_sentencepiece_length=16
+            )
+        except RuntimeError as e:
+            if "Vocabulary size too high" in str(e):
+                # Extract suggested max from error and retry
+                import re
+                match = re.search(r'value <= (\d+)', str(e))
+                if match:
+                    suggested_max = int(match.group(1))
+                    print(f"   → Retrying with vocab size: {suggested_max}")
+                    spm.SentencePieceTrainer.train(
+                        input=temp_file,
+                        model_prefix=model_prefix,
+                        vocab_size=suggested_max,
+                        model_type='bpe',
+                        pad_id=0,
+                        unk_id=1,
+                        bos_id=2,
+                        eos_id=3,
+                        character_coverage=1.0,
+                        normalization_rule_name='identity',
+                        num_threads=4,
+                        split_digits=True,
+                        allow_whitespace_only_pieces=False,
+                        byte_fallback=False,
+                        max_sentencepiece_length=16
+                    )
+                else:
+                    raise
+            else:
+                raise
+        # Clean up
+        os.remove(temp_file)
+        # Load the trained model
+        self.model_path = f"{model_prefix}.model"
+        self.load(self.model_path)
+        print(f"✓ Tokenizer trained: {self.vocab_size()} tokens")
+        print(f"✓ Model saved: {self.model_path}")
+    def load(self, model_path):
+        """Load trained tokenizer"""
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.load(model_path)
+        self.model_path = model_path
+    def encode(self, text):
+        """Encode text to token IDs"""
+        if self.sp is None:
+            raise ValueError("Tokenizer not loaded. Train or load a model first.")
+        return self.sp.encode_as_ids(text)
+    def decode(self, ids):
+        """Decode token IDs to text"""
+        if self.sp is None:
+            raise ValueError("Tokenizer not loaded. Train or load a model first.")
+        return self.sp.decode_ids(ids)
+    def vocab_size(self):
+        """Get vocabulary size"""
+        if self.sp is None:
+            return 0
+        return self.sp.get_piece_size()
+    def bos_id(self):
+        """Beginning of sentence token ID"""
+        return self.sp.bos_id()
+    def eos_id(self):
+        """End of sentence token ID"""
+        return self.sp.eos_id()
+    def pad_id(self):
+        """Padding token ID"""
+        return self.sp.pad_id()
+    def unk_id(self):
+        """Unknown token ID"""
+        return self.sp.unk_id()