teszenofficial commited on
Commit
f6f958e
·
verified ·
1 Parent(s): 72003ee

Upload 2 files

Browse files
Files changed (2) hide show
  1. dataset.py +98 -0
  2. tokenizer.py +138 -0
dataset.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.utils.data import Dataset
3
+ import json
4
+ import random
5
+
6
+
7
+ class MTPDataset(Dataset):
8
+ """Dataset mejorado con augmentación de datos"""
9
+
10
+ def __init__(self, corpus_path, tokenizer, max_seq_len=512,
11
+ use_augmentation=False, augmentation_prob=0.3):
12
+ self.tokenizer = tokenizer
13
+ self.max_seq_len = max_seq_len
14
+ self.use_augmentation = use_augmentation
15
+ self.augmentation_prob = augmentation_prob
16
+ self.data = []
17
+
18
+ # Load corpus
19
+ with open(corpus_path, 'r', encoding='utf-8') as f:
20
+ for line in f:
21
+ entry = json.loads(line)
22
+ if 'instruction' in entry and 'response' in entry:
23
+ self.data.append(entry)
24
+
25
+ print(f"✓ Loaded {len(self.data)} examples from corpus")
26
+ if use_augmentation:
27
+ print(f"✓ Data augmentation enabled (prob={augmentation_prob})")
28
+
29
+ def __len__(self):
30
+ return len(self.data)
31
+
32
+ def augment_text(self, text):
33
+ """Augmentación simple de texto"""
34
+ if not self.use_augmentation or random.random() > self.augmentation_prob:
35
+ return text
36
+
37
+ # Variación 1: Agregar espacios aleatorios (simula variaciones en formato)
38
+ if random.random() < 0.3:
39
+ text = text.strip()
40
+
41
+ # Variación 2: Cambiar puntuación final
42
+ if random.random() < 0.2:
43
+ if text.endswith('.'):
44
+ text = text[:-1]
45
+ elif not text.endswith(('.', '!', '?')):
46
+ text = text + '.'
47
+
48
+ return text
49
+
50
+ def __getitem__(self, idx):
51
+ entry = self.data[idx]
52
+
53
+ instruction = entry['instruction']
54
+ response = entry['response']
55
+
56
+ # Aplicar augmentación
57
+ instruction = self.augment_text(instruction)
58
+ response = self.augment_text(response)
59
+
60
+ # Formato mejorado
61
+ full_text = f"### Instrucción:\n{instruction}\n\n### Respuesta:\n{response}"
62
+
63
+ # Tokenize
64
+ tokens = self.tokenizer.encode(full_text)
65
+
66
+ # Add BOS and EOS
67
+ tokens = [self.tokenizer.bos_id()] + tokens + [self.tokenizer.eos_id()]
68
+
69
+ # Truncate if too long
70
+ if len(tokens) > self.max_seq_len:
71
+ # Truncar manteniendo BOS y EOS
72
+ tokens = [tokens[0]] + tokens[1:self.max_seq_len-1] + [self.tokenizer.eos_id()]
73
+
74
+ # Convert to tensor
75
+ input_ids = torch.tensor(tokens[:-1], dtype=torch.long)
76
+ target_ids = torch.tensor(tokens[1:], dtype=torch.long)
77
+
78
+ return input_ids, target_ids
79
+
80
+
81
+ def collate_fn(batch, pad_id=0):
82
+ """Custom collate function con padding inteligente"""
83
+ input_ids = [item[0] for item in batch]
84
+ target_ids = [item[1] for item in batch]
85
+
86
+ # Find max length in batch
87
+ max_len = max(len(ids) for ids in input_ids)
88
+
89
+ # Pad sequences
90
+ input_ids_padded = []
91
+ target_ids_padded = []
92
+
93
+ for inp, tgt in zip(input_ids, target_ids):
94
+ pad_len = max_len - len(inp)
95
+ input_ids_padded.append(torch.cat([inp, torch.full((pad_len,), pad_id, dtype=torch.long)]))
96
+ target_ids_padded.append(torch.cat([tgt, torch.full((pad_len,), pad_id, dtype=torch.long)]))
97
+
98
+ return torch.stack(input_ids_padded), torch.stack(target_ids_padded)
tokenizer.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sentencepiece as spm
2
+ import os
3
+ import json
4
+
5
+
6
+ class MTPTokenizer:
7
+ """Tokenizer using SentencePiece BPE"""
8
+
9
+ def __init__(self, model_path=None):
10
+ self.sp = None
11
+ self.model_path = model_path
12
+
13
+ if model_path and os.path.exists(model_path):
14
+ self.load(model_path)
15
+
16
+ def train(self, corpus_path, vocab_size=4000, model_prefix='mtp_tokenizer'):
17
+ """Train SentencePiece BPE tokenizer on corpus"""
18
+
19
+ # Extract text from JSONL corpus
20
+ texts = []
21
+ with open(corpus_path, 'r', encoding='utf-8') as f:
22
+ for line in f:
23
+ data = json.loads(line)
24
+ if 'instruction' in data:
25
+ texts.append(data['instruction'])
26
+ if 'response' in data:
27
+ texts.append(data['response'])
28
+
29
+ # Save temporary text file
30
+ temp_file = 'temp_corpus.txt'
31
+ with open(temp_file, 'w', encoding='utf-8') as f:
32
+ f.write('\n'.join(texts))
33
+
34
+ # Calculate optimal vocab size based on corpus
35
+ total_chars = sum(len(text) for text in texts)
36
+ max_vocab = min(vocab_size, int(total_chars * 0.15)) # Heuristic: ~15% of chars
37
+
38
+ print(f" → Corpus stats: {len(texts)} texts, {total_chars} characters")
39
+ print(f" → Adjusted vocab size: {max_vocab} (requested: {vocab_size})")
40
+
41
+ # Train SentencePiece with adjusted parameters
42
+ try:
43
+ spm.SentencePieceTrainer.train(
44
+ input=temp_file,
45
+ model_prefix=model_prefix,
46
+ vocab_size=max_vocab,
47
+ model_type='bpe',
48
+ pad_id=0,
49
+ unk_id=1,
50
+ bos_id=2,
51
+ eos_id=3,
52
+ character_coverage=1.0,
53
+ normalization_rule_name='identity',
54
+ num_threads=4,
55
+ split_digits=True,
56
+ allow_whitespace_only_pieces=False,
57
+ byte_fallback=False,
58
+ max_sentencepiece_length=16
59
+ )
60
+ except RuntimeError as e:
61
+ if "Vocabulary size too high" in str(e):
62
+ # Extract suggested max from error and retry
63
+ import re
64
+ match = re.search(r'value <= (\d+)', str(e))
65
+ if match:
66
+ suggested_max = int(match.group(1))
67
+ print(f" → Retrying with vocab size: {suggested_max}")
68
+ spm.SentencePieceTrainer.train(
69
+ input=temp_file,
70
+ model_prefix=model_prefix,
71
+ vocab_size=suggested_max,
72
+ model_type='bpe',
73
+ pad_id=0,
74
+ unk_id=1,
75
+ bos_id=2,
76
+ eos_id=3,
77
+ character_coverage=1.0,
78
+ normalization_rule_name='identity',
79
+ num_threads=4,
80
+ split_digits=True,
81
+ allow_whitespace_only_pieces=False,
82
+ byte_fallback=False,
83
+ max_sentencepiece_length=16
84
+ )
85
+ else:
86
+ raise
87
+ else:
88
+ raise
89
+
90
+ # Clean up
91
+ os.remove(temp_file)
92
+
93
+ # Load the trained model
94
+ self.model_path = f"{model_prefix}.model"
95
+ self.load(self.model_path)
96
+
97
+ print(f"✓ Tokenizer trained: {self.vocab_size()} tokens")
98
+ print(f"✓ Model saved: {self.model_path}")
99
+
100
+ def load(self, model_path):
101
+ """Load trained tokenizer"""
102
+ self.sp = spm.SentencePieceProcessor()
103
+ self.sp.load(model_path)
104
+ self.model_path = model_path
105
+
106
+ def encode(self, text):
107
+ """Encode text to token IDs"""
108
+ if self.sp is None:
109
+ raise ValueError("Tokenizer not loaded. Train or load a model first.")
110
+ return self.sp.encode_as_ids(text)
111
+
112
+ def decode(self, ids):
113
+ """Decode token IDs to text"""
114
+ if self.sp is None:
115
+ raise ValueError("Tokenizer not loaded. Train or load a model first.")
116
+ return self.sp.decode_ids(ids)
117
+
118
+ def vocab_size(self):
119
+ """Get vocabulary size"""
120
+ if self.sp is None:
121
+ return 0
122
+ return self.sp.get_piece_size()
123
+
124
+ def bos_id(self):
125
+ """Beginning of sentence token ID"""
126
+ return self.sp.bos_id()
127
+
128
+ def eos_id(self):
129
+ """End of sentence token ID"""
130
+ return self.sp.eos_id()
131
+
132
+ def pad_id(self):
133
+ """Padding token ID"""
134
+ return self.sp.pad_id()
135
+
136
+ def unk_id(self):
137
+ """Unknown token ID"""
138
+ return self.sp.unk_id()