amarorn commited on
Commit
22dfad5
·
1 Parent(s): a6fbb35

fix: corrigir crash de treinamento e adicionar salvamento de resultados

Browse files

- Remover quantização 4-bit que causa erro de GPU RAM
- Adicionar salvamento de resultados em logs/ (JSON e TXT)
- Reduzir batch_size para 2 e aumentar gradient_accumulation para 4
- Adicionar tratamento de erros durante treinamento
- Adicionar variáveis de ambiente OMP_NUM_THREADS e PYTORCH_CUDA_ALLOC_CONF
- Usar LoRA apenas (sem quantização) para reduzir memória

Files changed (1) hide show
  1. train.py +167 -19
train.py CHANGED
@@ -12,9 +12,12 @@ from transformers import (
12
  Trainer,
13
  DataCollatorForLanguageModeling,
14
  )
15
- from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
16
- from transformers import BitsAndBytesConfig
17
  import torch
 
 
 
 
18
 
19
  # Configuração
20
  MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
@@ -25,28 +28,32 @@ OUTPUT_REPO = "beAnalytic/eda-llm-model"
25
  print(f"Carregando dataset: {DATASET_REPO}")
26
  dataset = load_dataset(DATASET_REPO)
27
 
 
 
 
 
28
  # Carregar modelo e tokenizer
29
  print(f"Carregando modelo: {MODEL_NAME}")
30
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
31
  tokenizer.pad_token = tokenizer.eos_token
32
 
33
- # Configurar quantização 4-bit
34
- bnb_config = BitsAndBytesConfig(
35
- load_in_4bit=True,
36
- bnb_4bit_compute_dtype=torch.float16,
37
- bnb_4bit_quant_type="nf4",
38
- bnb_4bit_use_double_quant=True,
39
- )
40
 
 
 
 
41
  model = AutoModelForCausalLM.from_pretrained(
42
  MODEL_NAME,
43
- quantization_config=bnb_config,
44
- device_map="auto",
45
  trust_remote_code=True,
 
46
  )
47
 
48
- # Preparar modelo para LoRA
49
- model = prepare_model_for_kbit_training(model)
50
 
51
  # Configurar LoRA
52
  peft_config = LoraConfig(
@@ -119,12 +126,16 @@ def tokenize_function(examples):
119
  train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
120
  eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
121
 
 
 
 
 
122
  # Configurar argumentos de treinamento
123
  training_args = TrainingArguments(
124
  output_dir="./results",
125
  num_train_epochs=3,
126
- per_device_train_batch_size=4,
127
- per_device_eval_batch_size=4,
128
  learning_rate=3e-05,
129
  warmup_steps=100,
130
  logging_steps=10,
@@ -133,8 +144,9 @@ training_args = TrainingArguments(
133
  eval_steps=500,
134
  save_total_limit=3,
135
  load_best_model_at_end=True,
136
- fp16=True,
137
- gradient_accumulation_steps=2,
 
138
  push_to_hub=True,
139
  hub_model_id=OUTPUT_REPO,
140
  hub_strategy="checkpoint",
@@ -157,11 +169,147 @@ trainer = Trainer(
157
 
158
  # Treinar
159
  print("Iniciando treinamento...")
160
- trainer.train()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  # Fazer push final
163
  print(f"Fazendo push do modelo final para {OUTPUT_REPO}")
164
- trainer.push_to_hub()
 
 
 
 
 
165
 
166
  print("✅ Treinamento concluído!")
167
 
 
12
  Trainer,
13
  DataCollatorForLanguageModeling,
14
  )
15
+ from peft import LoraConfig, get_peft_model
 
16
  import torch
17
+ import os
18
+ import json
19
+ from datetime import datetime
20
+ from pathlib import Path
21
 
22
  # Configuração
23
  MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
 
28
  print(f"Carregando dataset: {DATASET_REPO}")
29
  dataset = load_dataset(DATASET_REPO)
30
 
31
+ # Configurar variáveis de ambiente para evitar problemas de memória
32
+ os.environ["OMP_NUM_THREADS"] = "1"
33
+ os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
34
+
35
  # Carregar modelo e tokenizer
36
  print(f"Carregando modelo: {MODEL_NAME}")
37
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
38
  tokenizer.pad_token = tokenizer.eos_token
39
 
40
+ # Verificar se há GPU disponível
41
+ device = "cuda" if torch.cuda.is_available() else "cpu"
42
+ print(f"Usando dispositivo: {device}")
 
 
 
 
43
 
44
+ # Carregar modelo sem quantização (LoRA é suficiente para reduzir memória)
45
+ # Quantização 4-bit está causando problemas de GPU RAM no HuggingFace Space
46
+ print("Carregando modelo (sem quantização, usando LoRA para eficiência)...")
47
  model = AutoModelForCausalLM.from_pretrained(
48
  MODEL_NAME,
49
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
50
+ device_map="auto" if device == "cuda" else None,
51
  trust_remote_code=True,
52
+ use_cache=False,
53
  )
54
 
55
+ if device == "cpu":
56
+ print("⚠️ Modelo carregado em CPU - treinamento será mais lento")
57
 
58
  # Configurar LoRA
59
  peft_config = LoraConfig(
 
126
  train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
127
  eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
128
 
129
+ # Criar diretório de logs
130
+ logs_dir = Path("./logs")
131
+ logs_dir.mkdir(exist_ok=True)
132
+
133
  # Configurar argumentos de treinamento
134
  training_args = TrainingArguments(
135
  output_dir="./results",
136
  num_train_epochs=3,
137
+ per_device_train_batch_size=2,
138
+ per_device_eval_batch_size=2,
139
  learning_rate=3e-05,
140
  warmup_steps=100,
141
  logging_steps=10,
 
144
  eval_steps=500,
145
  save_total_limit=3,
146
  load_best_model_at_end=True,
147
+ fp16=device == "cuda",
148
+ gradient_accumulation_steps=4,
149
+ dataloader_pin_memory=False,
150
  push_to_hub=True,
151
  hub_model_id=OUTPUT_REPO,
152
  hub_strategy="checkpoint",
 
169
 
170
  # Treinar
171
  print("Iniciando treinamento...")
172
+ try:
173
+ train_output = trainer.train()
174
+ except Exception as e:
175
+ print(f"❌ Erro durante treinamento: {e}")
176
+ # Tentar salvar resultados mesmo em caso de erro
177
+ train_output = None
178
+
179
+ # Coletar estado atual se possível
180
+ try:
181
+ state = trainer.state
182
+ final_log_history = state.log_history if hasattr(state, 'log_history') and state.log_history else []
183
+ except:
184
+ final_log_history = []
185
+
186
+ # Salvar log de erro
187
+ error_info = {
188
+ "timestamp": datetime.utcnow().isoformat() + "Z",
189
+ "error": str(e),
190
+ "model_name": MODEL_NAME,
191
+ "dataset_repo": DATASET_REPO,
192
+ "status": "failed"
193
+ }
194
+
195
+ error_file = logs_dir / f"training_error_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.json"
196
+ with open(error_file, 'w', encoding='utf-8') as f:
197
+ json.dump(error_info, f, indent=2, ensure_ascii=False)
198
+ print(f"✅ Informações de erro salvas em: {error_file}")
199
+
200
+ raise
201
+
202
+ # Coletar métricas finais do estado do trainer
203
+ state = trainer.state
204
+ final_log_history = state.log_history if hasattr(state, 'log_history') and state.log_history else []
205
+
206
+ # Tentar obter loss final de diferentes fontes
207
+ final_train_loss = None
208
+ if train_output and hasattr(train_output, 'training_loss'):
209
+ final_train_loss = train_output.training_loss
210
+ elif final_log_history:
211
+ for log_entry in reversed(final_log_history):
212
+ if 'loss' in log_entry and 'eval_loss' not in log_entry:
213
+ final_train_loss = log_entry.get('loss')
214
+ break
215
+
216
+ # Buscar últimas métricas de validação
217
+ last_eval_metrics = {}
218
+ if final_log_history:
219
+ for log_entry in reversed(final_log_history):
220
+ if 'eval_loss' in log_entry:
221
+ last_eval_metrics = {k: v for k, v in log_entry.items() if k.startswith('eval_')}
222
+ break
223
+
224
+ # Coletar informações do treinamento
225
+ training_info = {
226
+ "timestamp": datetime.utcnow().isoformat() + "Z",
227
+ "model_name": MODEL_NAME,
228
+ "dataset_repo": DATASET_REPO,
229
+ "output_repo": OUTPUT_REPO,
230
+ "training_config": {
231
+ "num_train_epochs": training_args.num_train_epochs,
232
+ "per_device_train_batch_size": training_args.per_device_train_batch_size,
233
+ "per_device_eval_batch_size": training_args.per_device_eval_batch_size,
234
+ "gradient_accumulation_steps": training_args.gradient_accumulation_steps,
235
+ "learning_rate": training_args.learning_rate,
236
+ "warmup_steps": training_args.warmup_steps,
237
+ "fp16": training_args.fp16,
238
+ },
239
+ "dataset_info": {
240
+ "train_samples": len(train_dataset),
241
+ "eval_samples": len(eval_dataset) if eval_dataset else 0,
242
+ },
243
+ "training_results": {
244
+ "final_train_loss": final_train_loss,
245
+ "final_eval_metrics": last_eval_metrics,
246
+ "total_steps": len(final_log_history) if final_log_history else 0,
247
+ "log_history": final_log_history[-50:],
248
+ },
249
+ "status": "completed",
250
+ }
251
+
252
+ # Salvar resultados em JSON
253
+ results_file = logs_dir / f"training_results_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.json"
254
+ with open(results_file, 'w', encoding='utf-8') as f:
255
+ json.dump(training_info, f, indent=2, ensure_ascii=False)
256
+ print(f"✅ Resultados salvos em: {results_file}")
257
+
258
+ # Criar resumo em texto legível
259
+ summary_file = logs_dir / f"training_summary_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.txt"
260
+ with open(summary_file, 'w', encoding='utf-8') as f:
261
+ f.write("=" * 80 + "\n")
262
+ f.write("RESUMO DO TREINAMENTO\n")
263
+ f.write("=" * 80 + "\n\n")
264
+ f.write(f"Data/Hora: {training_info['timestamp']}\n")
265
+ f.write(f"Modelo: {MODEL_NAME}\n")
266
+ f.write(f"Dataset: {DATASET_REPO}\n")
267
+ f.write(f"Output: {OUTPUT_REPO}\n\n")
268
+
269
+ f.write("CONFIGURAÇÃO DE TREINAMENTO:\n")
270
+ f.write("-" * 80 + "\n")
271
+ config = training_info['training_config']
272
+ f.write(f"Épocas: {config['num_train_epochs']}\n")
273
+ f.write(f"Batch Size (train): {config['per_device_train_batch_size']}\n")
274
+ f.write(f"Batch Size (eval): {config['per_device_eval_batch_size']}\n")
275
+ f.write(f"Gradient Accumulation Steps: {config['gradient_accumulation_steps']}\n")
276
+ f.write(f"Learning Rate: {config['learning_rate']}\n")
277
+ f.write(f"Warmup Steps: {config['warmup_steps']}\n")
278
+ f.write(f"FP16: {config['fp16']}\n\n")
279
+
280
+ f.write("DATASET:\n")
281
+ f.write("-" * 80 + "\n")
282
+ dataset_info = training_info['dataset_info']
283
+ f.write(f"Amostras de Treino: {dataset_info['train_samples']}\n")
284
+ f.write(f"Amostras de Validação: {dataset_info['eval_samples']}\n\n")
285
+
286
+ f.write("RESULTADOS:\n")
287
+ f.write("-" * 80 + "\n")
288
+ results = training_info['training_results']
289
+ if results['final_train_loss'] is not None:
290
+ f.write(f"Loss Final (Treino): {results['final_train_loss']:.6f}\n")
291
+
292
+ if results['final_eval_metrics']:
293
+ f.write("\nMétricas Finais de Validação:\n")
294
+ for key, value in results['final_eval_metrics'].items():
295
+ if isinstance(value, float):
296
+ f.write(f" {key}: {value:.6f}\n")
297
+ else:
298
+ f.write(f" {key}: {value}\n")
299
+
300
+ f.write(f"\nTotal de Steps: {results['total_steps']}\n")
301
+ f.write(f"Status: {training_info['status']}\n")
302
+
303
+ print(f"✅ Resumo salvo em: {summary_file}")
304
 
305
  # Fazer push final
306
  print(f"Fazendo push do modelo final para {OUTPUT_REPO}")
307
+ try:
308
+ trainer.push_to_hub()
309
+ print("✅ Push para Hub concluído!")
310
+ except Exception as e:
311
+ print(f"⚠️ Aviso: Erro ao fazer push para Hub: {e}")
312
+ print("Os checkpoints estão salvos localmente em ./results")
313
 
314
  print("✅ Treinamento concluído!")
315