amarorn commited on
Commit
6889ee6
·
1 Parent(s): 212ba47

feat: adicionar logging de resultados de treinamento em logs/

Browse files

- Criar diretório logs/ no Dockerfile
- Salvar resultados completos em JSON (logs/training_results_*.json)
- Gerar resumo legível em texto (logs/training_summary_*.txt)
- Incluir métricas finais, configuração, e histórico de logs
- Formato estruturado para fácil avaliação dos resultados

Files changed (2) hide show
  1. Dockerfile +3 -0
  2. train.py +125 -2
Dockerfile CHANGED
@@ -25,6 +25,9 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
25
  COPY train.py /app/train.py
26
  COPY app.py /app/app.py
27
 
 
 
 
28
  # Configurar variáveis de ambiente padrão (podem ser sobrescritas)
29
  ENV MODEL_NAME=microsoft/Phi-3-mini-4k-instruct
30
  ENV DATASET_REPO=beAnalytic/eda-training-dataset
 
25
  COPY train.py /app/train.py
26
  COPY app.py /app/app.py
27
 
28
+ # Criar diretório de logs
29
+ RUN mkdir -p /app/logs
30
+
31
  # Configurar variáveis de ambiente padrão (podem ser sobrescritas)
32
  ENV MODEL_NAME=microsoft/Phi-3-mini-4k-instruct
33
  ENV DATASET_REPO=beAnalytic/eda-training-dataset
train.py CHANGED
@@ -15,9 +15,11 @@ from transformers import (
15
  from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
16
  from transformers import BitsAndBytesConfig
17
  from huggingface_hub import login as hf_login, logout as hf_logout
18
- from huggingface_hub import login as hf_login
19
  import torch
20
  import os
 
 
 
21
 
22
  # Configuração (pode ser sobrescrita por variáveis de ambiente)
23
  MODEL_NAME = os.getenv("MODEL_NAME", "microsoft/Phi-3-mini-4k-instruct")
@@ -362,9 +364,130 @@ except Exception as e:
362
  else:
363
  raise
364
 
 
 
 
 
365
  # Treinar
366
  print("Iniciando treinamento...")
367
- trainer.train()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
 
369
  # Fazer push final apenas se autenticado
370
  if push_to_hub_enabled:
 
15
  from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
16
  from transformers import BitsAndBytesConfig
17
  from huggingface_hub import login as hf_login, logout as hf_logout
 
18
  import torch
19
  import os
20
+ import json
21
+ from datetime import datetime
22
+ from pathlib import Path
23
 
24
  # Configuração (pode ser sobrescrita por variáveis de ambiente)
25
  MODEL_NAME = os.getenv("MODEL_NAME", "microsoft/Phi-3-mini-4k-instruct")
 
364
  else:
365
  raise
366
 
367
+ # Criar diretório de logs
368
+ logs_dir = Path("./logs")
369
+ logs_dir.mkdir(exist_ok=True)
370
+
371
  # Treinar
372
  print("Iniciando treinamento...")
373
+ train_output = trainer.train()
374
+
375
+ # Coletar métricas finais do estado do trainer
376
+ state = trainer.state
377
+ final_log_history = state.log_history if hasattr(state, 'log_history') and state.log_history else []
378
+
379
+ # Tentar obter loss final de diferentes fontes
380
+ final_train_loss = None
381
+ if hasattr(train_output, 'training_loss'):
382
+ final_train_loss = train_output.training_loss
383
+ elif final_log_history:
384
+ for log_entry in reversed(final_log_history):
385
+ if 'loss' in log_entry and 'eval_loss' not in log_entry:
386
+ final_train_loss = log_entry.get('loss')
387
+ break
388
+
389
+ # Buscar últimas métricas de validação
390
+ last_eval_metrics = {}
391
+ if final_log_history:
392
+ for log_entry in reversed(final_log_history):
393
+ if 'eval_loss' in log_entry:
394
+ last_eval_metrics = {k: v for k, v in log_entry.items() if k.startswith('eval_')}
395
+ break
396
+
397
+ # Coletar informações do treinamento
398
+ training_info = {
399
+ "timestamp": datetime.utcnow().isoformat() + "Z",
400
+ "model_name": MODEL_NAME,
401
+ "dataset_repo": DATASET_REPO,
402
+ "output_repo": OUTPUT_REPO,
403
+ "training_config": {
404
+ "num_train_epochs": training_args.num_train_epochs,
405
+ "per_device_train_batch_size": training_args.per_device_train_batch_size,
406
+ "per_device_eval_batch_size": training_args.per_device_eval_batch_size,
407
+ "gradient_accumulation_steps": training_args.gradient_accumulation_steps,
408
+ "learning_rate": training_args.learning_rate,
409
+ "warmup_steps": training_args.warmup_steps,
410
+ "fp16": training_args.fp16,
411
+ },
412
+ "dataset_info": {
413
+ "train_samples": len(train_dataset),
414
+ "eval_samples": len(eval_dataset) if eval_dataset else 0,
415
+ },
416
+ "training_results": {
417
+ "final_train_loss": final_train_loss,
418
+ "final_eval_metrics": last_eval_metrics,
419
+ "total_steps": len(final_log_history) if final_log_history else 0,
420
+ "log_history": final_log_history[-50:], # Últimas 50 entradas
421
+ },
422
+ "status": "completed",
423
+ "push_to_hub_enabled": push_to_hub_enabled,
424
+ }
425
+
426
+ # Salvar resultados em JSON
427
+ results_file = logs_dir / f"training_results_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.json"
428
+ with open(results_file, 'w', encoding='utf-8') as f:
429
+ json.dump(training_info, f, indent=2, ensure_ascii=False)
430
+ print(f"✅ Resultados salvos em: {results_file}")
431
+
432
+ # Criar resumo em texto legível
433
+ summary_file = logs_dir / f"training_summary_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.txt"
434
+ with open(summary_file, 'w', encoding='utf-8') as f:
435
+ f.write("=" * 80 + "\n")
436
+ f.write("RESUMO DO TREINAMENTO\n")
437
+ f.write("=" * 80 + "\n\n")
438
+ f.write(f"Data/Hora: {training_info['timestamp']}\n")
439
+ f.write(f"Modelo: {MODEL_NAME}\n")
440
+ f.write(f"Dataset: {DATASET_REPO}\n")
441
+ f.write(f"Output: {OUTPUT_REPO}\n\n")
442
+
443
+ f.write("CONFIGURAÇÃO DE TREINAMENTO:\n")
444
+ f.write("-" * 80 + "\n")
445
+ config = training_info['training_config']
446
+ f.write(f"Épocas: {config['num_train_epochs']}\n")
447
+ f.write(f"Batch Size (train): {config['per_device_train_batch_size']}\n")
448
+ f.write(f"Batch Size (eval): {config['per_device_eval_batch_size']}\n")
449
+ f.write(f"Gradient Accumulation Steps: {config['gradient_accumulation_steps']}\n")
450
+ f.write(f"Learning Rate: {config['learning_rate']}\n")
451
+ f.write(f"Warmup Steps: {config['warmup_steps']}\n")
452
+ f.write(f"FP16: {config['fp16']}\n\n")
453
+
454
+ f.write("DATASET:\n")
455
+ f.write("-" * 80 + "\n")
456
+ dataset_info = training_info['dataset_info']
457
+ f.write(f"Amostras de Treino: {dataset_info['train_samples']}\n")
458
+ f.write(f"Amostras de Validação: {dataset_info['eval_samples']}\n\n")
459
+
460
+ f.write("RESULTADOS:\n")
461
+ f.write("-" * 80 + "\n")
462
+ results = training_info['training_results']
463
+ if results['final_train_loss'] is not None:
464
+ f.write(f"Loss Final (Treino): {results['final_train_loss']:.6f}\n")
465
+
466
+ if results['final_eval_metrics']:
467
+ f.write("\nMétricas Finais de Validação:\n")
468
+ for key, value in results['final_eval_metrics'].items():
469
+ if isinstance(value, float):
470
+ f.write(f" {key}: {value:.6f}\n")
471
+ else:
472
+ f.write(f" {key}: {value}\n")
473
+
474
+ f.write(f"\nTotal de Steps: {results['total_steps']}\n")
475
+ f.write(f"Status: {training_info['status']}\n")
476
+ f.write(f"Push para Hub: {'Sim' if training_info['push_to_hub_enabled'] else 'Não'}\n")
477
+
478
+ if results['log_history']:
479
+ f.write("\n" + "=" * 80 + "\n")
480
+ f.write("ÚLTIMAS MÉTRICAS DO LOG:\n")
481
+ f.write("=" * 80 + "\n")
482
+ for i, log_entry in enumerate(results['log_history'][-10:], 1):
483
+ f.write(f"\nLog Entry {i}:\n")
484
+ for key, value in log_entry.items():
485
+ if isinstance(value, float):
486
+ f.write(f" {key}: {value:.6f}\n")
487
+ else:
488
+ f.write(f" {key}: {value}\n")
489
+
490
+ print(f"✅ Resumo salvo em: {summary_file}")
491
 
492
  # Fazer push final apenas se autenticado
493
  if push_to_hub_enabled: