Spaces:
Runtime error
Runtime error
feat: adicionar logging de resultados de treinamento em logs/
Browse files- Criar diretório logs/ no Dockerfile
- Salvar resultados completos em JSON (logs/training_results_*.json)
- Gerar resumo legível em texto (logs/training_summary_*.txt)
- Incluir métricas finais, configuração, e histórico de logs
- Formato estruturado para fácil avaliação dos resultados
- Dockerfile +3 -0
- train.py +125 -2
Dockerfile
CHANGED
|
@@ -25,6 +25,9 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
|
|
| 25 |
COPY train.py /app/train.py
|
| 26 |
COPY app.py /app/app.py
|
| 27 |
|
|
|
|
|
|
|
|
|
|
| 28 |
# Configurar variáveis de ambiente padrão (podem ser sobrescritas)
|
| 29 |
ENV MODEL_NAME=microsoft/Phi-3-mini-4k-instruct
|
| 30 |
ENV DATASET_REPO=beAnalytic/eda-training-dataset
|
|
|
|
| 25 |
COPY train.py /app/train.py
|
| 26 |
COPY app.py /app/app.py
|
| 27 |
|
| 28 |
+
# Criar diretório de logs
|
| 29 |
+
RUN mkdir -p /app/logs
|
| 30 |
+
|
| 31 |
# Configurar variáveis de ambiente padrão (podem ser sobrescritas)
|
| 32 |
ENV MODEL_NAME=microsoft/Phi-3-mini-4k-instruct
|
| 33 |
ENV DATASET_REPO=beAnalytic/eda-training-dataset
|
train.py
CHANGED
|
@@ -15,9 +15,11 @@ from transformers import (
|
|
| 15 |
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
| 16 |
from transformers import BitsAndBytesConfig
|
| 17 |
from huggingface_hub import login as hf_login, logout as hf_logout
|
| 18 |
-
from huggingface_hub import login as hf_login
|
| 19 |
import torch
|
| 20 |
import os
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
# Configuração (pode ser sobrescrita por variáveis de ambiente)
|
| 23 |
MODEL_NAME = os.getenv("MODEL_NAME", "microsoft/Phi-3-mini-4k-instruct")
|
|
@@ -362,9 +364,130 @@ except Exception as e:
|
|
| 362 |
else:
|
| 363 |
raise
|
| 364 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
# Treinar
|
| 366 |
print("Iniciando treinamento...")
|
| 367 |
-
trainer.train()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
|
| 369 |
# Fazer push final apenas se autenticado
|
| 370 |
if push_to_hub_enabled:
|
|
|
|
| 15 |
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
| 16 |
from transformers import BitsAndBytesConfig
|
| 17 |
from huggingface_hub import login as hf_login, logout as hf_logout
|
|
|
|
| 18 |
import torch
|
| 19 |
import os
|
| 20 |
+
import json
|
| 21 |
+
from datetime import datetime
|
| 22 |
+
from pathlib import Path
|
| 23 |
|
| 24 |
# Configuração (pode ser sobrescrita por variáveis de ambiente)
|
| 25 |
MODEL_NAME = os.getenv("MODEL_NAME", "microsoft/Phi-3-mini-4k-instruct")
|
|
|
|
| 364 |
else:
|
| 365 |
raise
|
| 366 |
|
| 367 |
+
# Criar diretório de logs
|
| 368 |
+
logs_dir = Path("./logs")
|
| 369 |
+
logs_dir.mkdir(exist_ok=True)
|
| 370 |
+
|
| 371 |
# Treinar
|
| 372 |
print("Iniciando treinamento...")
|
| 373 |
+
train_output = trainer.train()
|
| 374 |
+
|
| 375 |
+
# Coletar métricas finais do estado do trainer
|
| 376 |
+
state = trainer.state
|
| 377 |
+
final_log_history = state.log_history if hasattr(state, 'log_history') and state.log_history else []
|
| 378 |
+
|
| 379 |
+
# Tentar obter loss final de diferentes fontes
|
| 380 |
+
final_train_loss = None
|
| 381 |
+
if hasattr(train_output, 'training_loss'):
|
| 382 |
+
final_train_loss = train_output.training_loss
|
| 383 |
+
elif final_log_history:
|
| 384 |
+
for log_entry in reversed(final_log_history):
|
| 385 |
+
if 'loss' in log_entry and 'eval_loss' not in log_entry:
|
| 386 |
+
final_train_loss = log_entry.get('loss')
|
| 387 |
+
break
|
| 388 |
+
|
| 389 |
+
# Buscar últimas métricas de validação
|
| 390 |
+
last_eval_metrics = {}
|
| 391 |
+
if final_log_history:
|
| 392 |
+
for log_entry in reversed(final_log_history):
|
| 393 |
+
if 'eval_loss' in log_entry:
|
| 394 |
+
last_eval_metrics = {k: v for k, v in log_entry.items() if k.startswith('eval_')}
|
| 395 |
+
break
|
| 396 |
+
|
| 397 |
+
# Coletar informações do treinamento
|
| 398 |
+
training_info = {
|
| 399 |
+
"timestamp": datetime.utcnow().isoformat() + "Z",
|
| 400 |
+
"model_name": MODEL_NAME,
|
| 401 |
+
"dataset_repo": DATASET_REPO,
|
| 402 |
+
"output_repo": OUTPUT_REPO,
|
| 403 |
+
"training_config": {
|
| 404 |
+
"num_train_epochs": training_args.num_train_epochs,
|
| 405 |
+
"per_device_train_batch_size": training_args.per_device_train_batch_size,
|
| 406 |
+
"per_device_eval_batch_size": training_args.per_device_eval_batch_size,
|
| 407 |
+
"gradient_accumulation_steps": training_args.gradient_accumulation_steps,
|
| 408 |
+
"learning_rate": training_args.learning_rate,
|
| 409 |
+
"warmup_steps": training_args.warmup_steps,
|
| 410 |
+
"fp16": training_args.fp16,
|
| 411 |
+
},
|
| 412 |
+
"dataset_info": {
|
| 413 |
+
"train_samples": len(train_dataset),
|
| 414 |
+
"eval_samples": len(eval_dataset) if eval_dataset else 0,
|
| 415 |
+
},
|
| 416 |
+
"training_results": {
|
| 417 |
+
"final_train_loss": final_train_loss,
|
| 418 |
+
"final_eval_metrics": last_eval_metrics,
|
| 419 |
+
"total_steps": len(final_log_history) if final_log_history else 0,
|
| 420 |
+
"log_history": final_log_history[-50:], # Últimas 50 entradas
|
| 421 |
+
},
|
| 422 |
+
"status": "completed",
|
| 423 |
+
"push_to_hub_enabled": push_to_hub_enabled,
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
# Salvar resultados em JSON
|
| 427 |
+
results_file = logs_dir / f"training_results_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.json"
|
| 428 |
+
with open(results_file, 'w', encoding='utf-8') as f:
|
| 429 |
+
json.dump(training_info, f, indent=2, ensure_ascii=False)
|
| 430 |
+
print(f"✅ Resultados salvos em: {results_file}")
|
| 431 |
+
|
| 432 |
+
# Criar resumo em texto legível
|
| 433 |
+
summary_file = logs_dir / f"training_summary_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.txt"
|
| 434 |
+
with open(summary_file, 'w', encoding='utf-8') as f:
|
| 435 |
+
f.write("=" * 80 + "\n")
|
| 436 |
+
f.write("RESUMO DO TREINAMENTO\n")
|
| 437 |
+
f.write("=" * 80 + "\n\n")
|
| 438 |
+
f.write(f"Data/Hora: {training_info['timestamp']}\n")
|
| 439 |
+
f.write(f"Modelo: {MODEL_NAME}\n")
|
| 440 |
+
f.write(f"Dataset: {DATASET_REPO}\n")
|
| 441 |
+
f.write(f"Output: {OUTPUT_REPO}\n\n")
|
| 442 |
+
|
| 443 |
+
f.write("CONFIGURAÇÃO DE TREINAMENTO:\n")
|
| 444 |
+
f.write("-" * 80 + "\n")
|
| 445 |
+
config = training_info['training_config']
|
| 446 |
+
f.write(f"Épocas: {config['num_train_epochs']}\n")
|
| 447 |
+
f.write(f"Batch Size (train): {config['per_device_train_batch_size']}\n")
|
| 448 |
+
f.write(f"Batch Size (eval): {config['per_device_eval_batch_size']}\n")
|
| 449 |
+
f.write(f"Gradient Accumulation Steps: {config['gradient_accumulation_steps']}\n")
|
| 450 |
+
f.write(f"Learning Rate: {config['learning_rate']}\n")
|
| 451 |
+
f.write(f"Warmup Steps: {config['warmup_steps']}\n")
|
| 452 |
+
f.write(f"FP16: {config['fp16']}\n\n")
|
| 453 |
+
|
| 454 |
+
f.write("DATASET:\n")
|
| 455 |
+
f.write("-" * 80 + "\n")
|
| 456 |
+
dataset_info = training_info['dataset_info']
|
| 457 |
+
f.write(f"Amostras de Treino: {dataset_info['train_samples']}\n")
|
| 458 |
+
f.write(f"Amostras de Validação: {dataset_info['eval_samples']}\n\n")
|
| 459 |
+
|
| 460 |
+
f.write("RESULTADOS:\n")
|
| 461 |
+
f.write("-" * 80 + "\n")
|
| 462 |
+
results = training_info['training_results']
|
| 463 |
+
if results['final_train_loss'] is not None:
|
| 464 |
+
f.write(f"Loss Final (Treino): {results['final_train_loss']:.6f}\n")
|
| 465 |
+
|
| 466 |
+
if results['final_eval_metrics']:
|
| 467 |
+
f.write("\nMétricas Finais de Validação:\n")
|
| 468 |
+
for key, value in results['final_eval_metrics'].items():
|
| 469 |
+
if isinstance(value, float):
|
| 470 |
+
f.write(f" {key}: {value:.6f}\n")
|
| 471 |
+
else:
|
| 472 |
+
f.write(f" {key}: {value}\n")
|
| 473 |
+
|
| 474 |
+
f.write(f"\nTotal de Steps: {results['total_steps']}\n")
|
| 475 |
+
f.write(f"Status: {training_info['status']}\n")
|
| 476 |
+
f.write(f"Push para Hub: {'Sim' if training_info['push_to_hub_enabled'] else 'Não'}\n")
|
| 477 |
+
|
| 478 |
+
if results['log_history']:
|
| 479 |
+
f.write("\n" + "=" * 80 + "\n")
|
| 480 |
+
f.write("ÚLTIMAS MÉTRICAS DO LOG:\n")
|
| 481 |
+
f.write("=" * 80 + "\n")
|
| 482 |
+
for i, log_entry in enumerate(results['log_history'][-10:], 1):
|
| 483 |
+
f.write(f"\nLog Entry {i}:\n")
|
| 484 |
+
for key, value in log_entry.items():
|
| 485 |
+
if isinstance(value, float):
|
| 486 |
+
f.write(f" {key}: {value:.6f}\n")
|
| 487 |
+
else:
|
| 488 |
+
f.write(f" {key}: {value}\n")
|
| 489 |
+
|
| 490 |
+
print(f"✅ Resumo salvo em: {summary_file}")
|
| 491 |
|
| 492 |
# Fazer push final apenas se autenticado
|
| 493 |
if push_to_hub_enabled:
|