| |
| """ |
| Script para analisar métricas de treinamento do TensorBoard e gerar relatório. |
| |
| Uso: |
| python analyze_training_metrics.py [diretório_com_eventos] |
| |
| Exemplo: |
| python analyze_training_metrics.py ./results |
| python analyze_training_metrics.py . |
| """ |
|
|
| import os |
| import sys |
| import json |
| import glob |
| from pathlib import Path |
| from typing import Dict, List, Optional |
| from datetime import datetime |
|
|
| try: |
| from tensorboard.backend.event_processing.event_accumulator import EventAccumulator |
| TENSORBOARD_AVAILABLE = True |
| except ImportError: |
| TENSORBOARD_AVAILABLE = False |
| print("⚠️ TensorBoard não instalado. Instalando...") |
| print("Execute: pip install tensorboard") |
|
|
|
|
| def find_event_files(directory: str) -> List[str]: |
| """Encontra arquivos de eventos do TensorBoard.""" |
| pattern = os.path.join(directory, "events.out.tfevents.*") |
| files = glob.glob(pattern) |
| return sorted(files, key=os.path.getmtime, reverse=True) |
|
|
|
|
| def extract_config_from_event_file(filepath: str) -> Dict: |
| """Extrai configuração do arquivo de eventos (formato binário com texto).""" |
| config = {} |
| |
| try: |
| with open(filepath, 'rb') as f: |
| content = f.read() |
| text_content = content.decode('utf-8', errors='ignore') |
| |
| |
| if 'args/text_summary' in text_content: |
| |
| start_idx = text_content.find('args/text_summary') |
| if start_idx != -1: |
| |
| json_start = text_content.find('{', start_idx) |
| if json_start != -1: |
| |
| brace_count = 0 |
| json_end = json_start |
| for i in range(json_start, len(text_content)): |
| if text_content[i] == '{': |
| brace_count += 1 |
| elif text_content[i] == '}': |
| brace_count -= 1 |
| if brace_count == 0: |
| json_end = i + 1 |
| break |
| |
| if json_end > json_start: |
| try: |
| args_json = text_content[json_start:json_end] |
| config['training_args'] = json.loads(args_json) |
| except json.JSONDecodeError as e: |
| print(f"⚠️ Erro ao parsear args: {e}") |
| |
| |
| if 'model_config/text_summary' in text_content: |
| start_idx = text_content.find('model_config/text_summary') |
| if start_idx != -1: |
| json_start = text_content.find('{', start_idx) |
| if json_start != -1: |
| brace_count = 0 |
| json_end = json_start |
| for i in range(json_start, len(text_content)): |
| if text_content[i] == '{': |
| brace_count += 1 |
| elif text_content[i] == '}': |
| brace_count -= 1 |
| if brace_count == 0: |
| json_end = i + 1 |
| break |
| |
| if json_end > json_start: |
| try: |
| model_json = text_content[json_start:json_end] |
| config['model_config'] = json.loads(model_json) |
| except json.JSONDecodeError as e: |
| print(f"⚠️ Erro ao parsear model_config: {e}") |
| except Exception as e: |
| print(f"⚠️ Erro ao ler arquivo {filepath}: {e}") |
| |
| return config |
|
|
|
|
| def analyze_metrics_with_tensorboard(directory: str) -> Dict: |
| """Analisa métricas usando TensorBoard EventAccumulator.""" |
| if not TENSORBOARD_AVAILABLE: |
| return {} |
| |
| try: |
| ea = EventAccumulator(directory) |
| ea.Reload() |
| |
| metrics = { |
| 'scalars': {}, |
| 'tags': ea.Tags() |
| } |
| |
| |
| for tag in ea.Tags().get('scalars', []): |
| scalars = ea.Scalars(tag) |
| if scalars: |
| metrics['scalars'][tag] = { |
| 'count': len(scalars), |
| 'latest': scalars[-1].value if scalars else None, |
| 'latest_step': scalars[-1].step if scalars else None, |
| 'first': scalars[0].value if scalars else None, |
| 'first_step': scalars[0].step if scalars else None, |
| 'min': min(s.value for s in scalars) if scalars else None, |
| 'max': max(s.value for s in scalars) if scalars else None, |
| 'all_values': [s.value for s in scalars], |
| 'all_steps': [s.step for s in scalars] |
| } |
| |
| return metrics |
| except Exception as e: |
| print(f"⚠️ Erro ao analisar com TensorBoard: {e}") |
| return {} |
|
|
|
|
| def calculate_training_stats(metrics: Dict) -> Dict: |
| """Calcula estatísticas úteis sobre o treinamento.""" |
| stats = {} |
| |
| scalars = metrics.get('scalars', {}) |
| |
| |
| if 'train/loss' in scalars: |
| train_loss = scalars['train/loss'] |
| stats['train_loss'] = { |
| 'current': train_loss['latest'], |
| 'initial': train_loss['first'], |
| 'improvement': train_loss['first'] - train_loss['latest'] if train_loss['first'] and train_loss['latest'] else None, |
| 'improvement_pct': ((train_loss['first'] - train_loss['latest']) / train_loss['first'] * 100) if train_loss['first'] and train_loss['latest'] and train_loss['first'] > 0 else None, |
| 'min': train_loss['min'], |
| 'max': train_loss['max'], |
| 'steps': train_loss['count'] |
| } |
| |
| |
| if 'eval/loss' in scalars: |
| eval_loss = scalars['eval/loss'] |
| stats['eval_loss'] = { |
| 'current': eval_loss['latest'], |
| 'initial': eval_loss['first'], |
| 'improvement': eval_loss['first'] - eval_loss['latest'] if eval_loss['first'] and eval_loss['latest'] else None, |
| 'improvement_pct': ((eval_loss['first'] - eval_loss['latest']) / eval_loss['first'] * 100) if eval_loss['first'] and eval_loss['latest'] and eval_loss['first'] > 0 else None, |
| 'min': eval_loss['min'], |
| 'max': eval_loss['max'], |
| 'steps': eval_loss['count'] |
| } |
| |
| |
| if 'train/learning_rate' in scalars: |
| lr = scalars['train/learning_rate'] |
| stats['learning_rate'] = { |
| 'current': lr['latest'], |
| 'initial': lr['first'], |
| 'final': lr['latest'], |
| 'steps': lr['count'] |
| } |
| |
| |
| if 'train/loss' in scalars and 'eval/loss' in scalars: |
| train_latest = scalars['train/loss']['latest'] |
| eval_latest = scalars['eval/loss']['latest'] |
| if train_latest and eval_latest: |
| gap = eval_latest - train_latest |
| stats['overfitting'] = { |
| 'gap': gap, |
| 'gap_pct': (gap / train_latest * 100) if train_latest > 0 else None, |
| 'status': 'overfitting' if gap > train_latest * 0.2 else 'ok' if gap > 0 else 'underfitting' |
| } |
| |
| return stats |
|
|
|
|
| def generate_report(config: Dict, metrics: Dict, stats: Dict, output_file: Optional[str] = None) -> str: |
| """Gera relatório formatado das métricas.""" |
| |
| report = [] |
| report.append("=" * 80) |
| report.append("RELATÓRIO DE MÉTRICAS DE TREINAMENTO") |
| report.append("=" * 80) |
| report.append(f"Gerado em: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") |
| report.append("") |
| |
| |
| model_config = config.get('model_config', {}) |
| if model_config: |
| report.append("🤖 CONFIGURAÇÃO DO MODELO") |
| report.append("-" * 80) |
| report.append(f" Arquitetura: {model_config.get('architectures', ['N/A'])[0]}") |
| report.append(f" Tipo: {model_config.get('model_type', 'N/A')}") |
| report.append(f" Dimensões Ocultas: {model_config.get('hidden_size', 'N/A')}") |
| report.append(f" Camadas: {model_config.get('num_hidden_layers', 'N/A')}") |
| report.append(f" Cabeças de Atenção: {model_config.get('num_attention_heads', 'N/A')}") |
| report.append(f" Contexto Máximo: {model_config.get('max_position_embeddings', 'N/A')} tokens") |
| report.append(f" Vocabulário: {model_config.get('vocab_size', 'N/A')} tokens") |
| |
| quant = model_config.get('quantization_config', {}) |
| if quant: |
| report.append(f" Quantização: {'4-bit NF4' if quant.get('load_in_4bit') else '8-bit' if quant.get('load_in_8bit') else 'Nenhuma'}") |
| report.append("") |
| |
| |
| training_args = config.get('training_args', {}) |
| if training_args: |
| report.append("⚙️ CONFIGURAÇÃO DE TREINAMENTO") |
| report.append("-" * 80) |
| report.append(f" Learning Rate: {training_args.get('learning_rate', 'N/A')}") |
| report.append(f" Batch Size: {training_args.get('per_device_train_batch_size', 'N/A')} × {training_args.get('gradient_accumulation_steps', 1)} = {training_args.get('per_device_train_batch_size', 0) * training_args.get('gradient_accumulation_steps', 1)}") |
| report.append(f" Épocas: {training_args.get('num_train_epochs', 'N/A')}") |
| report.append(f" Warmup Steps: {training_args.get('warmup_steps', 'N/A')}") |
| report.append(f" Logging Steps: {training_args.get('logging_steps', 'N/A')}") |
| report.append(f" Eval Steps: {training_args.get('eval_steps', 'N/A')}") |
| report.append(f" Save Steps: {training_args.get('save_steps', 'N/A')}") |
| report.append(f" FP16: {'✅' if training_args.get('fp16') else '❌'}") |
| report.append(f" Push to Hub: {'✅' if training_args.get('push_to_hub') else '❌'}") |
| if training_args.get('hub_model_id'): |
| report.append(f" Hub Model ID: {training_args.get('hub_model_id')}") |
| report.append("") |
| |
| |
| if stats: |
| report.append("📊 ESTATÍSTICAS DE TREINAMENTO") |
| report.append("-" * 80) |
| |
| if 'train_loss' in stats: |
| tl = stats['train_loss'] |
| report.append(" 📉 Loss de Treinamento:") |
| report.append(f" • Atual: {tl['current']:.6f}" if tl['current'] else " • Atual: N/A") |
| report.append(f" • Inicial: {tl['initial']:.6f}" if tl['initial'] else " • Inicial: N/A") |
| if tl.get('improvement'): |
| report.append(f" • Melhoria: {tl['improvement']:.6f} ({tl['improvement_pct']:.2f}%)") |
| report.append(f" • Mínimo: {tl['min']:.6f}" if tl['min'] else " • Mínimo: N/A") |
| report.append(f" • Máximo: {tl['max']:.6f}" if tl['max'] else " • Máximo: N/A") |
| report.append(f" • Steps registrados: {tl['steps']}") |
| report.append("") |
| |
| if 'eval_loss' in stats: |
| el = stats['eval_loss'] |
| report.append(" 📉 Loss de Validação:") |
| report.append(f" • Atual: {el['current']:.6f}" if el['current'] else " • Atual: N/A") |
| report.append(f" • Inicial: {el['initial']:.6f}" if el['initial'] else " • Inicial: N/A") |
| if el.get('improvement'): |
| report.append(f" • Melhoria: {el['improvement']:.6f} ({el['improvement_pct']:.2f}%)") |
| report.append(f" • Mínimo: {el['min']:.6f}" if el['min'] else " • Mínimo: N/A") |
| report.append(f" • Steps registrados: {el['steps']}") |
| report.append("") |
| |
| if 'learning_rate' in stats: |
| lr = stats['learning_rate'] |
| report.append(" 📈 Learning Rate:") |
| report.append(f" • Atual: {lr['current']:.2e}" if lr['current'] else " • Atual: N/A") |
| report.append(f" • Inicial: {lr['initial']:.2e}" if lr['initial'] else " • Inicial: N/A") |
| report.append("") |
| |
| if 'overfitting' in stats: |
| of = stats['overfitting'] |
| report.append(" ⚠️ Análise de Overfitting:") |
| report.append(f" • Gap (eval - train): {of['gap']:.6f}" if of['gap'] else " • Gap: N/A") |
| if of.get('gap_pct'): |
| report.append(f" • Gap percentual: {of['gap_pct']:.2f}%") |
| report.append(f" • Status: {of['status'].upper()}") |
| if of['status'] == 'overfitting': |
| report.append(" • ⚠️ ATENÇÃO: Modelo pode estar overfitting!") |
| report.append(" • Considere: early stopping, mais dados, ou regularização") |
| elif of['status'] == 'underfitting': |
| report.append(" • ⚠️ ATENÇÃO: Modelo pode estar underfitting!") |
| report.append(" • Considere: mais épocas, modelo maior, ou learning rate maior") |
| report.append("") |
| |
| |
| tags = metrics.get('tags', {}) |
| if tags.get('scalars'): |
| report.append("📋 MÉTRICAS DISPONÍVEIS") |
| report.append("-" * 80) |
| for tag in sorted(tags['scalars']): |
| scalar = metrics['scalars'].get(tag, {}) |
| latest = scalar.get('latest') |
| if latest is not None: |
| report.append(f" • {tag}: {latest:.6f} (step {scalar.get('latest_step', 'N/A')})") |
| report.append("") |
| |
| |
| report.append("💡 RECOMENDAÇÕES") |
| report.append("-" * 80) |
| |
| if not stats: |
| report.append(" ⚠️ Nenhuma métrica de treinamento encontrada ainda.") |
| report.append(" • Aguarde o treinamento avançar alguns steps") |
| report.append(" • Os logs são salvos a cada step (logging_steps=1)") |
| else: |
| if 'train_loss' in stats and stats['train_loss'].get('current'): |
| current_loss = stats['train_loss']['current'] |
| if current_loss > 2.0: |
| report.append(" • Loss ainda alto (>2.0). Treinamento pode precisar mais tempo.") |
| elif current_loss < 0.5: |
| report.append(" • Loss baixo (<0.5). Modelo pode estar convergindo bem.") |
| |
| if 'overfitting' in stats: |
| if stats['overfitting']['status'] == 'overfitting': |
| report.append(" • ⚠️ Detecção de overfitting. Considere:") |
| report.append(" - Early stopping") |
| report.append(" - Aumentar dataset de treinamento") |
| report.append(" - Aumentar dropout ou regularização") |
| elif stats['overfitting']['status'] == 'underfitting': |
| report.append(" • ⚠️ Possível underfitting. Considere:") |
| report.append(" - Mais épocas de treinamento") |
| report.append(" - Aumentar learning rate") |
| report.append(" - Modelo com mais parâmetros") |
| |
| report.append("") |
| report.append("=" * 80) |
| |
| report_text = "\n".join(report) |
| |
| |
| if output_file: |
| with open(output_file, 'w', encoding='utf-8') as f: |
| f.write(report_text) |
| print(f"✅ Relatório salvo em: {output_file}") |
| |
| return report_text |
|
|
|
|
| def main(): |
| """Função principal.""" |
| if len(sys.argv) > 1: |
| directory = sys.argv[1] |
| else: |
| directory = "." |
| |
| if not os.path.isdir(directory): |
| print(f"❌ Erro: Diretório '{directory}' não encontrado") |
| sys.exit(1) |
| |
| print(f"🔍 Analisando métricas em: {os.path.abspath(directory)}") |
| |
| |
| event_files = find_event_files(directory) |
| |
| if not event_files: |
| print(f"⚠️ Nenhum arquivo de eventos encontrado em '{directory}'") |
| print(" Procure por arquivos: events.out.tfevents.*") |
| sys.exit(1) |
| |
| print(f"📄 Encontrados {len(event_files)} arquivo(s) de eventos") |
| |
| |
| config = extract_config_from_event_file(event_files[0]) |
| |
| |
| if TENSORBOARD_AVAILABLE: |
| print("📊 Analisando métricas com TensorBoard...") |
| metrics = analyze_metrics_with_tensorboard(directory) |
| else: |
| print("⚠️ TensorBoard não disponível. Instale com: pip install tensorboard") |
| metrics = {} |
| |
| |
| stats = calculate_training_stats(metrics) |
| |
| |
| output_file = os.path.join(directory, "training_metrics_report.txt") |
| report = generate_report(config, metrics, stats, output_file) |
| |
| |
| print("\n" + report) |
| |
| |
| json_output = { |
| 'config': config, |
| 'metrics': { |
| 'scalars': { |
| k: { |
| 'latest': v.get('latest'), |
| 'latest_step': v.get('latest_step'), |
| 'count': v.get('count') |
| } |
| for k, v in metrics.get('scalars', {}).items() |
| } |
| }, |
| 'stats': stats, |
| 'generated_at': datetime.now().isoformat() |
| } |
| |
| json_file = os.path.join(directory, "training_metrics.json") |
| with open(json_file, 'w', encoding='utf-8') as f: |
| json.dump(json_output, f, indent=2, ensure_ascii=False) |
| print(f"✅ Métricas em JSON salvas em: {json_file}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|
|
|