#!/usr/bin/env python3 """ Summarize evaluation results and generate tables Usage: python summarize.py --results_dir --output Features: - Summarize all metrics - Separate Chinese and English + merged statistics - Bold highest scores (bold lowest PER) - Append to history records - Generate visualization table images """ import argparse, json, os from collections import defaultdict from datetime import datetime import pandas as pd import matplotlib.pyplot as plt import matplotlib matplotlib.use('Agg') # Metric definitions METRICS = { 'songeval': ['Coherence', 'Musicality', 'Memorability', 'Clarity', 'Naturalness'], 'audiobox': ['CE', 'CU', 'PC', 'PQ', 'Score'], 'mulan_t': ['Mulan-T'], 'per': ['PER'] } ALL_METRICS = METRICS['audiobox'] + METRICS['songeval'] + METRICS['mulan_t'] + METRICS['per'] def load_results(results_dir): """Load all results""" data = defaultdict(dict) for metric_type in METRICS: metric_dir = os.path.join(results_dir, metric_type) if not os.path.exists(metric_dir): continue for f in os.listdir(metric_dir): if not f.endswith('.json') or '_details' in f: continue path = os.path.join(metric_dir, f) try: with open(path) as fp: rec = json.load(fp) model = rec.get('model', f.replace('.json', '')) metrics = rec.get('metrics', {}) for k, v in metrics.items(): data[model][k] = v except: pass return data def merge_cn_en(data): """Merge Chinese and English results""" merged = {} base_models = set() for model in data: if model.endswith('_cn') or model.endswith('_en'): base_models.add(model[:-3]) for base in base_models: cn, en = data.get(f"{base}_cn", {}), data.get(f"{base}_en", {}) if not cn and not en: continue merged[base] = {} all_keys = set(cn.keys()) | set(en.keys()) for k in all_keys: vals = [v for v in [cn.get(k), en.get(k)] if v is not None] if vals: merged[base][k] = sum(vals) / len(vals) return merged def find_best(data, metric): """Find best value""" vals = [d.get(metric) for d in data.values() if d.get(metric) is not None] if not vals: return None return min(vals) if metric == 'PER' else max(vals) def generate_markdown_table(data, title="Results"): """Generate Markdown table""" if not data: return "" # Find best values best = {m: find_best(data, m) for m in ALL_METRICS} # Table header lines = [f"## {title}", ""] header = "| Model | " + " | ".join(ALL_METRICS) + " |" sep = "| --- | " + " | ".join(["---"] * len(ALL_METRICS)) + " |" lines.extend([header, sep]) # Data rows for model in sorted(data.keys()): row = [model] for m in ALL_METRICS: val = data[model].get(m) if val is None: row.append("-") else: s = f"{val:.4f}" if best[m] is not None and abs(val - best[m]) < 1e-9: s = f"**{s}**" row.append(s) lines.append("| " + " | ".join(row) + " |") return "\n".join(lines) def save_table_image(data, output_path): """Generate table image""" if not data: return # Prepare DataFrame rows = [] for model in sorted(data.keys()): row = {'Model': model} for m in ALL_METRICS: row[m] = data[model].get(m) rows.append(row) df = pd.DataFrame(rows) # Find best values best_indices = set() for col_idx, col in enumerate(ALL_METRICS): if col not in df.columns: continue numeric_series = pd.to_numeric(df[col], errors='coerce') if numeric_series.isnull().all(): continue best_val = numeric_series.min() if col == 'PER' else numeric_series.max() for row_idx, val in enumerate(numeric_series): if pd.notna(val) and abs(val - best_val) < 1e-9: best_indices.add((row_idx, col_idx + 1)) # Draw table num_rows, num_cols = len(df), len(df.columns) fig, ax = plt.subplots(figsize=(max(15, num_cols * 1.5), max(4, num_rows * 0.5 + 2))) ax.axis('off') # Prepare cell text cell_text = [] for _, row in df.iterrows(): row_text = [str(row['Model'])] for m in ALL_METRICS: val = row.get(m) row_text.append(f"{val:.4f}" if val is not None and pd.notna(val) else "-") cell_text.append(row_text) col_labels = ['Model'] + ALL_METRICS table = ax.table(cellText=cell_text, colLabels=col_labels, loc='center', cellLoc='center') table.auto_set_font_size(False) table.set_fontsize(9) table.scale(1.0, 1.5) # Styling for (row, col), cell in table.get_celld().items(): if row == 0: cell.set_text_props(weight='bold', color='white') cell.set_facecolor('#40466e') else: if (row - 1, col) in best_indices: cell.set_text_props(weight='bold', color='#d62728') cell.set_facecolor('#f2f2f2' if (row - 1) % 2 == 0 else 'white') plt.title("Evaluation Summary", fontsize=14, fontweight='bold', pad=10) plt.tight_layout() img_path = output_path.replace('.md', '.png') plt.savefig(img_path, dpi=200, bbox_inches='tight') plt.close() print(f"Table image: {img_path}") def append_history(results_dir, data): """Append history records""" history_file = os.path.join(results_dir, "history.jsonl") timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") record = { "timestamp": timestamp, "models": {} } for model, metrics in data.items(): record["models"][model] = metrics with open(history_file, 'a', encoding='utf-8') as f: f.write(json.dumps(record, ensure_ascii=False) + '\n') print(f"History appended: {history_file}") def main(): parser = argparse.ArgumentParser() parser.add_argument("--results_dir", required=True) parser.add_argument("--output", default="summary.md") args = parser.parse_args() data = load_results(args.results_dir) merged = merge_cn_en(data) # Merge into data all_data = dict(data) all_data.update(merged) # Append history records append_history(args.results_dir, all_data) # Generate Markdown table timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") output = [] output.append(f"# Baseline Evaluation Results Summary") output.append(f"\n**Update Time**: {timestamp}\n") output.append(generate_markdown_table(all_data, "All Results")) output.append("") # Write to file with open(args.output, 'w', encoding='utf-8') as f: f.write("\n".join(output)) print("\n" + "\n".join(output)) print(f"\nSaved: {args.output}") # Generate table image save_table_image(all_data, args.output) if __name__ == "__main__": main()