#!/usr/bin/env python3 """ Extract all benchmark results from GlycanAA and SweetTalk logs. Creates comprehensive comparison CSV files. """ import os import re import glob import pandas as pd from collections import defaultdict # Define tasks TASKS = ['domain', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'immunogenicity', 'link'] DATASETS = ['strict', 'strict_3'] def extract_glycanaa_results(log_dir='GlycanAA/logs'): """Extract results from GlycanAA logs.""" results = [] # Pattern: AA_{dataset}_{model}_{task}_{jobid}.out pattern = os.path.join(log_dir, 'AA_*.out') log_files = glob.glob(pattern) print(f"Found {len(log_files)} GlycanAA log files") for log_file in log_files: basename = os.path.basename(log_file) # Parse filename # Format: AA_strict_3_GearNet-Edge_species_9209924.out or AA_strict_GearNet_species_9209017.out parts = basename.replace('.out', '').split('_') if len(parts) < 4: continue # Determine dataset (strict or strict_3) if parts[1] == 'strict' and parts[2] == '3': dataset = 'strict_3' model_start = 3 elif parts[1] == 'strict': dataset = 'strict' model_start = 2 else: continue # Extract model name (can have - in name like GearNet-Edge) # Last part before last underscore is the task, last is jobid job_id = parts[-1] task = parts[-2] model = '_'.join(parts[model_start:-2]).replace('_', '-') if '-' in '_'.join(parts[model_start:-2]) else '-'.join(parts[model_start:-2]) # Actually let's parse more carefully remaining = '_'.join(parts[model_start:]) # Remove jobid at end remaining = '_'.join(remaining.split('_')[:-1]) # Find task found_task = None for t in TASKS: if remaining.endswith(t): found_task = t model = remaining[:-len(t)-1] # Remove task and underscore break if not found_task: continue task = found_task # Read log file and extract metrics try: with open(log_file, 'r') as f: content = f.read() # Look for test metrics at end of file # Format: "accuracy [species]: 0.394995" acc_match = re.search(rf'accuracy \[{task}\]: ([\d.]+)', content) f1_match = re.search(rf'macrof1 \[{task}\]: ([\d.]+)', content) mcc_match = re.search(rf'matthews correlation coefficient \[{task}\]: ([\d.]+)', content) auprc_match = re.search(rf'auprc \[{task}\]: ([\d.]+)', content) if acc_match: accuracy = float(acc_match.group(1)) macro_f1 = float(f1_match.group(1)) if f1_match else None mcc = float(mcc_match.group(1)) if mcc_match else None auprc = float(auprc_match.group(1)) if auprc_match else None results.append({ 'source': 'GlycanAA', 'dataset': dataset, 'model': model, 'task': task, 'accuracy': accuracy, 'macro_f1': macro_f1, 'mcc': mcc, 'auprc': auprc, 'job_id': job_id, 'log_file': basename }) except Exception as e: print(f"Error reading {log_file}: {e}") return results def extract_sweettalk_results(): """Extract results from SweetTalk result CSVs and logs.""" results = [] # Check result directories for dataset in ['strict', 'strict_3']: if dataset == 'strict': result_dir = 'sweettalk/results' else: result_dir = 'sweettalk/results_strict_3' if not os.path.exists(result_dir): print(f"SweetTalk {dataset} results directory not found: {result_dir}") continue for task in TASKS: csv_path = os.path.join(result_dir, task, f'results_{task}.csv') if os.path.exists(csv_path): try: df = pd.read_csv(csv_path) if len(df) > 0: row = df.iloc[0] results.append({ 'source': 'SweetTalk', 'dataset': dataset, 'model': 'SweetTalk', 'task': task, 'accuracy': row.get('accuracy'), 'macro_f1': row.get('macro_f1'), 'mcc': None, 'auprc': None, 'job_id': None, 'log_file': csv_path }) except Exception as e: print(f"Error reading {csv_path}: {e}") return results def extract_our_results(): """Extract our BERTv4 results from publication summary.""" results = [] for dataset in ['strict', 'strict_3']: csv_path = f'downstream_tasks/results_publication_summary/{dataset}_results.csv' if os.path.exists(csv_path): try: df = pd.read_csv(csv_path) for _, row in df.iterrows(): results.append({ 'source': 'BERTv4', 'dataset': dataset, 'model': 'BERTv4-Topology', 'task': row['Task'], 'accuracy': row['Accuracy'], 'macro_f1': row['Macro-F1'], 'mcc': row.get('MCC'), 'auprc': row.get('AUPRC'), 'job_id': None, 'log_file': csv_path }) except Exception as e: print(f"Error reading {csv_path}: {e}") return results def main(): print("="*60) print("EXTRACTING ALL BENCHMARK RESULTS") print("="*60) all_results = [] # Extract from each source print("\n1. Extracting GlycanAA results...") glycanaa_results = extract_glycanaa_results() all_results.extend(glycanaa_results) print(f" Found {len(glycanaa_results)} GlycanAA results") print("\n2. Extracting SweetTalk results...") sweettalk_results = extract_sweettalk_results() all_results.extend(sweettalk_results) print(f" Found {len(sweettalk_results)} SweetTalk results") print("\n3. Extracting BERTv4 results...") our_results = extract_our_results() all_results.extend(our_results) print(f" Found {len(our_results)} BERTv4 results") # Convert to DataFrame df = pd.DataFrame(all_results) if len(df) == 0: print("\nNo results found!") return # Save raw results df.to_csv('all_benchmark_results_raw.csv', index=False) print(f"\nSaved raw results to all_benchmark_results_raw.csv ({len(df)} rows)") # Create summary by dataset, model, task (take best/latest result per combination) # Group and take max accuracy for each combination summary = df.groupby(['dataset', 'model', 'task']).agg({ 'accuracy': 'max', 'macro_f1': 'max', 'source': 'first' }).reset_index() summary.to_csv('all_benchmark_results_summary.csv', index=False) print(f"Saved summary to all_benchmark_results_summary.csv ({len(summary)} rows)") # Create pivot tables for easy comparison for dataset in DATASETS: subset = summary[summary['dataset'] == dataset] if len(subset) > 0: # Pivot: rows=models, columns=tasks pivot = subset.pivot(index='model', columns='task', values='accuracy') # Reorder columns cols = [t for t in TASKS if t in pivot.columns] pivot = pivot[cols] pivot.to_csv(f'benchmark_comparison_{dataset}.csv') print(f"\nSaved {dataset} comparison to benchmark_comparison_{dataset}.csv") print(pivot.to_string()) # Print summary counts print("\n" + "="*60) print("RESULTS SUMMARY") print("="*60) for dataset in DATASETS: print(f"\n{dataset.upper()}:") subset = summary[summary['dataset'] == dataset] models = subset['model'].unique() print(f" Models: {len(models)}") for model in sorted(models): model_subset = subset[subset['model'] == model] tasks_complete = len(model_subset) print(f" - {model}: {tasks_complete}/10 tasks") if __name__ == '__main__': main()