File size: 4,270 Bytes

16b9846

import os
import json
import pandas as pd
from typing import Dict, List
import argparse

# Add this at the beginning of the script
parser = argparse.ArgumentParser(description='Process model results')
parser.add_argument('--modelname', type=str, help='Model name to use as prefix')
args = parser.parse_args()

def find_result_files(root_dir: str) -> List[tuple]:
    """
    Find all results JSON files and their corresponding checkpoint numbers.
    Returns list of (checkpoint_number, file_path) tuples.
    """
    result_files = []
    
    for root, dirs, files in os.walk(root_dir):
        if 'result' in root.lower():
            for file in files:
                if file.startswith('result') and file.endswith('.json'):
                    # Extract checkpoint number from path
                    checkpoint = None
                    path_parts = root.split(os.sep)
                    for part in path_parts:
                        if part.startswith('checkpoint-'):
                            checkpoint = part
                            break
                    
                    if checkpoint:
                        result_files.append((checkpoint, os.path.join(root, file)))
    
    return result_files

def extract_metrics(json_path: str) -> Dict[str, float]:
    """
    Extract specific accuracy metrics from a results JSON file and convert to percentages.
    """
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    metrics = {}
    target_tasks = [
        'medmcqa',
        'medqa_4options',
        'mmlu_anatomy',
        'mmlu_clinical_knowledge',
        'mmlu_college_biology',
        'mmlu_college_medicine',
        'mmlu_medical_genetics',
        'mmlu_professional_medicine',
        'pubmedqa'
    ]
    
    results = data.get('results', {})
    for task in target_tasks:
        if task in results:
            # Convert to percentage and round to 3 decimal places
            value = results[task].get('acc,none')
            if value is not None:
                metrics[task] = round(value * 100, 3)
            else:
                metrics[task] = None
    
    # Calculate average of available metrics
    valid_metrics = [v for v in metrics.values() if v is not None]
    if valid_metrics:
        metrics['average'] = round(sum(valid_metrics) / len(valid_metrics), 3)
    else:
        metrics['average'] = None
    
    return metrics

def process_all_results(root_dir: str, output_file: str = 'model_metrics.csv', checkpoint_prefix: str = None):
    """
    Process all result files and create a CSV with metrics as percentages.
    
    Parameters:
        root_dir (str): Root directory to search for result files
        output_file (str): Output CSV filename
        checkpoint_prefix (str): Optional prefix to add before checkpoint numbers (e.g., "model_name_")
    """
    result_files = find_result_files(root_dir)
    all_metrics = []
    
    for checkpoint, file_path in result_files:
        metrics = extract_metrics(file_path)
        # Add prefix to checkpoint if provided
        if checkpoint_prefix:
            metrics['checkpoint'] = f"{checkpoint_prefix}{checkpoint}"
        else:
            metrics['checkpoint'] = checkpoint
        all_metrics.append(metrics)
    
    if all_metrics:
        df = pd.DataFrame(all_metrics)
        # Reorder columns to put checkpoint and average first
        cols = ['checkpoint', 'average'] + [col for col in df.columns if col not in ['checkpoint', 'average']]
        df = df[cols]
        
        # Format float columns to 3 decimal places
        float_cols = [col for col in df.columns if col != 'checkpoint']
        for col in float_cols:
            df[col] = df[col].apply(lambda x: f"{x:.3f}" if pd.notnull(x) else x)
            
        df.to_csv(output_file, index=False)
        print(f"Results saved to {output_file}")
    else:
        print("No result files found.")

# Usage examples
if __name__ == "__main__":
    root_directory = "."  # Replace with your root directory path
    
    # Example 1: Without prefix (original behavior)
    # process_all_results(root_directory)
    
    process_all_results(
        root_directory,
        output_file='model_metrics.csv',
        checkpoint_prefix=args.modelname)