| | import os |
| | import json |
| | import pandas as pd |
| | from typing import Dict, List |
| | import argparse |
| |
|
| | |
| | parser = argparse.ArgumentParser(description='Process model results') |
| | parser.add_argument('--modelname', type=str, help='Model name to use as prefix') |
| | args = parser.parse_args() |
| |
|
| | def find_result_files(root_dir: str) -> List[tuple]: |
| | """ |
| | Find all results JSON files and their corresponding checkpoint numbers. |
| | Returns list of (checkpoint_number, file_path) tuples. |
| | """ |
| | result_files = [] |
| | |
| | for root, dirs, files in os.walk(root_dir): |
| | if 'result' in root.lower(): |
| | for file in files: |
| | if file.startswith('result') and file.endswith('.json'): |
| | |
| | checkpoint = None |
| | path_parts = root.split(os.sep) |
| | for part in path_parts: |
| | if part.startswith('checkpoint-'): |
| | checkpoint = part |
| | break |
| | |
| | if checkpoint: |
| | result_files.append((checkpoint, os.path.join(root, file))) |
| | |
| | return result_files |
| |
|
| | def extract_metrics(json_path: str) -> Dict[str, float]: |
| | """ |
| | Extract specific accuracy metrics from a results JSON file and convert to percentages. |
| | """ |
| | with open(json_path, 'r') as f: |
| | data = json.load(f) |
| | |
| | metrics = {} |
| | target_tasks = [ |
| | 'medmcqa', |
| | 'medqa_4options', |
| | 'mmlu_anatomy', |
| | 'mmlu_clinical_knowledge', |
| | 'mmlu_college_biology', |
| | 'mmlu_college_medicine', |
| | 'mmlu_medical_genetics', |
| | 'mmlu_professional_medicine', |
| | 'pubmedqa' |
| | ] |
| | |
| | results = data.get('results', {}) |
| | for task in target_tasks: |
| | if task in results: |
| | |
| | value = results[task].get('acc,none') |
| | if value is not None: |
| | metrics[task] = round(value * 100, 3) |
| | else: |
| | metrics[task] = None |
| | |
| | |
| | valid_metrics = [v for v in metrics.values() if v is not None] |
| | if valid_metrics: |
| | metrics['average'] = round(sum(valid_metrics) / len(valid_metrics), 3) |
| | else: |
| | metrics['average'] = None |
| | |
| | return metrics |
| |
|
| | def process_all_results(root_dir: str, output_file: str = 'model_metrics.csv', checkpoint_prefix: str = None): |
| | """ |
| | Process all result files and create a CSV with metrics as percentages. |
| | |
| | Parameters: |
| | root_dir (str): Root directory to search for result files |
| | output_file (str): Output CSV filename |
| | checkpoint_prefix (str): Optional prefix to add before checkpoint numbers (e.g., "model_name_") |
| | """ |
| | result_files = find_result_files(root_dir) |
| | all_metrics = [] |
| | |
| | for checkpoint, file_path in result_files: |
| | metrics = extract_metrics(file_path) |
| | |
| | if checkpoint_prefix: |
| | metrics['checkpoint'] = f"{checkpoint_prefix}{checkpoint}" |
| | else: |
| | metrics['checkpoint'] = checkpoint |
| | all_metrics.append(metrics) |
| | |
| | if all_metrics: |
| | df = pd.DataFrame(all_metrics) |
| | |
| | cols = ['checkpoint', 'average'] + [col for col in df.columns if col not in ['checkpoint', 'average']] |
| | df = df[cols] |
| | |
| | |
| | float_cols = [col for col in df.columns if col != 'checkpoint'] |
| | for col in float_cols: |
| | df[col] = df[col].apply(lambda x: f"{x:.3f}" if pd.notnull(x) else x) |
| | |
| | df.to_csv(output_file, index=False) |
| | print(f"Results saved to {output_file}") |
| | else: |
| | print("No result files found.") |
| |
|
| | |
| | if __name__ == "__main__": |
| | root_directory = "." |
| | |
| | |
| | |
| | |
| | process_all_results( |
| | root_directory, |
| | output_file='model_metrics.csv', |
| | checkpoint_prefix=args.modelname) |