airevartis
/

medical-benchmark-scripts

Model card Files Files and versions

xet

Community

airevartis commited on Sep 14, 2025

Commit

f55049d

verified ·

1 Parent(s): eb6e3d8

Upload post_finetune_evaluation.py with huggingface_hub

Browse files

Files changed (1) hide show

post_finetune_evaluation.py +412 -0

post_finetune_evaluation.py ADDED Viewed

	@@ -0,0 +1,412 @@

+#!/usr/bin/env python3
+"""
+Post fine-tuning evaluation on Hugging Face infrastructure
+"""
+import torch
+import json
+import os
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    pipeline,
+    BitsAndBytesConfig
+)
+from datasets import load_dataset
+import numpy as np
+from typing import Dict, List, Tuple
+import logging
+import re
+from pathlib import Path
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class HFPostFineTuneEvaluator:
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Using device: {self.device}")
+        # Model configurations
+        self.models = {
+            "biomistral_7b": "BioMistral/BioMistral-7B",
+            "qwen3_7b": "Qwen/Qwen2.5-7B-Instruct",
+            "meditron_7b": "epfl-llm/meditron-7b",
+            "internist_7b": "internistai/internist-7b"
+        }
+        # Quantization config
+        self.quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4"
+        )
+    def load_finetuned_model(self, model_name: str) -> Tuple:
+        """Load fine-tuned model from HF Hub"""
+        logger.info(f"Loading fine-tuned model: {model_name}")
+        try:
+            # Try to load from HF Hub first
+            finetuned_repo = f"medical-{model_name}-finetuned"
+            try:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    finetuned_repo,
+                    trust_remote_code=True
+                )
+                model = AutoModelForCausalLM.from_pretrained(
+                    finetuned_repo,
+                    quantization_config=self.quantization_config if self.device == "cuda" else None,
+                    device_map="auto" if self.device == "cuda" else None,
+                    torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+                    trust_remote_code=True
+                )
+                logger.info(f"Successfully loaded fine-tuned {model_name}")
+                return model, tokenizer, True
+            except Exception as e:
+                logger.warning(f"Could not load fine-tuned model from HF Hub: {e}")
+                logger.info(f"Loading base model {model_name} instead")
+                # Fallback to base model
+                tokenizer = AutoTokenizer.from_pretrained(
+                    self.models[model_name],
+                    trust_remote_code=True
+                )
+                model = AutoModelForCausalLM.from_pretrained(
+                    self.models[model_name],
+                    quantization_config=self.quantization_config if self.device == "cuda" else None,
+                    device_map="auto" if self.device == "cuda" else None,
+                    torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+                    trust_remote_code=True
+                )
+                return model, tokenizer, False
+        except Exception as e:
+            logger.error(f"Failed to load {model_name}: {e}")
+            return None, None, False
+    def create_prompt(self, question: str, options: List[str], model_name: str) -> str:
+        """Create prompt for different model types"""
+        options_text = "\n".join([f"{chr(65+i)}. {opt}" for i, opt in enumerate(options)])
+        if "qwen" in model_name.lower():
+            return f"""<|im_start|>user
+{question}
+{options_text}
+Please select the correct answer (A, B, C, D, or E).<|im_end|>
+<|im_start|>assistant
+The correct answer is"""
+        elif "mistral" in model_name.lower() or "biomistral" in model_name.lower():
+            return f"""<s>[INST] {question}
+{options_text}
+Please select the correct answer (A, B, C, D, or E). [/INST] The correct answer is"""
+        else:
+            # Generic format
+            return f"""Question: {question}
+{options_text}
+Answer:"""
+    def extract_answer(self, text: str) -> str:
+        """Extract answer from model output"""
+        patterns = [
+            r'[Tt]he correct answer is ([A-E])',
+            r'[Aa]nswer: ([A-E])',
+            r'([A-E])\.',
+            r'^([A-E])\s*$'
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, text)
+            if match:
+                return match.group(1)
+        match = re.search(r'([A-E])', text)
+        if match:
+            return match.group(1)
+        return "A"
+    def evaluate_model(self, model_name: str, test_dataset) -> Dict:
+        """Evaluate a single model on the test dataset"""
+        logger.info(f"Evaluating {model_name}")
+        model, tokenizer, is_finetuned = self.load_finetuned_model(model_name)
+        if model is None or tokenizer is None:
+            return {"error": f"Failed to load {model_name}"}
+        # Create generation pipeline
+        generator = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            max_new_tokens=50,
+            temperature=0.1,
+            do_sample=False,
+            pad_token_id=tokenizer.eos_token_id
+        )
+        results = []
+        correct = 0
+        total = len(test_dataset)
+        logger.info(f"Running evaluation on {total} examples")
+        for i, example in enumerate(test_dataset):
+            try:
+                # Create prompt
+                prompt = self.create_prompt(
+                    example['question'],
+                    example['options'],
+                    model_name
+                )
+                # Generate response
+                response = generator(prompt, return_full_text=False)
+                generated_text = response[0]['generated_text']
+                # Extract answer
+                predicted_answer = self.extract_answer(generated_text)
+                true_answer = example['answer']
+                is_correct = predicted_answer == true_answer
+                if is_correct:
+                    correct += 1
+                results.append({
+                    'question_id': i,
+                    'question': example['question'],
+                    'options': example['options'],
+                    'true_answer': true_answer,
+                    'predicted_answer': predicted_answer,
+                    'generated_text': generated_text,
+                    'is_correct': is_correct
+                })
+            except Exception as e:
+                logger.error(f"Error processing example {i}: {e}")
+                results.append({
+                    'question_id': i,
+                    'error': str(e),
+                    'is_correct': False
+                })
+        # Calculate metrics
+        accuracy = correct / total if total > 0 else 0
+        # Calculate per-option accuracy
+        option_accuracies = {}
+        for option in ['A', 'B', 'C', 'D', 'E']:
+            option_correct = sum(1 for r in results if r.get('true_answer') == option and r.get('is_correct', False))
+            option_total = sum(1 for r in results if r.get('true_answer') == option)
+            option_accuracies[option] = option_correct / option_total if option_total > 0 else 0
+        metrics = {
+            'model_name': f"{model_name}_finetuned" if is_finetuned else f"{model_name}_base",
+            'is_finetuned': is_finetuned,
+            'total_examples': total,
+            'correct_predictions': correct,
+            'accuracy': accuracy,
+            'option_accuracies': option_accuracies
+        }
+        logger.info(f"{model_name} ({'finetuned' if is_finetuned else 'base'}) - Accuracy: {accuracy:.4f}")
+        # Clean up memory
+        del model, tokenizer, generator
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        return metrics
+    def run_evaluation(self, test_dataset) -> Dict:
+        """Run evaluation on all models"""
+        results = {}
+        for model_name in self.models.keys():
+            logger.info(f"Starting evaluation for {model_name}")
+            results[model_name] = self.evaluate_model(model_name, test_dataset)
+        return results
+    def compare_with_baseline(self, post_results: Dict, baseline_file: str = "/tmp/zero_shot_results.json") -> Dict:
+        """Compare with baseline zero-shot results"""
+        try:
+            with open(baseline_file, 'r') as f:
+                baseline_results = json.load(f)
+        except FileNotFoundError:
+            logger.warning("Baseline results not found, skipping comparison")
+            return {}
+        comparison = {}
+        for model_name, post_result in post_results.items():
+            if 'error' in post_result:
+                continue
+            baseline_key = model_name.replace('_finetuned', '')
+            if baseline_key in baseline_results and 'error' not in baseline_results[baseline_key]:
+                baseline_accuracy = baseline_results[baseline_key]['accuracy']
+                post_accuracy = post_result['accuracy']
+                improvement = post_accuracy - baseline_accuracy
+                relative_improvement = (improvement / baseline_accuracy * 100) if baseline_accuracy > 0 else 0
+                comparison[model_name] = {
+                    'baseline_accuracy': baseline_accuracy,
+                    'post_accuracy': post_accuracy,
+                    'improvement': improvement,
+                    'relative_improvement_pct': relative_improvement
+                }
+        return comparison
+    def save_results(self, results: Dict, comparison: Dict, output_path: str = "/tmp/post_finetune_results.json"):
+        """Save evaluation results"""
+        # Prepare serializable results
+        serializable_results = {}
+        for model_name, result in results.items():
+            if 'error' not in result:
+                serializable_results[model_name] = {
+                    'model_name': result['model_name'],
+                    'is_finetuned': result['is_finetuned'],
+                    'total_examples': result['total_examples'],
+                    'correct_predictions': result['correct_predictions'],
+                    'accuracy': result['accuracy'],
+                    'option_accuracies': result['option_accuracies']
+                }
+        # Add comparison data
+        output_data = {
+            'post_finetune_results': serializable_results,
+            'comparison_with_baseline': comparison
+        }
+        with open(output_path, 'w') as f:
+            json.dump(output_data, f, indent=2)
+        logger.info(f"Results saved to {output_path}")
+        return output_path
+def main():
+    """Main function for HF post-fine-tuning evaluation job"""
+    logger.info("Starting post fine-tuning evaluation on Hugging Face infrastructure")
+    # Load MedQA dataset
+    logger.info("Loading MedQA dataset...")
+    try:
+        dataset = load_dataset("bigbio/med_qa")
+    except:
+        try:
+            dataset = load_dataset("medqa")
+        except:
+            logger.error("Could not load MedQA dataset")
+            return
+    def process_example(example):
+        if 'question' in example:
+            question = example['question']
+        elif 'text' in example:
+            question = example['text']
+        else:
+            question = example['input']
+        if 'options' in example:
+            options = example['options']
+        elif 'choices' in example:
+            options = example['choices']
+        else:
+            options = []
+            for i in range(5):
+                key = f'option_{i}' if f'option_{i}' in example else f'choice_{i}'
+                if key in example:
+                    options.append(example[key])
+        if 'answer' in example:
+            answer = example['answer']
+        elif 'label' in example:
+            answer = example['label']
+        else:
+            answer = example['output']
+        return {
+            'question': question,
+            'options': options,
+            'answer': answer
+        }
+    test_dataset = dataset['test'].map(process_example)
+    logger.info(f"Processed {len(test_dataset)} test examples")
+    # Initialize evaluator
+    evaluator = HFPostFineTuneEvaluator()
+    # Run evaluation
+    logger.info("Starting post fine-tuning evaluation...")
+    results = evaluator.run_evaluation(test_dataset)
+    # Compare with baseline
+    comparison = evaluator.compare_with_baseline(results)
+    # Save results
+    output_path = evaluator.save_results(results, comparison)
+    # Print summary
+    print("\n" + "="*60)
+    print("POST FINE-TUNING EVALUATION RESULTS")
+    print("="*60)
+    for model_name, result in results.items():
+        if 'error' not in result:
+            status = "finetuned" if result['is_finetuned'] else "base"
+            print(f"{model_name} ({status}): {result['accuracy']:.4f} accuracy")
+    if comparison:
+        print("\n" + "="*60)
+        print("IMPROVEMENT ANALYSIS")
+        print("="*60)
+        for model_name, comp in comparison.items():
+            print(f"{model_name}: {comp['baseline_accuracy']:.4f} → {comp['post_accuracy']:.4f} ({comp['relative_improvement_pct']:+.2f}%)")
+    # Upload results to HF Hub
+    try:
+        from huggingface_hub import HfApi
+        api = HfApi()
+        repo_name = "medical-benchmark-results"
+        try:
+            api.create_repo(repo_name, exist_ok=True)
+        except:
+            pass
+        api.upload_file(
+            path_or_fileobj=output_path,
+            path_in_repo="post_finetune_evaluation.json",
+            repo_id=repo_name,
+            repo_type="dataset"
+        )
+        logger.info(f"Results uploaded to {repo_name}/post_finetune_evaluation.json")
+    except Exception as e:
+        logger.warning(f"Could not upload results to HF Hub: {e}")
+    logger.info("Post fine-tuning evaluation completed!")
+if __name__ == "__main__":
+    main()