Charlie81
/

LoRE

TensorBoard

Safetensors

Model card Files Files and versions

xet

Metrics Training metrics Community

Charlie81 commited on Jul 28, 2025

Commit

3e8c5b1

1 Parent(s): c37387b

claude to deepseek

Browse files

Files changed (1) hide show

scripts/evalexperts.py +684 -0

scripts/evalexperts.py ADDED Viewed

	@@ -0,0 +1,684 @@

+#!/usr/bin/env python3
+"""
+eval_with_expert_tracking.py - Evaluation script for OLMoE models with expert usage tracking
+This script extends the standard evaluation to track:
+1. Which experts are being used
+2. Frequency of expert usage
+3. Distribution across experts
+4. Small vs regular expert usage
+"""
+import argparse
+import json
+import os
+import sys
+import logging
+from typing import Dict, List, Optional, Any, Tuple
+import numpy as np
+import torch
+from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
+# lm-eval imports
+from lm_eval import evaluator
+from lm_eval.models.huggingface import HFLM
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+class ExpertTrackingHFLM(HFLM):
+    """Wrapper around HFLM that tracks expert usage statistics."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.expert_stats = {
+            'total_tokens': 0,
+            'regular_expert_usage': {},
+            'small_expert_usage': {},
+            'layer_stats': {}
+        }
+        self._register_hooks()
+    def _register_hooks(self):
+        """Register forward hooks to track expert usage."""
+        if not hasattr(self.model, 'model') or not hasattr(self.model.model, 'layers'):
+            logger.warning("Model doesn't have expected layer structure - expert tracking disabled")
+            return
+        for layer_idx, layer in enumerate(self.model.model.layers):
+            if hasattr(layer, 'mlp') and hasattr(layer.mlp, 'experts'):
+                # Register hook for this MoE layer
+                layer.mlp._expert_hook_handle = layer.mlp.register_forward_hook(
+                    self._make_expert_hook(layer_idx)
+                )
+    def _make_expert_hook(self, layer_idx: int):
+        """Create a forward hook for tracking expert usage in a specific layer."""
+        def expert_hook(module, input, output):
+            if not hasattr(module, 'gate') or not hasattr(module, 'experts'):
+                return
+            hidden_states, router_logits = input[0], output[1]
+            batch_size, seq_len, hidden_dim = hidden_states.shape
+            # Get routing probabilities
+            routing_probs = torch.softmax(router_logits, dim=-1)
+            # Get top-k experts
+            topk_probs, topk_experts = torch.topk(
+                routing_probs,
+                k=module.top_k,
+                dim=-1
+            )
+            # Update statistics
+            self._update_expert_stats(
+                layer_idx=layer_idx,
+                topk_experts=topk_experts,
+                topk_probs=topk_probs,
+                num_regular_experts=module.num_experts,
+                num_small_experts=module.num_small_experts if hasattr(module, 'num_small_experts') else 0,
+                batch_size=batch_size,
+                seq_len=seq_len
+            )
+        return expert_hook
+    def _update_expert_stats(self, layer_idx: int, topk_experts: torch.Tensor,
+                            topk_probs: torch.Tensor, num_regular_experts: int,
+                            num_small_experts: int, batch_size: int, seq_len: int):
+        """Update expert usage statistics."""
+        # Flatten the batch and sequence dimensions
+        topk_experts_flat = topk_experts.view(-1, topk_experts.size(-1))
+        topk_probs_flat = topk_probs.view(-1, topk_probs.size(-1))
+        # Initialize layer stats if not present
+        if layer_idx not in self.expert_stats['layer_stats']:
+            self.expert_stats['layer_stats'][layer_idx] = {
+                'total_tokens': 0,
+                'regular_expert_counts': torch.zeros(num_regular_experts, dtype=torch.long),
+                'small_expert_counts': torch.zeros(num_small_experts, dtype=torch.long) if num_small_experts > 0 else None,
+                'regular_expert_load': torch.zeros(num_regular_experts, dtype=torch.float),
+                'small_expert_load': torch.zeros(num_small_experts, dtype=torch.float) if num_small_experts > 0 else None
+            }
+        layer_stats = self.expert_stats['layer_stats'][layer_idx]
+        num_tokens = topk_experts_flat.size(0)
+        # Update global stats
+        self.expert_stats['total_tokens'] += num_tokens
+        # Update layer stats
+        layer_stats['total_tokens'] += num_tokens
+        # Track regular experts
+        for expert_idx in range(num_regular_experts):
+            mask = (topk_experts_flat == expert_idx)
+            count = mask.sum().item()
+            load = topk_probs_flat[mask].sum().item()
+            layer_stats['regular_expert_counts'][expert_idx] += count
+            layer_stats['regular_expert_load'][expert_idx] += load
+            if expert_idx not in self.expert_stats['regular_expert_usage']:
+                self.expert_stats['regular_expert_usage'][expert_idx] = 0
+            self.expert_stats['regular_expert_usage'][expert_idx] += count
+        # Track small experts if they exist
+        if num_small_experts > 0:
+            for expert_idx in range(num_small_experts):
+                small_expert_num = expert_idx + num_regular_experts
+                mask = (topk_experts_flat == small_expert_num)
+                count = mask.sum().item()
+                load = topk_probs_flat[mask].sum().item()
+                layer_stats['small_expert_counts'][expert_idx] += count
+                layer_stats['small_expert_load'][expert_idx] += load
+                if expert_idx not in self.expert_stats['small_expert_usage']:
+                    self.expert_stats['small_expert_usage'][expert_idx] = 0
+                self.expert_stats['small_expert_usage'][expert_idx] += count
+    def get_expert_stats(self) -> Dict[str, Any]:
+        """Return expert usage statistics in a serializable format."""
+        stats = {
+            'total_tokens': self.expert_stats['total_tokens'],
+            'regular_expert_usage': {},
+            'small_expert_usage': {},
+            'layer_stats': {}
+        }
+        # Convert regular expert usage
+        for expert_idx, count in self.expert_stats['regular_expert_usage'].items():
+            stats['regular_expert_usage'][expert_idx] = {
+                'count': count,
+                'percentage': count / (self.expert_stats['total_tokens'] * self.model.config.top_k) * 100
+            }
+        # Convert small expert usage if they exist
+        if self.expert_stats['small_expert_usage']:
+            for expert_idx, count in self.expert_stats['small_expert_usage'].items():
+                stats['small_expert_usage'][expert_idx] = {
+                    'count': count,
+                    'percentage': count / (self.expert_stats['total_tokens'] * self.model.config.top_k) * 100
+                }
+        # Convert layer stats
+        for layer_idx, layer_stat in self.expert_stats['layer_stats'].items():
+            stats['layer_stats'][layer_idx] = {
+                'total_tokens': layer_stat['total_tokens'],
+                'regular_expert_counts': layer_stat['regular_expert_counts'].tolist(),
+                'regular_expert_load': layer_stat['regular_expert_load'].tolist(),
+                'small_expert_counts': layer_stat['small_expert_counts'].tolist() if layer_stat['small_expert_counts'] is not None else None,
+                'small_expert_load': layer_stat['small_expert_load'].tolist() if layer_stat['small_expert_load'] is not None else None
+            }
+        return stats
+    def print_expert_stats(self) -> None:
+        """Print expert usage statistics in a human-readable format."""
+        if not self.expert_stats['total_tokens']:
+            print("No expert usage statistics collected.")
+            return
+        total_tokens = self.expert_stats['total_tokens']
+        top_k = getattr(self.model.config, 'top_k', 1)
+        total_expert_activations = total_tokens * top_k
+        print("\n" + "="*80)
+        print("EXPERT USAGE STATISTICS")
+        print("="*80)
+        print(f"Total tokens processed: {total_tokens:,}")
+        print(f"Total expert activations (top-{top_k}): {total_expert_activations:,}")
+        print("\nOverall Expert Usage:")
+        # Print regular experts
+        if self.expert_stats['regular_expert_usage']:
+            print("\nRegular Experts:")
+            for expert_idx, count in sorted(self.expert_stats['regular_expert_usage'].items()):
+                percentage = count / total_expert_activations * 100
+                print(f"  Expert {expert_idx}: {count:,} ({percentage:.2f}%)")
+        # Print small experts if they exist
+        if self.expert_stats['small_expert_usage']:
+            print("\nSmall Experts:")
+            for expert_idx, count in sorted(self.expert_stats['small_expert_usage'].items()):
+                percentage = count / total_expert_activations * 100
+                print(f"  Small Expert {expert_idx}: {count:,} ({percentage:.2f}%)")
+        # Print layer-wise statistics
+        print("\nLayer-wise Statistics:")
+        for layer_idx, layer_stat in self.expert_stats['layer_stats'].items():
+            print(f"\nLayer {layer_idx}:")
+            print(f"  Tokens processed: {layer_stat['total_tokens']:,}")
+            # Regular experts
+            print("  Regular Experts:")
+            for expert_idx, (count, load) in enumerate(zip(
+                layer_stat['regular_expert_counts'],
+                layer_stat['regular_expert_load']
+            )):
+                count_pct = count / (layer_stat['total_tokens'] * top_k) * 100
+                load_pct = load / layer_stat['total_tokens'] * 100
+                print(f"    Expert {expert_idx}: Count={count:,} ({count_pct:.2f}%), Load={load:.2f} ({load_pct:.2f}%)")
+            # Small experts if they exist
+            if layer_stat['small_expert_counts'] is not None:
+                print("  Small Experts:")
+                for expert_idx, (count, load) in enumerate(zip(
+                    layer_stat['small_expert_counts'],
+                    layer_stat['small_expert_load']
+                )):
+                    count_pct = count / (layer_stat['total_tokens'] * top_k) * 100
+                    load_pct = load / layer_stat['total_tokens'] * 100
+                    print(f"    Small Expert {expert_idx}: Count={count:,} ({count_pct:.2f}%), Load={load:.2f} ({load_pct:.2f}%)")
+        print("="*80 + "\n")
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Evaluate OLMoE models with expert usage tracking",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Standard evaluation with expert tracking
+  python eval_with_expert_tracking.py --model_type transformers --tasks mmlu arc_easy
+  # Custom model evaluation with expert tracking
+  python eval_with_expert_tracking.py --model_type custom --tasks mmlu hellaswag
+        """
+    )
+    # Model arguments
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="allenai/OLMoE-1B-7B-0924",
+        help="Path or name of the pretrained model"
+    )
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default="transformers",
+        choices=["transformers", "custom"],
+        help="Model type: 'transformers' for standard OLMoE, 'custom' for MyOLMoE"
+    )
+    parser.add_argument(
+        "--custom_model_path",
+        type=str,
+        default="./myolmoe_model",
+        help="Path to custom MyOLMoE model code (when using --model_type custom)"
+    )
+    # Evaluation arguments
+    parser.add_argument(
+        "--tasks",
+        type=str,
+        nargs="+",
+        default=["mmlu"],
+        help="Tasks to evaluate on (e.g., mmlu, hellaswag, arc_easy, gsm8k)"
+    )
+    parser.add_argument(
+        "--num_fewshot",
+        type=int,
+        default=0,
+        help="Number of few-shot examples"
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=8,
+        help="Batch size for evaluation"
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=None,
+        help="Maximum batch size (auto if None)"
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="auto",
+        help="Device to use ('auto', 'cuda', 'cpu')"
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="auto",
+        choices=["auto", "float16", "bfloat16", "float32"],
+        help="Data type for model weights"
+    )
+    # Output arguments
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./eval_results",
+        help="Directory to save evaluation results"
+    )
+    parser.add_argument(
+        "--output_filename",
+        type=str,
+        default=None,
+        help="Custom filename for results (auto-generated if not provided)"
+    )
+    # Additional arguments
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Limit number of examples per task (for testing)"
+    )
+    parser.add_argument(
+        "--write_out",
+        action="store_true",
+        help="Write out individual predictions to files"
+    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help="Trust remote code when loading model"
+    )
+    parser.add_argument(
+        "--verbosity",
+        type=str,
+        default="INFO",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
+        help="Logging verbosity level"
+    )
+    return parser.parse_args()
+def load_transformers_model(args) -> ExpertTrackingHFLM:
+    """
+    Load standard Transformers OLMoE model with expert tracking.
+    Args:
+        args: Parsed command line arguments
+    Returns:
+        ExpertTrackingHFLM: Wrapped model ready for evaluation with expert tracking
+    """
+    logger.info(f"Loading Transformers OLMoE model with expert tracking: {args.model_path}")
+    # Create ExpertTrackingHFLM model
+    model = ExpertTrackingHFLM(
+        pretrained=args.model_path,
+        device=args.device,
+        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
+        dtype=args.dtype,
+        trust_remote_code=args.trust_remote_code
+    )
+    logger.info("Transformers model with expert tracking loaded successfully")
+    return model
+def load_custom_model(args) -> ExpertTrackingHFLM:
+    """
+    Load custom MyOLMoE model with expert tracking.
+    Args:
+        args: Parsed command line arguments
+    Returns:
+        ExpertTrackingHFLM: Wrapped model ready for evaluation with expert tracking
+    """
+    logger.info(f"Loading custom MyOLMoE model with expert tracking: {args.model_path}")
+    # Add custom model path to Python path
+    if os.path.exists(args.custom_model_path):
+        sys.path.insert(0, args.custom_model_path)
+        logger.info(f"Added {args.custom_model_path} to Python path")
+    else:
+        logger.warning(f"Custom model path not found: {args.custom_model_path}")
+    try:
+        # Import custom model class
+        from modeling_myolmoe import MyOlmoeForCausalLM
+        logger.info("Successfully imported MyOlmoeForCausalLM")
+    except ImportError as e:
+        logger.error(f"Failed to import custom model: {e}")
+        logger.error("Make sure the custom model code is available in the specified path")
+        raise
+    # Load model configuration
+    config = AutoConfig.from_pretrained(
+        args.model_path,
+        trust_remote_code=args.trust_remote_code
+    )
+    logger.info("Model will use default top-k routing configuration")
+    # Determine torch dtype
+    if args.dtype == "auto":
+        torch_dtype = "auto"
+    else:
+        torch_dtype = {
+            "float16": torch.float16,
+            "bfloat16": torch.bfloat16,
+            "float32": torch.float32
+        }[args.dtype]
+    # Load the custom model
+    hf_model = MyOlmoeForCausalLM.from_pretrained(
+        args.model_path,
+        config=config,
+        torch_dtype=torch_dtype,
+        device_map="auto" if args.device == "auto" else None,
+        trust_remote_code=args.trust_remote_code
+    ).eval()
+    # Wrap in ExpertTrackingHFLM
+    model = ExpertTrackingHFLM(
+        pretrained=args.model_path,
+        device=args.device,
+        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
+        dtype=args.dtype
+    )
+    logger.info("Custom model with expert tracking loaded successfully")
+    return model
+def run_evaluation(args) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    """
+    Run evaluation on the specified model and return both task results and expert stats.
+    Args:
+        args: Parsed command line arguments
+    Returns:
+        Tuple of (evaluation_results, expert_stats)
+    """
+    logger.info("Starting evaluation with expert tracking...")
+    # Load appropriate model
+    if args.model_type == "transformers":
+        model = load_transformers_model(args)
+    elif args.model_type == "custom":
+        model = load_custom_model(args)
+    else:
+        raise ValueError(f"Unknown model type: {args.model_type}")
+    # Run evaluation
+    logger.info(f"Running evaluation on tasks: {args.tasks}")
+    logger.info(f"Few-shot examples: {args.num_fewshot}")
+    logger.info(f"Batch size: {args.batch_size}")
+    results = evaluator.simple_evaluate(
+        model=model,
+        tasks=args.tasks,
+        num_fewshot=args.num_fewshot,
+        limit=args.limit,
+        write_out=args.write_out,
+    )
+    # Get expert statistics
+    expert_stats = model.get_expert_stats()
+    logger.info("Evaluation completed successfully")
+    return results, expert_stats
+def save_results(results: Dict[str, Any], expert_stats: Dict[str, Any], args) -> str:
+    """
+    Save evaluation results and expert statistics to file.
+    Args:
+        results: Evaluation results
+        expert_stats: Expert usage statistics
+        args: Parsed command line arguments
+    Returns:
+        str: Path to saved results file
+    """
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Generate filename if not provided
+    if args.output_filename is None:
+        model_name = os.path.basename(args.model_path.rstrip('/'))
+        tasks_str = "_".join(args.tasks[:3])
+        if len(args.tasks) > 3:
+            tasks_str += f"_and_{len(args.tasks)-3}_more"
+        if args.model_type == "custom":
+            filename = f"{model_name}_custom_{tasks_str}_results_with_expert_stats.json"
+        else:
+            filename = f"{model_name}_transformers_{tasks_str}_results_with_expert_stats.json"
+    else:
+        filename = args.output_filename
+    if not filename.endswith('.json'):
+        filename += '.json'
+    output_path = os.path.join(args.output_dir, filename)
+    # Prepare metadata
+    metadata = {
+        "model_path": args.model_path,
+        "model_type": args.model_type,
+        "tasks": args.tasks,
+        "num_fewshot": args.num_fewshot,
+        "batch_size": args.batch_size,
+        "device": args.device,
+        "dtype": args.dtype,
+        "limit": args.limit,
+    }
+    # Add routing info for custom models
+    if args.model_type == "custom":
+        metadata["routing_type"] = "top-k (default)"
+    combined_results = {
+        "metadata": metadata,
+        "task_results": results,
+        "expert_statistics": expert_stats
+    }
+    # Save to file
+    with open(output_path, 'w') as f:
+        json.dump(combined_results, f, indent=2)
+    logger.info(f"Results saved to {output_path}")
+    return output_path
+def print_summary(results: Dict[str, Any], expert_stats: Dict[str, Any], args) -> None:
+    """
+    Print a formatted summary of evaluation results and expert statistics.
+    Args:
+        results: Evaluation results
+        expert_stats: Expert usage statistics
+        args: Parsed command line arguments
+    """
+    print(f"\n{'='*80}")
+    print(f"EVALUATION SUMMARY")
+    print(f"Model: {args.model_path}")
+    print(f"Type: {args.model_type.upper()}")
+    if args.model_type == "custom":
+        print(f"Routing: TOP-K (default)")
+    print(f"Tasks: {', '.join(args.tasks)}")
+    print(f"{'='*80}")
+    # Print task results
+    if "results" in results:
+        for task, metrics in results["results"].items():
+            if isinstance(metrics, dict):
+                print(f"\n📊 {task.upper()}:")
+                for metric, value in metrics.items():
+                    if isinstance(value, (int, float)) and not metric.endswith('_stderr'):
+                        stderr_key = f"{metric}_stderr"
+                        stderr = metrics.get(stderr_key, 0)
+                        print(f"   {metric:.<20} {value:.4f} (±{stderr:.4f})")
+    else:
+        print("\n⚠️  No results found in evaluation output")
+    # Print expert statistics
+    if expert_stats:
+        total_tokens = expert_stats.get('total_tokens', 0)
+        if total_tokens > 0:
+            top_k = getattr(args, 'top_k', 1)  # Default to 1 if not specified
+            total_expert_activations = total_tokens * top_k
+            print(f"\n🔍 EXPERT USAGE SUMMARY (Top-{top_k})")
+            print(f"Total tokens processed: {total_tokens:,}")
+            print(f"Total expert activations: {total_expert_activations:,}")
+            # Regular experts
+            if expert_stats.get('regular_expert_usage'):
+                print("\nRegular Experts:")
+                for expert_idx, stats in sorted(expert_stats['regular_expert_usage'].items()):
+                    print(f"  Expert {expert_idx}: {stats['count']:,} ({stats['percentage']:.2f}%)")
+            # Small experts
+            if expert_stats.get('small_expert_usage'):
+                print("\nSmall Experts:")
+                for expert_idx, stats in sorted(expert_stats['small_expert_usage'].items()):
+                    print(f"  Small Expert {expert_idx}: {stats['count']:,} ({stats['percentage']:.2f}%)")
+            # Layer statistics
+            if expert_stats.get('layer_stats'):
+                print("\nLayer-wise Statistics (Top 3 most used experts per layer):")
+                for layer_idx, layer_stat in expert_stats['layer_stats'].items():
+                    print(f"\nLayer {layer_idx}:")
+                    print(f"  Tokens processed: {layer_stat['total_tokens']:,}")
+                    # Regular experts
+                    if layer_stat.get('regular_expert_counts'):
+                        counts = layer_stat['regular_expert_counts']
+                        top_indices = np.argsort(counts)[-3:][::-1]
+                        print("  Top Regular Experts:")
+                        for idx in top_indices:
+                            count = counts[idx]
+                            load = layer_stat['regular_expert_load'][idx]
+                            count_pct = count / (layer_stat['total_tokens'] * top_k) * 100
+                            load_pct = load / layer_stat['total_tokens'] * 100
+                            print(f"    Expert {idx}: Count={count:,} ({count_pct:.2f}%), Load={load:.2f} ({load_pct:.2f}%)")
+                    # Small experts
+                    if layer_stat.get('small_expert_counts'):
+                        counts = layer_stat['small_expert_counts']
+                        top_indices = np.argsort(counts)[-3:][::-1]
+                        print("  Top Small Experts:")
+                        for idx in top_indices:
+                            count = counts[idx]
+                            load = layer_stat['small_expert_load'][idx]
+                            count_pct = count / (layer_stat['total_tokens'] * top_k) * 100
+                            load_pct = load / layer_stat['total_tokens'] * 100
+                            print(f"    Small Expert {idx}: Count={count:,} ({count_pct:.2f}%), Load={load:.2f} ({load_pct:.2f}%)")
+    print(f"\n{'='*80}")
+def main():
+    """Main evaluation function with expert tracking."""
+    args = parse_args()
+    # Set logging level
+    numeric_level = getattr(logging, args.verbosity.upper(), None)
+    if isinstance(numeric_level, int):
+        logging.getLogger().setLevel(numeric_level)
+        logger.setLevel(numeric_level)
+    try:
+        logger.info("="*80)
+        logger.info("Starting OLMoE Model Evaluation with Expert Tracking")
+        logger.info("="*80)
+        # Run evaluation
+        results, expert_stats = run_evaluation(args)
+        # Save results
+        output_path = save_results(results, expert_stats, args)
+        # Print summary
+        print_summary(results, expert_stats, args)
+        logger.info(f"✅ Evaluation completed successfully!")
+        logger.info(f"📁 Results saved to: {output_path}")
+    except KeyboardInterrupt:
+        logger.info("Evaluation interrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        logger.error(f"❌ Evaluation failed: {e}")
+        logger.debug("Full traceback:", exc_info=True)
+        sys.exit(1)
+if __name__ == "__main__":
+    main()