Revert "expert usage stats"

Browse files

This reverts commit a875a536fb5cb362c243ac2a09bbf7e6ec37db66.

Files changed (2) hide show

myolmoe/modeling_myolmoe.py +11 -23
scripts/evalexperts.py +0 -441

myolmoe/modeling_myolmoe.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import math
 from typing import List, Optional, Tuple, Union
-from collections import defaultdict
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
@@ -559,17 +558,20 @@ class OlmoeSparseMoeBlock(nn.Module):
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
         in_second_half = layer_idx >= self.total_layers // 2
         if in_second_half:
             second_half_idx = layer_idx - (self.total_layers // 2)
             num_second_half_blocks = self.total_layers - (self.total_layers // 2)
             if config.small_expert_strategy == "constant":
                 self.num_small_experts = config.max_small_expert_count // num_second_half_blocks
             elif config.small_expert_strategy == "increment":
                 self.num_small_experts = (
-                    (second_half_idx + 1) * config.max_small_expert_count //
-                    ((num_second_half_blocks * (num_second_half_blocks + 1)) // 2)
                 )
             else:
                 raise ValueError(f"Unknown strategy: {config.small_expert_strategy}")
@@ -582,19 +584,20 @@ class OlmoeSparseMoeBlock(nn.Module):
         ]) if self.num_small_experts > 0 else None
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
-        self.small_gate = nn.Linear(config.hidden_size, self.num_small_experts, bias=False) \
-            if self.num_small_experts > 0 else None
-        self.small_expert_sparsity_coef = config.small_expert_sparsity_coef
-        # Usage tracking (not a buffer, no gradient)
-        self.expert_usage = defaultdict(int)
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
         if self.num_small_experts > 0:
             small_router_logits = self.small_gate(hidden_states)
             combined_logits = torch.cat([router_logits, small_router_logits], dim=-1)
@@ -604,12 +607,6 @@ class OlmoeSparseMoeBlock(nn.Module):
         routing_probs = F.softmax(combined_logits, dim=1, dtype=torch.float)
         routing_weights, selected_experts = torch.topk(routing_probs, self.top_k, dim=-1)
-        # Track expert usage
-        for i in range(selected_experts.size(0)):
-            for j in range(self.top_k):
-                expert_id = selected_experts[i, j].item()
-                self.expert_usage[expert_id] += 1
         if self.norm_topk_prob:
             routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
@@ -635,15 +632,6 @@ class OlmoeSparseMoeBlock(nn.Module):
         return final_hidden_states.view(batch_size, sequence_length, hidden_dim), combined_logits
-    def __del__(self):
-        if self.expert_usage:
-            print(f"\n[Expert Usage Report for Layer {self.layer_idx}]")
-            total = sum(self.expert_usage.values())
-            for expert_id in sorted(self.expert_usage):
-                count = self.expert_usage[expert_id]
-                percent = 100.0 * count / total if total > 0 else 0.0
-                print(f"  Expert {expert_id:2d}: {count} times ({percent:.2f}%)")
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):

 import math
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
+        # Determine if this block is in the second half
         in_second_half = layer_idx >= self.total_layers // 2
+        # Determine small expert count for this layer
         if in_second_half:
             second_half_idx = layer_idx - (self.total_layers // 2)
             num_second_half_blocks = self.total_layers - (self.total_layers // 2)
             if config.small_expert_strategy == "constant":
                 self.num_small_experts = config.max_small_expert_count // num_second_half_blocks
             elif config.small_expert_strategy == "increment":
+                # Linearly scale small experts from 1 to max_small_expert_count
                 self.num_small_experts = (
+                    (second_half_idx + 1) * config.max_small_expert_count // ((num_second_half_blocks * (num_second_half_blocks + 1)) // 2)
                 )
             else:
                 raise ValueError(f"Unknown strategy: {config.small_expert_strategy}")
         ]) if self.num_small_experts > 0 else None
         self.gate = nn.Linear(config.hidden_size, self.num_experts, bias=False)
+        if self.num_small_experts > 0:
+            self.small_gate = nn.Linear(config.hidden_size, self.num_small_experts, bias=False)
+        else:
+            self.small_gate = None
+        self.small_expert_sparsity_coef = config.small_expert_sparsity_coef
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
         if self.num_small_experts > 0:
             small_router_logits = self.small_gate(hidden_states)
             combined_logits = torch.cat([router_logits, small_router_logits], dim=-1)
         routing_probs = F.softmax(combined_logits, dim=1, dtype=torch.float)
         routing_weights, selected_experts = torch.topk(routing_probs, self.top_k, dim=-1)
         if self.norm_topk_prob:
             routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
         return final_hidden_states.view(batch_size, sequence_length, hidden_dim), combined_logits
 class OlmoeDecoderLayer(nn.Module):
     def __init__(self, config: OlmoeConfig, layer_idx: int):

scripts/evalexperts.py DELETED Viewed

@@ -1,441 +0,0 @@
-#!/usr/bin/env python3
-"""
-eval_with_expert_tracking.py - Evaluation script for MyOLMoE models with expert usage tracking
-This script evaluates a custom MyOLMoE model on benchmark tasks and tracks expert usage per layer.
-Usage Example:
-    python eval_with_expert_tracking.py --model_path allenai/OLMoE-1B-7B-0924 --tasks mmlu hellaswag --num_fewshot 5
-"""
-import argparse
-import json
-import os
-import sys
-import logging
-from typing import Dict, List, Tuple, Any
-import torch
-import numpy as np
-from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
-from lm_eval import evaluator
-from lm_eval.models.huggingface import HFLM
-# Set up logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-def parse_args():
-    """Parse command line arguments."""
-    parser = argparse.ArgumentParser(
-        description="Evaluate MyOLMoE model on benchmark tasks with expert usage tracking",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    # Model arguments
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        default="allenai/OLMoE-1B-7B-0924",
-        help="Path or name of the pretrained MyOLMoE model"
-    )
-    parser.add_argument(
-        "--custom_model_path",
-        type=str,
-        default="./myolmoe_model",
-        help="Path to custom MyOLMoE model code"
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="auto",
-        help="Device to use ('auto', 'cuda', 'cpu')"
-    )
-    parser.add_argument(
-        "--dtype",
-        type=str,
-        default="auto",
-        choices=["auto", "float16", "bfloat16", "float32"],
-        help="Data type for model weights"
-    )
-    parser.add_argument(
-        "--trust_remote_code",
-        action="store_true",
-        help="Trust remote code when loading model"
-    )
-    # Evaluation arguments
-    parser.add_argument(
-        "--tasks",
-        type=str,
-        nargs="+",
-        default=["mmlu"],
-        help="Tasks to evaluate on (e.g., mmlu, hellaswag, arc_easy)"
-    )
-    parser.add_argument(
-        "--num_fewshot",
-        type=int,
-        default=0,
-        help="Number of few-shot examples"
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=8,
-        help="Batch size for evaluation"
-    )
-    parser.add_argument(
-        "--max_batch_size",
-        type=int,
-        default=None,
-        help="Maximum batch size (auto if None)"
-    )
-    parser.add_argument(
-        "--limit",
-        type=int,
-        default=None,
-        help="Limit number of examples per task (for testing)"
-    )
-    # Output arguments
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="./eval_results",
-        help="Directory to save evaluation results and expert usage"
-    )
-    parser.add_argument(
-        "--output_filename",
-        type=str,
-        default=None,
-        help="Custom filename for results (auto-generated if not provided)"
-    )
-    return parser.parse_args()
-def load_custom_model(args) -> Tuple[AutoModelForCausalLM, AutoTokenizer, HFLM]:
-    """
-    Load custom MyOLMoE model, tokenizer, and HFLM wrapper.
-    Args:
-        args: Parsed command line arguments
-    Returns:
-        Tuple of (model, tokenizer, HFLM wrapper)
-    """
-    logger.info(f"Loading custom MyOLMoE model: {args.model_path}")
-    # Add custom model path to Python path
-    if os.path.exists(args.custom_model_path):
-        sys.path.insert(0, args.custom_model_path)
-        logger.info(f"Added {args.custom_model_path} to Python path")
-    else:
-        logger.error(f"Custom model path not found: {args.custom_model_path}")
-        raise FileNotFoundError(f"Custom model path not found: {args.custom_model_path}")
-    try:
-        from modeling_myolmoe import MyOlmoeForCausalLM
-        logger.info("Successfully imported MyOlmoeForCausalLM")
-    except ImportError as e:
-        logger.error(f"Failed to import custom model: {e}")
-        raise
-    # Load model configuration
-    config = AutoConfig.from_pretrained(
-        args.model_path,
-        trust_remote_code=args.trust_remote_code
-    )
-    # Determine torch dtype
-    torch_dtype = args.dtype
-    if args.dtype != "auto":
-        torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[args.dtype]
-    # Load model and tokenizer
-    model = MyOlmoeForCausalLM.from_pretrained(
-        args.model_path,
-        config=config,
-        torch_dtype=torch_dtype,
-        device_map="auto" if args.device == "auto" else None,
-        trust_remote_code=args.trust_remote_code
-    ).eval()
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.model_path,
-        trust_remote_code=args.trust_remote_code
-    )
-    # Create HFLM wrapper for evaluation
-    hf_model = HFLM(
-        pretrained=args.model_path,  # Pass model path as string
-        device=args.device,
-        batch_size=args.batch_size,
-        max_batch_size=args.max_batch_size,
-        dtype=args.dtype,
-        trust_remote_code=args.trust_remote_code
-    )
-    logger.info("Custom model, tokenizer, and HFLM wrapper loaded successfully")
-    return model, tokenizer, hf_model
-def track_expert_usage(model, input_ids: torch.Tensor) -> List[Dict[int, int]]:
-    """
-    Track expert usage per layer during a single forward pass.
-    Args:
-        model: MyOLMoE model
-        input_ids: Input token IDs (batched)
-    Returns:
-        List of dictionaries, where each dictionary maps expert indices to their usage counts for a layer
-    """
-    expert_usage = [{} for _ in range(model.config.num_hidden_layers)]
-    def hook_fn(module, input, output, layer_idx):
-        if hasattr(module, 'selected_experts'):  # Hypothetical attribute
-            selected_experts = module.selected_experts  # Shape: (batch_size, seq_len, top_k)
-            for expert_idx in selected_experts.flatten().tolist():
-                expert_usage[layer_idx][expert_idx] = expert_usage[layer_idx].get(expert_idx, 0) + 1
-        elif hasattr(module, 'routing_weights'):  # Alternative: use routing weights
-            weights = module.routing_weights  # Shape: (batch_size, seq_len, num_experts)
-            top_k_indices = torch.topk(weights, k=model.config.top_k, dim=-1).indices
-            for expert_idx in top_k_indices.flatten().tolist():
-                expert_usage[layer_idx][expert_idx] = expert_usage[layer_idx].get(expert_idx, 0) + 1
-    # Register hooks for each MoE layer
-    hooks = []
-    for i, layer in enumerate(model.transformer.layers):  # Adjust based on actual model structure
-        if hasattr(layer, 'moe'):
-            hook = layer.moe.register_forward_hook(lambda m, inp, out: hook_fn(m, inp, out, i))
-            hooks.append(hook)
-    # Run a forward pass
-    with torch.no_grad():
-        model(input_ids)
-    # Remove hooks
-    for hook in hooks:
-        hook.remove()
-    return expert_usage
-def run_evaluation_with_tracking(model, hf_model, tokenizer, args) -> Tuple[Dict[str, Any], Dict[str, List[Dict[int, int]]]]:
-    """
-    Run evaluation on benchmark tasks and track expert usage.
-    Args:
-        model: MyOLMoE model
-        hf_model: HFLM wrapper for evaluation
-        tokenizer: Tokenizer
-        args: Parsed command line arguments
-    Returns:
-        Tuple of (evaluation results, task-wise expert usage)
-    """
-    logger.info(f"Running evaluation on tasks: {args.tasks}")
-    logger.info(f"Few-shot examples: {args.num_fewshot}")
-    logger.info(f"Batch size: {args.batch_size}")
-    # Initialize expert usage tracking for each task
-    task_expert_usage = {task: [] for task in args.tasks}
-    # Custom batch processing to track expert usage
-    def custom_loglikelihood(self, requests):
-        from lm_eval.api.instance import Instance
-        res = []
-        for request in requests:
-            input_ids = tokenizer(request.arguments[0], return_tensors="pt").input_ids.to(model.device)
-            # Track expert usage
-            batch_expert_usage = track_expert_usage(model, input_ids)
-            task_expert_usage[request.task_name].append(batch_expert_usage)
-            # Original loglikelihood computation
-            res.append(self._loglikelihood([request]))
-        return [item for sublist in res for item in sublist]
-    # Override HFLM's loglikelihood method
-    original_loglikelihood = hf_model.loglikelihood
-    hf_model.loglikelihood = custom_loglikelihood.__get__(hf_model, HFLM)
-    # Run evaluation
-    results = evaluator.simple_evaluate(
-        model=hf_model,
-        tasks=args.tasks,
-        num_fewshot=args.num_fewshot,
-        limit=args.limit,
-        batch_size=args.batch_size,
-        max_batch_size=args.max_batch_size,
-    )
-    # Restore original method
-    hf_model.loglikelihood = original_loglikelihood
-    # Aggregate expert usage per task
-    aggregated_usage = {}
-    for task in args.tasks:
-        if task_expert_usage[task]:
-            aggregated_usage[task] = [
-                {k: sum(d.get(k, 0) for d in layer_usages) for k in set().union(*layer_usages)}
-                for layer_usages in zip(*task_expert_usage[task])
-            ]
-        else:
-            aggregated_usage[task] = [{} for _ in range(model.config.num_hidden_layers)]
-    logger.info("Evaluation and expert usage tracking completed")
-    return results, aggregated_usage
-def make_serializable(obj: Any) -> Any:
-    """
-    Convert objects to JSON-serializable format.
-    Args:
-        obj: Object to convert
-    Returns:
-        JSON-serializable version of the object
-    """
-    if isinstance(obj, dict):
-        return {k: make_serializable(v) for k, v in obj.items()}
-    elif isinstance(obj, list):
-        return [make_serializable(v) for v in obj]
-    elif isinstance(obj, tuple):
-        return tuple(make_serializable(v) for v in obj)
-    elif isinstance(obj, (np.integer, np.floating)):
-        return obj.item()
-    elif isinstance(obj, np.dtype):
-        return str(obj)
-    elif isinstance(obj, torch.Tensor):
-        return obj.tolist()
-    elif isinstance(obj, torch.dtype):
-        return str(obj)
-    else:
-        return obj
-def save_results(results: Dict[str, Any], expert_usage: Dict[str, List[Dict[int, int]]], args) -> str:
-    """
-    Save evaluation results and expert usage to file.
-    Args:
-        results: Evaluation results
-        expert_usage: Expert usage per task and layer
-        args: Parsed command line arguments
-    Returns:
-        str: Path to saved results file
-    """
-    os.makedirs(args.output_dir, exist_ok=True)
-    # Generate filename
-    if args.output_filename is None:
-        model_name = os.path.basename(args.model_path.rstrip('/'))
-        tasks_str = "_".join(args.tasks[:3])
-        if len(args.tasks) > 3:
-            tasks_str += f"_and_{len(args.tasks)-3}_more"
-        filename = f"{model_name}_eval_expert_usage.json"
-    else:
-        filename = args.output_filename
-    if not filename.endswith('.json'):
-        filename += '.json'
-    output_path = os.path.join(args.output_dir, filename)
-    # Prepare results
-    results_with_metadata = {
-        "metadata": {
-            "model_path": args.model_path,
-            "tasks": args.tasks,
-            "num_fewshot": args.num_fewshot,
-            "batch_size": args.batch_size,
-            "device": args.device,
-            "dtype": args.dtype,
-            "limit": args.limit,
-            "routing_type": "top-k (default)",
-        },
-        "results": results,
-        "expert_usage": {
-            task: [{str(k): v for k, v in layer_usage.items()} for layer_usage in task_usage]
-            for task, task_usage in expert_usage.items()
-        }
-    }
-    # Convert to JSON-serializable format
-    serializable_results = make_serializable(results_with_metadata)
-    # Save to file
-    with open(output_path, 'w') as f:
-        json.dump(serializable_results, f, indent=2)
-    logger.info(f"Results saved to {output_path}")
-    return output_path
-def print_summary(results: Dict[str, Any], expert_usage: Dict[str, List[Dict[int, int]]], args) -> None:
-    """
-    Print a summary of evaluation results and expert usage.
-    Args:
-        results: Evaluation results
-        expert_usage: Expert usage per task and layer
-        args: Parsed command line arguments
-    """
-    print(f"\n{'='*80}")
-    print(f"EVALUATION SUMMARY")
-    print(f"Model: {args.model_path}")
-    print(f"Tasks: {', '.join(args.tasks)}")
-    print(f"{'='*80}")
-    if "results" in results:
-        for task, metrics in results["results"].items():
-            if isinstance(metrics, dict):
-                print(f"\n📊 {task.upper()}:")
-                for metric, value in metrics.items():
-                    if isinstance(value, (int, float)) and not metric.endswith('_stderr'):
-                        stderr_key = f"{metric}_stderr"
-                        stderr = metrics.get(stderr_key, 0)
-                        print(f"   {metric:.<20} {value:.4f} (±{stderr:.4f})")
-    print(f"\nEXPERT USAGE PER TASK AND LAYER")
-    for task, task_usage in expert_usage.items():
-        print(f"\nTask: {task.upper()}")
-        for i, layer_usage in enumerate(task_usage):
-            print(f"  Layer {i}:")
-            for expert_idx, count in layer_usage.items():
-                print(f"    Expert {expert_idx}: {count} times")
-    print(f"\n{'='*80}")
-def main():
-    """Main function for evaluation with expert usage tracking."""
-    args = parse_args()
-    try:
-        logger.info("="*80)
-        logger.info("Starting MyOLMoE Evaluation with Expert Usage Tracking")
-        logger.info("="*80)
-        # Load model, tokenizer, and HFLM wrapper
-        model, tokenizer, hf_model = load_custom_model(args)
-        # Run evaluation with expert usage tracking
-        results, expert_usage = run_evaluation_with_tracking(model, hf_model, tokenizer, args)
-        # Save results
-        output_path = save_results(results, expert_usage, args)
-        # Print summary
-        print_summary(results, expert_usage, args)
-        logger.info(f"✅ Evaluation completed successfully!")
-        logger.info(f"📁 Results saved to: {output_path}")
-    except KeyboardInterrupt:
-        logger.info("Evaluation interrupted by user")
-        sys.exit(1)
-    except Exception as e:
-        logger.error(f"❌ Evaluation failed: {e}")
-        logger.debug("Full traceback:", exc_info=True)
-        sys.exit(1)
-if __name__ == "__main__":
-    main()