Revert "18k checkpoint"

Browse files

This reverts commit 555d9b9825121563de4138cbe24ac43bd5bf5f89.

Files changed (2) hide show

myolmoe/modeling_myolmoe.py +105 -155
scripts/eval.py +50 -107

myolmoe/modeling_myolmoe.py CHANGED Viewed

@@ -14,124 +14,107 @@ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_u
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
 from transformers.utils import logging
-# from transformers.models.olmoe.configuration_olmoe import MyOlmoeConfig
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_rope_utils import rope_config_validation
-from dataclasses import dataclass, field
-from typing import Optional, List, Any
-from transformers import PretrainedConfig
-from dataclasses import dataclass, field
-from typing import Optional, List, Dict, Any
-from transformers import PretrainedConfig
-@dataclass
-class MyOlmoeConfig(PretrainedConfig):
-    """
-    Configuration class for MyOlmoe model.
-    """
-    model_type: str = "olmoe"  # Keep as "olmoe" to match your trained model
-    # Core model parameters
-    vocab_size: int = 50304
-    hidden_size: int = 2048
-    intermediate_size: int = 1024
-    num_hidden_layers: int = 16
-    num_attention_heads: int = 16
-    num_key_value_heads: int = 16
-    max_position_embeddings: int = 4096
-    # Expert parameters
-    num_experts: int = 64
-    num_experts_per_tok: int = 2
-    num_small_experts: int = 0
-    small_expert_count: int = 64
-    small_expert_intermediate_ratio: int = 16
-    small_expert_intermediate_size: int = 0
-    small_expert_sparsity_coef: float = 0.1
-    small_expert_strategy: str = "constant"
-    max_small_expert_count: int = 64
-    # Attention parameters
-    attention_bias: bool = False
-    attention_dropout: float = 0.0
-    clip_qkv: Optional[float] = None
-    # Normalization and activation
-    hidden_act: str = "silu"
-    rms_norm_eps: float = 1e-05
-    norm_topk_prob: bool = False
-    # Router parameters
-    router_aux_loss_coef: float = 0.01
-    output_router_logits: bool = False
-    # Training parameters
-    initializer_range: float = 0.02
-    tie_word_embeddings: bool = False
-    use_cache: bool = True
-    # RoPE parameters
-    rope_theta: float = 10000.0
-    rope_scaling: Optional[dict] = None
-    # Token IDs - Set proper defaults
-    pad_token_id: int = 1
-    eos_token_id: int = 50279
-    bos_token_id: int = 1
-    # Model architecture
-    architectures: List[str] = field(default_factory=lambda: ["MyOlmoeForCausalLM"])
-    def __init__(self, **kwargs):
-        # Handle model loading parameters that shouldn't go to config
-        model_loading_params = ['torch_dtype', 'device_map', 'low_cpu_mem_usage',
-                               'load_in_8bit', 'load_in_4bit', 'quantization_config']
-        for param in model_loading_params:
-            kwargs.pop(param, None)
-        # Set defaults for any missing required fields
-        if 'pad_token_id' not in kwargs:
-            kwargs['pad_token_id'] = self.pad_token_id
-        if 'eos_token_id' not in kwargs:
-            kwargs['eos_token_id'] = self.eos_token_id
-        if 'bos_token_id' not in kwargs:
-            kwargs['bos_token_id'] = self.bos_token_id
-        if 'architectures' not in kwargs:
-            kwargs['architectures'] = ["MyOlmoeForCausalLM"]
-        # Initialize the parent class first
-        super().__init__(**kwargs)
-        # Then set dataclass fields from remaining kwargs or defaults
-        for field_name, field_def in self.__dataclass_fields__.items():
-            if hasattr(self, field_name):
-                continue  # Already set by parent
-            if field_name in kwargs:
-                setattr(self, field_name, kwargs[field_name])
-            else:
-                # Use default value from dataclass field
-                if field_def.default != field_def.default_factory:
-                    setattr(self, field_name, field_def.default)
-                elif field_def.default_factory != field_def.default_factory:  # type: ignore
-                    setattr(self, field_name, field_def.default_factory())
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
-        """Override from_pretrained to handle the model type properly."""
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        # Keep the original model_type from the saved config
-        # This allows loading models trained with "olmoe" type
-        if 'model_type' in config_dict:
-            original_model_type = config_dict['model_type']
-            # But register with the class model_type for compatibility
-            if original_model_type == "olmoe":
-                config_dict['model_type'] = "olmoe"  # Keep as olmoe
-        return cls.from_dict(config_dict, **kwargs)
 logger = logging.get_logger(__name__)
@@ -203,7 +186,7 @@ ALL_LAYERNORM_LAYERS.append(OlmoeRMSNorm)
 class OlmoeRotaryEmbedding(nn.Module):
-    def __init__(self, config: MyOlmoeConfig, device=None):
         super().__init__()
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get(
@@ -289,7 +272,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 class OlmoeAttention(nn.Module):
-    def __init__(self, config: MyOlmoeConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -574,14 +557,11 @@ class OlmoeSparseMoeBlock(nn.Module):
         self.num_experts = config.num_experts
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
-        #########
-        self.register_buffer('expert_usage_counts', torch.zeros(config.num_experts + config.max_small_expert_count, dtype=torch.long))
-        self.expert_usage_counts: torch.Tensor  # For type hinting
-        #########
         in_second_half = layer_idx >= self.total_layers // 2
         if in_second_half:
             second_half_idx = layer_idx - (self.total_layers // 2)
             num_second_half_blocks = self.total_layers - (self.total_layers // 2)
@@ -589,6 +569,7 @@ class OlmoeSparseMoeBlock(nn.Module):
             if config.small_expert_strategy == "constant":
                 self.num_small_experts = config.max_small_expert_count // num_second_half_blocks
             elif config.small_expert_strategy == "increment":
                 self.num_small_experts = (
                     (second_half_idx + 1) * config.max_small_expert_count // ((num_second_half_blocks * (num_second_half_blocks + 1)) // 2)
                 )
@@ -629,12 +610,6 @@ class OlmoeSparseMoeBlock(nn.Module):
         if self.norm_topk_prob:
             routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-        #########
-        expert_indices = selected_experts.flatten()
-        unique_experts, counts = torch.unique(expert_indices, return_counts=True)
-        self.expert_usage_counts[unique_experts] += counts.to(self.expert_usage_counts.device)
-        #########
         final_hidden_states = torch.zeros_like(hidden_states)
         expert_mask = torch.nn.functional.one_hot(
             selected_experts,
@@ -656,35 +631,10 @@ class OlmoeSparseMoeBlock(nn.Module):
             final_hidden_states.index_add_(0, top_x, current_output.to(hidden_states.dtype))
         return final_hidden_states.view(batch_size, sequence_length, hidden_dim), combined_logits
-    #########
-    def __del__(self):
-        # Print expert usage statistics when the block is deconstructed
-        if hasattr(self, 'expert_usage_counts'):
-            total_usage = self.expert_usage_counts.sum().item()
-            if total_usage > 0:
-                print(f"\nExpert Usage Statistics for Layer {self.layer_idx}:")
-                print(f"Total tokens processed: {total_usage}")
-                # Regular experts
-                if self.num_experts > 0:
-                    regular_usage = self.expert_usage_counts[:self.num_experts]
-                    print("\nRegular Experts:")
-                    for i, count in enumerate(regular_usage):
-                        print(f"Expert {i}: {count.item()} uses ({count.item()/total_usage:.2%})")
-                # Small experts
-                if self.num_small_experts > 0:
-                    small_usage = self.expert_usage_counts[self.num_experts:self.num_experts+self.num_small_experts]
-                    print("\nSmall Experts:")
-                    for i, count in enumerate(small_usage):
-                        print(f"Small Expert {i}: {count.item()} uses ({count.item()/total_usage:.2%})")
-                print("\n")
-    #########
 class OlmoeDecoderLayer(nn.Module):
-    def __init__(self, config: MyOlmoeConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = OLMOE_ATTENTION_CLASSES[config._attn_implementation](
@@ -740,7 +690,7 @@ class OlmoeDecoderLayer(nn.Module):
 class OlmoePreTrainedModel(PreTrainedModel):
-    config_class = MyOlmoeConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["OlmoeDecoderLayer"]
@@ -766,7 +716,7 @@ class OlmoePreTrainedModel(PreTrainedModel):
 class OlmoeModel(OlmoePreTrainedModel):
-    def __init__(self, config: MyOlmoeConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -1171,4 +1121,4 @@ class MyOlmoeForCausalLM(OlmoePreTrainedModel, GenerationMixin):
             router_logits=outputs.router_logits,
         )
-__all__ = ["MyOlmoeForCausalLM", "OlmoeModel", "OlmoePreTrainedModel", "MyOlmoeConfig"]

 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
 from transformers.utils import logging
+# from transformers.models.olmoe.configuration_olmoe import OlmoeConfig
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_rope_utils import rope_config_validation
+class OlmoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`OlmoeModel`].
+    [Previous docstring remains the same...]
+    Args:
+        [Previous args remain the same...]
+        small_expert_intermediate_ratio (`float`, *optional*, defaults to 0.5):
+            Ratio of intermediate size for small experts compared to regular experts.
+        small_expert_count (`int`, *optional*, defaults to 64):
+            Frequency of small experts - every Nth expert will be small.
+        small_expert_sparsity_coef (`float`, *optional*, defaults to 0.1):
+            Coefficient for small expert load balancing loss.
+    """
+    model_type = "olmoe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=50304,
+        hidden_size=2048,
+        intermediate_size=2048,
+        num_hidden_layers=16,
+        num_attention_heads=16,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=None,
+        eos_token_id=50279,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        clip_qkv=None,
+        num_experts_per_tok=8,
+        num_experts=64,
+        output_router_logits=False,
+        router_aux_loss_coef=0.01,
+        norm_topk_prob=False,
+        small_expert_intermediate_ratio=64,
+        small_expert_count=64,
+        small_expert_sparsity_coef=0.1,
+        small_expert_strategy="constant",  # increment
+        max_small_expert_count=64,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.clip_qkv = clip_qkv
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.norm_topk_prob = norm_topk_prob
+        # Small expert parameters
+        self.small_expert_intermediate_ratio = small_expert_intermediate_ratio
+        self.small_expert_count = small_expert_count
+        self.small_expert_sparsity_coef = small_expert_sparsity_coef
+        self.small_expert_strategy = small_expert_strategy
+        self.max_small_expert_count = max_small_expert_count
+        # Validate the correctness of rotary position embeddings parameters
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
 logger = logging.get_logger(__name__)
 class OlmoeRotaryEmbedding(nn.Module):
+    def __init__(self, config: OlmoeConfig, device=None):
         super().__init__()
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             self.rope_type = config.rope_scaling.get(
 class OlmoeAttention(nn.Module):
+    def __init__(self, config: OlmoeConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
         self.num_experts = config.num_experts
         self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
+        # Determine if this block is in the second half
         in_second_half = layer_idx >= self.total_layers // 2
+        # Determine small expert count for this layer
         if in_second_half:
             second_half_idx = layer_idx - (self.total_layers // 2)
             num_second_half_blocks = self.total_layers - (self.total_layers // 2)
             if config.small_expert_strategy == "constant":
                 self.num_small_experts = config.max_small_expert_count // num_second_half_blocks
             elif config.small_expert_strategy == "increment":
+                # Linearly scale small experts from 1 to max_small_expert_count
                 self.num_small_experts = (
                     (second_half_idx + 1) * config.max_small_expert_count // ((num_second_half_blocks * (num_second_half_blocks + 1)) // 2)
                 )
         if self.norm_topk_prob:
             routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
         final_hidden_states = torch.zeros_like(hidden_states)
         expert_mask = torch.nn.functional.one_hot(
             selected_experts,
             final_hidden_states.index_add_(0, top_x, current_output.to(hidden_states.dtype))
         return final_hidden_states.view(batch_size, sequence_length, hidden_dim), combined_logits
 class OlmoeDecoderLayer(nn.Module):
+    def __init__(self, config: OlmoeConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = OLMOE_ATTENTION_CLASSES[config._attn_implementation](
 class OlmoePreTrainedModel(PreTrainedModel):
+    config_class = OlmoeConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["OlmoeDecoderLayer"]
 class OlmoeModel(OlmoePreTrainedModel):
+    def __init__(self, config: OlmoeConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
             router_logits=outputs.router_logits,
         )
+__all__ = ["MyOlmoeForCausalLM", "OlmoeModel", "OlmoePreTrainedModel", "OlmoeConfig"]

scripts/eval.py CHANGED Viewed

@@ -183,6 +183,12 @@ def load_transformers_model(args) -> HFLM:
 def load_custom_model(args) -> HFLM:
     """
     Load custom MyOLMoE model (uses top-k routing by default).
     """
     logger.info(f"Loading custom MyOLMoE model: {args.model_path}")
     logger.info("Using top-k routing (default)")
@@ -195,84 +201,49 @@ def load_custom_model(args) -> HFLM:
         logger.warning(f"Custom model path not found: {args.custom_model_path}")
     try:
-        # Import custom model class and config
-        from modeling_myolmoe import MyOlmoeForCausalLM, MyOlmoeConfig
-        logger.info("Successfully imported MyOlmoeForCausalLM and MyOlmoeConfig")
-        # IMPORTANT: Register with "olmoe" since that's what your model was trained with
-        from transformers import AutoConfig, AutoModelForCausalLM
-        AutoConfig.register("olmoe", MyOlmoeConfig, exist_ok=True)  # Use exist_ok=True
-        AutoModelForCausalLM.register(MyOlmoeConfig, MyOlmoeForCausalLM, exist_ok=True)
-        logger.info("Registered MyOlmoeForCausalLM with MyOlmoeConfig for 'olmoe' type")
     except ImportError as e:
         logger.error(f"Failed to import custom model: {e}")
         logger.error("Make sure the custom model code is available in the specified path")
         raise
-    # Load model manually to avoid wrapper issues
-    logger.info("Loading model manually to avoid wrapper issues...")
-    try:
-        # Load tokenizer first
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.model_path,
-            trust_remote_code=args.trust_remote_code
-        )
-        # Load config - this should now work with the olmoe type
-        model_config = AutoConfig.from_pretrained(
-            args.model_path,
-            trust_remote_code=args.trust_remote_code
-        )
-        logger.info(f"Loaded config type: {type(model_config)}")
-        logger.info(f"Config model_type: {model_config.model_type}")
-        # Verify the config is properly initialized
-        if not hasattr(model_config, '__dataclass_fields__'):
-            logger.warning("Config is not recognized as a dataclass, attempting to recreate...")
-            # Recreate config as proper dataclass instance
-            config_dict = model_config.to_dict()
-            model_config = MyOlmoeConfig(**config_dict)
-        # Prepare model loading kwargs
-        model_kwargs = {
-            'config': model_config,
-            'trust_remote_code': args.trust_remote_code,
-        }
-        # Add torch_dtype if specified
-        if args.dtype == "bfloat16":
-            model_kwargs['torch_dtype'] = torch.bfloat16
-        elif args.dtype == "float16":
-            model_kwargs['torch_dtype'] = torch.float16
-        elif args.dtype == "float32":
-            model_kwargs['torch_dtype'] = torch.float32
-        # Load model instance
-        model_instance = AutoModelForCausalLM.from_pretrained(
-            args.model_path,
-            **model_kwargs
-        )
-        logger.info(f"Loaded model type: {type(model_instance)}")
-        # Create HFLM wrapper
-        model = HFLM(
-            pretrained=model_instance,
-            tokenizer=tokenizer,
-            device=args.device,
-            batch_size=args.batch_size,
-            max_batch_size=args.max_batch_size
-        )
-    except Exception as e:
-        logger.error(f"Failed to load custom model: {e}")
-        logger.error(f"Error type: {type(e)}")
-        import traceback
-        logger.error(f"Traceback: {traceback.format_exc()}")
-        raise
     logger.info("Custom model loaded successfully")
     return model
@@ -368,41 +339,13 @@ def run_evaluation(args) -> Dict[str, Any]:
     logger.info(f"Few-shot examples: {args.num_fewshot}")
     logger.info(f"Batch size: {args.batch_size}")
-    # Debug information - FIXED
-    print("Type of model being passed:", type(model))
-    if hasattr(model, '_model') and hasattr(model._model, 'config'):
-        print("Model config:", model._model.config)
-    elif hasattr(model, 'config'):
-        print("Model config:", model.config)
-    else:
-        print("Model config: Not accessible")
-    # Ensure model is properly initialized
-    if hasattr(model, '_model') and model._model is not None:
-        logger.info("Model is properly loaded and wrapped")
-    else:
-        logger.warning("Model wrapper may not be properly initialized")
-    try:
-        results = evaluator.simple_evaluate(
-            model=model,
-            tasks=args.tasks,
-            num_fewshot=args.num_fewshot,
-            limit=args.limit,
-            write_out=args.write_out,
-        )
-    except Exception as e:
-        logger.error(f"Evaluation failed with error: {e}")
-        logger.error("This might be due to model registration or configuration issues")
-        # Additional debugging
-        logger.error(f"Model type: {type(model)}")
-        if hasattr(model, '_model'):
-            logger.error(f"Internal model type: {type(model._model)}")
-            if hasattr(model._model, 'config'):
-                logger.error(f"Internal model config type: {type(model._model.config)}")
-        raise
     logger.info("Evaluation completed successfully")
     return results

 def load_custom_model(args) -> HFLM:
     """
     Load custom MyOLMoE model (uses top-k routing by default).
+    Args:
+        args: Parsed command line arguments
+    Returns:
+        HFLM: Wrapped model ready for evaluation
     """
     logger.info(f"Loading custom MyOLMoE model: {args.model_path}")
     logger.info("Using top-k routing (default)")
         logger.warning(f"Custom model path not found: {args.custom_model_path}")
     try:
+        # Import custom model class
+        from modeling_myolmoe import MyOlmoeForCausalLM
+        logger.info("Successfully imported MyOlmoeForCausalLM")
     except ImportError as e:
         logger.error(f"Failed to import custom model: {e}")
         logger.error("Make sure the custom model code is available in the specified path")
         raise
+    # Load model configuration
+    config = AutoConfig.from_pretrained(
+        args.model_path,
+        trust_remote_code=args.trust_remote_code
+    )
+    logger.info("Model will use default top-k routing configuration")
+    # Determine torch dtype
+    if args.dtype == "auto":
+        torch_dtype = "auto"
+    else:
+        torch_dtype = {
+            "float16": torch.float16,
+            "bfloat16": torch.bfloat16,
+            "float32": torch.float32
+        }[args.dtype]
+    # Load the custom model
+    hf_model = MyOlmoeForCausalLM.from_pretrained(
+        args.model_path,
+        config=config,
+        torch_dtype=torch_dtype,
+        device_map="auto" if args.device == "auto" else None,
+        trust_remote_code=args.trust_remote_code
+    ).eval()
+    # Wrap in HFLM
+    model = HFLM(
+        pretrained=hf_model,
+        device=args.device,
+        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
+        dtype=args.dtype
+    )
     logger.info("Custom model loaded successfully")
     return model
     logger.info(f"Few-shot examples: {args.num_fewshot}")
     logger.info(f"Batch size: {args.batch_size}")
+    results = evaluator.simple_evaluate(
+        model=model,
+        tasks=args.tasks,
+        num_fewshot=args.num_fewshot,
+        limit=args.limit,
+        write_out=args.write_out,
+    )
     logger.info("Evaluation completed successfully")
     return results