irodkin
/

run_20

irodkin commited on Nov 1, 2025

Commit

8eadfe3

verified ·

1 Parent(s): 4d6c6fe

Training checkpoint at step 9000

Files changed (1) hide show

modeling_armt.py CHANGED Viewed

@@ -440,8 +440,6 @@ def attn_mask_to_4d(attn_mask, upper, query_len):
     return mask
 def invert_attn_mask(attn_mask, dtype):
-        if os.environ.get("NOT_INVERT_ATTN_MASK"):
-            return attn_mask
         min_dtype = torch.finfo(dtype).min
         # Use the same dtype as attn_mask to avoid dtype conversion
         one = torch.tensor(1.0, dtype=attn_mask.dtype, device=attn_mask.device)
@@ -1829,7 +1827,7 @@ except Exception as e:
     raise e
 # Reuse utilities from the existing implementation to ensure identical math
-# inlined language_modeling: removed import DPFP, invert_attn_mask, attn_mask_to_4d
 def reverse_invert_attn_mask(mask: torch.Tensor) -> torch.Tensor:
     if os.environ.get("NOT_INVERT_ATTN_MASK"):
@@ -1856,6 +1854,8 @@ def is_empty_past_key_values(past_key_values: Optional[DynamicCache], layer_idx:
         return True
     return False
 def segment_tensor(t: torch.Tensor, start_idx: int, end_idx: int, seq_len: int) -> torch.Tensor:
     if not isinstance(t, torch.Tensor):
         return t

     return mask
 def invert_attn_mask(attn_mask, dtype):
         min_dtype = torch.finfo(dtype).min
         # Use the same dtype as attn_mask to avoid dtype conversion
         one = torch.tensor(1.0, dtype=attn_mask.dtype, device=attn_mask.device)
     raise e
 # Reuse utilities from the existing implementation to ensure identical math
+# inlined language_modeling: removed import DPFP, invert_attn_mask as _invert_attn_mask, attn_mask_to_4d
 def reverse_invert_attn_mask(mask: torch.Tensor) -> torch.Tensor:
     if os.environ.get("NOT_INVERT_ATTN_MASK"):
         return True
     return False
+invert_attn_mask = lambda mask, dtype: (_invert_attn_mask(mask, dtype) if not os.environ.get("NOT_INVERT_ATTN_MASK") else mask)
 def segment_tensor(t: torch.Tensor, start_idx: int, end_idx: int, seq_len: int) -> torch.Tensor:
     if not isinstance(t, torch.Tensor):
         return t