zaydzuhri commited on
Commit
5fe432f
·
verified ·
1 Parent(s): 1a7a6e1

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. fla/layers/__init__.py +44 -0
  2. fla/layers/nsa.py +138 -0
  3. fla/models/bitnet/__init__.py +13 -0
  4. fla/models/delta_net/__pycache__/configuration_delta_net.cpython-312.pyc +0 -0
  5. fla/models/delta_net/__pycache__/modeling_delta_net.cpython-312.pyc +0 -0
  6. fla/models/forgetting_transformer/__pycache__/__init__.cpython-312.pyc +0 -0
  7. fla/models/forgetting_transformer/__pycache__/modeling_forgetting_transformer.cpython-312.pyc +0 -0
  8. fla/models/gated_deltanet/__pycache__/__init__.cpython-312.pyc +0 -0
  9. fla/models/gated_deltanet/__pycache__/configuration_gated_deltanet.cpython-312.pyc +0 -0
  10. fla/models/gated_deltanet/__pycache__/modeling_gated_deltanet.cpython-312.pyc +0 -0
  11. fla/models/gated_deltaproduct/__pycache__/__init__.cpython-312.pyc +0 -0
  12. fla/models/gated_deltaproduct/__pycache__/configuration_gated_deltaproduct.cpython-312.pyc +0 -0
  13. fla/models/gated_deltaproduct/__pycache__/modeling_gated_deltaproduct.cpython-312.pyc +0 -0
  14. fla/models/gla/__pycache__/configuration_gla.cpython-312.pyc +0 -0
  15. fla/models/gsa/__pycache__/configuration_gsa.cpython-312.pyc +0 -0
  16. fla/models/gsa/__pycache__/modeling_gsa.cpython-312.pyc +0 -0
  17. fla/models/hgrn/__pycache__/configuration_hgrn.cpython-312.pyc +0 -0
  18. fla/models/lightnet/__pycache__/__init__.cpython-312.pyc +0 -0
  19. fla/models/lightnet/__pycache__/configuration_lightnet.cpython-312.pyc +0 -0
  20. fla/models/lightnet/__pycache__/modeling_lightnet.cpython-312.pyc +0 -0
  21. fla/models/linear_attn/__pycache__/__init__.cpython-312.pyc +0 -0
  22. fla/models/linear_attn/__pycache__/modeling_linear_attn.cpython-312.pyc +0 -0
  23. fla/models/mamba/__pycache__/configuration_mamba.cpython-312.pyc +0 -0
  24. fla/models/mamba/__pycache__/modeling_mamba.cpython-312.pyc +0 -0
  25. fla/models/nsa/__pycache__/configuration_nsa.cpython-312.pyc +0 -0
  26. fla/models/nsa/__pycache__/modeling_nsa.cpython-312.pyc +0 -0
  27. fla/models/retnet/__pycache__/__init__.cpython-312.pyc +0 -0
  28. fla/models/retnet/__pycache__/modeling_retnet.cpython-312.pyc +0 -0
  29. fla/models/retnet/modeling_retnet.py +425 -0
  30. fla/models/transformer/__init__.py +13 -0
  31. fla/models/transformer/__pycache__/__init__.cpython-312.pyc +0 -0
  32. fla/models/transformer/__pycache__/configuration_transformer.cpython-312.pyc +0 -0
  33. fla/models/transformer/__pycache__/modeling_transformer.cpython-312.pyc +0 -0
  34. fla/models/transformer/configuration_transformer.py +71 -0
  35. fla/models/transformer_mtp/__pycache__/configuration_transformer.cpython-312.pyc +0 -0
  36. fla/models/transformer_mtp/__pycache__/modeling_transformer.cpython-312.pyc +0 -0
  37. fla/models/transformer_top/__init__.py +13 -0
  38. fla/modules/__pycache__/feature_map.cpython-312.pyc +0 -0
  39. fla/modules/__pycache__/fused_bitlinear.cpython-312.pyc +0 -0
  40. fla/modules/__pycache__/fused_cross_entropy.cpython-312.pyc +0 -0
  41. fla/modules/__pycache__/fused_kl_div.cpython-312.pyc +0 -0
  42. fla/modules/__pycache__/fused_linear_listnet_loss.cpython-312.pyc +0 -0
  43. fla/modules/__pycache__/fused_norm_gate.cpython-312.pyc +0 -0
  44. fla/modules/__pycache__/grpo.cpython-312.pyc +0 -0
  45. fla/modules/__pycache__/l2norm.cpython-312.pyc +0 -0
  46. fla/modules/__pycache__/mlp.cpython-312.pyc +0 -0
  47. fla/modules/__pycache__/rotary.cpython-312.pyc +0 -0
  48. fla/modules/__pycache__/seq_to_top.cpython-312.pyc +0 -0
  49. logs/none_enyj3lod/attempt_0/0/stdout.log +117 -0
  50. logs/none_enyj3lod/attempt_0/1/stderr.log +0 -0
fla/layers/__init__.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
3
+
4
+ from .abc import ABCAttention
5
+ from .attn import Attention
6
+ from .based import BasedLinearAttention
7
+ from .bitattn import BitAttention
8
+ from .delta_net import DeltaNet
9
+ from .forgetting_attn import ForgettingAttention
10
+ from .gated_deltanet import GatedDeltaNet
11
+ from .gated_deltaproduct import GatedDeltaProduct
12
+ from .gla import GatedLinearAttention
13
+ from .gsa import GatedSlotAttention
14
+ from .hgrn import HGRNAttention
15
+ from .hgrn2 import HGRN2Attention
16
+ from .lightnet import LightNetAttention
17
+ from .linear_attn import LinearAttention
18
+ from .multiscale_retention import MultiScaleRetention
19
+ from .nsa import NativeSparseAttention
20
+ from .rebased import ReBasedLinearAttention
21
+ from .rwkv6 import RWKV6Attention
22
+ from .rwkv7 import RWKV7Attention
23
+
24
+ __all__ = [
25
+ 'ABCAttention',
26
+ 'Attention',
27
+ 'BasedLinearAttention',
28
+ 'BitAttention',
29
+ 'DeltaNet',
30
+ 'ForgettingAttention',
31
+ 'GatedDeltaNet',
32
+ 'GatedDeltaProduct',
33
+ 'GatedLinearAttention',
34
+ 'GatedSlotAttention',
35
+ 'HGRNAttention',
36
+ 'HGRN2Attention',
37
+ 'LightNetAttention',
38
+ 'LinearAttention',
39
+ 'MultiScaleRetention',
40
+ 'NativeSparseAttention',
41
+ 'ReBasedLinearAttention',
42
+ 'RWKV6Attention',
43
+ 'RWKV7Attention',
44
+ ]
fla/layers/nsa.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
3
+
4
+ from __future__ import annotations
5
+
6
+ from typing import TYPE_CHECKING, Optional, Tuple, Union
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ from einops import rearrange
11
+ from transformers.utils import logging
12
+
13
+ from fla.modules import RotaryEmbedding
14
+ from fla.ops.nsa.parallel import parallel_nsa
15
+
16
+ if TYPE_CHECKING:
17
+ from fla.models.utils import Cache
18
+
19
+ logger = logging.get_logger(__name__)
20
+
21
+
22
+ class NativeSparseAttention(nn.Module):
23
+
24
+ def __init__(
25
+ self,
26
+ hidden_size: int = 2048,
27
+ num_heads: int = 64,
28
+ num_kv_heads: Optional[int] = 4,
29
+ head_dim: int = 64,
30
+ qkv_bias: bool = False,
31
+ block_size: Optional[int] = 64,
32
+ block_counts: Optional[Union[torch.LongTensor, int]] = 16,
33
+ window_size: Optional[int] = 512,
34
+ rope_theta: Optional[float] = 10000.,
35
+ max_position_embeddings: Optional[int] = None,
36
+ layer_idx: int = None
37
+ ):
38
+ super().__init__()
39
+
40
+ self.hidden_size = hidden_size
41
+ self.num_heads = num_heads
42
+ if num_kv_heads is None:
43
+ self.num_kv_heads = self.num_heads
44
+ else:
45
+ self.num_kv_heads = num_kv_heads
46
+ self.num_kv_groups = num_heads // self.num_kv_heads
47
+ self.head_dim = head_dim
48
+ self.kv_dim = self.num_kv_heads * self.head_dim
49
+ self.qkv_bias = qkv_bias
50
+
51
+ self.block_size = block_size
52
+ self.block_counts = block_counts
53
+ self.window_size = window_size
54
+ self.rope_theta = rope_theta
55
+ self.max_position_embeddings = max_position_embeddings
56
+ self.layer_idx = layer_idx
57
+
58
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=self.qkv_bias)
59
+ self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=self.qkv_bias)
60
+ self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=self.qkv_bias)
61
+ self.g_proj = nn.Linear(self.hidden_size, self.num_heads * 3, bias=False)
62
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
63
+
64
+ self.rotary = RotaryEmbedding(dim=self.head_dim, base=self.rope_theta)
65
+
66
+ def forward(
67
+ self,
68
+ hidden_states: torch.Tensor,
69
+ attention_mask: Optional[torch.LongTensor] = None,
70
+ past_key_values: Optional[Cache] = None,
71
+ output_attentions: bool = False,
72
+ use_cache: bool = False,
73
+ **kwargs,
74
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
75
+ if attention_mask is not None:
76
+ assert len(attention_mask.shape) == 2, (
77
+ "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
78
+ "for padding purposes (0 indicating padding). "
79
+ "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
80
+ )
81
+
82
+ batch_size, seq_len, _ = hidden_states.size()
83
+
84
+ q = rearrange(self.q_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
85
+ k = rearrange(self.k_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
86
+ v = rearrange(self.v_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
87
+ g = rearrange(self.g_proj(hidden_states), '... (h d) -> ... h d', d=3)
88
+ g_cmp, g_slc, g_swa = g.sigmoid().unbind(-1)
89
+
90
+ cu_seqlens = kwargs.get('cu_seqlens', None)
91
+
92
+ seqlen_offset, max_seqlen = 0, seq_len
93
+ if past_key_values is not None:
94
+ seqlen_offset = past_key_values.get_seq_length(self.layer_idx)
95
+ max_seqlen = q.shape[1] + seqlen_offset
96
+
97
+ if attention_mask is not None:
98
+ # to deliminate the offsets of padding tokens
99
+ seqlen_offset = seqlen_offset + attention_mask.sum(-1) - attention_mask.shape[-1]
100
+ max_seqlen = q.shape[1] + max(seqlen_offset)
101
+
102
+ if self.max_position_embeddings is not None:
103
+ max_seqlen = max(max_seqlen, self.max_position_embeddings)
104
+ q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)
105
+
106
+ if past_key_values is not None:
107
+ cache_has_content = past_key_values.get_seq_length(self.layer_idx) > 0
108
+ k_cached, v_cached = past_key_values.update(
109
+ attn_state=(k.flatten(-2, -1), v.flatten(-2, -1)),
110
+ layer_idx=self.layer_idx,
111
+ offset=seq_len,
112
+ cache_kwargs=dict(window_size=self.window_size)
113
+ )['attn_state']
114
+ if cache_has_content:
115
+ k, v = k_cached, v_cached
116
+ k = rearrange(k, '... (h d) -> ... h d', d=self.head_dim)
117
+ v = rearrange(v, '... (h d) -> ... h d', d=self.head_dim)
118
+
119
+ o = parallel_nsa(
120
+ q=q,
121
+ k=k,
122
+ v=v,
123
+ g_cmp=g_cmp,
124
+ g_slc=g_slc,
125
+ g_swa=g_swa,
126
+ block_size=self.block_size,
127
+ block_counts=self.block_counts,
128
+ window_size=self.window_size,
129
+ cu_seqlens=cu_seqlens,
130
+ head_first=False
131
+ )
132
+ o = o.reshape(batch_size, seq_len, -1)
133
+ o = self.o_proj(o)
134
+
135
+ if not output_attentions:
136
+ attentions = None
137
+
138
+ return o, attentions, past_key_values
fla/models/bitnet/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
4
+
5
+ from fla.models.bitnet.configuration_bitnet import BitNetConfig
6
+ from fla.models.bitnet.modeling_bitnet import BitNetForCausalLM, BitNetModel
7
+
8
+ AutoConfig.register(BitNetConfig.model_type, BitNetConfig)
9
+ AutoModel.register(BitNetConfig, BitNetModel)
10
+ AutoModelForCausalLM.register(BitNetConfig, BitNetForCausalLM)
11
+
12
+
13
+ __all__ = ['BitNetConfig', 'BitNetForCausalLM', 'BitNetModel']
fla/models/delta_net/__pycache__/configuration_delta_net.cpython-312.pyc ADDED
Binary file (3.59 kB). View file
 
fla/models/delta_net/__pycache__/modeling_delta_net.cpython-312.pyc ADDED
Binary file (18.5 kB). View file
 
fla/models/forgetting_transformer/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (817 Bytes). View file
 
fla/models/forgetting_transformer/__pycache__/modeling_forgetting_transformer.cpython-312.pyc ADDED
Binary file (17.2 kB). View file
 
fla/models/gated_deltanet/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (746 Bytes). View file
 
fla/models/gated_deltanet/__pycache__/configuration_gated_deltanet.cpython-312.pyc ADDED
Binary file (3.34 kB). View file
 
fla/models/gated_deltanet/__pycache__/modeling_gated_deltanet.cpython-312.pyc ADDED
Binary file (18.5 kB). View file
 
fla/models/gated_deltaproduct/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (777 Bytes). View file
 
fla/models/gated_deltaproduct/__pycache__/configuration_gated_deltaproduct.cpython-312.pyc ADDED
Binary file (3.38 kB). View file
 
fla/models/gated_deltaproduct/__pycache__/modeling_gated_deltaproduct.cpython-312.pyc ADDED
Binary file (20.7 kB). View file
 
fla/models/gla/__pycache__/configuration_gla.cpython-312.pyc ADDED
Binary file (3.73 kB). View file
 
fla/models/gsa/__pycache__/configuration_gsa.cpython-312.pyc ADDED
Binary file (3.84 kB). View file
 
fla/models/gsa/__pycache__/modeling_gsa.cpython-312.pyc ADDED
Binary file (18.7 kB). View file
 
fla/models/hgrn/__pycache__/configuration_hgrn.cpython-312.pyc ADDED
Binary file (3.28 kB). View file
 
fla/models/lightnet/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (699 Bytes). View file
 
fla/models/lightnet/__pycache__/configuration_lightnet.cpython-312.pyc ADDED
Binary file (3.36 kB). View file
 
fla/models/lightnet/__pycache__/modeling_lightnet.cpython-312.pyc ADDED
Binary file (18.3 kB). View file
 
fla/models/linear_attn/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (737 Bytes). View file
 
fla/models/linear_attn/__pycache__/modeling_linear_attn.cpython-312.pyc ADDED
Binary file (18.5 kB). View file
 
fla/models/mamba/__pycache__/configuration_mamba.cpython-312.pyc ADDED
Binary file (7.06 kB). View file
 
fla/models/mamba/__pycache__/modeling_mamba.cpython-312.pyc ADDED
Binary file (41.5 kB). View file
 
fla/models/nsa/__pycache__/configuration_nsa.cpython-312.pyc ADDED
Binary file (2.64 kB). View file
 
fla/models/nsa/__pycache__/modeling_nsa.cpython-312.pyc ADDED
Binary file (17.6 kB). View file
 
fla/models/retnet/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (682 Bytes). View file
 
fla/models/retnet/__pycache__/modeling_retnet.cpython-312.pyc ADDED
Binary file (18.4 kB). View file
 
fla/models/retnet/modeling_retnet.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ import warnings
7
+ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.utils.checkpoint
12
+ from transformers.generation import GenerationMixin
13
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
14
+ from transformers.modeling_utils import PreTrainedModel
15
+ from transformers.utils import logging
16
+ from transformers.utils.deprecation import deprecate_kwarg
17
+
18
+ from fla.layers.attn import Attention
19
+ from fla.layers.multiscale_retention import MultiScaleRetention
20
+ from fla.models.retnet.configuration_retnet import RetNetConfig
21
+ from fla.models.utils import Cache
22
+ from fla.modules import FusedCrossEntropyLoss, FusedLinearCrossEntropyLoss
23
+ from fla.modules import GatedMLP as RetNetMLP
24
+ from fla.modules import RMSNorm
25
+
26
+ if TYPE_CHECKING:
27
+ from transformers.processing_utils import Unpack
28
+
29
+ logger = logging.get_logger(__name__)
30
+
31
+
32
+ class RetNetBlock(nn.Module):
33
+ def __init__(self, config: RetNetConfig, layer_idx: int):
34
+ super().__init__()
35
+
36
+ self.config = config
37
+ self.layer_idx = layer_idx
38
+
39
+ self.attn_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps)
40
+ if config.attn is not None and layer_idx in config.attn['layers']:
41
+ self.attn = Attention(
42
+ hidden_size=config.hidden_size,
43
+ num_heads=config.attn['num_heads'],
44
+ num_kv_heads=config.attn['num_kv_heads'],
45
+ qkv_bias=config.attn['qkv_bias'],
46
+ window_size=config.attn['window_size'],
47
+ rope_theta=config.attn['rope_theta'],
48
+ max_position_embeddings=config.max_position_embeddings,
49
+ layer_idx=layer_idx
50
+ )
51
+ else:
52
+ self.attn = MultiScaleRetention(
53
+ mode=config.attn_mode,
54
+ hidden_size=config.hidden_size,
55
+ expand_k=config.expand_k,
56
+ expand_v=config.expand_v,
57
+ num_heads=config.num_heads,
58
+ num_kv_heads=config.num_kv_heads,
59
+ feature_map=config.feature_map,
60
+ use_output_gate=config.use_output_gate,
61
+ gate_fn=config.hidden_act,
62
+ elementwise_affine=config.elementwise_affine,
63
+ norm_eps=config.norm_eps,
64
+ fuse_norm=config.fuse_norm,
65
+ layer_idx=layer_idx
66
+ )
67
+ self.mlp_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps)
68
+ self.mlp = RetNetMLP(
69
+ hidden_size=config.hidden_size,
70
+ hidden_ratio=config.hidden_ratio,
71
+ intermediate_size=config.intermediate_size,
72
+ hidden_act=config.hidden_act,
73
+ fuse_swiglu=config.fuse_swiglu
74
+ )
75
+
76
+ def forward(
77
+ self,
78
+ hidden_states: torch.Tensor,
79
+ attention_mask: Optional[torch.Tensor] = None,
80
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
81
+ use_cache: Optional[bool] = False,
82
+ output_attentions: Optional[bool] = False,
83
+ **kwargs: Unpack[Dict]
84
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
85
+
86
+ residual = hidden_states
87
+
88
+ hidden_states = self.attn_norm(hidden_states)
89
+ hidden_states, attentions, past_key_values = self.attn(
90
+ hidden_states=hidden_states,
91
+ attention_mask=attention_mask,
92
+ past_key_values=past_key_values,
93
+ use_cache=use_cache,
94
+ output_attentions=output_attentions,
95
+ **kwargs
96
+ )
97
+ if self.config.fuse_norm:
98
+ hidden_states, residual = self.mlp_norm(hidden_states, residual, True)
99
+ else:
100
+ hidden_states = residual + hidden_states
101
+ residual = hidden_states
102
+ hidden_states = self.mlp_norm(hidden_states)
103
+ hidden_states = self.mlp(hidden_states, **kwargs)
104
+ hidden_states = residual + hidden_states
105
+
106
+ outputs = (hidden_states, attentions, past_key_values)
107
+
108
+ return outputs
109
+
110
+
111
+ class RetNetPreTrainedModel(PreTrainedModel):
112
+
113
+ config_class = RetNetConfig
114
+ base_model_prefix = 'model'
115
+ supports_gradient_checkpointing = True
116
+ _no_split_modules = ['RetNetBlock']
117
+ _supports_cache_class = True
118
+
119
+ def __init__(self, *inputs, **kwargs):
120
+ super().__init__(*inputs, **kwargs)
121
+
122
+ def _init_weights(
123
+ self,
124
+ module: nn.Module,
125
+ prenorm_residual_strategy: Optional[str] = 'rescale',
126
+ num_residuals_per_layer: int = 2,
127
+ ):
128
+ if isinstance(module, (nn.Linear, nn.Conv1d)):
129
+ # Slightly different from the TF version which uses truncated_normal for initialization
130
+ # cf https://github.com/pytorch/pytorch/pull/5617
131
+ nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
132
+ if module.bias is not None:
133
+ nn.init.zeros_(module.bias)
134
+ elif isinstance(module, nn.Embedding):
135
+ nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
136
+ elif hasattr(module, 'reset_parameters'):
137
+ module.reset_parameters()
138
+
139
+ if prenorm_residual_strategy is not None:
140
+ # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
141
+ # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
142
+ # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
143
+ # > -- GPT-2 :: https://openai.com/blog/better-language-models/
144
+ #
145
+ # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
146
+ p = None
147
+ if hasattr(module, 'o_proj'):
148
+ p = module.o_proj.weight
149
+ elif hasattr(module, 'down_proj'):
150
+ p = module.down_proj.weight
151
+ if p is not None:
152
+ # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
153
+ # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
154
+ # We need to reinit p since this code could be called multiple times
155
+ # Having just p *= scale would repeatedly scale it down
156
+ if prenorm_residual_strategy == 'rescale':
157
+ nn.init.kaiming_uniform_(p, a=math.sqrt(5))
158
+ with torch.no_grad():
159
+ p /= math.sqrt(num_residuals_per_layer * self.config.num_hidden_layers)
160
+ elif prenorm_residual_strategy == 'zero':
161
+ nn.init.zeros_(p)
162
+ else:
163
+ raise ValueError(f"Invalid prenorm_residual_strategy: {prenorm_residual_strategy}")
164
+
165
+
166
+ class RetNetModel(RetNetPreTrainedModel):
167
+
168
+ def __init__(self, config: RetNetConfig):
169
+ super().__init__(config)
170
+ self.padding_idx = config.pad_token_id
171
+ self.vocab_size = config.vocab_size
172
+
173
+ self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
174
+ self.layers = nn.ModuleList(
175
+ [RetNetBlock(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
176
+ )
177
+ self.norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps)
178
+
179
+ self.gradient_checkpointing = False
180
+
181
+ self.post_init()
182
+
183
+ def get_input_embeddings(self):
184
+ return self.embeddings
185
+
186
+ def set_input_embeddings(self, value):
187
+ self.embeddings = value
188
+
189
+ def forward(
190
+ self,
191
+ input_ids: Optional[torch.LongTensor] = None,
192
+ attention_mask: Optional[torch.Tensor] = None, # noqa
193
+ inputs_embeds: Optional[torch.FloatTensor] = None,
194
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
195
+ use_cache: Optional[bool] = None,
196
+ output_attentions: Optional[bool] = None,
197
+ output_hidden_states: Optional[bool] = None,
198
+ return_dict: Optional[bool] = None,
199
+ **kwargs: Unpack[Dict]
200
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
201
+ if output_attentions:
202
+ warnings.warn(
203
+ "`RetNetModel` does not support output attention weights now, so `output_attentions` is set to `False`."
204
+ )
205
+ output_attentions = False
206
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
207
+ output_hidden_states = (
208
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
209
+ )
210
+ use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
211
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
212
+
213
+ # retrieve input_ids and inputs_embeds
214
+ if input_ids is not None and inputs_embeds is not None:
215
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
216
+ if input_ids is None and inputs_embeds is None:
217
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
218
+
219
+ if inputs_embeds is None:
220
+ inputs_embeds = self.embeddings(input_ids)
221
+ hidden_states = inputs_embeds
222
+
223
+ if use_cache and not isinstance(past_key_values, Cache):
224
+ past_key_values = Cache.from_legacy_cache(past_key_values)
225
+
226
+ if self.gradient_checkpointing and self.training and use_cache:
227
+ logger.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
228
+ use_cache = False
229
+
230
+ all_hidden_states = () if output_hidden_states else None
231
+ all_attns = () if output_attentions else None
232
+ for layer in self.layers:
233
+ if output_hidden_states:
234
+ all_hidden_states += (hidden_states,)
235
+
236
+ if self.gradient_checkpointing and self.training:
237
+ hidden_states, attentions, past_key_values = self._gradient_checkpointing_func(
238
+ layer.__call__,
239
+ hidden_states,
240
+ attention_mask,
241
+ past_key_values,
242
+ use_cache,
243
+ output_attentions,
244
+ **kwargs
245
+ )
246
+ else:
247
+ hidden_states, attentions, past_key_values = layer(
248
+ hidden_states,
249
+ attention_mask=attention_mask,
250
+ past_key_values=past_key_values,
251
+ use_cache=use_cache,
252
+ output_attentions=output_attentions,
253
+ **kwargs
254
+ )
255
+
256
+ if output_attentions:
257
+ all_attns += (attentions,)
258
+
259
+ hidden_states = self.norm(hidden_states)
260
+
261
+ # add hidden states from the last decoder layer
262
+ if output_hidden_states:
263
+ all_hidden_states += (hidden_states,)
264
+
265
+ if not return_dict:
266
+ return tuple(i for i in [hidden_states, past_key_values, all_hidden_states, all_attns] if i is not None)
267
+ return BaseModelOutputWithPast(
268
+ last_hidden_state=hidden_states,
269
+ past_key_values=past_key_values,
270
+ hidden_states=all_hidden_states,
271
+ attentions=all_attns
272
+ )
273
+
274
+
275
+ class RetNetForCausalLM(RetNetPreTrainedModel, GenerationMixin):
276
+
277
+ _tied_weights_keys = ["lm_head.weight"]
278
+
279
+ def __init__(self, config):
280
+ super().__init__(config)
281
+ self.model = RetNetModel(config)
282
+ self.vocab_size = config.vocab_size
283
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
284
+ self.criterion = None
285
+
286
+ # Initialize weights and apply final processing
287
+ self.post_init()
288
+
289
+ def get_input_embeddings(self):
290
+ return self.model.embeddings
291
+
292
+ def set_input_embeddings(self, value):
293
+ self.model.embeddings = value
294
+
295
+ def get_output_embeddings(self):
296
+ return self.lm_head
297
+
298
+ def set_output_embeddings(self, new_embeddings):
299
+ self.lm_head = new_embeddings
300
+
301
+ def set_decoder(self, decoder):
302
+ self.model = decoder
303
+
304
+ def get_decoder(self):
305
+ return self.model
306
+
307
+ def generate(self, *args, **kwargs):
308
+ try:
309
+ return super().generate(*args, **kwargs)
310
+ except AttributeError as exception:
311
+ # Expected exception: "AttributeError: '(object name)' object has no attribute 'past_key_values'"
312
+ if 'past_key_values' in str(exception):
313
+ raise AttributeError(
314
+ f"You tried to call `generate` with a decoding strategy that manipulates `past_key_values`, "
315
+ f"which is not supported for {self.__class__.__name__}. "
316
+ f"Try another generation strategy instead. "
317
+ f"For the available generation strategies, check this doc: "
318
+ f"https://huggingface.co/docs/transformers/en/generation_strategies#decoding-strategies"
319
+ )
320
+ else:
321
+ raise exception
322
+
323
+ @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
324
+ def prepare_inputs_for_generation(
325
+ self,
326
+ input_ids: torch.LongTensor = None,
327
+ past_key_values: Optional[torch.Tensor] = None,
328
+ attention_mask: Optional[torch.Tensor] = None,
329
+ inputs_embeds: Optional[torch.FloatTensor] = None,
330
+ use_cache: Optional[bool] = True,
331
+ logits_to_keep: Optional[int] = None,
332
+ **kwargs: Unpack[Dict]
333
+ ):
334
+ # only last token for `inputs_ids` if the `past_key_values` is passed along.
335
+ if past_key_values is not None:
336
+ input_ids = input_ids[:, -1:]
337
+
338
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
339
+ if inputs_embeds is not None and len(past_key_values) == 0:
340
+ model_inputs = {'inputs_embeds': inputs_embeds}
341
+ else:
342
+ # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
343
+ # recompiles graphs as the stride of the inputs is a guard.
344
+ # Ref: https://github.com/huggingface/transformers/pull/29114
345
+ # TODO: use `next_tokens` directly instead.
346
+ model_inputs = {'input_ids': input_ids.contiguous()}
347
+
348
+ if logits_to_keep is not None:
349
+ model_inputs['logits_to_keep'] = logits_to_keep
350
+
351
+ model_inputs.update({
352
+ 'past_key_values': past_key_values,
353
+ 'use_cache': use_cache,
354
+ 'attention_mask': attention_mask,
355
+ })
356
+ return model_inputs
357
+
358
+ @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
359
+ def forward(
360
+ self,
361
+ input_ids: torch.LongTensor = None,
362
+ attention_mask: Optional[torch.Tensor] = None,
363
+ inputs_embeds: Optional[torch.FloatTensor] = None,
364
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
365
+ labels: Optional[torch.LongTensor] = None,
366
+ use_cache: Optional[bool] = None,
367
+ output_attentions: Optional[bool] = None,
368
+ output_hidden_states: Optional[bool] = None,
369
+ return_dict: Optional[bool] = None,
370
+ logits_to_keep: Optional[int] = 0,
371
+ **kwargs: Unpack[Dict]
372
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
373
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
374
+ output_hidden_states = (
375
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
376
+ )
377
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
378
+
379
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
380
+ outputs = self.model(
381
+ input_ids=input_ids,
382
+ attention_mask=attention_mask,
383
+ inputs_embeds=inputs_embeds,
384
+ past_key_values=past_key_values,
385
+ use_cache=use_cache,
386
+ output_attentions=output_attentions,
387
+ output_hidden_states=output_hidden_states,
388
+ return_dict=return_dict,
389
+ **kwargs
390
+ )
391
+
392
+ hidden_states = outputs[0]
393
+ fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training
394
+
395
+ loss, logits = None, None
396
+ if not fuse_linear_and_cross_entropy or labels is None:
397
+ logits = self.lm_head(hidden_states if logits_to_keep is None else hidden_states[:, -logits_to_keep:])
398
+ if labels is not None:
399
+ if getattr(self, 'criterion', None) is None:
400
+ if fuse_linear_and_cross_entropy:
401
+ criterion = FusedLinearCrossEntropyLoss()
402
+ elif self.config.fuse_cross_entropy:
403
+ criterion = FusedCrossEntropyLoss(inplace_backward=True)
404
+ else:
405
+ criterion = nn.CrossEntropyLoss()
406
+ else:
407
+ criterion = self.criterion
408
+ labels = labels.to(hidden_states.device)
409
+ labels = torch.cat((labels[..., 1:], torch.full_like(labels[:, :1], criterion.ignore_index)), 1)
410
+ if fuse_linear_and_cross_entropy:
411
+ loss = criterion(hidden_states, labels, self.lm_head.weight, self.lm_head.bias)
412
+ else:
413
+ loss = criterion(logits.view(labels.numel(), -1), labels.view(-1))
414
+
415
+ if not return_dict:
416
+ output = (logits,) + outputs[1:]
417
+ return (loss,) + output if loss is not None else output
418
+
419
+ return CausalLMOutputWithPast(
420
+ loss=loss,
421
+ logits=logits,
422
+ past_key_values=outputs.past_key_values,
423
+ hidden_states=outputs.hidden_states,
424
+ attentions=outputs.attentions,
425
+ )
fla/models/transformer/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
4
+
5
+ from fla.models.transformer.configuration_transformer import TransformerConfig
6
+ from fla.models.transformer.modeling_transformer import TransformerForCausalLM, TransformerModel
7
+
8
+ AutoConfig.register(TransformerConfig.model_type, TransformerConfig)
9
+ AutoModel.register(TransformerConfig, TransformerModel)
10
+ AutoModelForCausalLM.register(TransformerConfig, TransformerForCausalLM)
11
+
12
+
13
+ __all__ = ['TransformerConfig', 'TransformerForCausalLM', 'TransformerModel']
fla/models/transformer/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (728 Bytes). View file
 
fla/models/transformer/__pycache__/configuration_transformer.cpython-312.pyc ADDED
Binary file (2.52 kB). View file
 
fla/models/transformer/__pycache__/modeling_transformer.cpython-312.pyc ADDED
Binary file (17.1 kB). View file
 
fla/models/transformer/configuration_transformer.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from typing import Optional
4
+
5
+ from transformers.configuration_utils import PretrainedConfig
6
+
7
+
8
+ class TransformerConfig(PretrainedConfig):
9
+
10
+ model_type = 'transformer'
11
+ keys_to_ignore_at_inference = ['past_key_values']
12
+
13
+ def __init__(
14
+ self,
15
+ hidden_size: int = 2048,
16
+ num_hidden_layers: int = 24,
17
+ num_heads: int = 32,
18
+ num_kv_heads: int = None,
19
+ qkv_bias: bool = False,
20
+ qk_norm: bool = False,
21
+ window_size: Optional[int] = None,
22
+ rope_theta: Optional[float] = 10000.,
23
+ max_position_embeddings: int = 2048,
24
+ hidden_ratio: Optional[int] = 4,
25
+ intermediate_size: Optional[int] = None,
26
+ hidden_act: str = "swish",
27
+ initializer_range: float = 0.006,
28
+ elementwise_affine: Optional[bool] = True,
29
+ norm_eps: float = 1e-6,
30
+ use_cache: bool = True,
31
+ pad_token_id: int = None,
32
+ bos_token_id: int = 1,
33
+ eos_token_id: int = 2,
34
+ tie_word_embeddings: bool = False,
35
+ fuse_norm: bool = True,
36
+ fuse_swiglu: bool = True,
37
+ fuse_cross_entropy: bool = True,
38
+ vocab_size: int = 32000,
39
+ **kwargs,
40
+ ):
41
+ self.hidden_size = hidden_size
42
+ self.num_hidden_layers = num_hidden_layers
43
+ self.num_heads = num_heads
44
+ self.num_kv_heads = num_kv_heads
45
+ self.qkv_bias = qkv_bias
46
+ self.qk_norm = qk_norm
47
+ self.window_size = window_size
48
+ self.rope_theta = rope_theta
49
+ self.max_position_embeddings = max_position_embeddings
50
+
51
+ self.hidden_ratio = hidden_ratio
52
+ self.intermediate_size = intermediate_size
53
+ self.hidden_act = hidden_act
54
+
55
+ self.initializer_range = initializer_range
56
+ self.elementwise_affine = elementwise_affine
57
+ self.norm_eps = norm_eps
58
+ self.use_cache = use_cache
59
+
60
+ self.fuse_norm = fuse_norm
61
+ self.fuse_swiglu = fuse_swiglu
62
+ self.fuse_cross_entropy = fuse_cross_entropy
63
+ self.vocab_size = vocab_size
64
+
65
+ super().__init__(
66
+ pad_token_id=pad_token_id,
67
+ bos_token_id=bos_token_id,
68
+ eos_token_id=eos_token_id,
69
+ tie_word_embeddings=tie_word_embeddings,
70
+ **kwargs,
71
+ )
fla/models/transformer_mtp/__pycache__/configuration_transformer.cpython-312.pyc ADDED
Binary file (2.69 kB). View file
 
fla/models/transformer_mtp/__pycache__/modeling_transformer.cpython-312.pyc ADDED
Binary file (24.9 kB). View file
 
fla/models/transformer_top/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
4
+
5
+ from fla.models.transformer_top.configuration_transformer import TOPTransformerConfig
6
+ from fla.models.transformer_top.modeling_transformer import TOPTransformerForCausalLM, TOPTransformerModel
7
+
8
+ AutoConfig.register(TOPTransformerConfig.model_type, TOPTransformerConfig)
9
+ AutoModel.register(TOPTransformerConfig, TOPTransformerModel)
10
+ AutoModelForCausalLM.register(TOPTransformerConfig, TOPTransformerForCausalLM)
11
+
12
+
13
+ __all__ = ['TOPTransformerConfig', 'TOPTransformerForCausalLM', 'TOPTransformerModel']
fla/modules/__pycache__/feature_map.cpython-312.pyc ADDED
Binary file (17.6 kB). View file
 
fla/modules/__pycache__/fused_bitlinear.cpython-312.pyc ADDED
Binary file (23.6 kB). View file
 
fla/modules/__pycache__/fused_cross_entropy.cpython-312.pyc ADDED
Binary file (16 kB). View file
 
fla/modules/__pycache__/fused_kl_div.cpython-312.pyc ADDED
Binary file (11.7 kB). View file
 
fla/modules/__pycache__/fused_linear_listnet_loss.cpython-312.pyc ADDED
Binary file (17.8 kB). View file
 
fla/modules/__pycache__/fused_norm_gate.cpython-312.pyc ADDED
Binary file (35.3 kB). View file
 
fla/modules/__pycache__/grpo.cpython-312.pyc ADDED
Binary file (18.6 kB). View file
 
fla/modules/__pycache__/l2norm.cpython-312.pyc ADDED
Binary file (6.96 kB). View file
 
fla/modules/__pycache__/mlp.cpython-312.pyc ADDED
Binary file (6.23 kB). View file
 
fla/modules/__pycache__/rotary.cpython-312.pyc ADDED
Binary file (23.2 kB). View file
 
fla/modules/__pycache__/seq_to_top.cpython-312.pyc ADDED
Binary file (4.08 kB). View file
 
logs/none_enyj3lod/attempt_0/0/stdout.log ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-09-11T14:21:46.554380Z  WARN Status Code: 502. Retrying..., request_id: ""
2
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
3
+
4
+ 2025-09-11T14:22:46.610973Z  WARN Status Code: 502. Retrying..., request_id: ""
5
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
6
+
7
+ 2025-09-11T14:22:46.611254Z  WARN Status Code: 502. Retrying..., request_id: ""
8
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
9
+
10
+ 2025-09-11T14:24:33.933975Z  WARN Status Code: 502. Retrying..., request_id: ""
11
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
12
+
13
+ 2025-09-11T14:24:52.783218Z  WARN Status Code: 504. Retrying..., request_id: ""
14
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
15
+
16
+ 2025-09-11T23:44:05.149637Z  WARN Status Code: 502. Retrying..., request_id: ""
17
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
18
+
19
+ 2025-09-11T23:44:06.469994Z  WARN Status Code: 502. Retrying..., request_id: ""
20
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
21
+
22
+ 2025-09-11T23:44:16.415816Z  WARN Status Code: 504. Retrying..., request_id: ""
23
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
24
+
25
+ 2025-09-11T23:44:32.328096Z  WARN Status Code: 504. Retrying..., request_id: ""
26
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
27
+
28
+ 2025-09-11T23:44:54.227188Z  WARN Status Code: 502. Retrying..., request_id: ""
29
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
30
+
31
+ 2025-09-11T23:45:06.678665Z  WARN Status Code: 504. Retrying..., request_id: ""
32
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
33
+
34
+ 2025-09-11T23:45:08.271528Z  WARN Status Code: 504. Retrying..., request_id: ""
35
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
36
+
37
+ 2025-09-11T23:45:22.066729Z  WARN Status Code: 502. Retrying..., request_id: ""
38
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
39
+
40
+ 2025-09-11T23:45:52.295651Z  WARN Status Code: 504. Retrying..., request_id: ""
41
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
42
+
43
+ 2025-09-11T23:45:53.499997Z  WARN Status Code: 504. Retrying..., request_id: ""
44
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
45
+
46
+ 2025-09-11T23:45:55.640780Z  WARN Status Code: 504. Retrying..., request_id: ""
47
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
48
+
49
+ 2025-09-11T23:45:57.556113Z  WARN Status Code: 504. Retrying..., request_id: ""
50
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
51
+
52
+ 2025-09-11T23:45:57.711116Z  WARN Status Code: 504. Retrying..., request_id: ""
53
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
54
+
55
+ 2025-09-11T23:45:58.341464Z  WARN Status Code: 504. Retrying..., request_id: ""
56
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
57
+
58
+ 2025-09-11T23:46:08.529175Z  WARN Status Code: 502. Retrying..., request_id: ""
59
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
60
+
61
+ 2025-09-11T23:46:21.196718Z  WARN Status Code: 504. Retrying..., request_id: ""
62
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
63
+
64
+ 2025-09-11T23:46:24.177897Z  WARN Status Code: 504. Retrying..., request_id: ""
65
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
66
+
67
+ 2025-09-11T23:46:24.642941Z  WARN Status Code: 504. Retrying..., request_id: ""
68
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
69
+
70
+ 2025-09-11T23:46:38.977757Z  WARN Status Code: 502. Retrying..., request_id: ""
71
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
72
+
73
+ 2025-09-11T23:46:39.966855Z  WARN Status Code: 502. Retrying..., request_id: ""
74
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
75
+
76
+ 2025-09-11T23:46:39.981239Z  WARN Status Code: 502. Retrying..., request_id: ""
77
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
78
+
79
+ 2025-09-11T23:46:50.642843Z  WARN Status Code: 504. Retrying..., request_id: ""
80
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
81
+
82
+ 2025-09-11T23:46:51.023410Z  WARN Status Code: 504. Retrying..., request_id: ""
83
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
84
+
85
+ 2025-09-11T23:46:51.611516Z  WARN Status Code: 504. Retrying..., request_id: ""
86
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
87
+
88
+ 2025-09-11T23:47:50.666111Z  WARN Status Code: 504. Retrying..., request_id: ""
89
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
90
+
91
+ 2025-09-11T23:48:34.977417Z  WARN Status Code: 500. Retrying..., request_id: "01K4XJWHDKPTCC9T33QCYA6MNQ"
92
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
93
+
94
+ 2025-09-11T23:48:47.297181Z  WARN Status Code: 504. Retrying..., request_id: ""
95
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
96
+
97
+ 2025-09-12T19:43:44.698287Z  WARN Status Code: 504. Retrying..., request_id: ""
98
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
99
+
100
+ 2025-09-12T19:45:32.431801Z  WARN Status Code: 502. Retrying..., request_id: ""
101
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
102
+
103
+ 2025-09-12T19:45:32.455977Z  WARN Status Code: 502. Retrying..., request_id: ""
104
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
105
+
106
+ 2025-09-12T19:48:33.481189Z  WARN Status Code: 502. Retrying..., request_id: ""
107
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
108
+
109
+ 2025-09-12T19:48:34.040804Z  WARN Status Code: 502. Retrying..., request_id: ""
110
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
111
+
112
+ 2025-09-12T19:50:34.408642Z  WARN Status Code: 502. Retrying..., request_id: ""
113
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
114
+
115
+ 2025-09-12T19:56:40.114757Z  WARN Status Code: 502. Retrying..., request_id: ""
116
+ at /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
117
+
logs/none_enyj3lod/attempt_0/1/stderr.log ADDED
The diff for this file is too large to render. See raw diff