mohammed-aljafry commited on
Commit
def2824
·
verified ·
1 Parent(s): 94dcf53

Final fix: Upload model with original __init__ and smart wrapper

Browse files
Files changed (2) hide show
  1. config.json +8 -8
  2. modeling_interfuser.py +25 -8
config.json CHANGED
@@ -4,24 +4,20 @@
4
  "InterfuserForHuggingFace"
5
  ],
6
  "img_size": 224,
 
 
7
  "in_chans": 3,
8
  "embed_dim": 256,
9
  "enc_depth": 6,
10
  "dec_depth": 6,
11
  "dim_feedforward": 2048,
 
12
  "rgb_backbone_name": "r50",
13
  "lidar_backbone_name": "r18",
14
  "num_heads": 8,
15
- "direct_concat": true,
16
- "use_different_backbone": true,
17
- "waypoints_pred_head": "gru",
18
- "traffic_pred_head_type": "det",
19
- "use_view_embed": true,
20
- "multi_view_img_size": 112,
21
- "patch_size": 8,
22
- "normalize_before": false,
23
  "dropout": 0.1,
24
  "end2end": false,
 
25
  "separate_view_attention": false,
26
  "separate_all_attention": false,
27
  "act_layer": null,
@@ -30,7 +26,11 @@
30
  "with_lidar": false,
31
  "with_right_left_sensors": true,
32
  "with_center_sensor": false,
 
 
33
  "reverse_pos": true,
 
 
34
  "use_mmad_pretrain": null,
35
  "auto_map": {
36
  "AutoModel": "modeling_interfuser.InterfuserForHuggingFace"
 
4
  "InterfuserForHuggingFace"
5
  ],
6
  "img_size": 224,
7
+ "multi_view_img_size": 112,
8
+ "patch_size": 8,
9
  "in_chans": 3,
10
  "embed_dim": 256,
11
  "enc_depth": 6,
12
  "dec_depth": 6,
13
  "dim_feedforward": 2048,
14
+ "normalize_before": false,
15
  "rgb_backbone_name": "r50",
16
  "lidar_backbone_name": "r18",
17
  "num_heads": 8,
 
 
 
 
 
 
 
 
18
  "dropout": 0.1,
19
  "end2end": false,
20
+ "direct_concat": true,
21
  "separate_view_attention": false,
22
  "separate_all_attention": false,
23
  "act_layer": null,
 
26
  "with_lidar": false,
27
  "with_right_left_sensors": true,
28
  "with_center_sensor": false,
29
+ "traffic_pred_head_type": "det",
30
+ "waypoints_pred_head": "gru",
31
  "reverse_pos": true,
32
+ "use_different_backbone": true,
33
+ "use_view_embed": true,
34
  "use_mmad_pretrain": null,
35
  "auto_map": {
36
  "AutoModel": "modeling_interfuser.InterfuserForHuggingFace"
modeling_interfuser.py CHANGED
@@ -1,6 +1,5 @@
1
 
2
  # -*- coding: utf-8 -*-
3
- # This is a self-contained file for the Interfuser model.
4
  import torch, math, copy
5
  from torch import nn, Tensor
6
  from functools import partial
@@ -14,12 +13,18 @@ try:
14
  except ImportError:
15
  raise ImportError("This model requires timm. Please install with 'pip install timm==0.4.12' or a compatible version.")
16
 
 
 
 
 
17
  def _get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
 
18
  def build_attn_mask(mask_type):
19
  mask=torch.ones((151,151),dtype=torch.bool).cuda()
20
  if mask_type=="seperate_all":mask[:50,:50]=False;mask[50:67,50:67]=False;mask[67:84,67:84]=False;mask[84:101,84:101]=False;mask[101:151,101:151]=False
21
  elif mask_type=="seperate_view":mask[:50,:50]=False;mask[50:67,50:67]=False;mask[67:84,67:84]=False;mask[84:101,84:101]=False;mask[101:151,:]=False;mask[:,101:151]=False
22
  return mask
 
23
  class HybridEmbed(nn.Module):
24
  def __init__(self,backbone,img_size=224,patch_size=1,feature_size=None,in_chans=3,embed_dim=768):
25
  super().__init__();assert isinstance(backbone,nn.Module);img_size=to_2tuple(img_size);patch_size=to_2tuple(patch_size);self.img_size=img_size;self.patch_size=patch_size;self.backbone=backbone
@@ -39,6 +44,7 @@ class HybridEmbed(nn.Module):
39
  x=self.backbone(x)
40
  if isinstance(x,(list,tuple)):x=x[-1]
41
  x=self.proj(x);global_x=torch.mean(x,[2,3],keepdim=False)[:,:,None];return x,global_x
 
42
  class PositionEmbeddingSine(nn.Module):
43
  def __init__(self,num_pos_feats=64,temperature=10000,normalize=False,scale=None):
44
  super().__init__();self.num_pos_feats=num_pos_feats;self.temperature=temperature;self.normalize=normalize
@@ -50,6 +56,7 @@ class PositionEmbeddingSine(nn.Module):
50
  if self.normalize:eps=1e-6;y_embed=y_embed/(y_embed[:,-1:,:]+eps)*self.scale;x_embed=x_embed/(x_embed[:,:,-1:]+eps)*self.scale
51
  dim_t=torch.arange(self.num_pos_feats,dtype=torch.float32,device=x.device);dim_t=self.temperature**(2*(dim_t//2)/self.num_pos_feats);pos_x=x_embed[:,:,:,None]/dim_t;pos_y=y_embed[:,:,:,None]/dim_t
52
  pos_x=torch.stack((pos_x[:,:,:,0::2].sin(),pos_x[:,:,:,1::2].cos()),dim=4).flatten(3);pos_y=torch.stack((pos_y[:,:,:,0::2].sin(),pos_y[:,:,:,1::2].cos()),dim=4).flatten(3);return torch.cat((pos_y,pos_x),dim=3).permute(0,3,1,2)
 
53
  class TransformerEncoder(nn.Module):
54
  def __init__(self,encoder_layer,num_layers,norm=None):super().__init__();self.layers=_get_clones(encoder_layer,num_layers);self.num_layers=num_layers;self.norm=norm
55
  def forward(self,src,mask:Optional[Tensor]=None,src_key_padding_mask:Optional[Tensor]=None,pos:Optional[Tensor]=None):
@@ -57,21 +64,25 @@ class TransformerEncoder(nn.Module):
57
  for layer in self.layers:output=layer(output,src_mask=mask,src_key_padding_mask=src_key_padding_mask,pos=pos)
58
  if self.norm is not None:output=self.norm(output)
59
  return output
 
60
  class GRUWaypointsPredictor(nn.Module):
61
  def __init__(self,input_dim,waypoints=10,**kwargs):super().__init__();self.gru=torch.nn.GRU(input_size=input_dim,hidden_size=64,batch_first=True);self.encoder=nn.Linear(2,64);self.decoder=nn.Linear(64,2);self.waypoints=waypoints
62
  def forward(self,x,target_point):bs=x.shape[0];z=self.encoder(target_point).unsqueeze(0);output,_=self.gru(x,z);output=self.decoder(output.reshape(bs*self.waypoints,-1)).reshape(bs,self.waypoints,2);return torch.cumsum(output,1)
 
63
  class TransformerEncoderLayer(nn.Module):
64
  def __init__(self,d_model,nhead,dim_feedforward=2048,dropout=0.1,activation=nn.ReLU(),normalize_before=False,**kwargs):
65
  super().__init__();self.self_attn=nn.MultiheadAttention(d_model,nhead,dropout=dropout);self.linear1=nn.Linear(d_model,dim_feedforward);self.dropout=nn.Dropout(dropout);self.linear2=nn.Linear(dim_feedforward,d_model);self.norm1=nn.LayerNorm(d_model);self.norm2=nn.LayerNorm(d_model);self.dropout1=nn.Dropout(dropout);self.dropout2=nn.Dropout(dropout);self.activation=activation();self.normalize_before=normalize_before
66
  def with_pos_embed(self,tensor,pos:Optional[Tensor]):return tensor if pos is None else tensor+pos
67
  def forward(self,src,src_mask:Optional[Tensor]=None,src_key_padding_mask:Optional[Tensor]=None,pos:Optional[Tensor]=None):
68
  q=k=self.with_pos_embed(src,pos);src2=self.self_attn(q,k,value=src,attn_mask=src_mask,key_padding_mask=src_key_padding_mask)[0];src=src+self.dropout1(src2);src=self.norm1(src);src2=self.linear2(self.dropout(self.activation(self.linear1(src))));src=src+self.dropout2(src2);src=self.norm2(src);return src
 
69
  class TransformerDecoderLayer(nn.Module):
70
  def __init__(self,d_model,nhead,dim_feedforward=2048,dropout=0.1,activation=nn.ReLU(),normalize_before=False,**kwargs):
71
  super().__init__();self.self_attn=nn.MultiheadAttention(d_model,nhead,dropout=dropout);self.multihead_attn=nn.MultiheadAttention(d_model,nhead,dropout=dropout);self.linear1=nn.Linear(d_model,dim_feedforward);self.dropout=nn.Dropout(dropout);self.linear2=nn.Linear(dim_feedforward,d_model);self.norm1=nn.LayerNorm(d_model);self.norm2=nn.LayerNorm(d_model);self.norm3=nn.LayerNorm(d_model);self.dropout1=nn.Dropout(dropout);self.dropout2=nn.Dropout(dropout);self.dropout3=nn.Dropout(dropout);self.activation=activation();self.normalize_before=normalize_before
72
  def with_pos_embed(self,tensor,pos:Optional[Tensor]):return tensor if pos is None else tensor+pos
73
  def forward(self,tgt,memory,tgt_mask:Optional[Tensor]=None,memory_mask:Optional[Tensor]=None,tgt_key_padding_mask:Optional[Tensor]=None,memory_key_padding_mask:Optional[Tensor]=None,pos:Optional[Tensor]=None,query_pos:Optional[Tensor]=None):
74
  q=k=self.with_pos_embed(tgt,query_pos);tgt2=self.self_attn(q,k,value=tgt,attn_mask=tgt_mask,key_padding_mask=tgt_key_padding_mask)[0];tgt=tgt+self.dropout1(tgt2);tgt=self.norm1(tgt);tgt2=self.multihead_attn(query=self.with_pos_embed(tgt,query_pos),key=self.with_pos_embed(memory,pos),value=memory,attn_mask=memory_mask,key_padding_mask=memory_key_padding_mask)[0];tgt=tgt+self.dropout2(tgt2);tgt=self.norm2(tgt);tgt2=self.linear2(self.dropout(self.activation(self.linear1(tgt))));tgt=tgt+self.dropout3(tgt2);tgt=self.norm3(tgt);return tgt
 
75
  class TransformerDecoder(nn.Module):
76
  def __init__(self,decoder_layer,num_layers,norm=None,return_intermediate=False,**kwargs):
77
  super().__init__();self.layers=_get_clones(decoder_layer,num_layers);self.num_layers=num_layers;self.norm=norm;self.return_intermediate=return_intermediate
@@ -80,6 +91,10 @@ class TransformerDecoder(nn.Module):
80
  for layer in self.layers:output=layer(output,memory,**kwargs)
81
  if self.norm is not None:output=self.norm(output)
82
  return output.unsqueeze(0)
 
 
 
 
83
  class Interfuser(nn.Module):
84
  def __init__(self,img_size=224,multi_view_img_size=112,patch_size=8,in_chans=3,embed_dim=768,enc_depth=6,dec_depth=6,dim_feedforward=2048,normalize_before=False,rgb_backbone_name="r26",lidar_backbone_name="r26",num_heads=8,norm_layer=None,dropout=0.1,end2end=False,direct_concat=True,separate_view_attention=False,separate_all_attention=False,act_layer=None,weight_init="",freeze_num=-1,with_lidar=False,with_right_left_sensors=True,with_center_sensor=False,traffic_pred_head_type="det",waypoints_pred_head="heatmap",reverse_pos=True,use_different_backbone=False,use_view_embed=True,use_mmad_pretrain=None):
85
  super().__init__();self.traffic_pred_head_type=traffic_pred_head_type;self.num_features=self.embed_dim=embed_dim;norm_layer=norm_layer or partial(nn.LayerNorm,eps=1e-6);act_layer=act_layer or nn.GELU;self.reverse_pos=reverse_pos;self.waypoints_pred_head=waypoints_pred_head;self.with_lidar=with_lidar;self.with_right_left_sensors=with_right_left_sensors;self.with_center_sensor=with_center_sensor;self.direct_concat=direct_concat;self.separate_view_attention=separate_view_attention;self.separate_all_attention=separate_all_attention;self.end2end=end2end;self.use_view_embed=use_view_embed
@@ -101,13 +116,10 @@ class Interfuser(nn.Module):
101
  else:front_image_token=front_image_token+self.position_encoding(front_image_token)
102
  front_image_token=front_image_token.flatten(2).permute(2,0,1);front_image_token_global=(front_image_token_global+self.view_embed[:,:,0,:]+self.global_embed[:,:,0:1]).permute(2,0,1);features.extend([front_image_token,front_image_token_global])
103
  if self.with_right_left_sensors:
104
- left_image_token,left_image_token_global=self.rgb_patch_embed(left_image)
105
- if self.use_view_embed:left_image_token=left_image_token+self.view_embed[:,:,1:2,:]+self.position_encoding(left_image_token)
106
- else:left_image_token=left_image_token+self.position_encoding(left_image_token)
107
  left_image_token=left_image_token.flatten(2).permute(2,0,1);left_image_token_global=(left_image_token_global+self.view_embed[:,:,1,:]+self.global_embed[:,:,1:2]).permute(2,0,1)
108
- right_image_token,right_image_token_global=self.rgb_patch_embed(right_image)
109
- if self.use_view_embed:right_image_token=right_image_token+self.view_embed[:,:,2:3,:]+self.position_encoding(right_image_token)
110
- else:right_image_token=right_image_token+self.position_encoding(right_image_token)
111
  right_image_token=right_image_token.flatten(2).permute(2,0,1);right_image_token_global=(right_image_token_global+self.view_embed[:,:,2,:]+self.global_embed[:,:,2:3]).permute(2,0,1)
112
  features.extend([left_image_token,left_image_token_global,right_image_token,right_image_token_global])
113
  return torch.cat(features,0)
@@ -124,6 +136,9 @@ class Interfuser(nn.Module):
124
  velocity=measurements[:,6:7].unsqueeze(-1).repeat(1,400,32);traffic_feature_with_vel=torch.cat([hs[:,:400],velocity],dim=2);traffic=self.traffic_pred_head(traffic_feature_with_vel)
125
  return traffic,waypoints,is_junction,traffic_light_state,stop_sign,hs[:,:400]
126
 
 
 
 
127
  class InterfuserConfig(PretrainedConfig):
128
  model_type="interfuser"
129
  def __init__(self, **kwargs):
@@ -135,9 +150,11 @@ class InterfuserForHuggingFace(PreTrainedModel):
135
  config_class = InterfuserConfig
136
  def __init__(self, config: InterfuserConfig):
137
  super().__init__(config)
 
138
  init_args = config.to_dict()
139
- # Remove keys that are not in the original __init__
140
  for key in ["model_type", "architectures", "auto_map", "transformers_version"]: init_args.pop(key, None)
 
141
  self.interfuser = Interfuser(**init_args)
142
  def forward(self, rgb, rgb_left, rgb_right, rgb_center, lidar, measurements, target_point, **kwargs):
143
  inputs_dict = {'rgb':rgb, 'rgb_left':rgb_left, 'rgb_right':rgb_right, 'rgb_center':rgb_center, 'lidar':lidar, 'measurements':measurements, 'target_point':target_point}
 
1
 
2
  # -*- coding: utf-8 -*-
 
3
  import torch, math, copy
4
  from torch import nn, Tensor
5
  from functools import partial
 
13
  except ImportError:
14
  raise ImportError("This model requires timm. Please install with 'pip install timm==0.4.12' or a compatible version.")
15
 
16
+ # =========================================================
17
+ # SECTION 1: ALL HELPER CLASSES (COPIED FROM YOUR NOTEBOOK)
18
+ # =========================================================
19
+
20
  def _get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
21
+
22
  def build_attn_mask(mask_type):
23
  mask=torch.ones((151,151),dtype=torch.bool).cuda()
24
  if mask_type=="seperate_all":mask[:50,:50]=False;mask[50:67,50:67]=False;mask[67:84,67:84]=False;mask[84:101,84:101]=False;mask[101:151,101:151]=False
25
  elif mask_type=="seperate_view":mask[:50,:50]=False;mask[50:67,50:67]=False;mask[67:84,67:84]=False;mask[84:101,84:101]=False;mask[101:151,:]=False;mask[:,101:151]=False
26
  return mask
27
+
28
  class HybridEmbed(nn.Module):
29
  def __init__(self,backbone,img_size=224,patch_size=1,feature_size=None,in_chans=3,embed_dim=768):
30
  super().__init__();assert isinstance(backbone,nn.Module);img_size=to_2tuple(img_size);patch_size=to_2tuple(patch_size);self.img_size=img_size;self.patch_size=patch_size;self.backbone=backbone
 
44
  x=self.backbone(x)
45
  if isinstance(x,(list,tuple)):x=x[-1]
46
  x=self.proj(x);global_x=torch.mean(x,[2,3],keepdim=False)[:,:,None];return x,global_x
47
+
48
  class PositionEmbeddingSine(nn.Module):
49
  def __init__(self,num_pos_feats=64,temperature=10000,normalize=False,scale=None):
50
  super().__init__();self.num_pos_feats=num_pos_feats;self.temperature=temperature;self.normalize=normalize
 
56
  if self.normalize:eps=1e-6;y_embed=y_embed/(y_embed[:,-1:,:]+eps)*self.scale;x_embed=x_embed/(x_embed[:,:,-1:]+eps)*self.scale
57
  dim_t=torch.arange(self.num_pos_feats,dtype=torch.float32,device=x.device);dim_t=self.temperature**(2*(dim_t//2)/self.num_pos_feats);pos_x=x_embed[:,:,:,None]/dim_t;pos_y=y_embed[:,:,:,None]/dim_t
58
  pos_x=torch.stack((pos_x[:,:,:,0::2].sin(),pos_x[:,:,:,1::2].cos()),dim=4).flatten(3);pos_y=torch.stack((pos_y[:,:,:,0::2].sin(),pos_y[:,:,:,1::2].cos()),dim=4).flatten(3);return torch.cat((pos_y,pos_x),dim=3).permute(0,3,1,2)
59
+
60
  class TransformerEncoder(nn.Module):
61
  def __init__(self,encoder_layer,num_layers,norm=None):super().__init__();self.layers=_get_clones(encoder_layer,num_layers);self.num_layers=num_layers;self.norm=norm
62
  def forward(self,src,mask:Optional[Tensor]=None,src_key_padding_mask:Optional[Tensor]=None,pos:Optional[Tensor]=None):
 
64
  for layer in self.layers:output=layer(output,src_mask=mask,src_key_padding_mask=src_key_padding_mask,pos=pos)
65
  if self.norm is not None:output=self.norm(output)
66
  return output
67
+
68
  class GRUWaypointsPredictor(nn.Module):
69
  def __init__(self,input_dim,waypoints=10,**kwargs):super().__init__();self.gru=torch.nn.GRU(input_size=input_dim,hidden_size=64,batch_first=True);self.encoder=nn.Linear(2,64);self.decoder=nn.Linear(64,2);self.waypoints=waypoints
70
  def forward(self,x,target_point):bs=x.shape[0];z=self.encoder(target_point).unsqueeze(0);output,_=self.gru(x,z);output=self.decoder(output.reshape(bs*self.waypoints,-1)).reshape(bs,self.waypoints,2);return torch.cumsum(output,1)
71
+
72
  class TransformerEncoderLayer(nn.Module):
73
  def __init__(self,d_model,nhead,dim_feedforward=2048,dropout=0.1,activation=nn.ReLU(),normalize_before=False,**kwargs):
74
  super().__init__();self.self_attn=nn.MultiheadAttention(d_model,nhead,dropout=dropout);self.linear1=nn.Linear(d_model,dim_feedforward);self.dropout=nn.Dropout(dropout);self.linear2=nn.Linear(dim_feedforward,d_model);self.norm1=nn.LayerNorm(d_model);self.norm2=nn.LayerNorm(d_model);self.dropout1=nn.Dropout(dropout);self.dropout2=nn.Dropout(dropout);self.activation=activation();self.normalize_before=normalize_before
75
  def with_pos_embed(self,tensor,pos:Optional[Tensor]):return tensor if pos is None else tensor+pos
76
  def forward(self,src,src_mask:Optional[Tensor]=None,src_key_padding_mask:Optional[Tensor]=None,pos:Optional[Tensor]=None):
77
  q=k=self.with_pos_embed(src,pos);src2=self.self_attn(q,k,value=src,attn_mask=src_mask,key_padding_mask=src_key_padding_mask)[0];src=src+self.dropout1(src2);src=self.norm1(src);src2=self.linear2(self.dropout(self.activation(self.linear1(src))));src=src+self.dropout2(src2);src=self.norm2(src);return src
78
+
79
  class TransformerDecoderLayer(nn.Module):
80
  def __init__(self,d_model,nhead,dim_feedforward=2048,dropout=0.1,activation=nn.ReLU(),normalize_before=False,**kwargs):
81
  super().__init__();self.self_attn=nn.MultiheadAttention(d_model,nhead,dropout=dropout);self.multihead_attn=nn.MultiheadAttention(d_model,nhead,dropout=dropout);self.linear1=nn.Linear(d_model,dim_feedforward);self.dropout=nn.Dropout(dropout);self.linear2=nn.Linear(dim_feedforward,d_model);self.norm1=nn.LayerNorm(d_model);self.norm2=nn.LayerNorm(d_model);self.norm3=nn.LayerNorm(d_model);self.dropout1=nn.Dropout(dropout);self.dropout2=nn.Dropout(dropout);self.dropout3=nn.Dropout(dropout);self.activation=activation();self.normalize_before=normalize_before
82
  def with_pos_embed(self,tensor,pos:Optional[Tensor]):return tensor if pos is None else tensor+pos
83
  def forward(self,tgt,memory,tgt_mask:Optional[Tensor]=None,memory_mask:Optional[Tensor]=None,tgt_key_padding_mask:Optional[Tensor]=None,memory_key_padding_mask:Optional[Tensor]=None,pos:Optional[Tensor]=None,query_pos:Optional[Tensor]=None):
84
  q=k=self.with_pos_embed(tgt,query_pos);tgt2=self.self_attn(q,k,value=tgt,attn_mask=tgt_mask,key_padding_mask=tgt_key_padding_mask)[0];tgt=tgt+self.dropout1(tgt2);tgt=self.norm1(tgt);tgt2=self.multihead_attn(query=self.with_pos_embed(tgt,query_pos),key=self.with_pos_embed(memory,pos),value=memory,attn_mask=memory_mask,key_padding_mask=memory_key_padding_mask)[0];tgt=tgt+self.dropout2(tgt2);tgt=self.norm2(tgt);tgt2=self.linear2(self.dropout(self.activation(self.linear1(tgt))));tgt=tgt+self.dropout3(tgt2);tgt=self.norm3(tgt);return tgt
85
+
86
  class TransformerDecoder(nn.Module):
87
  def __init__(self,decoder_layer,num_layers,norm=None,return_intermediate=False,**kwargs):
88
  super().__init__();self.layers=_get_clones(decoder_layer,num_layers);self.num_layers=num_layers;self.norm=norm;self.return_intermediate=return_intermediate
 
91
  for layer in self.layers:output=layer(output,memory,**kwargs)
92
  if self.norm is not None:output=self.norm(output)
93
  return output.unsqueeze(0)
94
+
95
+ # =========================================================
96
+ # SECTION 2: THE ORIGINAL INTERFUSER MODEL (UNMODIFIED)
97
+ # =========================================================
98
  class Interfuser(nn.Module):
99
  def __init__(self,img_size=224,multi_view_img_size=112,patch_size=8,in_chans=3,embed_dim=768,enc_depth=6,dec_depth=6,dim_feedforward=2048,normalize_before=False,rgb_backbone_name="r26",lidar_backbone_name="r26",num_heads=8,norm_layer=None,dropout=0.1,end2end=False,direct_concat=True,separate_view_attention=False,separate_all_attention=False,act_layer=None,weight_init="",freeze_num=-1,with_lidar=False,with_right_left_sensors=True,with_center_sensor=False,traffic_pred_head_type="det",waypoints_pred_head="heatmap",reverse_pos=True,use_different_backbone=False,use_view_embed=True,use_mmad_pretrain=None):
100
  super().__init__();self.traffic_pred_head_type=traffic_pred_head_type;self.num_features=self.embed_dim=embed_dim;norm_layer=norm_layer or partial(nn.LayerNorm,eps=1e-6);act_layer=act_layer or nn.GELU;self.reverse_pos=reverse_pos;self.waypoints_pred_head=waypoints_pred_head;self.with_lidar=with_lidar;self.with_right_left_sensors=with_right_left_sensors;self.with_center_sensor=with_center_sensor;self.direct_concat=direct_concat;self.separate_view_attention=separate_view_attention;self.separate_all_attention=separate_all_attention;self.end2end=end2end;self.use_view_embed=use_view_embed
 
116
  else:front_image_token=front_image_token+self.position_encoding(front_image_token)
117
  front_image_token=front_image_token.flatten(2).permute(2,0,1);front_image_token_global=(front_image_token_global+self.view_embed[:,:,0,:]+self.global_embed[:,:,0:1]).permute(2,0,1);features.extend([front_image_token,front_image_token_global])
118
  if self.with_right_left_sensors:
119
+ left_image_token,left_image_token_global=self.rgb_patch_embed(left_image);right_image_token,right_image_token_global=self.rgb_patch_embed(right_image)
120
+ if self.use_view_embed:left_image_token=left_image_token+self.view_embed[:,:,1:2,:]+self.position_encoding(left_image_token);right_image_token=right_image_token+self.view_embed[:,:,2:3,:]+self.position_encoding(right_image_token)
121
+ else:left_image_token=left_image_token+self.position_encoding(left_image_token);right_image_token=right_image_token+self.position_encoding(right_image_token)
122
  left_image_token=left_image_token.flatten(2).permute(2,0,1);left_image_token_global=(left_image_token_global+self.view_embed[:,:,1,:]+self.global_embed[:,:,1:2]).permute(2,0,1)
 
 
 
123
  right_image_token=right_image_token.flatten(2).permute(2,0,1);right_image_token_global=(right_image_token_global+self.view_embed[:,:,2,:]+self.global_embed[:,:,2:3]).permute(2,0,1)
124
  features.extend([left_image_token,left_image_token_global,right_image_token,right_image_token_global])
125
  return torch.cat(features,0)
 
136
  velocity=measurements[:,6:7].unsqueeze(-1).repeat(1,400,32);traffic_feature_with_vel=torch.cat([hs[:,:400],velocity],dim=2);traffic=self.traffic_pred_head(traffic_feature_with_vel)
137
  return traffic,waypoints,is_junction,traffic_light_state,stop_sign,hs[:,:400]
138
 
139
+ # =========================================================
140
+ # SECTION 3: HUGGING FACE WRAPPER CLASSES
141
+ # =========================================================
142
  class InterfuserConfig(PretrainedConfig):
143
  model_type="interfuser"
144
  def __init__(self, **kwargs):
 
150
  config_class = InterfuserConfig
151
  def __init__(self, config: InterfuserConfig):
152
  super().__init__(config)
153
+ # ** الغلاف الذكي: يقرأ كل معامل من config ويمرره إلى Interfuser **
154
  init_args = config.to_dict()
155
+ # نزيل المفاتيح التي لا تخص Interfuser.__init__
156
  for key in ["model_type", "architectures", "auto_map", "transformers_version"]: init_args.pop(key, None)
157
+ # نستدعي النموذج الأصلي بكل معاملاته
158
  self.interfuser = Interfuser(**init_args)
159
  def forward(self, rgb, rgb_left, rgb_right, rgb_center, lidar, measurements, target_point, **kwargs):
160
  inputs_dict = {'rgb':rgb, 'rgb_left':rgb_left, 'rgb_right':rgb_right, 'rgb_center':rgb_center, 'lidar':lidar, 'measurements':measurements, 'target_point':target_point}