davesalvi
/

ispl_safe

Model card Files Files and versions

xet

Community

davesalvi commited on Mar 28, 2025

Commit

b1fcba0

1 Parent(s): afd55da

moe transformer

Browse files

Files changed (3) hide show

.idea/workspace.xml +2 -1
script.py +2 -1
src/moe_model.py +28 -66

.idea/workspace.xml CHANGED Viewed

@@ -6,6 +6,7 @@
   <component name="ChangeListManager">
     <list default="true" id="23565123-73ab-4f40-a9ef-1086e0c9e1ec" name="Changes" comment="">
       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/src/moe_model.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/moe_model.py" afterDir="false" />
     </list>
     <option name="SHOW_DIALOG" value="false" />
@@ -136,7 +137,7 @@
       <workItem from="1743062628099" duration="35000" />
       <workItem from="1743063082652" duration="7000" />
       <workItem from="1743092790258" duration="1395000" />
-      <workItem from="1743151940209" duration="1035000" />
     </task>
     <servers />
   </component>

   <component name="ChangeListManager">
     <list default="true" id="23565123-73ab-4f40-a9ef-1086e0c9e1ec" name="Changes" comment="">
       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/script.py" beforeDir="false" afterPath="$PROJECT_DIR$/script.py" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/src/moe_model.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/moe_model.py" afterDir="false" />
     </list>
     <option name="SHOW_DIALOG" value="false" />
       <workItem from="1743062628099" duration="35000" />
       <workItem from="1743063082652" duration="7000" />
       <workItem from="1743092790258" duration="1395000" />
+      <workItem from="1743151940209" duration="1772000" />
     </task>
     <servers />
   </component>

script.py CHANGED Viewed

@@ -69,7 +69,8 @@ expert_6 = LCNN(return_emb=True).to(device)
 # model_path = './checkpoints/MOE_ULTIMATE.pth'
 model = MOE_attention(experts=[expert_1, expert_2, expert_3, expert_4, expert_5, expert_6], device=device, freezing=True)
-model_path = './checkpoints/MOE_ATTENTION.pth'
 model = (model).to(device)
 model.load_state_dict(torch.load(model_path, map_location=device))

 # model_path = './checkpoints/MOE_ULTIMATE.pth'
 model = MOE_attention(experts=[expert_1, expert_2, expert_3, expert_4, expert_5, expert_6], device=device, freezing=True)
+# model_path = './checkpoints/MOE_ATTENTION.pth'
+model_path = './checkpoints/MOE_TRANSF.pth'
 model = (model).to(device)
 model.load_state_dict(torch.load(model_path, map_location=device))

src/moe_model.py CHANGED Viewed

@@ -61,61 +61,34 @@ class UltimateMOE(nn.Module):
 class MOE_attention(nn.Module):
     def __init__(self, experts, device, input_dim=128, freezing=False):
         super(MOE_attention, self).__init__()
         self.threshold = 0.4
         self.device = device
         self.experts = nn.ModuleList(experts)
         self.num_experts = len(experts)
-        self.proc_emb_1 = nn.Sequential(
-            nn.Linear(128, 128),
-            nn.BatchNorm1d(128),
-            nn.GLU(),
-            nn.Linear(64, 32)
         )
-        self.proc_emb_2 = nn.Sequential(
-            nn.Linear(128, 128),
-            nn.BatchNorm1d(128),
-            nn.GLU(),
-            nn.Linear(64, 32)
-        )
-        self.proc_emb_3 = nn.Sequential(
-            nn.Linear(128, 128),
-            nn.BatchNorm1d(128),
-            nn.GLU(),
-            nn.Linear(64, 32)
-        )
-        self.proc_emb_4 = nn.Sequential(
-            nn.Linear(128, 128),
-            nn.BatchNorm1d(128),
-            nn.GLU(),
-            nn.Linear(64, 32)
-        )
-        self.proc_emb_5 = nn.Sequential(
-            nn.Linear(128, 128),
-            nn.BatchNorm1d(128),
-            nn.GLU(),
-            nn.Linear(64, 32)
-        )
-        self.proc_emb_6 = nn.Sequential(
-            nn.Linear(128, 128),
-            nn.BatchNorm1d(128),
-            nn.GLU(),
-            nn.Linear(64, 32)
-        )
-        self.MHead_Attn = nn.MultiheadAttention(
-            embed_dim=32, num_heads=4,
-            dropout=0.1, batch_first=True
-        )
-        self.query_proj = nn.Linear(input_dim * self.num_experts, 32)
         self.softmax = nn.Softmax(dim=1)
         if freezing:
@@ -125,31 +98,20 @@ class MOE_attention(nn.Module):
     def forward(self, x):
-        outputs = [expert(x)[0] for expert in self.experts]
-        embeddings = [expert(x)[1] for expert in self.experts]
-        proc_emb_1 = self.proc_emb_1(embeddings[0])
-        proc_emb_2 = self.proc_emb_2(embeddings[1])
-        proc_emb_3 = self.proc_emb_3(embeddings[2])
-        proc_emb_4 = self.proc_emb_4(embeddings[3])
-        proc_emb_5 = self.proc_emb_5(embeddings[4])
-        proc_emb_6 = self.proc_emb_6(embeddings[5])
-        processed_embs = torch.stack([proc_emb_1, proc_emb_2, proc_emb_3, proc_emb_4, proc_emb_5, proc_emb_6], dim=1)
-        query_input = torch.cat(embeddings, dim=1)
-        query = self.query_proj(query_input).unsqueeze(1)
-        attn_output, attn_weights = self.MHead_Attn(
-            query=query,
-            key=processed_embs,
-            value=processed_embs
-        )
-        gating_weights = attn_weights.mean(dim=1).unsqueeze(1)
         expert_outputs = torch.stack(outputs, dim=1)
         combined_output = torch.bmm(gating_weights, expert_outputs).squeeze(1)
-        score = self.softmax(combined_output)
-        return score

 class MOE_attention(nn.Module):
     def __init__(self, experts, device, input_dim=128, freezing=False):
         super(MOE_attention, self).__init__()
         self.threshold = 0.4
         self.device = device
         self.experts = nn.ModuleList(experts)
         self.num_experts = len(experts)
+        self.proc_emb = nn.ModuleList([
+            nn.Sequential(
+                nn.Linear(128, 128),
+                nn.BatchNorm1d(128),
+                nn.GLU(),
+                nn.Linear(64, 32)
+            ) for _ in range(self.num_experts)
+        ])
+        self.TransfEnc = nn.Sequential(
+            nn.TransformerEncoderLayer(d_model=32, nhead=4, dropout=0.1, dim_feedforward=512),
+            nn.TransformerEncoderLayer(d_model=32, nhead=4, dropout=0.1, dim_feedforward=128)
         )
+        self.linear_out = nn.Linear(32, 1)
+        # self.MHead_Attn = nn.MultiheadAttention(
+        #     embed_dim=32, num_heads=4,
+        #     dropout=0.1, batch_first=True
+        # )
+        # self.query_proj = nn.Linear(input_dim * self.num_experts, 32)
         self.softmax = nn.Softmax(dim=1)
         if freezing:
     def forward(self, x):
+        results = [expert(x) for expert in self.experts]
+        outputs = [res[0] for res in results]
+        embeddings = [res[1] for res in results]
+        processed_embs = torch.stack([proc_emb(emb) for proc_emb, emb in zip(self.proc_emb, embeddings)], dim=1)
+        # pdb.set_trace()
+        transf_out = self.TransfEnc(processed_embs)
+        gating_weights = self.linear_out(transf_out).squeeze(2)
+        gating_weights = self.softmax(gating_weights).unsqueeze(1)
         expert_outputs = torch.stack(outputs, dim=1)
         combined_output = torch.bmm(gating_weights, expert_outputs).squeeze(1)
+        return combined_output