Spaces:

asdfasdfdsafdsa
/

pgps-demo

Sleeping

asdfasdfdsafdsa commited on Aug 24, 2025

Commit

cc4d3bc

verified ·

1 Parent(s): aef03a7

Fix token tensor dimensions - should be [batch, 1, seq_len]

Files changed (1) hide show

simple_inference.py CHANGED Viewed

@@ -46,12 +46,14 @@ def simple_process_input(image, text_input, model, src_lang, tgt_lang, cfg):
     sect_tag_indices = [1] * text_len  # Default to [PROB]
     class_tag_indices = [1] * text_len  # Default to [GEN]
-    # MLM pretrain expects token to be [batch, seq_len, num_tokens_per_word]
-    # For single word tokens, num_tokens_per_word = 1
-    # So we need to add an extra dimension
     if cfg.use_MLM_pretrain:
-        # Reshape token tensor to [batch, seq_len, 1] then expand to match expected format
-        token_tensor_3d = token_tensor.unsqueeze(-1)  # [batch, seq_len, 1]
         text_dict = {
             'token': token_tensor_3d,
@@ -60,8 +62,11 @@ def simple_process_input(image, text_input, model, src_lang, tgt_lang, cfg):
             'len': torch.LongTensor([text_len]).to(device)
         }
     else:
         text_dict = {
-            'token': token_tensor,
             'sect_tag': torch.LongTensor([sect_tag_indices]).to(device),
             'class_tag': torch.LongTensor([class_tag_indices]).to(device),
             'len': torch.LongTensor([text_len]).to(device)

     sect_tag_indices = [1] * text_len  # Default to [PROB]
     class_tag_indices = [1] * text_len  # Default to [GEN]
+    # The model expects token to be [batch, num_subwords_per_token, seq_len]
+    # For simple case, we have 1 subword per token, so shape is [batch, 1, seq_len]
+    # This gets embedded and summed over dim=1 to get [batch, seq_len, embed_dim]
     if cfg.use_MLM_pretrain:
+        # Create 3D tensor: [batch_size, 1, text_len]
+        # Each token is a single subword, so middle dimension is 1
+        token_tensor_3d = token_tensor.unsqueeze(1)  # [batch, 1, seq_len]
         text_dict = {
             'token': token_tensor_3d,
             'len': torch.LongTensor([text_len]).to(device)
         }
     else:
+        # Non-MLM path also needs 3D tensor for consistency
+        token_tensor_3d = token_tensor.unsqueeze(1)  # [batch, 1, seq_len]
         text_dict = {
+            'token': token_tensor_3d,
             'sect_tag': torch.LongTensor([sect_tag_indices]).to(device),
             'class_tag': torch.LongTensor([class_tag_indices]).to(device),
             'len': torch.LongTensor([text_len]).to(device)