Spaces:

asdfasdfdsafdsa
/

pgps-demo

Sleeping

App Files Files Community

asdfasdfdsafdsa commited on Aug 24, 2025

Commit

aef03a7

verified ·

1 Parent(s): bf8c161

Fix tensor dimension mismatch in MLM pretrain path

Browse files

Files changed (1) hide show

simple_inference.py +29 -6

simple_inference.py CHANGED Viewed

@@ -38,14 +38,37 @@ def simple_process_input(image, text_input, model, src_lang, tgt_lang, cfg):
     batch_size = 1
     text_len = len(text_indices)
-    text_dict = {
-        'token': torch.LongTensor([text_indices]).to(device),
-        'sect_tag': torch.ones(batch_size, text_len, dtype=torch.long).to(device),
-        'class_tag': torch.ones(batch_size, text_len, dtype=torch.long).to(device),
-        'len': torch.LongTensor([text_len]).to(device)
-    }
     # Simple var dict (no variables detected)
     var_dict = {
         'pos': torch.zeros(batch_size, 1, dtype=torch.long).to(device),
         'len': torch.zeros(batch_size, dtype=torch.long).to(device),

     batch_size = 1
     text_len = len(text_indices)
+    # For MLM pretrain, tokens need to be 3D: [batch, seq_len, vocab_size]
+    # But here we use 2D: [batch, seq_len] and let the embedding layer handle it
+    token_tensor = torch.LongTensor([text_indices]).to(device)
+    # Ensure sect_tag and class_tag match token length
+    sect_tag_indices = [1] * text_len  # Default to [PROB]
+    class_tag_indices = [1] * text_len  # Default to [GEN]
+    # MLM pretrain expects token to be [batch, seq_len, num_tokens_per_word]
+    # For single word tokens, num_tokens_per_word = 1
+    # So we need to add an extra dimension
+    if cfg.use_MLM_pretrain:
+        # Reshape token tensor to [batch, seq_len, 1] then expand to match expected format
+        token_tensor_3d = token_tensor.unsqueeze(-1)  # [batch, seq_len, 1]
+        text_dict = {
+            'token': token_tensor_3d,
+            'sect_tag': torch.LongTensor([sect_tag_indices]).to(device),
+            'class_tag': torch.LongTensor([class_tag_indices]).to(device),
+            'len': torch.LongTensor([text_len]).to(device)
+        }
+    else:
+        text_dict = {
+            'token': token_tensor,
+            'sect_tag': torch.LongTensor([sect_tag_indices]).to(device),
+            'class_tag': torch.LongTensor([class_tag_indices]).to(device),
+            'len': torch.LongTensor([text_len]).to(device)
+        }
     # Simple var dict (no variables detected)
+    # Note: var positions need to account for the diagram token that will be added
     var_dict = {
         'pos': torch.zeros(batch_size, 1, dtype=torch.long).to(device),
         'len': torch.zeros(batch_size, dtype=torch.long).to(device),