Tu2003716
/

COCOM_disabled_flash_attn

@@ -1,4 +1,4 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, PreTrainedModel, PretrainedConfig, AutoModel,LongformerForCausalLM, LongformerTokenizer, LongformerConfig
 import torch
 import math
 from peft import get_peft_model, LoraConfig, TaskType
@@ -71,8 +71,7 @@ class COCOMConfig(PretrainedConfig):
                 lora = False,
                 training_form="both",
                 lora_r=16,
-                attn_implementation="longformer",
-                attention_window=512,
                 device_map = "cuda",
                  **kwargs):
         super().__init__(**kwargs)
@@ -96,28 +95,6 @@ class COCOM(PreTrainedModel):
         super().__init__(cfg)
         # define models
         attn_impl = cfg.attn_implementation
-        if cfg.attn_implementation == "longformer":
-            # Initialize Longformer
-            longformer_config = LongformerConfig.from_pretrained(cfg.decoder_model_name)
-            longformer_config.attention_window = 512  # Modify based on context window size
-            self.decoder = LongformerForCausalLM.from_pretrained(
-                cfg.decoder_model_name,
-                config=longformer_config,
-                torch_dtype=torch.float16,
-                low_cpu_mem_usage=True,
-                device_map=cfg.device_map
-            )
-        else:
-            # Original decoder initialization
-            self.decoder = AutoModelForCausalLM.from_pretrained(
-                cfg.decoder_model_name,
-                torch_dtype=torch.float16,
-                attn_implementation=attn_impl,
-                low_cpu_mem_usage=True,
-                device_map=cfg.device_map
-            )
         # model could be loaded in three quantization modes: no, int4, int8
         if cfg.quantization == "no":
             self.decoder = AutoModelForCausalLM.from_pretrained(
@@ -216,20 +193,15 @@ class COCOM(PreTrainedModel):
         self.compr_rate = cfg.compr_rate
         self.local_rank = os.getenv('LOCAL_RANK', '0')
-    def compress_and_replace_emb(self, enc_input_ids, enc_attention_mask, dec_input_ids, dec_attention_mask):
         indices = range(0, enc_input_ids.size(0) + 1, self.generation_top_k)
-        # Perform compression
         if self.compr:
             compressed_embs = self.compr(enc_input_ids, enc_attention_mask)
         else:
             compressed_embs = self.compr_decoder(enc_input_ids, enc_attention_mask)
-        # Replace embeddings with compressed ones
-        input_embeds = self.replace_embeddings(compressed_embs, dec_input_ids, dec_attention_mask, indices)
         return input_embeds
     def compr_decoder(self, input_ids, attention_mask):
         emb = self.decoder(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1]
@@ -240,23 +212,19 @@ class COCOM(PreTrainedModel):
     def replace_embeddings(self, compressed_embs, dec_input_ids, indices):
         # Embed the decoder input
         inputs_embeds = self.decoder.get_input_embeddings()(dec_input_ids)
-        # Number of compressed embeddings
         num_embs = compressed_embs.size(1)
-        # Define slot length for memory tokens
-        slot_len = num_embs + 1 if self.sep else num_embs
-        # Find the first memory token indices
         first_mem_token_indices = torch.argmax((dec_input_ids == self.decoder_tokenizer.mem_token_id).int(), dim=1)
         batch_size = inputs_embeds.size(0)
-        # Replace memory tokens with compressed embeddings
         for i in range(batch_size):
             for j in range(indices[i], indices[i + 1]):
-                start_idx = first_mem_token_indices[i].item() + (j - indices[i]) * slot_len
                 inputs_embeds[i, start_idx:start_idx + num_embs, :] = compressed_embs[j]
         return inputs_embeds
@@ -267,13 +235,19 @@ class COCOM(PreTrainedModel):
             dec_attention_mask: torch.LongTensor = None,
             labels: torch.LongTensor = None):
-        inputs_embeds = self.compress_and_replace_emb(enc_input_ids, enc_attention_mask, dec_input_ids, dec_attention_mask)
-        # Detach inputs_embeds if training compressor only
         if (self.training_form == "compressor") and (self.compr is None):
-            inputs_embeds = inputs_embeds.detach()
-        # Pass through the decoder
         decoder_outputs = self.decoder(inputs_embeds=inputs_embeds, attention_mask=dec_attention_mask, labels=labels)
         return {"loss": decoder_outputs.loss, "logits": decoder_outputs.logits}
@@ -289,7 +263,7 @@ class COCOM(PreTrainedModel):
             attention_mask=dec_attention_mask.to(device),
             do_sample=False,
             top_p=None,
-            max_new_tokens=min(max_new_tokens, 4096)
             )
         decoded = self.decoder_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
         return decoded

+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, PreTrainedModel, PretrainedConfig, AutoModel
 import torch
 import math
 from peft import get_peft_model, LoraConfig, TaskType
                 lora = False,
                 training_form="both",
                 lora_r=16,
+                attn_implementation="eager",
                 device_map = "cuda",
                  **kwargs):
         super().__init__(**kwargs)
         super().__init__(cfg)
         # define models
         attn_impl = cfg.attn_implementation
         # model could be loaded in three quantization modes: no, int4, int8
         if cfg.quantization == "no":
             self.decoder = AutoModelForCausalLM.from_pretrained(
         self.compr_rate = cfg.compr_rate
         self.local_rank = os.getenv('LOCAL_RANK', '0')
+    def compress_and_replace_emb(self, enc_input_ids, enc_attention_mask, dec_input_ids):
         indices = range(0, enc_input_ids.size(0) + 1, self.generation_top_k)
         if self.compr:
             compressed_embs = self.compr(enc_input_ids, enc_attention_mask)
+            input_embeds = self.replace_embeddings(compressed_embs, dec_input_ids, indices)
         else:
             compressed_embs = self.compr_decoder(enc_input_ids, enc_attention_mask)
+            input_embeds = self.replace_embeddings(compressed_embs, dec_input_ids, indices)
         return input_embeds
     def compr_decoder(self, input_ids, attention_mask):
         emb = self.decoder(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1]
     def replace_embeddings(self, compressed_embs, dec_input_ids, indices):
         # Embed the decoder input
         inputs_embeds = self.decoder.get_input_embeddings()(dec_input_ids)
         num_embs = compressed_embs.size(1)
+        if self.sep:
+            slot_len = num_embs + 1
+        else:
+            slot_len = num_embs
+        # get first mem_token inidices
         first_mem_token_indices = torch.argmax((dec_input_ids == self.decoder_tokenizer.mem_token_id).int(), dim=1)
         batch_size = inputs_embeds.size(0)
+        # for each example in batch, replace them with compressed embeddings
         for i in range(batch_size):
             for j in range(indices[i], indices[i + 1]):
+                start_idx = first_mem_token_indices[i].item() + (j-indices[i]) * slot_len
                 inputs_embeds[i, start_idx:start_idx + num_embs, :] = compressed_embs[j]
         return inputs_embeds
             dec_attention_mask: torch.LongTensor = None,
             labels: torch.LongTensor = None):
+        # enc_input_ids: stores the contexts, should be flattened from all queries before input, dimention (batch_size*generation_top_k, token_length)
+        # enc_attention_mask: attention mask of enc_input_ids
+        # dec_input_ids: stores the prompts (including mem tokens), dimention (batch_size, token_length)
+        # dec_attention_mask: attention mask of dec_input_ids
+        # Perform compression with gradient tracking
+        inputs_embeds = self.compress_and_replace_emb(enc_input_ids, enc_attention_mask, dec_input_ids)
+        # if training_form is compressor, then detach the inputs_embeds, to make gradient not count in decoder
         if (self.training_form == "compressor") and (self.compr is None):
+            inputs_embeds  = inputs_embeds.detach()
+        # decoding
         decoder_outputs = self.decoder(inputs_embeds=inputs_embeds, attention_mask=dec_attention_mask, labels=labels)
         return {"loss": decoder_outputs.loss, "logits": decoder_outputs.logits}
             attention_mask=dec_attention_mask.to(device),
             do_sample=False,
             top_p=None,
+            max_new_tokens=max_new_tokens
             )
         decoded = self.decoder_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
         return decoded