GenerTeam
/

GENERator-v2-prokaryote-1.2b-base

@@ -101,24 +101,27 @@ print(decoded_sequences)
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
-# Load the tokenizer and model.
 tokenizer = AutoTokenizer.from_pretrained("GENERator-v2-prokaryote-1.2b-base", trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained("GENERator-v2-prokaryote-1.2b-base")
 config = model.config
 max_length = config.max_position_embeddings
-# Define input sequences.
 sequences = [
     "ATGAGGTGGCAAGAAATGGGCTAC",
     "GAATTCCATGAGGCTATAGAATAATCTAAGAGAAAT"
 ]
-# Tokenize the sequences with add_special_tokens=True to automatically add special tokens,
-# such as the BOS EOS token, at the appropriate positions.
 tokenizer.padding_side = "right"
 inputs = tokenizer(
-    sequences,
     add_special_tokens=True,
     return_tensors="pt",
     padding=True,
@@ -126,29 +129,34 @@ inputs = tokenizer(
     max_length=max_length
 )
-# Perform a forward pass through the model to obtain the outputs, including hidden states.
 with torch.inference_mode():
     outputs = model(**inputs, output_hidden_states=True)
-# Retrieve the hidden states from the last layer.
-hidden_states = outputs.hidden_states[-1]  # Shape: (batch_size, sequence_length, hidden_size)
-# Use the attention_mask to determine the index of the last token in each sequence.
-# Since add_special_tokens=True is used, the last token is typically the EOS token.
 attention_mask = inputs["attention_mask"]
-last_token_indices = attention_mask.sum(dim=1) - 1  # Index of the last token for each sequence
-# Extract the embedding corresponding to the EOS token for each sequence.
-seq_embeddings = []
-for i, token_index in enumerate(last_token_indices):
-    # Fetch the embedding for the last token (EOS token).
-    seq_embedding = hidden_states[i, token_index, :]
-    seq_embeddings.append(seq_embedding)
-# Stack the embeddings into a tensor with shape (batch_size, hidden_size)
-seq_embeddings = torch.stack(seq_embeddings)
-print("Sequence Embeddings:", seq_embeddings)
 ```

 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
+# Load the tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained("GENERator-v2-prokaryote-1.2b-base", trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained("GENERator-v2-prokaryote-1.2b-base")
+# Get model configuration
 config = model.config
 max_length = config.max_position_embeddings
+# Define input sequences
 sequences = [
     "ATGAGGTGGCAAGAAATGGGCTAC",
     "GAATTCCATGAGGCTATAGAATAATCTAAGAGAAAT"
 ]
+# Truncate each sequence to the nearest multiple of 6
+processed_sequences = [tokenizer.bos_token + seq[:len(seq)//6*6] for seq in sequences]
+# Tokenization
 tokenizer.padding_side = "right"
 inputs = tokenizer(
+    processed_sequences,
     add_special_tokens=True,
     return_tensors="pt",
     padding=True,
     max_length=max_length
 )
+# Model Inference
 with torch.inference_mode():
     outputs = model(**inputs, output_hidden_states=True)
+hidden_states = outputs.hidden_states[-1]
 attention_mask = inputs["attention_mask"]
+# Option 1: Last token (EOS) embedding
+last_token_indices = attention_mask.sum(dim=1) - 1
+eos_embeddings = hidden_states[torch.arange(hidden_states.size(0)), last_token_indices, :]
+# Option 2: Mean pooling over all tokens
+expanded_mask = attention_mask.unsqueeze(-1).expand(hidden_states.size()).to(torch.float32)
+sum_embeddings = torch.sum(hidden_states * expanded_mask, dim=1)
+mean_embeddings = sum_embeddings / expanded_mask.sum(dim=1)
+# Output
+print("EOS (Last Token) Embeddings:", eos_embeddings)
+print("Mean Pooling Embeddings:", mean_embeddings)
+# ============================================================================
+# Additional notes:
+# - The preprocessing step ensures sequences are multiples of 6 for 6-mer tokenizer
+# - For causal LM, the last token embedding (EOS) is commonly used
+# - Mean pooling considers all tokens including BOS and content tokens
+# - The choice depends on your downstream task requirements
+# - Both methods handle variable sequence lengths via attention mask
+# ============================================================================
 ```