Upload files

Files changed (13) hide show

STR-Bamba_8.bin +3 -0
STR-Bamba_8.pt +3 -0
config.json +62 -0
requirements.txt +12 -0
str_bamba/bamba.py +534 -0
str_bamba/bamba_config.py +28 -0
str_bamba/bamba_modules.py +229 -0
str_bamba/config/config_encoder-decoder_436M.json +62 -0
str_bamba/generation.py +398 -0
str_bamba/load.py +60 -0
str_bamba/tokenizer/special_tokens.py +39 -0
str_bamba/tokenizer/str_bamba_tokenizer.json +0 -0
str_bamba/tokenizer/str_tokenizer.py +101 -0

STR-Bamba_8.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db6d7a2561bfbaf9bd8a5f910321b2ff21671b6bc47cad955a323898203a9967
+size 1372194320

STR-Bamba_8.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db6d7a2561bfbaf9bd8a5f910321b2ff21671b6bc47cad955a323898203a9967
+size 1372194320

config.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+    "encoder_config": {
+        "d_model": 1024,
+        "d_intermediate": 0,
+        "n_layer": 24,
+        "vocab_size": 5000,
+        "max_position_embeddings": 4096,
+        "ssm_cfg": {
+            "layer": "Mamba2"
+        },
+        "attn_layer_idx": [
+            6,
+            18
+        ],
+        "attn_cfg": {
+            "causal": false,
+            "d_conv": 0,
+            "head_dim": 64,
+            "num_heads": 16,
+            "num_heads_kv": 8,
+            "out_proj_bias": false,
+            "qkv_proj_bias": false,
+            "rotary_emb_dim": 64
+        },
+        "rms_norm": true,
+        "residual_in_fp32": true,
+        "fused_add_norm": true,
+        "pad_vocab_size_multiple": 8,
+        "tie_embeddings": false
+    },
+    "decoder_config": {
+        "d_model": 1024,
+        "d_intermediate": 0,
+        "n_layer": 24,
+        "vocab_size": 5000,
+        "max_position_embeddings": 4096,
+        "ssm_cfg": {
+            "layer": "Mamba2"
+        },
+        "attn_layer_idx": [
+            6,
+            18
+        ],
+        "attn_cfg": {
+            "causal": true,
+            "d_conv": 0,
+            "head_dim": 64,
+            "num_heads": 16,
+            "num_heads_kv": 8,
+            "out_proj_bias": false,
+            "qkv_proj_bias": false,
+            "rotary_emb_dim": 64
+        },
+        "rms_norm": true,
+        "residual_in_fp32": true,
+        "fused_add_norm": true,
+        "pad_vocab_size_multiple": 8,
+        "tie_embeddings": false
+    },
+    "tie_word_embeddings": true,
+    "seed": 0
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+numpy==1.26.4
+pandas==2.2.3
+scikit-learn>=1.6.1
+datasets==3.5.0
+transformers==4.52.1
+tokenizers==0.21.1
+deepspeed==0.16.7
+einops==0.8.1
+tqdm==4.67.1
+torch-optimizer==0.3.0
+rdkit>=2024.3.5
+selfies>=2.2.0

str_bamba/bamba.py ADDED Viewed

	@@ -0,0 +1,534 @@

+from .generation import GenerationMixin
+from mamba_ssm.modules.mamba2 import Mamba2
+from mamba_ssm.modules.mha import MHA
+from mamba_ssm.modules.mlp import GatedMLP
+from mamba_ssm.modules.block import Block
+from mamba_ssm.models.mixer_seq_simple import _init_weights
+from mamba_ssm.utils.hf import load_config_hf, load_state_dict_hf
+from .bamba_modules import BertEmbeddings, BertPooler, BertPreTrainingHeads, BlockCrossAttention
+from .bamba_config import BambaConfig, BambaEncoderDecoderConfig
+try:
+    from mamba_ssm.ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+from typing import List, Optional, Tuple, Union
+from collections import namedtuple
+import torch.backends.cudnn as cudnn
+import math
+import random
+from functools import partial
+import json
+import os
+import copy
+import torch
+import torch.nn as nn
+import pandas as pd
+import numpy as np
+import gc
+from tqdm import tqdm
+def create_block(
+    d_model,
+    d_intermediate,
+    block_class,
+    ssm_cfg=None,
+    attn_layer_idx=None,
+    attn_cfg=None,
+    norm_epsilon=1e-5,
+    rms_norm=False,
+    residual_in_fp32=False,
+    fused_add_norm=False,
+    layer_idx=None,
+    device=None,
+    dtype=None,
+):
+    if ssm_cfg is None:
+        ssm_cfg = {}
+    if attn_layer_idx is None:
+        attn_layer_idx = []
+    if attn_cfg is None:
+        attn_cfg = {}
+    factory_kwargs = {"device": device, "dtype": dtype}
+    if layer_idx not in attn_layer_idx:
+        # Create a copy of the config to modify
+        ssm_cfg = copy.deepcopy(ssm_cfg) if ssm_cfg is not None else {}
+        ssm_layer = ssm_cfg.pop("layer", "Mamba1")
+        if ssm_layer not in ["Mamba1", "Mamba2"]:
+            raise ValueError(f"Invalid ssm_layer: {ssm_layer}, only support Mamba1 and Mamba2")
+        mixer_cls = partial(
+            Mamba2 if ssm_layer == "Mamba2" else Mamba,
+            layer_idx=layer_idx,
+            **ssm_cfg,
+            **factory_kwargs
+        )
+    else:
+        mixer_cls = partial(MHA, layer_idx=layer_idx, **attn_cfg, **factory_kwargs)
+    norm_cls = partial(
+        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
+    )
+    if d_intermediate == 0:
+        mlp_cls = nn.Identity
+    else:
+        mlp_cls = partial(
+            GatedMLP, hidden_features=d_intermediate, out_features=d_model, **factory_kwargs
+        )
+    block = block_class(
+        d_model,
+        mixer_cls,
+        mlp_cls,
+        norm_cls=norm_cls,
+        fused_add_norm=fused_add_norm,
+        residual_in_fp32=residual_in_fp32,
+    )
+    if isinstance(block, BlockCrossAttention) and factory_kwargs["dtype"] is not None:
+        block.encoder_attn.type(factory_kwargs["dtype"]).to(factory_kwargs["device"])
+    block.layer_idx = layer_idx
+    return block
+class BambaMixerModel(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        n_layer: int,
+        d_intermediate: int,
+        vocab_size: int,
+        max_position_embeddings: int,
+        is_decoder: bool = False,
+        ssm_cfg=None,
+        attn_layer_idx=None,
+        attn_cfg=None,
+        norm_epsilon: float = 1e-5,
+        rms_norm: bool = False,
+        initializer_cfg=None,
+        fused_add_norm=False,
+        residual_in_fp32=False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.residual_in_fp32 = residual_in_fp32
+        self.is_decoder = is_decoder
+        if is_decoder:
+            self.embedding = nn.Embedding(vocab_size, d_model, **factory_kwargs)
+        else:
+            self.embedding = BertEmbeddings(vocab_size, d_model, max_position_embeddings, **factory_kwargs)
+        # We change the order of residual and layer norm:
+        # Instead of LN -> Attn / MLP -> Add, we do:
+        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
+        # the main branch (output of MLP / Mixer). The model definition is unchanged.
+        # This is for performance reason: we can fuse add + layer_norm.
+        self.fused_add_norm = fused_add_norm
+        if self.fused_add_norm:
+            if layer_norm_fn is None or rms_norm_fn is None:
+                raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels")
+        if is_decoder:
+            block_class = BlockCrossAttention
+        else:
+            block_class = Block
+        self.layers = nn.ModuleList(
+            [
+                create_block(
+                    d_model,
+                    d_intermediate=d_intermediate,
+                    block_class=block_class,
+                    ssm_cfg=ssm_cfg,
+                    attn_layer_idx=attn_layer_idx,
+                    attn_cfg=attn_cfg,
+                    norm_epsilon=norm_epsilon,
+                    rms_norm=rms_norm,
+                    residual_in_fp32=residual_in_fp32,
+                    fused_add_norm=fused_add_norm,
+                    layer_idx=i,
+                    **factory_kwargs,
+                )
+                for i in range(n_layer)
+            ]
+        )
+        self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)(
+            d_model, eps=norm_epsilon, **factory_kwargs
+        )
+        if not is_decoder:
+            self.pooler = BertPooler(d_model, **factory_kwargs)
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=n_layer,
+                **(initializer_cfg if initializer_cfg is not None else {}),
+                n_residuals_per_layer=1 if d_intermediate == 0 else 2,  # 2 if we have MLP
+            )
+        )
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return {
+            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+            for i, layer in enumerate(self.layers)
+        }
+    def forward(self, input_ids, token_type_ids=None, inference_params=None, encoder_hidden_states=None, attention_mask=None, **mixer_kwargs):
+        if self.is_decoder:
+            hidden_states = self.embedding(input_ids)
+        else:
+            hidden_states = self.embedding(input_ids, token_type_ids)
+        residual = None
+        for layer in self.layers:
+            if self.is_decoder:
+                hidden_states, residual = layer(
+                    hidden_states, residual, inference_params=inference_params, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, **mixer_kwargs
+                )
+            else:
+                hidden_states, residual = layer(
+                    hidden_states, residual, inference_params=inference_params, **mixer_kwargs
+                )
+        if not self.fused_add_norm:
+            residual = (hidden_states + residual) if residual is not None else hidden_states
+            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
+        else:
+            # Set prenorm=False here since we don't need the residual
+            hidden_states = layer_norm_fn(
+                hidden_states,
+                self.norm_f.weight,
+                self.norm_f.bias,
+                eps=self.norm_f.eps,
+                residual=residual,
+                prenorm=False,
+                residual_in_fp32=self.residual_in_fp32,
+                is_rms_norm=isinstance(self.norm_f, RMSNorm)
+            )
+        if not self.is_decoder:
+            pooled_output = self.pooler(hidden_states)
+            return hidden_states, pooled_output
+        return hidden_states
+class BambaEncoder(nn.Module):
+    def __init__(
+        self,
+        config: BambaConfig,
+        initializer_cfg=None,
+        device=None,
+        dtype=None,
+    ) -> None:
+        self.config = config
+        d_model = config.d_model
+        n_layer = config.n_layer
+        d_intermediate = config.d_intermediate
+        vocab_size = config.vocab_size
+        max_position_embeddings = config.max_position_embeddings
+        ssm_cfg = config.ssm_cfg
+        attn_layer_idx = config.attn_layer_idx
+        attn_cfg = config.attn_cfg
+        rms_norm = config.rms_norm
+        residual_in_fp32 = config.residual_in_fp32
+        fused_add_norm = config.fused_add_norm
+        pad_vocab_size_multiple = config.pad_vocab_size_multiple
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        if vocab_size % pad_vocab_size_multiple != 0:
+            vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple)
+        self.backbone = BambaMixerModel(
+            d_model=d_model,
+            n_layer=n_layer,
+            d_intermediate=d_intermediate,
+            vocab_size=vocab_size,
+            max_position_embeddings=max_position_embeddings,
+            is_decoder=False,
+            ssm_cfg=ssm_cfg,
+            attn_layer_idx=attn_layer_idx,
+            attn_cfg=attn_cfg,
+            rms_norm=rms_norm,
+            initializer_cfg=initializer_cfg,
+            fused_add_norm=fused_add_norm,
+            residual_in_fp32=residual_in_fp32,
+            **factory_kwargs,
+        )
+        self.cls = BertPreTrainingHeads(vocab_size, d_model, **factory_kwargs)
+        # Initialize weights and apply final processing
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=n_layer,
+                **(initializer_cfg if initializer_cfg is not None else {}),
+            )
+        )
+        self.tie_weights()
+    def tie_weights(self):
+        if self.config.tie_embeddings:
+            self.lm_head.weight = self.backbone.embedding.weight
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.backbone.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, inference_params=None, num_last_tokens=0, **mixer_kwargs):
+        """
+        "position_ids" is just to be compatible with Transformer generation. We don't use it.
+        num_last_tokens: if > 0, only return the logits for the last n tokens
+        """
+        hidden_states, pooled_output = self.backbone(input_ids, token_type_ids, inference_params=inference_params, **mixer_kwargs)
+        if num_last_tokens > 0:
+            hidden_states = hidden_states[:, -num_last_tokens:]
+        lm_logits, seq_relationship_score = self.cls(hidden_states, pooled_output)
+        CausalLMOutput = namedtuple("CausalLMOutput", ["logits", "seq_relationship_logits", "hidden_states"])
+        return CausalLMOutput(logits=lm_logits, seq_relationship_logits=seq_relationship_score, hidden_states=hidden_states)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
+        config_data = load_config_hf(pretrained_model_name)
+        config = MambaConfig(**config_data)
+        model = cls(config, device=device, dtype=dtype, **kwargs)
+        model.load_state_dict(load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype))
+        return model
+    def save_pretrained(self, save_directory):
+        """
+        Minimal implementation of save_pretrained for MambaLMHeadModel.
+        Save the model and its configuration file to a directory.
+        """
+        # Ensure save_directory exists
+        os.makedirs(save_directory, exist_ok=True)
+        # Save the model's state_dict
+        model_path = os.path.join(save_directory, 'pytorch_model.bin')
+        torch.save(self.state_dict(), model_path)
+        # Save the configuration of the model
+        config_path = os.path.join(save_directory, 'config.json')
+        with open(config_path, 'w') as f:
+            json.dump(self.config.__dict__, f, indent=4)
+class BambaDecoder(nn.Module, GenerationMixin):
+    def __init__(
+        self,
+        config: BambaConfig,
+        initializer_cfg=None,
+        device=None,
+        dtype=None,
+    ) -> None:
+        self.config = config
+        d_model = config.d_model
+        n_layer = config.n_layer
+        d_intermediate = config.d_intermediate
+        vocab_size = config.vocab_size
+        max_position_embeddings = config.max_position_embeddings
+        ssm_cfg = config.ssm_cfg
+        attn_layer_idx = config.attn_layer_idx
+        attn_cfg = config.attn_cfg
+        rms_norm = config.rms_norm
+        residual_in_fp32 = config.residual_in_fp32
+        fused_add_norm = config.fused_add_norm
+        pad_vocab_size_multiple = config.pad_vocab_size_multiple
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        if vocab_size % pad_vocab_size_multiple != 0:
+            vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple)
+        self.backbone = BambaMixerModel(
+            d_model=d_model,
+            n_layer=n_layer,
+            d_intermediate=d_intermediate,
+            vocab_size=vocab_size,
+            max_position_embeddings=max_position_embeddings,
+            is_decoder=True,
+            ssm_cfg=ssm_cfg,
+            attn_layer_idx=attn_layer_idx,
+            attn_cfg=attn_cfg,
+            rms_norm=rms_norm,
+            initializer_cfg=initializer_cfg,
+            fused_add_norm=fused_add_norm,
+            residual_in_fp32=residual_in_fp32,
+            **factory_kwargs,
+        )
+        self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs)
+        # Initialize weights and apply final processing
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=n_layer,
+                **(initializer_cfg if initializer_cfg is not None else {}),
+            )
+        )
+        self.tie_weights()
+    def tie_weights(self):
+        if self.config.tie_embeddings:
+            self.lm_head.weight = self.backbone.embedding.weight
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.backbone.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, inference_params=None, num_last_tokens=0, encoder_hidden_states=None, attention_mask=None, **mixer_kwargs):
+        """
+        "position_ids" is just to be compatible with Transformer generation. We don't use it.
+        num_last_tokens: if > 0, only return the logits for the last n tokens
+        """
+        hidden_states = self.backbone(
+            input_ids, token_type_ids, inference_params=inference_params, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, **mixer_kwargs
+        )
+        if num_last_tokens > 0:
+            hidden_states = hidden_states[:, -num_last_tokens:]
+        lm_logits = self.lm_head(hidden_states)
+        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
+        return CausalLMOutput(logits=lm_logits)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
+        config_data = load_config_hf(pretrained_model_name)
+        config = MambaConfig(**config_data)
+        model = cls(config, device=device, dtype=dtype, **kwargs)
+        model.load_state_dict(load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype))
+        return model
+    def save_pretrained(self, save_directory):
+        """
+        Minimal implementation of save_pretrained for MambaLMHeadModel.
+        Save the model and its configuration file to a directory.
+        """
+        # Ensure save_directory exists
+        os.makedirs(save_directory, exist_ok=True)
+        # Save the model's state_dict
+        model_path = os.path.join(save_directory, 'pytorch_model.bin')
+        torch.save(self.state_dict(), model_path)
+        # Save the configuration of the model
+        config_path = os.path.join(save_directory, 'config.json')
+        with open(config_path, 'w') as f:
+            json.dump(self.config.__dict__, f, indent=4)
+class BambaEncoderDecoder(nn.Module, GenerationMixin):
+    def __init__(
+        self,
+        config: BambaEncoderDecoderConfig,
+        tokenizer=None,
+        initializer_cfg=None,
+        device=None,
+        dtype=None,
+    ) -> None:
+        self.config = config
+        self.encoder_config = config.encoder_config
+        self.decoder_config = config.decoder_config
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.tokenizer = tokenizer
+        super().__init__()
+        self.encoder = BambaEncoder(self.encoder_config, **factory_kwargs)
+        self.decoder = BambaDecoder(self.decoder_config, **factory_kwargs)
+        self.device = device
+        self.tie_weights()
+        self._set_seed(config.seed)
+    def tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self.decoder.backbone.embedding.weight = self.encoder.backbone.embedding.word_embeddings.weight
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.decoder.backbone.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+    def forward(self, encoder_input_ids, decoder_input_ids, token_type_ids=None, attention_mask=None, position_ids=None, inference_params=None, num_last_tokens=0, **mixer_kwargs):
+        """
+        "position_ids" is just to be compatible with Transformer generation. We don't use it.
+        num_last_tokens: if > 0, only return the logits for the last n tokens
+        """
+        encoder_hidden_states = self.encoder(encoder_input_ids, inference_params=inference_params, **mixer_kwargs).hidden_states
+        lm_logits = self.decoder(decoder_input_ids, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, inference_params=inference_params, **mixer_kwargs).logits
+        if num_last_tokens > 0:
+            hidden_states = hidden_states[:, -num_last_tokens:]
+        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
+        return CausalLMOutput(logits=lm_logits)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
+        config_data = load_config_hf(pretrained_model_name)
+        config = MambaConfig(**config_data)
+        model = cls(config, device=device, dtype=dtype, **kwargs)
+        model.load_state_dict(load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype))
+        return model
+    def save_pretrained(self, save_directory):
+        """
+        Minimal implementation of save_pretrained for MambaLMHeadModel.
+        Save the model and its configuration file to a directory.
+        """
+        # Ensure save_directory exists
+        os.makedirs(save_directory, exist_ok=True)
+        # Save the model's state_dict
+        model_path = os.path.join(save_directory, 'pytorch_model.bin')
+        torch.save(self.state_dict(), model_path)
+        # Save the configuration of the model
+        config_path = os.path.join(save_directory, 'config.json')
+        with open(config_path, 'w') as f:
+            json.dump(self.config.__dict__, f, indent=4)
+    def _set_seed(self, value):
+        print('Random Seed:', value)
+        random.seed(value)
+        torch.manual_seed(value)
+        torch.cuda.manual_seed(value)
+        torch.cuda.manual_seed_all(value)
+        np.random.seed(value)
+        cudnn.deterministic = True
+        cudnn.benchmark = False
+    def extract_embeddings(self, smiles):
+        tokens = self.tokenizer(smiles, padding=True, truncation=True, return_tensors='pt')
+        idx = tokens['input_ids'].to(self.device)
+        mask = tokens['attention_mask'].to(self.device)
+        outputs = self.encoder(input_ids=idx)
+        hidden_states = outputs.hidden_states
+        token_embeddings = hidden_states
+        input_mask_expanded = mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+        embeddings = sum_embeddings / sum_mask
+        return embeddings
+    def encode(self, smiles, useCuda=False, batch_size=100, return_torch=False):
+        """Extract efficiently SMILES embeddings per batches."""
+        # TODO: remove useCuda argument
+        # handle single str or a list of str
+        smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
+        # process in batches
+        n_split = smiles.shape[0] // batch_size if smiles.shape[0] >= batch_size else smiles.shape[0]
+        embeddings = [
+            self.extract_embeddings(list(batch)).cpu().detach().numpy()
+                for batch in tqdm(np.array_split(smiles, n_split))
+        ]
+        flat_list = [item for sublist in embeddings for item in sublist]
+        # clear GPU memory
+        torch.cuda.empty_cache()
+        gc.collect()
+        if return_torch:
+            return torch.tensor(flat_list)
+        return pd.DataFrame(flat_list)

str_bamba/bamba_config.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from dataclasses import dataclass, field
+@dataclass
+class BambaConfig:
+    d_model: int = 2560
+    d_intermediate: int = 0
+    n_layer: int = 64
+    vocab_size: int = 50277
+    max_position_embeddings: int = 262144
+    ssm_cfg: dict = field(default_factory=dict)
+    attn_layer_idx: list = field(default_factory=list)
+    attn_cfg: dict = field(default_factory=dict)
+    rms_norm: bool = True
+    residual_in_fp32: bool = True
+    fused_add_norm: bool = True
+    pad_vocab_size_multiple: int = 8
+    tie_embeddings: bool = True
+@dataclass
+class BambaEncoderDecoderConfig:
+    encoder_config: BambaConfig = None
+    decoder_config: BambaConfig = None
+    tie_word_embeddings: bool = True
+    seed: int = 0

str_bamba/bamba_modules.py ADDED Viewed

	@@ -0,0 +1,229 @@

+from typing import Optional
+import torch
+from torch import nn, Tensor
+from mamba_ssm.ops.triton.layer_norm import RMSNorm, layer_norm_fn
+from transformers.models.bart.modeling_bart import BartSdpaAttention
+from transformers.activations import ACT2FN
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    def __init__(self, vocab_size, hidden_size, max_position_embeddings, type_vocab_size=2, pad_token_id=2, layer_norm_eps=1e-12, hidden_dropout_prob=0.1, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.word_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=pad_token_id, **factory_kwargs)
+        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size, **factory_kwargs)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps, **factory_kwargs)
+        self.dropout = nn.Dropout(hidden_dropout_prob)
+        # self.position_embedding_type = "rotary"
+        # self.register_buffer(
+        #     "position_ids", torch.arange(max_position_embeddings).expand((1, -1)), persistent=False
+        # )
+        # self.register_buffer(
+        #     "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        # )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        # if position_ids is None:
+        #     position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            # if hasattr(self, "token_type_ids"):
+            #     import ipdb; ipdb.set_trace()
+            #     buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+            #     buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+            #     token_type_ids = buffered_token_type_ids_expanded
+            # else:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=input_ids.device)
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertPooler(nn.Module):
+    def __init__(self, hidden_size, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size, **factory_kwargs)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, hidden_size, hidden_act="gelu", layer_norm_eps=1e-12, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size, **factory_kwargs)
+        if isinstance(hidden_act, str):
+            self.transform_act_fn = ACT2FN[hidden_act]
+        else:
+            self.transform_act_fn = hidden_act
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps, **factory_kwargs)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, vocab_size, hidden_size, hidden_act="gelu", layer_norm_eps=1e-12, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(hidden_size, hidden_act, layer_norm_eps, **factory_kwargs)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(hidden_size, vocab_size, bias=False, **factory_kwargs)
+        self.bias = nn.Parameter(torch.zeros(vocab_size, **factory_kwargs))
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, vocab_size, hidden_size, hidden_act="gelu", layer_norm_eps=1e-12, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.predictions = BertLMPredictionHead(vocab_size, hidden_size, hidden_act, layer_norm_eps, **factory_kwargs)
+        self.seq_relationship = nn.Linear(hidden_size, 2, **factory_kwargs)
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+class BlockCrossAttention(nn.Module):
+    def __init__(
+        self, dim, mixer_cls, mlp_cls, norm_cls=nn.LayerNorm, fused_add_norm=False, residual_in_fp32=False
+    ):
+        """
+        Simple block wrapping a mixer class with LayerNorm/RMSNorm and residual connection"
+        This Block has a slightly different structure compared to a regular
+        prenorm Transformer block.
+        The standard block is: LN -> MHA/MLP -> Add.
+        [Ref: https://arxiv.org/abs/2002.04745]
+        Here we have: Add -> LN -> Mixer, returning both
+        the hidden_states (output of the mixer) and the residual.
+        This is purely for performance reasons, as we can fuse add and LayerNorm.
+        The residual needs to be provided (except for the very first block).
+        """
+        super().__init__()
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.norm = norm_cls(dim)
+        self.mixer = mixer_cls(dim)
+        self.encoder_attn = BartSdpaAttention(embed_dim=dim, num_heads=1)
+        if mlp_cls is not nn.Identity:
+            self.norm2 = norm_cls(dim)
+            self.mlp = mlp_cls(dim)
+        else:
+            self.mlp = None
+        if self.fused_add_norm:
+            assert RMSNorm is not None, "RMSNorm import fails"
+            assert isinstance(
+                self.norm, (nn.LayerNorm, RMSNorm)
+            ), "Only LayerNorm and RMSNorm are supported for fused_add_norm"
+    def forward(
+            self, hidden_states: Tensor, residual: Optional[Tensor] = None, inference_params=None, encoder_hidden_states=None, attention_mask=None, **mixer_kwargs
+    ):
+        r"""Pass the input through the encoder layer.
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: hidden_states = Mixer(LN(residual))
+        """
+        if not self.fused_add_norm:
+            residual = (hidden_states + residual) if residual is not None else hidden_states
+            hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+        else:
+            hidden_states, residual = layer_norm_fn(
+                hidden_states,
+                self.norm.weight,
+                self.norm.bias,
+                residual=residual,
+                prenorm=True,
+                residual_in_fp32=self.residual_in_fp32,
+                eps=self.norm.eps,
+                is_rms_norm=isinstance(self.norm, RMSNorm)
+            )
+        hidden_states = self.mixer(hidden_states, inference_params=inference_params, **mixer_kwargs)
+        # cross-attention
+        hidden_states, _, _ = self.encoder_attn(hidden_states, encoder_hidden_states, attention_mask=attention_mask)
+        if self.mlp is not None:
+            if not self.fused_add_norm:
+                residual = hidden_states + residual
+                hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
+                if self.residual_in_fp32:
+                    residual = residual.to(torch.float32)
+            else:
+                hidden_states, residual = layer_norm_fn(
+                    hidden_states,
+                    self.norm2.weight,
+                    self.norm2.bias,
+                    residual=residual,
+                    prenorm=True,
+                    residual_in_fp32=self.residual_in_fp32,
+                    eps=self.norm2.eps,
+                    is_rms_norm=isinstance(self.norm2, RMSNorm)
+                )
+            hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)

str_bamba/config/config_encoder-decoder_436M.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+    "encoder_config": {
+        "d_model": 1024,
+        "d_intermediate": 0,
+        "n_layer": 24,
+        "vocab_size": 5000,
+        "max_position_embeddings": 4096,
+        "ssm_cfg": {
+            "layer": "Mamba2"
+        },
+        "attn_layer_idx": [
+            6,
+            18
+        ],
+        "attn_cfg": {
+            "causal": false,
+            "d_conv": 0,
+            "head_dim": 64,
+            "num_heads": 16,
+            "num_heads_kv": 8,
+            "out_proj_bias": false,
+            "qkv_proj_bias": false,
+            "rotary_emb_dim": 64
+        },
+        "rms_norm": true,
+        "residual_in_fp32": true,
+        "fused_add_norm": true,
+        "pad_vocab_size_multiple": 8,
+        "tie_embeddings": false
+    },
+    "decoder_config": {
+        "d_model": 1024,
+        "d_intermediate": 0,
+        "n_layer": 24,
+        "vocab_size": 5000,
+        "max_position_embeddings": 4096,
+        "ssm_cfg": {
+            "layer": "Mamba2"
+        },
+        "attn_layer_idx": [
+            6,
+            18
+        ],
+        "attn_cfg": {
+            "causal": true,
+            "d_conv": 0,
+            "head_dim": 64,
+            "num_heads": 16,
+            "num_heads_kv": 8,
+            "out_proj_bias": false,
+            "qkv_proj_bias": false,
+            "rotary_emb_dim": 64
+        },
+        "rms_norm": true,
+        "residual_in_fp32": true,
+        "fused_add_norm": true,
+        "pad_vocab_size_multiple": 8,
+        "tie_embeddings": false
+    },
+    "tie_word_embeddings": true,
+    "seed": 0
+}

str_bamba/generation.py ADDED Viewed

	@@ -0,0 +1,398 @@

+# Copyright (c) 2023, Albert Gu, Tri Dao.
+import gc
+import time
+from collections import namedtuple
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Callable, Optional, Sequence, Union
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import Tensor
+from torch.profiler import ProfilerActivity, profile, record_function
+from transformers.generation import GreedySearchDecoderOnlyOutput, SampleDecoderOnlyOutput, TextStreamer
+@dataclass
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+    max_seqlen: int
+    max_batch_size: int
+    seqlen_offset: int = 0
+    batch_size_offset: int = 0
+    key_value_memory_dict: dict = field(default_factory=dict)
+    lengths_per_sample: Optional[Tensor] = None
+    def reset(self, max_seqlen, max_batch_size):
+        self.max_seqlen = max_seqlen
+        self.max_batch_size = max_batch_size
+        self.seqlen_offset = 0
+        if self.lengths_per_sample is not None:
+            self.lengths_per_sample.zero_()
+def modify_logits_for_min_p_filtering(logits, min_p):
+    """Set the logits for none min_p values to -inf. Done in-place."""
+    if min_p <= 0.0 or min_p >= 1.0:
+        return
+    indices_to_remove = logits < min_p
+    logits.masked_fill_(indices_to_remove, float("-Inf"))
+# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
+# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L231
+def modify_logits_for_top_k_filtering(logits, top_k):
+    """Set the logits for none top-k values to -inf. Done in-place."""
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits.masked_fill_(indices_to_remove, float("-Inf"))
+# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
+# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L170
+def modify_logits_for_top_p_filtering(logits, top_p):
+    """Set the logits for none top-p values to -inf. Done in-place."""
+    if top_p <= 0.0 or top_p >= 1.0:
+        return
+    # First sort and calculate cumulative sum of probabilities.
+    sorted_logits, sorted_indices = torch.sort(logits, descending=False)
+    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+    # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
+    sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
+    # scatter sorted tensors to original indexing
+    indices_to_remove = sorted_indices_to_remove.scatter(
+        1, sorted_indices, sorted_indices_to_remove
+    )
+    logits.masked_fill_(indices_to_remove, float("-inf"))
+def modify_logit_for_repetition_penalty(logits, prev_output_tokens, repetition_penalty=1.0):
+    """Apply repetition penalty. See https://arxiv.org/abs/1909.05858
+    logits: (batch_size, vocab_size)
+    prev_output_tokens: (batch_size, seq_len)
+    """
+    if repetition_penalty == 1.0:
+        return logits
+    score = torch.gather(logits, 1, prev_output_tokens)
+    # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
+    score = torch.where(score < 0, score * repetition_penalty, score / repetition_penalty)
+    logits.scatter_(1, prev_output_tokens, score)
+    return logits
+def sample(logits, top_k=1, top_p=0.0, min_p=0.0, temperature=1.0):
+    """Sample from top-k logits.
+    Arguments:
+        logits: Tensor of shape (batch_size, vocab_size)
+    """
+    if top_k == 1:  # Short-circuit for greedy decoding
+        return logits.argmax(dim=-1)
+    else:
+        if top_p > 0.0:
+            assert top_p <= 1.0, "top-p should be in (0, 1]."
+        if top_k > 0:
+            top_k = min(top_k, logits.size(-1))  # Safety check
+            logits_top, indices = torch.topk(logits, top_k, dim=-1)
+            if temperature != 1.0:
+                logits_top /= temperature
+            modify_logits_for_top_p_filtering(logits_top, top_p)
+            return indices[
+                torch.arange(indices.shape[0], device=indices.device),
+                torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1),
+            ]
+        else:
+            if min_p > 0.0:
+                logits_top = logits.clone()
+                max_prob = logits_top[..., 0].item()
+                min_prob = max_prob * min_p
+                modify_logits_for_min_p_filtering(logits_top, min_prob)
+                if temperature != 1.0:
+                    logits_top /= temperature
+                return torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1)
+            # Clone so that when we modify for top_p we don't change the original logits
+            logits_top = logits / temperature if temperature != 1.0 else logits.clone()
+            modify_logits_for_top_p_filtering(logits_top, top_p)
+            return torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(
+                dim=-1
+            )
+@torch.inference_mode()
+def decode(
+    input_ids,
+    encoder_hidden_states,
+    model,
+    max_length,
+    top_k=1,
+    top_p=0.0,
+    min_p=0.0,
+    temperature=1.0,
+    repetition_penalty=1.0,
+    eos_token_id=None,
+    teacher_outputs=None,
+    vocab_size=None,
+    cg=False,
+    enable_timing=False,
+    output_scores=False,
+    streamer: Optional[TextStreamer] = None
+):
+    """Decoding, either greedy or with top-k or top-p sampling.
+    If top-k = 0, don't limit the number of candidates (pure sampling).
+    Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first,
+    then top-p.
+    We assume that all sequences in the same batch have the same length.
+    Arguments:
+        input_ids: (batch, seq_len)
+        max_length: int
+        teacher_outputs (optional): (batch, seq_len). If provided, instead of sampling from the
+            logits, the next token is taken from the teacher_outputs. Useful for testing.
+    Returns: GreedySearchDecoderOnlyOutput or SampleDecoderOnlyOutput, with the following fields:
+        sequences: (batch, max_length)
+        scores: tuples of (batch, vocab_size)
+    """
+    if streamer is not None:
+        streamer.put(input_ids.cpu())
+    batch_size, seqlen_og = input_ids.shape
+    teacher_output_len = teacher_outputs.shape[1] if teacher_outputs is not None else 0
+    if cg:
+        if not hasattr(model, "_decoding_cache"):
+            model._decoding_cache = None
+        model._decoding_cache = update_graph_cache(
+            model,
+            encoder_hidden_states,
+            model._decoding_cache,
+            batch_size,
+            seqlen_og,
+            max_length,
+        )
+        inference_params = model._decoding_cache.inference_params
+        inference_params.reset(max_length, batch_size)
+    else:
+        inference_params = InferenceParams(max_seqlen=max_length, max_batch_size=batch_size)
+    def get_logits(input_ids, inference_params):
+        decoding = inference_params.seqlen_offset > 0
+        if decoding:
+            position_ids = torch.full(
+                (batch_size, 1),
+                inference_params.seqlen_offset,
+                dtype=torch.long,
+                device=input_ids.device,
+            )
+        else:
+            position_ids = None
+        if not cg or not decoding:
+            logits = model(
+                input_ids,
+                encoder_hidden_states=encoder_hidden_states,
+                position_ids=position_ids,
+                inference_params=inference_params,
+                num_last_tokens=1,
+            ).logits.squeeze(dim=1)
+        else:
+            logits = model._decoding_cache.run(
+                input_ids, position_ids, inference_params.seqlen_offset
+            ).squeeze(dim=1)
+        return logits[..., :vocab_size] if vocab_size is not None else logits
+    def sample_tokens(logits, inference_params):
+        if teacher_outputs is None or teacher_output_len <= inference_params.seqlen_offset:
+            token = sample(logits, top_k=top_k, top_p=top_p, min_p=min_p, temperature=temperature)
+        else:
+            token = teacher_outputs[:, inference_params.seqlen_offset]
+        # return rearrange(token, "b -> b 1")
+        return token.unsqueeze(1)
+    def should_stop(current_token, inference_params):
+        if inference_params.seqlen_offset == 0:
+            return False
+        if eos_token_id is not None and (current_token == eos_token_id).all():
+            return True
+        if inference_params.seqlen_offset >= max_length - 1:
+            return True
+        return False
+    start = torch.cuda.Event(enable_timing=enable_timing)
+    end = torch.cuda.Event(enable_timing=enable_timing)
+    if enable_timing:
+        start.record()
+    scores, sequences = [], [input_ids]
+    sequences_cat = input_ids
+    while not should_stop(sequences[-1], inference_params):
+        logits = get_logits(sequences[-1], inference_params)
+        if output_scores:
+            scores.append(logits.clone())
+        inference_params.seqlen_offset += sequences[-1].shape[1]
+        if repetition_penalty == 1.0:
+            sampled_tokens = sample_tokens(logits, inference_params)
+        else:
+            logits = modify_logit_for_repetition_penalty(
+                logits, sequences_cat, repetition_penalty
+            )
+            sampled_tokens = sample_tokens(logits, inference_params)
+            sequences_cat = torch.cat([sequences_cat, sampled_tokens], dim=1)
+        sequences.append(sampled_tokens)
+        if streamer is not None:
+            streamer.put(sampled_tokens.cpu())
+    if streamer is not None:
+        streamer.end()
+    if enable_timing:
+        end.record()
+        torch.cuda.synchronize()
+        print(f"Prompt processing + decoding time: {(start.elapsed_time(end)):.0f}ms")
+    output_cls = GreedySearchDecoderOnlyOutput if top_k == 1 else SampleDecoderOnlyOutput
+    return output_cls(sequences=torch.cat(sequences, dim=1), scores=tuple(scores))
+class GenerationMixin:
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        raise NotImplementedError
+    def generate(
+        self,
+        input_ids,
+        encoder_hidden_states,
+        max_length,
+        top_k=1,
+        top_p=0.0,
+        min_p=0.0,
+        temperature=1.0,
+        return_dict_in_generate=False,
+        output_scores=False,
+        **kwargs,
+    ):
+        output = decode(
+            input_ids, encoder_hidden_states, self, max_length, top_k=top_k, top_p=top_p, min_p = min_p, temperature=temperature, output_scores=output_scores, **kwargs
+        )
+        if not output_scores:
+            output.scores = None
+        return output if return_dict_in_generate else output.sequences
+@dataclass
+class DecodingCGCache:
+    max_batch_size: int = 0
+    max_seqlen: int = 0
+    device = None
+    dtype = None
+    callables: dict = field(default_factory=dict)
+    mempool = None
+    inference_params: Optional[InferenceParams] = None
+    run: Optional[Callable] = None
+@torch.inference_mode()
+def update_graph_cache(
+    model,
+    encoder_hidden_states,
+    cache,
+    batch_size,
+    seqlen_og,
+    max_seqlen,
+    decoding_seqlens=(1,),
+    dtype=None,
+    n_warmups=2,
+):
+    if cache is None:
+        cache = DecodingCGCache()
+    param_example = next(iter(model.parameters()))
+    device = param_example.device
+    if dtype is None:
+        dtype = param_example.dtype
+    if (
+        (device, dtype) != (cache.device, cache.dtype)
+        or batch_size > cache.max_batch_size
+        or max_seqlen > cache.max_seqlen
+    ):  # Invalidate the cache
+        cache.callables = {}
+        cache.mempool = None
+        cache.inference_params = None
+        gc.collect()
+        cache.device, cache.dtype = device, dtype
+        cache.max_batch_size, cache.max_seqlen = batch_size, max_seqlen
+        assert hasattr(model, "allocate_inference_cache"), "CUDA graph decoding requires that the model has a method allocate_inference_cache"
+        inf_cache = model.allocate_inference_cache(batch_size, max_seqlen, dtype)
+        lengths_per_sample = torch.full((batch_size,), seqlen_og, dtype=torch.int32, device=device)
+        cache.inference_params = InferenceParams(
+            max_seqlen=max_seqlen,
+            max_batch_size=batch_size,
+            seqlen_offset=seqlen_og,
+            key_value_memory_dict=inf_cache,
+            lengths_per_sample=lengths_per_sample,
+        )
+        cache.mempool = torch.cuda.graphs.graph_pool_handle()
+    for decoding_seqlen in decoding_seqlens:
+        if (batch_size, decoding_seqlen) not in cache.callables:
+            cache.callables[batch_size, decoding_seqlen] = capture_graph(
+                model,
+                encoder_hidden_states,
+                cache.inference_params,
+                batch_size,
+                max_seqlen,
+                decoding_seqlen=decoding_seqlen,
+                mempool=cache.mempool,
+                n_warmups=n_warmups,
+            )
+    def dispatch(input_ids, position_ids, seqlen):
+        batch_size, decoding_seqlen = input_ids.shape[:2]
+        return cache.callables[batch_size, decoding_seqlen](input_ids, position_ids, seqlen)
+    cache.run = dispatch
+    cache.inference_params.seqlen_offset = 0  # Reset so it's not confusing
+    return cache
+def capture_graph(
+    model, encoder_hidden_states, inference_params, batch_size, max_seqlen, decoding_seqlen=1, mempool=None, n_warmups=2
+):
+    device = next(iter(model.parameters())).device
+    input_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device)
+    position_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device)
+    seqlen_offset_og = inference_params.seqlen_offset
+    inference_params.seqlen_offset = max_seqlen - decoding_seqlen
+    inference_params.lengths_per_sample[:] = inference_params.seqlen_offset
+    # Warmup before capture
+    s = torch.cuda.Stream()
+    s.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(s):
+        for _ in range(n_warmups):
+            logits = model(
+                input_ids,
+                encoder_hidden_states=encoder_hidden_states,
+                position_ids=position_ids,
+                inference_params=inference_params,
+                num_last_tokens=decoding_seqlen,
+            ).logits
+        s.synchronize()
+        # This might be needed for correctness if we run with NCCL_GRAPH_MIXING_SUPPORT=0,
+        # which requires that graph launch and non-captured launch to not overlap (I think,
+        # that's how I interpret the documentation). I'm not sure if this is required.
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+    torch.cuda.current_stream().wait_stream(s)
+    # Captures the graph
+    # To allow capture, automatically sets a side stream as the current stream in the context
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph, pool=mempool):
+        logits = model(
+            input_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            position_ids=position_ids,
+            inference_params=inference_params,
+            num_last_tokens=decoding_seqlen,
+        ).logits
+    def run(new_input_ids, new_position_ids, seqlen):
+        inference_params.lengths_per_sample[:] = seqlen
+        input_ids.copy_(new_input_ids)
+        position_ids.copy_(new_position_ids)
+        graph.replay()
+        return logits.clone()
+    inference_params.seqlen_offset = seqlen_offset_og
+    return run

str_bamba/load.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from .bamba_config import BambaEncoderDecoderConfig
+from .bamba import BambaConfig, BambaEncoderDecoder
+from .tokenizer.str_tokenizer import load_tokenizer
+import torch
+import numpy as np
+import random
+import json
+import os
+def load_strbamba(ckpt_filename,
+			   base_folder='./str_bamba',
+			   config_filename='config_encoder-decoder_436M.json',
+			   tokenizer_filename='str_bamba_tokenizer.json',
+			   eval_model=True,
+			   device='cuda:0',
+			   dtype=torch.float32
+			   ):
+	# load config
+	with open(os.path.join(base_folder, f'config/{config_filename}')) as json_data:
+		config_json = json.load(json_data)
+	bamba_config = BambaEncoderDecoderConfig(
+        encoder_config=BambaConfig(**config_json['encoder_config']),
+        decoder_config=BambaConfig(**config_json['decoder_config']),
+        tie_word_embeddings=config_json['tie_word_embeddings'],
+        seed=config_json['seed']
+    )
+	# load tokenizer
+	tokenizer = load_tokenizer(os.path.join(base_folder, f'tokenizer/{tokenizer_filename}'))
+	# load model
+	model = BambaEncoderDecoder(bamba_config, tokenizer, device=device, dtype=dtype)
+	# load weights
+	ckpt_dict = torch.load(
+		os.path.join(base_folder, f'checkpoints/{ckpt_filename}'),
+		map_location=device,
+		weights_only=False
+	)
+	model.load_state_dict(ckpt_dict['module'])
+	# load RNG states each time the model and states are loaded from checkpoint
+	if 'rng' in ckpt_dict:
+		rng = ckpt_dict['rng']
+		for key, value in rng.items():
+			if key =='torch_state':
+				torch.set_rng_state(value.cpu())
+			elif key =='cuda_state':
+				torch.cuda.set_rng_state(value.cpu())
+			elif key =='numpy_state':
+				np.random.set_state(value)
+			elif key =='python_state':
+				random.setstate(value)
+			else:
+				print('unrecognized state')
+	if eval_model:
+		return model.eval()
+	return model

str_bamba/tokenizer/special_tokens.py ADDED Viewed

	@@ -0,0 +1,39 @@

+STR_SPECIAL_TOKENS = {
+    ### basic tokens ###
+    "BOS_TOKEN": "<bos>",
+    "EOS_TOKEN": "<sep>",
+    "PAD_TOKEN": "<pad>",
+    "MASK_TOKEN": "<mask>",
+    "UNK_TOKEN": "<unk>",
+    ### molecular representations ###
+    # molecular formula
+    "MOLECULAR_FORMULA_TOKEN": "<formula>",
+    # canonical SMILES
+    "SMILES_TOKEN": "<smiles>",
+    # IUPAC name
+    "IUPAC_TOKEN": "<iupac>",
+    # InChI
+    "INCHI_TOKEN": "<inchi>",
+    "INCHI_INITIAL_TOKEN": "InChI=",  # force `InChI=` to be a unique token
+    "INCHI_COMMA_TOKEN": ",",  # force `,` to be a unique token
+    "INCHI_DASH_TOKEN": "-",  # force `-` to be a unique token
+    "INCHI_FORWARDSLASH_TOKEN": "/",  # force `/` to be a unique token
+    "INCHI_QUESTIONMARK_TOKEN": "?",  # force `?` to be a unique token
+    "INCHI_PARENTHESIS_OPEN_TOKEN": "(",  # force `(` to be a unique token
+    "INCHI_PARENTHESIS_CLOSE_TOKEN": ")",  # force `)` to be a unique token
+    # SELFIES
+    "SELFIES_TOKEN": "<selfies>",
+    # polymer SPG
+    "POLYMER_SPG_TOKEN": "<polymer_spg>",
+    "POLYMER_ARROW_TOKEN": "->",  # force `->` to be a unique token
+    # formulation
+    "FORMULATION_START_TOKEN": "<formulation_start>",
+    "FORMULATION_END_TOKEN": "<formulation_end>",
+}

str_bamba/tokenizer/str_bamba_tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

str_bamba/tokenizer/str_tokenizer.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from typing import List
+from tokenizers import NormalizedString, PreTokenizedString
+from tokenizers.pre_tokenizers import PreTokenizer
+from transformers import PreTrainedTokenizerFast
+import re
+ATOM_REGEX_PATTERN = r"""(<(.*?)>|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"""
+FORMULATION_REGEX_PATTERN = r"""(<(.*?)>|[-+]?\d*\.\d+|[-+]?\d+\.?\d*[eE][-+]?\d+|[-+]?\d+|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"""
+NUMBER_REGEX_PATTERN = r"""(\d{2}|\d[a-zA-Z]\d|\d[a-zA-Z]|[a-zA-Z]\d+|\(|\))"""
+# NUMBER_REGEX_PATTERN = r"""((?<!\d)\d{2}(?!\d)|\d[a-zA-Z]\d|\d[a-zA-Z]|[a-zA-Z]\d)"""
+# NUMBER_REGEX_PATTERN = r"""(\d[a-zA-Z]\d|\d[a-zA-Z]|[a-zA-Z]\d|\b\d{2}\b)"""
+SPECIAL_REGEX_PATTERN = r"""<(.*?)>"""
+class MoleculePreTokenizer:
+    def molecule_based_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
+        splits = []
+        if str(normalized_string).startswith(('<smiles>', '<selfies>', '<polymer_spg>')):
+            for m in re.finditer(ATOM_REGEX_PATTERN, str(normalized_string)):
+                start = m.start(0)
+                stop = m.end(0)
+                if start == 0:  # remove special tokens
+                    continue
+                splits.append(normalized_string[start:stop])
+        elif str(normalized_string).startswith('<formulation_start>'):
+            for m in re.finditer(FORMULATION_REGEX_PATTERN, str(normalized_string)):
+                start = m.start(0)
+                stop = m.end(0)
+                if start == 0 or stop == len(str(normalized_string)):  # remove special tokens
+                    continue
+                splits.append(normalized_string[start:stop])
+        elif str(normalized_string).startswith(('<formula>', '<inchi>')):
+            for m in re.finditer(NUMBER_REGEX_PATTERN, str(normalized_string)):
+                start = m.start(0)
+                stop = m.end(0)
+                if start == 0:  # remove special tokens
+                    continue
+                splits.append(normalized_string[start:stop])
+        else:
+            last = 0
+            for m in re.finditer(SPECIAL_REGEX_PATTERN, str(normalized_string)):  # remove special tokens
+                start = m.start(0)
+                stop = m.end(0)
+                # splits.append(normalized_string[start:stop])
+                last = stop
+            splits.append(normalized_string[last:])
+        return splits
+    def pre_tokenize(self, pretok: PreTokenizedString):
+        pretok.split(self.molecule_based_split)
+class MultiMolTranBertTokenizer(PreTrainedTokenizerFast):
+        def __init__(self, vocab_file: str = '',
+                    do_lower_case=False,
+                    cls_token='<bos>',
+                    eos_token='<sep>',
+                    pad_token='<pad>',
+                    unk_token='<unk>',
+                    mask_token='<mask>',
+                    **kwargs):
+            super().__init__(
+                tokenizer_file=vocab_file,
+                bos_token=cls_token,
+                eos_token=eos_token,
+                pad_token=pad_token,
+                unk_token=unk_token,
+                mask_token=mask_token
+            )
+        def get_padding_idx(self):
+            return 2
+        def convert_idx_to_tokens(self, idx_tensor):
+            tokens = [self.convert_ids_to_tokens(idx) for idx in idx_tensor.tolist()]
+            return tokens
+        def convert_tokens_to_string(self, tokens):
+            stopwords = ['<bos>', '<eos>']
+            clean_tokens = [word for word in tokens if word not in stopwords]
+            out_string = ''.join(clean_tokens)
+            return out_string
+        def idx_to_smiles(self, torch_model, idx):
+            '''Convert tokens idx back to SMILES text'''
+            rev_tokens = torch_model.tokenizer.convert_idx_to_tokens(idx)
+            flat_list_tokens = [item for sublist in rev_tokens for item in sublist]
+            decoded_smiles = torch_model.tokenizer.convert_tokens_to_string(flat_list_tokens)
+            return decoded_smiles
+def load_tokenizer(vocab_file, **kwargs):
+    tokenizer = MultiMolTranBertTokenizer(vocab_file, **kwargs)
+    tokenizer.backend_tokenizer.pre_tokenizer = PreTokenizer.custom(MoleculePreTokenizer())
+    return tokenizer