YAML Metadata Warning:empty or missing yaml metadata in repo card

Check out the documentation for more information.

To utilize the pretrained model here, please refer to the bewlo codes for your custom sequences

import os
import torch
import importlib.util
from tokenizers import Tokenizer
from huggingface_hub import hf_hub_download

# Function to download and load a Python module from the Hugging Face Hub.
def load_module_from_hub(repo_id, filename):
    file_path = hf_hub_download(repo_id=repo_id, filename=filename)
    module_name = os.path.splitext(os.path.basename(file_path))[0]
    spec = importlib.util.spec_from_file_location(module_name, file_path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module

# Repository ID where your files are hosted.
repo_id = "dzjxzyd/PepBERT-small-UniRef100"

# 1) Download and load model.py and config.py dynamically.
model_module = load_module_from_hub(repo_id, "model.py")
config_module = load_module_from_hub(repo_id, "config.py")
build_transformer = model_module.build_transformer
get_config = config_module.get_config

# 2) Download tokenizer.json and load the tokenizer.
tokenizer_path = hf_hub_download(repo_id=repo_id, filename="tokenizer.json")
tokenizer = Tokenizer.from_file(tokenizer_path)

# 3) Download model weights (tmodel_12.pt).
weights_path = hf_hub_download(repo_id=repo_id, filename="tmodel_12.pt")

# 4) Initialize the model structure and load the weights.
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_built() or torch.backends.mps.is_available() else "cpu"
config = get_config()
model = build_transformer(
    src_vocab_size=tokenizer.get_vocab_size(),
    src_seq_len=config["seq_len"],
    d_model=config["d_model"]
)
state = torch.load(weights_path, map_location=torch.device(device))
model.load_state_dict(state["model_state_dict"])
model.eval()

# 5) Generate embeddings for an example amino acid sequence.
# Add special tokens [SOS] and [EOS] as in training.
sequence = "KRKGFLGI"
encoded_ids = (
    [tokenizer.token_to_id("[SOS]")]
    + tokenizer.encode(sequence).ids
    + [tokenizer.token_to_id("[EOS]")]
)
input_ids = torch.tensor([encoded_ids], dtype=torch.int64)

with torch.no_grad():
    # Create a simple attention mask (all ones, since no padding is used here)
    encoder_mask = torch.ones((1, 1, 1, input_ids.size(1)), dtype=torch.int64)
    # Forward pass through the encoder to get token embeddings
    emb = model.encode(input_ids, encoder_mask)
    # Remove first ([SOS]) and last ([EOS]) embeddings
    emb_no_first_last = emb[:, 1:-1, :]
    # Apply average pooling over the remaining tokens to get a fixed-size vector
    emb_avg = emb_no_first_last.mean(dim=1)

print("Shape of emb_avg:", emb_avg.shape)
print("emb_avg:", emb_avg)

Downloads last month: -; Downloads are not tracked for this model. How to track

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support