|
|
import numpy as np |
|
|
import torch.nn.functional as F |
|
|
from transformers import AutoTokenizer |
|
|
from transformers_neuronx import MistralForSampling, GQA, NeuronConfig |
|
|
|
|
|
|
|
|
neuron_config = NeuronConfig( |
|
|
group_query_attention=GQA.SHARD_OVER_HEADS |
|
|
) |
|
|
|
|
|
|
|
|
model_neuron = MistralForSampling.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2', amp='bf16', batch_size=1, tp_degree=2, n_positions=2048, neuron_config=neuron_config) |
|
|
model_neuron.to_neuron() |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2') |
|
|
tokenizer.pad_token_id = tokenizer.eos_token_id |
|
|
|
|
|
input_prompt = 'Who are you?' |
|
|
|
|
|
input_prompt = "[INST] " + input_prompt + " [/INST]" |
|
|
encoded_input = tokenizer(input_prompt, return_tensors='pt') |
|
|
original_input_ids = encoded_input.input_ids |
|
|
input_ids_length = original_input_ids.shape[1] |
|
|
power_of_length = 64 |
|
|
while power_of_length < input_ids_length: |
|
|
power_of_length *= 2 |
|
|
padding_size = ((input_ids_length - 1) // 64 + 1) * power_of_length |
|
|
padding_gap = padding_size - input_ids_length |
|
|
padded_input_ids = F.pad(original_input_ids, (padding_gap, 0), value=tokenizer.pad_token_id) |
|
|
|
|
|
input_embeds = model_neuron.chkpt_model.model.embed_tokens(padded_input_ids) |
|
|
|
|
|
input_embeds_np = input_embeds.detach().numpy() |
|
|
np.save('./input_embeds.npy', input_embeds_np) |
|
|
|