|
|
--- |
|
|
license: llama2 |
|
|
--- |
|
|
Quick notes--what I did to get to this point |
|
|
|
|
|
``` |
|
|
from optimum.neuron import NeuronModelForCausalLM |
|
|
from transformers import AutoTokenizer |
|
|
model_id = "TencentARC/LLaMA-Pro-8B" |
|
|
compiler_args = {"num_cores": 2, "auto_cast_type": "fp16"} |
|
|
input_shapes = {"sequence_length": 2048, "batch_size": 2 } |
|
|
llm = NeuronModelForCausalLM.from_pretrained(model_id, export=True, **input_shapes, **compiler_args) |
|
|
save_directory = "Tencent_neuron" |
|
|
|
|
|
llm.save_pretrained(save_directory) |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
|
|
tokenizer.save_pretrained(save_directory) |
|
|
|
|
|
quit() |
|
|
``` |
|
|
|
|
|
``` |
|
|
from optimum.neuron import pipeline |
|
|
|
|
|
# Load pipeline from Hugging Face repository |
|
|
save_directory = "Tencent_neuron" |
|
|
|
|
|
pipe = pipeline("text-generation", save_directory) |
|
|
|
|
|
# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating |
|
|
messages = [ |
|
|
{"role": "user", "content": "What is 2+2?"}, |
|
|
] |
|
|
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
|
# Run generation |
|
|
outputs = pipe(prompt, max_new_tokens=2048, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) |
|
|
print(outputs[0]["generated_text"]) |
|
|
|
|
|
``` |
|
|
``` |
|
|
from huggingface_hub import login |
|
|
from huggingface_hub import HfApi |
|
|
api = HfApi() |
|
|
login() |
|
|
|
|
|
|
|
|
save_directory = "Tencent_neuron" |
|
|
|
|
|
api.upload_folder( |
|
|
folder_path=save_directory, |
|
|
repo_id="jburtoft/TencentARC-LLaMA-Pro-8B-Neuron", |
|
|
repo_type="model", |
|
|
multi_commits=True, |
|
|
multi_commits_verbose=True, |
|
|
) |
|
|
``` |