| # ------------------------------------------------------------------------- | |
| # Copyright (c) Microsoft Corporation. All rights reserved. | |
| # Licensed under the MIT License. | |
| # -------------------------------------------------------------------------- | |
| import os | |
| from pathlib import Path | |
| import torch | |
| import torch.distributed as dist | |
| from optimum.onnxruntime import ORTModelForCausalLM | |
| from transformers import AutoConfig, AutoTokenizer, GenerationConfig | |
| device_id = 0 | |
| device = torch.device(f"cuda:{device_id}") # Change to torch.device("cpu") if running on CPU | |
| ep = "CUDAExecutionProvider" # change to CPUExecutionProvider if running on CPU | |
| ep_options = {"device_id": device_id} | |
| model_id = "mistralai/Mistral-7B-Instruct-v0.2" | |
| model_path = "./Olive/examples/llama2/models/qlora/qlora-conversion-transformers_optimization-bnb_quantization/gpu-cuda_model" | |
| model_path = Path(model_path) | |
| if not (model_path / "config.json").exists(): | |
| config = AutoConfig.from_pretrained(model_id) | |
| config.save_pretrained(model_path) | |
| else: | |
| config = AutoConfig.from_pretrained(model_path) | |
| if not (model_path / "generation_config.json").exists(): | |
| gen_config = GenerationConfig.from_pretrained(model_id) | |
| gen_config.save_pretrained(model_path) | |
| else: | |
| gen_config = GenerationConfig.from_pretrained(model_path) | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = ORTModelForCausalLM.from_pretrained( | |
| model_path, | |
| config=config, | |
| generation_config=gen_config, | |
| use_io_binding=True, | |
| # provider="CUDAExecutionProvider", | |
| provider=ep, | |
| provider_options={"device_id": device_id} | |
| # provider_options={"device_id": str(rank)}, | |
| ) | |