jva96160 commited on
Commit
83f95e1
Β·
verified Β·
1 Parent(s): 92175c2

Upload 3 files

Browse files
Files changed (3) hide show
  1. compose.yaml +21 -0
  2. merge_quant.ipynb +169 -0
  3. run.sh +13 -0
compose.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '0'
2
+
3
+ services:
4
+ vllm-openai:
5
+ restart: always
6
+ image: vllm/vllm-openai:latest
7
+ container_name: custom_service
8
+ shm_size: "32g"
9
+ ports:
10
+ - "8087:8087"
11
+ - "8088:8088"
12
+ volumes:
13
+ - "/home/jeff/Custom_service/deploy:/root"
14
+ entrypoint: /bin/bash /root/run.sh
15
+ deploy:
16
+ resources:
17
+ reservations:
18
+ devices:
19
+ - driver: nvidia
20
+ count: all
21
+ capabilities: [gpu]
merge_quant.ipynb ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "3feede9c",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stderr",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "/opt/miniconda3/envs/py10/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
14
+ " from .autonotebook import tqdm as notebook_tqdm\n",
15
+ "`torch_dtype` is deprecated! Use `dtype` instead!\n",
16
+ "Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [00:00<00:00, 274.73it/s]\n"
17
+ ]
18
+ }
19
+ ],
20
+ "source": [
21
+ "from transformers import (\n",
22
+ " AutoTokenizer,\n",
23
+ " AutoModelForCausalLM,\n",
24
+ " TrainingArguments,\n",
25
+ " AutoProcessor\n",
26
+ ")\n",
27
+ "from peft import PeftModel, PeftConfig\n",
28
+ "import torch\n",
29
+ "MODEL_NAME = \"/home/jeff/Custom_service/Llama-3.1-Nemotron-Nano-8B-v1\"\n",
30
+ "llm = AutoModelForCausalLM.from_pretrained(\n",
31
+ " MODEL_NAME,\n",
32
+ " device_map=\"cpu\",\n",
33
+ " trust_remote_code=True,\n",
34
+ " torch_dtype=torch.bfloat16,\n",
35
+ " )\n",
36
+ "llm = PeftModel.from_pretrained(llm, \"/home/jeff/Custom_service/train_15\")\n",
37
+ "llm_processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
38
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 2,
44
+ "id": "a8dd97f7",
45
+ "metadata": {},
46
+ "outputs": [
47
+ {
48
+ "data": {
49
+ "text/plain": [
50
+ "('merged/tokenizer_config.json',\n",
51
+ " 'merged/special_tokens_map.json',\n",
52
+ " 'merged/chat_template.jinja',\n",
53
+ " 'merged/tokenizer.json')"
54
+ ]
55
+ },
56
+ "execution_count": 2,
57
+ "metadata": {},
58
+ "output_type": "execute_result"
59
+ }
60
+ ],
61
+ "source": [
62
+ "merged_model = llm.merge_and_unload()\n",
63
+ "output_des = 'merged'\n",
64
+ "merged_model.save_pretrained(output_des)\n",
65
+ "tokenizer.save_pretrained(output_des)"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 3,
71
+ "id": "da1ec848",
72
+ "metadata": {},
73
+ "outputs": [
74
+ {
75
+ "name": "stderr",
76
+ "output_type": "stream",
77
+ "text": [
78
+ "Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 4/4 [01:36<00:00, 24.01s/it]\n"
79
+ ]
80
+ }
81
+ ],
82
+ "source": [
83
+ "from transformers import BitsAndBytesConfig,AutoModelForCausalLM, AutoTokenizer\n",
84
+ "import torch\n",
85
+ "\n",
86
+ "nf4_config = BitsAndBytesConfig(\n",
87
+ " load_in_4bit=True,\n",
88
+ " bnb_4bit_quant_type=\"nf4\",\n",
89
+ " bnb_4bit_use_double_quant=True,\n",
90
+ " bnb_4bit_compute_dtype=torch.bfloat16\n",
91
+ ")\n",
92
+ "\n",
93
+ "model_nf4 = AutoModelForCausalLM.from_pretrained('/home/jeff/Custom_service/deploy/merged',device_map=\"cpu\", quantization_config=nf4_config)"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": 4,
99
+ "id": "3675e104",
100
+ "metadata": {},
101
+ "outputs": [
102
+ {
103
+ "data": {
104
+ "text/plain": [
105
+ "('quant_nf4/tokenizer_config.json',\n",
106
+ " 'quant_nf4/special_tokens_map.json',\n",
107
+ " 'quant_nf4/chat_template.jinja',\n",
108
+ " 'quant_nf4/tokenizer.json')"
109
+ ]
110
+ },
111
+ "execution_count": 4,
112
+ "metadata": {},
113
+ "output_type": "execute_result"
114
+ }
115
+ ],
116
+ "source": [
117
+ "tokenizer = AutoTokenizer.from_pretrained('/home/jeff/Custom_service/deploy/merged')\n",
118
+ "tokenizer.save_pretrained('quant_nf4')"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": 5,
124
+ "id": "267dd7eb",
125
+ "metadata": {},
126
+ "outputs": [],
127
+ "source": [
128
+ "model_nf4.save_pretrained('quant_nf4')\n"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": null,
134
+ "id": "f8f0b7ff",
135
+ "metadata": {},
136
+ "outputs": [],
137
+ "source": []
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": null,
142
+ "id": "39713d70",
143
+ "metadata": {},
144
+ "outputs": [],
145
+ "source": []
146
+ }
147
+ ],
148
+ "metadata": {
149
+ "kernelspec": {
150
+ "display_name": "py10",
151
+ "language": "python",
152
+ "name": "python3"
153
+ },
154
+ "language_info": {
155
+ "codemirror_mode": {
156
+ "name": "ipython",
157
+ "version": 3
158
+ },
159
+ "file_extension": ".py",
160
+ "mimetype": "text/x-python",
161
+ "name": "python",
162
+ "nbconvert_exporter": "python",
163
+ "pygments_lexer": "ipython3",
164
+ "version": "3.10.18"
165
+ }
166
+ },
167
+ "nbformat": 4,
168
+ "nbformat_minor": 5
169
+ }
run.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pip install pypinyin
2
+ pip install rapidfuzz
3
+ pip install openai
4
+ pip install fastapi
5
+ pip install uvicorn
6
+
7
+ cd /root
8
+
9
+ vllm serve /root/merged --host 0.0.0.0 --port 8087 --max-model-len 16384 --quantization bitsandbytes --load_format bitsandbytes --gpu-memory-utilization 0.8 --override-generation-config '{"temperature": 0.6}' &
10
+
11
+ uvicorn main:app --host 0.0.0.0 --port 8088 --log-level info --workers 1 >> ./log.txt &
12
+
13
+ wait