jva96160
/

city

Safetensors

Model card Files Files and versions

xet

Community

jva96160 commited on Nov 4, 2025

Commit

83f95e1

verified ·

1 Parent(s): 92175c2

Upload 3 files

Browse files

Files changed (3) hide show

compose.yaml +21 -0
merge_quant.ipynb +169 -0
run.sh +13 -0

compose.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+version: '0'
+services:
+  vllm-openai:
+    restart: always
+    image: vllm/vllm-openai:latest
+    container_name: custom_service
+    shm_size: "32g"
+    ports:
+      - "8087:8087"
+      - "8088:8088"
+    volumes:
+      - "/home/jeff/Custom_service/deploy:/root"
+    entrypoint: /bin/bash /root/run.sh
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]

merge_quant.ipynb ADDED Viewed

	@@ -0,0 +1,169 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "3feede9c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/miniconda3/envs/py10/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "`torch_dtype` is deprecated! Use `dtype` instead!\n",
+      "Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 274.73it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import (\n",
+    "    AutoTokenizer,\n",
+    "    AutoModelForCausalLM,\n",
+    "    TrainingArguments,\n",
+    "    AutoProcessor\n",
+    ")\n",
+    "from peft import PeftModel, PeftConfig\n",
+    "import torch\n",
+    "MODEL_NAME = \"/home/jeff/Custom_service/Llama-3.1-Nemotron-Nano-8B-v1\"\n",
+    "llm = AutoModelForCausalLM.from_pretrained(\n",
+    "            MODEL_NAME,\n",
+    "            device_map=\"cpu\",\n",
+    "            trust_remote_code=True,\n",
+    "            torch_dtype=torch.bfloat16,\n",
+    "        )\n",
+    "llm = PeftModel.from_pretrained(llm, \"/home/jeff/Custom_service/train_15\")\n",
+    "llm_processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a8dd97f7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('merged/tokenizer_config.json',\n",
+       " 'merged/special_tokens_map.json',\n",
+       " 'merged/chat_template.jinja',\n",
+       " 'merged/tokenizer.json')"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "merged_model = llm.merge_and_unload()\n",
+    "output_des = 'merged'\n",
+    "merged_model.save_pretrained(output_des)\n",
+    "tokenizer.save_pretrained(output_des)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "da1ec848",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████| 4/4 [01:36<00:00, 24.01s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import BitsAndBytesConfig,AutoModelForCausalLM, AutoTokenizer\n",
+    "import torch\n",
+    "\n",
+    "nf4_config = BitsAndBytesConfig(\n",
+    "   load_in_4bit=True,\n",
+    "   bnb_4bit_quant_type=\"nf4\",\n",
+    "   bnb_4bit_use_double_quant=True,\n",
+    "   bnb_4bit_compute_dtype=torch.bfloat16\n",
+    ")\n",
+    "\n",
+    "model_nf4 = AutoModelForCausalLM.from_pretrained('/home/jeff/Custom_service/deploy/merged',device_map=\"cpu\", quantization_config=nf4_config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "3675e104",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('quant_nf4/tokenizer_config.json',\n",
+       " 'quant_nf4/special_tokens_map.json',\n",
+       " 'quant_nf4/chat_template.jinja',\n",
+       " 'quant_nf4/tokenizer.json')"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained('/home/jeff/Custom_service/deploy/merged')\n",
+    "tokenizer.save_pretrained('quant_nf4')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "267dd7eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_nf4.save_pretrained('quant_nf4')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f8f0b7ff",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "39713d70",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py10",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

run.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+pip install pypinyin
+pip install rapidfuzz
+pip install openai
+pip install fastapi
+pip install uvicorn
+cd /root
+vllm serve /root/merged --host 0.0.0.0 --port 8087 --max-model-len 16384 --quantization bitsandbytes --load_format bitsandbytes --gpu-memory-utilization 0.8 --override-generation-config '{"temperature": 0.6}' &
+uvicorn main:app --host 0.0.0.0 --port 8088 --log-level info --workers 1 >> ./log.txt &
+wait