derek-thomas
/

tgi-notebooks-optimization

Model card Files Files and versions

xet

Community

derek-thomas commited on Aug 28, 2024

Commit

57918de

verified ·

1 Parent(s): 5d714fc

Upload 01-tgi-ie-benchmark.ipynb

Browse files

Files changed (1) hide show

01-tgi-ie-benchmark.ipynb +38 -17

01-tgi-ie-benchmark.ipynb CHANGED Viewed

@@ -1,5 +1,20 @@
 {
  "cells": [
   {
    "cell_type": "markdown",
    "id": "602a8c54-b434-4d8e-bc72-824c642fbdb5",
@@ -76,16 +91,16 @@
    "outputs": [],
    "source": [
     "# Endpoint\n",
-    "ENDPOINT_NAME=\"tgi-benchmark-sp\"\n",
-    "NAMESPACE = 'hf-test-lab'\n",
-    "MODEL = 'meta-llama/Meta-Llama-3-8B-Instruct'\n",
-    "INSTANCE_TYPE = 'nvidia-a100_2'\n",
     "\n",
     "# Simulation\n",
     "RESULTS_DIR = proj_dir/'tgi_benchmark_results'/INSTANCE_TYPE\n",
-    "tgi_bss = [8, 16, 24, 32, 40, 48, 56, 64]\n",
-    "INPUT_TOKENS = 3000\n",
-    "OUTPUT_TOKENS = 300"
    ]
   },
   {
@@ -129,8 +144,8 @@
     "            region=\"us-east-1\",\n",
     "            vendor=\"aws\",\n",
     "            accelerator=\"gpu\",\n",
-    "            instance_size=\"x1\",\n",
-    "            instance_type='nvidia-a100',\n",
     "            min_replica=0,\n",
     "            max_replica=1,\n",
     "            namespace=NAMESPACE,\n",
@@ -141,9 +156,10 @@
     "                    \"MAX_TOTAL_TOKENS\": f\"{INPUT_TOKENS + OUTPUT_TOKENS}\",\n",
     "                    \"MAX_BATCH_SIZE\": f\"{MAX_BATCH_SIZE}\",\n",
     "                    \"HF_TOKEN\": get_token(),\n",
     "                    \"MODEL_ID\": \"/repository\",\n",
     "                },\n",
-    "                \"url\": \"ghcr.io/huggingface/text-generation-inference:2.0.4\",\n",
     "            },\n",
     "            type=\"protected\",\n",
     "        )\n",
@@ -179,7 +195,8 @@
     "    # Set environment variables\n",
     "    env = os.environ.copy()\n",
     "    env['HUGGINGFACE_API_BASE'] = endpoint.url\n",
-    "    env['HUGGINGFACE_API_KEY'] = get_token()\n",
     "    # Convert pathlib.Path to string and append to PYTHONPATH\n",
     "    env['PYTHONPATH'] = str(LLMPerf_path) + (os.pathsep + env.get('PYTHONPATH', ''))\n",
     "\n",
@@ -200,16 +217,16 @@
     "    # Construct the command to run the benchmark script\n",
     "    command = [\n",
     "        \"python\", benchmark_script,\n",
-    "        \"--model\", f\"huggingface/{MODEL}\",\n",
     "        \"--mean-input-tokens\", f\"{INPUT_TOKENS}\",\n",
     "        \"--stddev-input-tokens\", \"10\",\n",
-    "        \"--mean-output-tokens\", \"240\",\n",
     "        \"--stddev-output-tokens\", \"5\",\n",
     "        \"--max-num-completed-requests\", str(min(max_requests, 1500)),\n",
     "        \"--timeout\", \"7200\",\n",
     "        \"--num-concurrent-requests\", str(vu),\n",
     "        \"--results-dir\", str(results_dir),\n",
-    "        \"--llm-api\", \"litellm\",\n",
     "        \"--additional-sampling-params\", '{}'\n",
     "    ]\n",
     "\n",
@@ -222,7 +239,7 @@
     "        return e.output.decode(), False\n",
     "\n",
     "def find_max_working_batch_size(endpoint, tgi_bs):\n",
-    "    batch_sizes = [8, 16, 32, 64, 128, 256]\n",
     "    max_working = None\n",
     "    for size in tqdm(batch_sizes):\n",
     "        tqdm.write(f\"Running: TGIBS {tgi_bs} Client Requests {size}\")\n",
@@ -255,7 +272,11 @@
    "source": [
     "for tgi_bs in tqdm(tgi_bss):\n",
     "    name = f\"{ENDPOINT_NAME}--tgibs-{tgi_bs}\"\n",
-    "    endpoint = create_endpoint(MAX_BATCH_SIZE=tgi_bs, name=name, instance_type=INSTANCE_TYPE) \n",
     "    endpoint.wait()\n",
     "    tqdm.write(f\"Endpoint Created: {name}\")\n",
     "    max_batch_size = find_max_working_batch_size(endpoint=endpoint, tgi_bs=tgi_bs)\n",
@@ -266,7 +287,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "25ef390c-10fe-4466-b8fd-1c01730205d2",
    "metadata": {},
    "outputs": [],
    "source": []

 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6221e83-9d8f-4716-aeda-b40847931f56",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "git clone https://github.com/philschmid/llmperf.git\n",
+    "cd llmperf\n",
+    "pip install -e . -q"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "602a8c54-b434-4d8e-bc72-824c642fbdb5",
    "outputs": [],
    "source": [
     "# Endpoint\n",
+    "ENDPOINT_NAME=\"mixtral-exp\"\n",
+    "NAMESPACE = 'HF-test-lab'\n",
+    "MODEL = 'TheBloke/mixtral-8x7b-v0.1-GPTQ'\n",
+    "INSTANCE_TYPE = 'nvidia-l4_AWQ'\n",
     "\n",
     "# Simulation\n",
     "RESULTS_DIR = proj_dir/'tgi_benchmark_results'/INSTANCE_TYPE\n",
+    "tgi_bss = [1]\n",
+    "INPUT_TOKENS = 800\n",
+    "OUTPUT_TOKENS = 1600"
    ]
   },
   {
     "            region=\"us-east-1\",\n",
     "            vendor=\"aws\",\n",
     "            accelerator=\"gpu\",\n",
+    "            instance_size=\"x4\",\n",
+    "            instance_type='nvidia-l4',\n",
     "            min_replica=0,\n",
     "            max_replica=1,\n",
     "            namespace=NAMESPACE,\n",
     "                    \"MAX_TOTAL_TOKENS\": f\"{INPUT_TOKENS + OUTPUT_TOKENS}\",\n",
     "                    \"MAX_BATCH_SIZE\": f\"{MAX_BATCH_SIZE}\",\n",
     "                    \"HF_TOKEN\": get_token(),\n",
+    "                    \"QUANTIZE\":\"awq\",\n",
     "                    \"MODEL_ID\": \"/repository\",\n",
     "                },\n",
+    "                \"url\": \"ghcr.io/huggingface/text-generation-inference:2.2.0\",\n",
     "            },\n",
     "            type=\"protected\",\n",
     "        )\n",
     "    # Set environment variables\n",
     "    env = os.environ.copy()\n",
     "    env['HUGGINGFACE_API_BASE'] = endpoint.url\n",
+    "    env['HUGGINGFACE_API_TOKEN'] = get_token()\n",
+    "    env['MODEL_ID'] = MODEL\n",
     "    # Convert pathlib.Path to string and append to PYTHONPATH\n",
     "    env['PYTHONPATH'] = str(LLMPerf_path) + (os.pathsep + env.get('PYTHONPATH', ''))\n",
     "\n",
     "    # Construct the command to run the benchmark script\n",
     "    command = [\n",
     "        \"python\", benchmark_script,\n",
+    "        \"--model\", f\"{MODEL}\",\n",
     "        \"--mean-input-tokens\", f\"{INPUT_TOKENS}\",\n",
     "        \"--stddev-input-tokens\", \"10\",\n",
+    "        \"--mean-output-tokens\", f\"{OUTPUT_TOKENS}\",\n",
     "        \"--stddev-output-tokens\", \"5\",\n",
     "        \"--max-num-completed-requests\", str(min(max_requests, 1500)),\n",
     "        \"--timeout\", \"7200\",\n",
     "        \"--num-concurrent-requests\", str(vu),\n",
     "        \"--results-dir\", str(results_dir),\n",
+    "        \"--llm-api\", \"huggingface\",\n",
     "        \"--additional-sampling-params\", '{}'\n",
     "    ]\n",
     "\n",
     "        return e.output.decode(), False\n",
     "\n",
     "def find_max_working_batch_size(endpoint, tgi_bs):\n",
+    "    batch_sizes = [8, 16, 32]\n",
     "    max_working = None\n",
     "    for size in tqdm(batch_sizes):\n",
     "        tqdm.write(f\"Running: TGIBS {tgi_bs} Client Requests {size}\")\n",
    "source": [
     "for tgi_bs in tqdm(tgi_bss):\n",
     "    name = f\"{ENDPOINT_NAME}--tgibs-{tgi_bs}\"\n",
+    "    try:\n",
+    "        endpoint = get_inference_endpoint(name, namespace=NAMESPACE)\n",
+    "    except:\n",
+    "        endpoint = create_endpoint(MAX_BATCH_SIZE=tgi_bs, name=name, instance_type=INSTANCE_TYPE) \n",
+    "        pass\n",
     "    endpoint.wait()\n",
     "    tqdm.write(f\"Endpoint Created: {name}\")\n",
     "    max_batch_size = find_max_working_batch_size(endpoint=endpoint, tgi_bs=tgi_bs)\n",
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "70a5f441-3da7-4888-9943-112750681067",
    "metadata": {},
    "outputs": [],
    "source": []