| | import sse from 'k6/x/sse'
|
| | import {check, sleep} from 'k6'
|
| | import {SharedArray} from 'k6/data'
|
| | import {Counter, Rate, Trend} from 'k6/metrics'
|
| | import exec from 'k6/execution';
|
| |
|
| |
|
| | const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
|
| |
|
| |
|
| | const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8
|
| |
|
| |
|
| | const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model'
|
| |
|
| |
|
| | const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
|
| |
|
| |
|
| | const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
|
| |
|
| |
|
| | const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024
|
| |
|
| |
|
| | const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048
|
| |
|
| | export function setup() {
|
| | console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
|
| | }
|
| |
|
| | const data = new SharedArray('conversations', function () {
|
| | const tokenizer = (message) => message.split(/[\s,'".?]/)
|
| |
|
| | return JSON.parse(open(dataset_path))
|
| |
|
| | .filter(data => data["conversations"].length >= 2)
|
| | .filter(data => data["conversations"][0]["from"] === "human")
|
| | .map(data => {
|
| | return {
|
| | prompt: data["conversations"][0]["value"],
|
| | n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length,
|
| | n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length,
|
| | }
|
| | })
|
| |
|
| | .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4)
|
| |
|
| | .filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot)
|
| |
|
| | .slice(0, n_prompt)
|
| | })
|
| |
|
| | const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
|
| | const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
|
| |
|
| | const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
|
| | const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
|
| | const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
|
| |
|
| | const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
|
| | const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
|
| |
|
| | const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate')
|
| | const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate')
|
| |
|
| | export const options = {
|
| | thresholds: {
|
| | llamacpp_completions_truncated_rate: [
|
| |
|
| | {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'},
|
| | ],
|
| | },
|
| | duration: '10m',
|
| | vus: 8,
|
| | }
|
| |
|
| | export default function () {
|
| | const conversation = data[exec.scenario.iterationInInstance % data.length]
|
| | const payload = {
|
| | "messages": [
|
| | {
|
| | "role": "system",
|
| | "content": "You are ChatGPT, an AI assistant.",
|
| | },
|
| | {
|
| | "role": "user",
|
| | "content": conversation.prompt,
|
| | }
|
| | ],
|
| | "model": model,
|
| | "stream": true,
|
| | "stream_options": {
|
| | "include_usage": true,
|
| | },
|
| | "seed": 42,
|
| | "max_tokens": max_tokens,
|
| | "stop": ["<|im_end|>"]
|
| | }
|
| |
|
| | const params = {method: 'POST', body: JSON.stringify(payload)};
|
| |
|
| | const startTime = new Date()
|
| | let promptEvalEndTime = null
|
| | let prompt_tokens = 0
|
| | let completions_tokens = 0
|
| | let finish_reason = null
|
| | const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
|
| | client.on('event', function (event) {
|
| | if (promptEvalEndTime == null) {
|
| | promptEvalEndTime = new Date()
|
| | llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
|
| | }
|
| |
|
| | if (event.data === '[DONE]' || event.data === '') {
|
| | return
|
| | }
|
| |
|
| | let chunk = JSON.parse(event.data)
|
| |
|
| | if (chunk.choices && chunk.choices.length > 0) {
|
| | let choice = chunk.choices[0]
|
| | if (choice.finish_reason) {
|
| | finish_reason = choice.finish_reason
|
| | }
|
| | }
|
| |
|
| | if (chunk.usage) {
|
| | prompt_tokens = chunk.usage.prompt_tokens
|
| | llamacpp_prompt_tokens.add(prompt_tokens)
|
| | llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
|
| |
|
| | completions_tokens = chunk.usage.completion_tokens
|
| | llamacpp_completion_tokens.add(completions_tokens)
|
| | llamacpp_completion_tokens_total_counter.add(completions_tokens)
|
| | }
|
| | })
|
| |
|
| | client.on('error', function (e) {
|
| | console.log('An unexpected error occurred: ', e.error());
|
| | throw e;
|
| | })
|
| | })
|
| |
|
| | check(res, {'success completion': (r) => r.status === 200})
|
| |
|
| | const endTime = new Date()
|
| |
|
| | const promptEvalTime = promptEvalEndTime - startTime
|
| | if (promptEvalTime > 0) {
|
| | llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
|
| | }
|
| |
|
| | const completion_time = endTime - promptEvalEndTime
|
| | if (completions_tokens > 0 && completion_time > 0) {
|
| | llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
|
| | }
|
| | llamacpp_completions_truncated_rate.add(finish_reason === 'length')
|
| | llamacpp_completions_stop_rate.add(finish_reason === 'stop')
|
| |
|
| | sleep(0.3)
|
| | }
|
| |
|