train-scripts / vllm_1800.log

Ashton2000

Upload folder using huggingface_hub

981b783 verified 4 months ago

28.9 kB

	INFO 09-18 14:31:14 [__init__.py:241] Automatically detected platform cuda.
	[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:31:16 [api_server.py:1805] vLLM API server version 0.10.1.1
	[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:31:16 [utils.py:326] non-default args: {'model_tag': '/data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/dpo/merged/checkpoint-1800', 'host': '0.0.0.0', 'port': 8011, 'model': '/data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/dpo/merged/checkpoint-1800', 'served_model_name': ['qwen'], 'enable_prefix_caching': True}
	[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:31:22 [__init__.py:711] Resolved architecture: Qwen2ForCausalLM
	[1;36m(APIServer pid=3508930)[0;0m `torch_dtype` is deprecated! Use `dtype` instead!
	[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:31:22 [__init__.py:1750] Using max model len 32768
	[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:31:23 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
	INFO 09-18 14:31:27 [__init__.py:241] Automatically detected platform cuda.
	[1;36m(EngineCore_0 pid=3509752)[0;0m INFO 09-18 14:31:29 [core.py:636] Waiting for init message from front-end.
	[1;36m(EngineCore_0 pid=3509752)[0;0m INFO 09-18 14:31:29 [core.py:74] Initializing a V1 LLM engine (v0.10.1.1) with config: model='/data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/dpo/merged/checkpoint-1800', speculative_config=None, tokenizer='/data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/dpo/merged/checkpoint-1800', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=qwen, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":512,"local_cache_dir":null}
	[1;36m(EngineCore_0 pid=3509752)[0;0m INFO 09-18 14:31:30 [parallel_state.py:1134] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
	[1;36m(EngineCore_0 pid=3509752)[0;0m WARNING 09-18 14:31:30 [topk_topp_sampler.py:61] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
	[1;36m(EngineCore_0 pid=3509752)[0;0m INFO 09-18 14:31:30 [gpu_model_runner.py:1953] Starting to load model /data/wyt/codes/DocDPO/sft/checkpoints_multilang/ted_base_balanced_en_zhdefr_320/dpo/merged/checkpoint-1800...
	[1;36m(EngineCore_0 pid=3509752)[0;0m INFO 09-18 14:31:30 [gpu_model_runner.py:1985] Loading model from scratch...
	[1;36m(EngineCore_0 pid=3509752)[0;0m INFO 09-18 14:31:30 [cuda.py:328] Using Flash Attention backend on V1 engine.
	[1;36m(EngineCore_0 pid=3509752)[0;0m Loading safetensors checkpoint shards: 0% Completed \| 0/4 [00:00<?, ?it/s]
	[1;36m(EngineCore_0 pid=3509752)[0;0m Loading safetensors checkpoint shards: 25% Completed \| 1/4 [00:01<00:03, 1.29s/it]
	[1;36m(EngineCore_0 pid=3509748)[0;0m Loading safetensors checkpoint shards: 50% Completed \| 2/4 [00:03<00:03, 1.70s/it]
	[1;36m(EngineCore_0 pid=3509744)[0;0m Loading safetensors checkpoint shards: 75% Completed \| 3/4 [00:04<00:01, 1.39s/it]
	[1;36m(EngineCore_0 pid=3509748)[0;0m Loading safetensors checkpoint shards: 100% Completed \| 4/4 [00:06<00:00, 1.53s/it]
	[1;36m(EngineCore_0 pid=3509748)[0;0m Loading safetensors checkpoint shards: 100% Completed \| 4/4 [00:06<00:00, 1.52s/it]
	[1;36m(EngineCore_0 pid=3509748)[0;0m
	[1;36m(EngineCore_0 pid=3509744)[0;0m INFO 09-18 14:31:37 [default_loader.py:262] Loading weights took 6.39 seconds
	[1;36m(EngineCore_0 pid=3509744)[0;0m INFO 09-18 14:31:37 [gpu_model_runner.py:2007] Model loading took 14.2488 GiB and 6.587665 seconds
	[1;36m(EngineCore_0 pid=3509744)[0;0m INFO 09-18 14:31:44 [backends.py:548] Using cache directory: /data/wyt/.cache/vllm/torch_compile_cache/1fe949e292/rank_0_0/backbone for vLLM's torch.compile
	[1;36m(EngineCore_0 pid=3509744)[0;0m INFO 09-18 14:31:44 [backends.py:559] Dynamo bytecode transform time: 6.39 s
	[1;36m(EngineCore_0 pid=3509744)[0;0m INFO 09-18 14:31:49 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 4.697 s
	[1;36m(EngineCore_0 pid=3509744)[0;0m INFO 09-18 14:31:52 [monitor.py:34] torch.compile takes 6.39 s in total
	[1;36m(EngineCore_0 pid=3509744)[0;0m INFO 09-18 14:31:53 [gpu_worker.py:276] Available KV cache memory: 51.38 GiB
	[1;36m(EngineCore_0 pid=3509744)[0;0m INFO 09-18 14:31:53 [kv_cache_utils.py:849] GPU KV cache size: 962,112 tokens
	[1;36m(EngineCore_0 pid=3509744)[0;0m INFO 09-18 14:31:53 [kv_cache_utils.py:853] Maximum concurrency for 32,768 tokens per request: 29.36x
	[1;36m(EngineCore_0 pid=3509744)[0;0m Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%\| \| 0/67 [00:00<?, ?it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 3%\|▎ \| 2/67 [00:00<00:03, 18.81it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 6%\|▌ \| 4/67 [00:00<00:03, 19.08it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 9%\|▉ \| 6/67 [00:00<00:03, 19.08it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 12%\|█▏ \| 8/67 [00:00<00:03, 18.31it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 16%\|█▋ \| 11/67 [00:00<00:02, 18.95it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 19%\|█▉ \| 13/67 [00:00<00:02, 18.88it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 22%\|██▏ \| 15/67 [00:00<00:02, 18.86it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 27%\|██▋ \| 18/67 [00:00<00:02, 19.62it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 31%\|███▏ \| 21/67 [00:01<00:02, 20.42it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 36%\|███▌ \| 24/67 [00:01<00:02, 21.04it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 40%\|████ \| 27/67 [00:01<00:01, 21.36it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 45%\|████▍ \| 30/67 [00:01<00:01, 21.03it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 49%\|████▉ \| 33/67 [00:01<00:01, 21.63it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 54%\|█████▎ \| 36/67 [00:01<00:01, 22.82it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 58%\|█████▊ \| 39/67 [00:01<00:01, 23.81it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 63%\|██████▎ \| 42/67 [00:01<00:01, 23.68it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 67%\|██████▋ \| 45/67 [00:02<00:00, 24.29it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 72%\|███████▏ \| 48/67 [00:02<00:00, 24.79it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 76%\|███████▌ \| 51/67 [00:02<00:00, 25.57it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 81%\|████████ \| 54/67 [00:02<00:00, 26.28it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 85%\|████████▌ \| 57/67 [00:02<00:00, 26.36it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 90%\|████████▉ \| 60/67 [00:02<00:00, 26.96it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 96%\|█████████▌\| 64/67 [00:02<00:00, 28.16it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%\|██████████\| 67/67 [00:02<00:00, 27.23it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%\|██████████\| 67/67 [00:02<00:00, 23.13it/s]
	[1;36m(EngineCore_0 pid=3509744)[0;0m INFO 09-18 14:31:56 [gpu_model_runner.py:2708] Graph capturing finished in 3 secs, took 1.56 GiB
	[1;36m(EngineCore_0 pid=3509744)[0;0m INFO 09-18 14:31:56 [core.py:214] init engine (profile, create kv cache, warmup model) took 19.28 seconds
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [loggers.py:142] Engine 000: vllm cache_config_info with initialization after num_gpu_blocks is: 60132
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [api_server.py:1611] Supported_tasks: ['generate']
	[1;36m(APIServer pid=3508927)[0;0m WARNING 09-18 14:31:57 [__init__.py:1625] Default sampling parameters have been overridden by the model's Hugging Face generation config recommended from the model creator. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [serving_responses.py:120] Using default chat sampling params from model: {'repetition_penalty': 1.05, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [serving_chat.py:134] Using default chat sampling params from model: {'repetition_penalty': 1.05, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [serving_completion.py:77] Using default completion sampling params from model: {'repetition_penalty': 1.05, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [api_server.py:1880] Starting vLLM API server 0 on http://0.0.0.0:8012
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:36] Available routes are:
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /openapi.json, Methods: HEAD, GET
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /docs, Methods: HEAD, GET
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /docs/oauth2-redirect, Methods: HEAD, GET
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /redoc, Methods: HEAD, GET
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /health, Methods: GET
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /load, Methods: GET
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /ping, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /ping, Methods: GET
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /tokenize, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /detokenize, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /v1/models, Methods: GET
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /version, Methods: GET
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /v1/responses, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /v1/responses/{response_id}, Methods: GET
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /v1/responses/{response_id}/cancel, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /v1/chat/completions, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /v1/completions, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /v1/embeddings, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /pooling, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /classify, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /score, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /v1/score, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /v1/audio/transcriptions, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /v1/audio/translations, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /rerank, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /v1/rerank, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /v2/rerank, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /scale_elastic_ep, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /is_scaling_elastic_ep, Methods: POST
	[1;36m(APIServer pid=3508927)[0;0m INFO 09-18 14:31:57 [launcher.py:44] Route: /invocations, Met[1;36m(APIServer pid=3508933)[0;0m INFO 09-18 14:32:00 [chat_utils.py:470] Detected the chat template content format to [1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:32:00 [chat_utils.py:470] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to ov[1;36m(APIServer pid=3508933)[0;0m INFO 09-18 14:32:07 [loggers.py:123] Engine 000: Avg prompt throughput: 47.2 tokens/s[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:32:07 [loggers.py:123] Engine 000: Avg prompt throughput: 47.2 tokens/s, Avg gene[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO 09-18 14:32:17 [loggers.py:123] Engine 000: Avg prompt throughput: 608.0 tokens/[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:32:17 [loggers.py:123] Engine 000: Avg prompt throughput: 608.0 tokens/s, Avg gener[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO 09-18 14:32:27 [loggers.py:123] Engine 000: Avg prompt throughput: 195.2 tokens/[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:32:27 [loggers.py:123] Engine 000: Avg prompt throughput: 195.2 tokens/s, Avg gener[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:39414 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:39414 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO 09-18 14:32:37 [loggers.py:123] Engine 000: Avg prompt throughput: 325.8 tokens/[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:32:37 [loggers.py:123] Engine 000: Avg prompt throughput: 325.8 tokens/s, Avg gener[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:39414 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO 09-18 14:32:47 [loggers.py:123] Engine 000: Avg prompt throughput: 694.0 tokens/[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:32:47 [loggers.py:123] Engine 000: Avg prompt throughput: 694.1 tokens/s, Avg gener[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:48072 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:41594 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:32:57 [loggers.py:123] Engine 000: Avg prompt throughput: 373.7 tokens/s, Avg gener[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO 09-18 14:33:07 [loggers.py:123] Engine 000: Avg prompt throughput: 249.6 tokens/[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:33:07 [loggers.py:123] Engine 000: Avg prompt throughput: 209.7 tokens/s, Avg gener[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:45372 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:45372 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO 09-18 14:33:17 [loggers.py:123] Engine 000: Avg prompt throughput: 356.6 tokens/[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:33:17 [loggers.py:123] Engine 000: Avg prompt throughput: 356.6 tokens/s, Avg generation throughput: 84.5 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP[1;36m(APIServer pid=3508933)[0;0m INFO 09-18 14:33:27 [loggers.py:123] Engine 000: Avg prompt throughput: 786.5 tokens/[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:33:27 [loggers.py:123] Engine 000: Avg prompt throughput: 786.5 tokens/s, Avg gener[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:54502 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO 09-18 14:33:37 [loggers.py:123] Engine 000: Avg prompt throughput: 253.4 tokens/[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:33:37 [loggers.py:123] Engine 000: Avg prompt throughput: 253.4 tokens/s, Avg gener[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:54502 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:33:47 [loggers.py:123] Engine 000: Avg prompt throughput: 376.9 tokens/s, Avg generation throughput: 81.1 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:36442 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP[1;36m(APIServer pid=3508933)[0;0m INFO 09-18 14:33:57 [loggers.py:123] Engine 000: Avg prompt throughput: 848.9 tokens/[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:33:57 [loggers.py:123] Engine 000: Avg prompt throughput: 849.0 tokens/s, Avg gener[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APISe[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:35254 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	[1;36m(APIServer pid=3508933)[0;0m INFO 09-18 14:34:07 [loggers.py:123] Engine 000: Avg prompt throughput: 269.5 tokens/[1;36m(APIServer pid=3508930)[0;0m INFO 09-18 14:34:07 [loggers.py:123] Engine 000: Avg prompt throughput: 269.5 tokens/s, Avg gener[1;36m(APIServer pid=3508933)[0;0m INFO: 127.0.0.1:56172 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	rver pid=3508[1;36m(APIServer pid=3508930)[0;0m INFO: 127.0.0.1:32922 - "POST /v1/chat/completions HTTP/1.1" 200 OK
	tion throughput: 84.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 57.6%
	[1;36m(APIServer pid=3508927)[0;0m INFO: 127.0.0.1:58182 - "POST /v1/chat/completions HTTP/1.1" 200 OK