Doesn't work in OpenAI Streaming Interface

#7
by pytokusu - opened
    def _start_server(self) -> subprocess.Popen:
    
        cmd = [
            sys.executable, '-m', 'vllm.entrypoints.openai.api_server', 
            '--seed', str(self.cfg.seed), 
            '--model', model_name, 
            '--served-model-name', self.cfg.served_model_name, 
            '--tensor-parallel-size', str(gpu_count), 
            '--max-num-seqs', str(CFG.attempts),
            '--gpu-memory-utilization', str(gpu_memory_utilization), 
            '--host', '0.0.0.0', 
            '--port', str(self.port), 
            '--dtype', self.cfg.dtype, 
            '--kv-cache-dtype', self.cfg.kv_cache_dtype, 
            '--max-model-len', str(self.cfg.context_tokens), 
            '--stream-interval', str(self.cfg.stream_interval), 
            '--async-scheduling',
            '--enable-prefix-caching',
            '--trust-remote-code',
        ]
        if is_rerun:
            cmd.append('--disable-log-stats')
    
        self.log_file = open(VLLM_LOG_FILE, 'w')
    
        return subprocess.Popen(
            cmd, 
            stdout=self.log_file, 
            stderr=subprocess.STDOUT, 
            start_new_session=True
        )

    def _wait_for_server(self):
    
        print('Waiting for vLLM server...')
        start_time = time.time()
    
        for _ in range(self.cfg.server_timeout):
            return_code = self.server_process.poll()
    
            if return_code is not None:
                self.log_file.flush()
    
                with open(VLLM_LOG_FILE, 'r') as log_file:
                    logs = log_file.read()
    
                raise RuntimeError(f'Server died with code {return_code}. Full logs:\n{logs}\n')
    
            try:
                self.client.models.list()
                elapsed = time.time() - start_time
                print(f'Server is ready (took {elapsed:.2f} seconds).\n')
    
                return
    
            except Exception:
                time.sleep(1)
    
        raise RuntimeError('Server failed to start (timeout).\n')
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[12], line 159
    156                     pass
    158 set_seed(CFG.seed)
--> 159 connect = AIMO3Connect(CFG)

Cell In[12], line 16, in AIMO3Connect.__init__(self, cfg, port)
     10 self.server_process = self._start_server()
     11 self.client = OpenAI(
     12     base_url=self.base_url, 
     13     api_key=self.api_key, 
     14     timeout=self.cfg.session_timeout
     15 )
---> 16 self._wait_for_server()
     17 self._initialize_kernels()

Cell In[12], line 106, in AIMO3Connect._wait_for_server(self)
    103     with open(VLLM_LOG_FILE, 'r') as log_file:
    104         logs = log_file.read()
--> 106     raise RuntimeError(f'Server died with code {return_code}. Full logs:\n{logs}\n')
    108 try:
    109     self.client.models.list()

RuntimeError: Server died with code 1. Full logs:
Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0+cu128             Please see https://github.com/pytorch/ao/issues/2919 for more info
/usr/local/lib/python3.12/dist-packages/torchao/quantization/quant_api.py:2525: SyntaxWarning: invalid escape sequence '\.'
  * regex for parameter names, must start with `re:`, e.g. `re:language\.layers\..+\.q_proj.weight`.
(APIServer pid=127809) INFO 01-17 11:42:18 [api_server.py:1351] vLLM API server version 0.13.0
(APIServer pid=127809) INFO 01-17 11:42:18 [utils.py:253] non-default args: {'host': '0.0.0.0', 'model': '/kaggle/models/text-to-text/stepfun-ai/Step3-VL-10B', 'trust_remote_code': True, 'seed': 42, 'max_model_len': 131072, 'served_model_name': ['gpt-oss'], 'gpu_memory_utilization': 0.96, 'kv_cache_dtype': 'fp8_e4m3', 'enable_prefix_caching': True, 'max_num_seqs': 8, 'async_scheduling': True, 'stream_interval': 200}
(APIServer pid=127809) The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
(APIServer pid=127809) Encountered exception while importing configuration_step_vl: No module named 'configuration_step_vl'
(APIServer pid=127809) Encountered exception while importing vision_encoder: No module named 'vision_encoder'
(APIServer pid=127809) Encountered exception while importing configuration_step_vl: No module named 'configuration_step_vl'
(APIServer pid=127809) Encountered exception while importing vision_encoder: No module named 'vision_encoder'
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51] Unable to load modeling_step_vl.Step3VL10BForCausalLM from /kaggle/models/text-to-text/stepfun-ai/Step3-VL-10B on HF Hub.
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51] Traceback (most recent call last):
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]   File "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/dynamic_module.py", line 33, in try_get_class_from_dynamic_module
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]     return get_class_from_dynamic_module(
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]   File "/usr/local/lib/python3.12/dist-packages/transformers/dynamic_module_utils.py", line 604, in get_class_from_dynamic_module
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]     final_module = get_cached_module_file(
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]                    ^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]   File "/usr/local/lib/python3.12/dist-packages/transformers/dynamic_module_utils.py", line 427, in get_cached_module_file
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]     modules_needed = check_imports(resolved_module_file)
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]   File "/usr/local/lib/python3.12/dist-packages/transformers/dynamic_module_utils.py", line 260, in check_imports
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]     raise ImportError(
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51] ImportError: This modeling file requires the following packages that were not found in your environment: configuration_step_vl, vision_encoder. Run `pip install configuration_step_vl vision_encoder`
(APIServer pid=127809) Encountered exception while importing configuration_step_vl: No module named 'configuration_step_vl'
(APIServer pid=127809) Encountered exception while importing vision_encoder: No module named 'vision_encoder'
(APIServer pid=127809) Encountered exception while importing configuration_step_vl: No module named 'configuration_step_vl'
(APIServer pid=127809) Encountered exception while importing vision_encoder: No module named 'vision_encoder'
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51] Unable to load modeling_step_vl.Step3VL10BForCausalLM from /kaggle/models/text-to-text/stepfun-ai/Step3-VL-10B on HF Hub.
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51] Traceback (most recent call last):
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]   File "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/dynamic_module.py", line 33, in try_get_class_from_dynamic_module
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]     return get_class_from_dynamic_module(
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]   File "/usr/local/lib/python3.12/dist-packages/transformers/dynamic_module_utils.py", line 604, in get_class_from_dynamic_module
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]     final_module = get_cached_module_file(
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]                    ^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]   File "/usr/local/lib/python3.12/dist-packages/transformers/dynamic_module_utils.py", line 427, in get_cached_module_file
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]     modules_needed = check_imports(resolved_module_file)
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]   File "/usr/local/lib/python3.12/dist-packages/transformers/dynamic_module_utils.py", line 260, in check_imports
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51]     raise ImportError(
(APIServer pid=127809) WARNING 01-17 11:42:20 [dynamic_module.py:51] ImportError: This modeling file requires the following packages that were not found in your environment: configuration_step_vl, vision_encoder. Run `pip install configuration_step_vl vision_encoder`
(APIServer pid=127809) Traceback (most recent call last):
(APIServer pid=127809)   File "<frozen runpy>", line 198, in _run_module_as_main
(APIServer pid=127809)   File "<frozen runpy>", line 88, in _run_code
(APIServer pid=127809)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1469, in <module>
(APIServer pid=127809)     uvloop.run(run_server(args))
(APIServer pid=127809)   File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run
(APIServer pid=127809)     return __asyncio.run(
(APIServer pid=127809)            ^^^^^^^^^^^^^^
(APIServer pid=127809)   File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
(APIServer pid=127809)     return runner.run(main)
(APIServer pid=127809)            ^^^^^^^^^^^^^^^^
(APIServer pid=127809)   File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
(APIServer pid=127809)     return self._loop.run_until_complete(task)
(APIServer pid=127809)            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=127809)   File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=127809)   File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper
(APIServer pid=127809)     return await main
(APIServer pid=127809)            ^^^^^^^^^^
(APIServer pid=127809)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1398, in run_server
(APIServer pid=127809)     await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=127809)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1417, in run_server_worker
(APIServer pid=127809)     async with build_async_engine_client(
(APIServer pid=127809)                ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=127809)   File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=127809)     return await anext(self.gen)
(APIServer pid=127809)            ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=127809)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 172, in build_async_engine_client
(APIServer pid=127809)     async with build_async_engine_client_from_engine_args(
(APIServer pid=127809)                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=127809)   File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=127809)     return await anext(self.gen)
(APIServer pid=127809)            ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=127809)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 198, in build_async_engine_client_from_engine_args
(APIServer pid=127809)     vllm_config = engine_args.create_engine_config(usage_context=usage_context)
(APIServer pid=127809)                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=127809)   File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1332, in create_engine_config
(APIServer pid=127809)     model_config = self.create_model_config()
(APIServer pid=127809)                    ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=127809)   File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1189, in create_model_config
(APIServer pid=127809)     return ModelConfig(
(APIServer pid=127809)            ^^^^^^^^^^^^
(APIServer pid=127809)   File "/usr/local/lib/python3.12/dist-packages/pydantic/_internal/_dataclasses.py", line 121, in __init__
(APIServer pid=127809)     s.__pydantic_validator__.validate_python(ArgsKwargs(args, kwargs), self_instance=s)
(APIServer pid=127809) pydantic_core._pydantic_core.ValidationError: 1 validation error for ModelConfig
(APIServer pid=127809)   Value error, Model architectures ['StepVLForConditionalGeneration'] are not supported for now. Supported architectures: dict_keys(['AfmoeForCausalLM', 'ApertusForCausalLM', 'AquilaModel', 'AquilaForCausalLM', 'ArceeForCausalLM', 'ArcticForCausalLM', 'BaiChuanForCausalLM', 'BaichuanForCausalLM', 'BailingMoeForCausalLM', 'BailingMoeV2ForCausalLM', 'BambaForCausalLM', 'BloomForCausalLM', 'ChatGLMModel', 'ChatGLMForConditionalGeneration', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CwmForCausalLM', 'DbrxForCausalLM', 'DeciLMForCausalLM', 'DeepseekForCausalLM', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM', 'DeepseekV32ForCausalLM', 'Dots1ForCausalLM', 'Ernie4_5ForCausalLM', 'Ernie4_5_MoeForCausalLM', 'ExaoneForCausalLM', 'Exaone4ForCausalLM', 'Fairseq2LlamaForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FalconH1ForCausalLM', 'FlexOlmoForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForCausalLM', 'Gemma3nForCausalLM', 'Qwen3NextForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'Glm4MoeForCausalLM', 'GptOssForCausalLM', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTJForCausalLM', 'GPTNeoXForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'GraniteMoeHybridForCausalLM', 'GraniteMoeSharedForCausalLM', 'GritLM', 'Grok1ModelForCausalLM', 'HunYuanMoEV1ForCausalLM', 'HunYuanDenseV1ForCausalLM', 'HCXVisionForCausalLM', 'InternLMForCausalLM', 'InternLM2ForCausalLM', 'InternLM2VEForCausalLM', 'InternLM3ForCausalLM', 'JAISLMHeadModel', 'Jais2ForCausalLM', 'JambaForCausalLM', 'KimiLinearForCausalLM', 'Lfm2ForCausalLM', 'Lfm2MoeForCausalLM', 'LlamaForCausalLM', 'Llama4ForCausalLM', 'LLaMAForCausalLM', 'LongcatFlashForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MiniCPMForCausalLM', 'MiniCPM3ForCausalLM', 'MiniMaxForCausalLM', 'MiniMaxText01ForCausalLM', 'MiniMaxM1ForCausalLM', 'MiniMaxM2ForCausalLM', 'MistralForCausalLM', 'MistralLarge3ForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MPTForCausalLM', 'MiMoForCausalLM', 'NemotronForCausalLM', 'NemotronHForCausalLM', 'OlmoForCausalLM', 'Olmo2ForCausalLM', 'Olmo3ForCausalLM', 'OlmoeForCausalLM', 'OPTForCausalLM', 'OrionForCausalLM', 'OuroForCausalLM', 'PanguEmbeddedForCausalLM', 'PanguUltraMoEForCausalLM', 'PersimmonForCausalLM', 'PhiForCausalLM', 'Phi3ForCausalLM', 'PhiMoEForCausalLM', 'Plamo2ForCausalLM', 'Plamo3ForCausalLM', 'QWenLMHeadModel', 'Qwen2ForCausalLM', 'Qwen2MoeForCausalLM', 'Qwen3ForCausalLM', 'Qwen3MoeForCausalLM', 'RWForCausalLM', 'SeedOssForCausalLM', 'Step3TextForCausalLM', 'StableLMEpochForCausalLM', 'StableLmForCausalLM', 'Starcoder2ForCausalLM', 'SolarForCausalLM', 'TeleChatForCausalLM', 'TeleChat2ForCausalLM', 'TeleFLMForCausalLM', 'XverseForCausalLM', 'Zamba2ForCausalLM', 'BertModel', 'BertSpladeSparseEmbeddingModel', 'Gemma2Model', 'Gemma3TextModel', 'GPT2ForSequenceClassification', 'GteModel', 'GteNewModel', 'InternLM2ForRewardModel', 'JambaForSequenceClassification', 'LlamaModel', 'MistralModel', 'ModernBertModel', 'NomicBertModel', 'Qwen2Model', 'Qwen2ForRewardModel', 'Qwen2ForProcessRewardModel', 'RobertaForMaskedLM', 'RobertaModel', 'XLMRobertaModel', 'CLIPModel', 'LlavaNextForConditionalGeneration', 'Phi3VForCausalLM', 'Qwen2VLForConditionalGeneration', 'SiglipModel', 'PrithviGeoSpatialMAE', 'Terratorch', 'BertForSequenceClassification', 'BertForTokenClassification', 'GteNewForSequenceClassification', 'ModernBertForSequenceClassification', 'ModernBertForTokenClassification', 'RobertaForSequenceClassification', 'XLMRobertaForSequenceClassification', 'JinaVLForRanking', 'AriaForConditionalGeneration', 'AudioFlamingo3ForConditionalGeneration', 'AyaVisionForConditionalGeneration', 'BagelForConditionalGeneration', 'BeeForConditionalGeneration', 'Blip2ForConditionalGeneration', 'ChameleonForConditionalGeneration', 'Cohere2VisionForConditionalGeneration', 'DeepseekVLV2ForCausalLM', 'DeepseekOCRForCausalLM', 'DotsOCRForCausalLM', 'Ernie4_5_VLMoeForConditionalGeneration', 'FuyuForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3nForConditionalGeneration', 'GLM4VForCausalLM', 'Glm4vForConditionalGeneration', 'Glm4vMoeForConditionalGeneration', 'GraniteSpeechForConditionalGeneration', 'H2OVLChatModel', 'HunYuanVLForConditionalGeneration', 'InternVLChatModel', 'NemotronH_Nano_VL_V2', 'OpenCUAForConditionalGeneration', 'InternS1ForConditionalGeneration', 'InternVLForConditionalGeneration', 'Idefics3ForConditionalGeneration', 'SmolVLMForConditionalGeneration', 'KeyeForConditionalGeneration', 'KeyeVL1_5ForConditionalGeneration', 'RForConditionalGeneration', 'KimiVLForConditionalGeneration', 'LightOnOCRForConditionalGeneration', 'Llama_Nemotron_Nano_VL', 'Llama4ForConditionalGeneration', 'LlavaForConditionalGeneration', 'LlavaNextVideoForConditionalGeneration', 'LlavaOnevisionForConditionalGeneration', 'MantisForConditionalGeneration', 'MiDashengLMModel', 'MiniMaxVL01ForConditionalGeneration', 'MiniCPMO', 'MiniCPMV', 'Mistral3ForConditionalGeneration', 'MolmoForCausalLM', 'NVLM_D', 'Ovis', 'Ovis2_5', 'PaddleOCRVLForConditionalGeneration', 'PaliGemmaForConditionalGeneration', 'Phi4MMForCausalLM', 'PixtralForConditionalGeneration', 'QwenVLForConditionalGeneration', 'Qwen2_5_VLForConditionalGeneration', 'Qwen2AudioForConditionalGeneration', 'Qwen2_5OmniModel', 'Qwen2_5OmniForConditionalGeneration', 'Qwen3OmniMoeForConditionalGeneration', 'Qwen3VLForConditionalGeneration', 'Qwen3VLMoeForConditionalGeneration', 'SkyworkR1VChatModel', 'Step3VLForConditionalGeneration', 'TarsierForConditionalGeneration', 'Tarsier2ForConditionalGeneration', 'UltravoxModel', 'VoxtralForConditionalGeneration', 'WhisperForConditionalGeneration', 'MiMoMTPModel', 'EagleLlamaForCausalLM', 'EagleLlama4ForCausalLM', 'EagleMiniCPMForCausalLM', 'Eagle3LlamaForCausalLM', 'LlamaForCausalLMEagle3', 'Eagle3Qwen2_5vlForCausalLM', 'Eagle3Qwen3vlForCausalLM', 'EagleMistralLarge3ForCausalLM', 'EagleDeepSeekMTPModel', 'DeepSeekMTPModel', 'ErnieMTPModel', 'LongCatFlashMTPModel', 'Glm4MoeMTPModel', 'MedusaModel', 'OpenPanguMTPModel', 'Qwen3NextMTP', 'SmolLM3ForCausalLM', 'Emu3ForConditionalGeneration', 'TransformersForCausalLM', 'TransformersMoEForCausalLM', 'TransformersMultiModalForCausalLM', 'TransformersMultiModalMoEForCausalLM', 'TransformersEmbeddingModel', 'TransformersMoEEmbeddingModel', 'TransformersMultiModalEmbeddingModel', 'TransformersForSequenceClassification', 'TransformersMoEForSequenceClassification', 'TransformersMultiModalForSequenceClassification']) [type=value_error, input_value=ArgsKwargs((), {'model': ...rocessor_plugin': None}), input_type=ArgsKwargs]
(APIServer pid=127809)     For further information visit https://errors.pydantic.dev/2.12/v/value_error
StepFun org

This model is incompatible with vLLM v0.13. Please switch to the main branch or manually apply the code from PR #32329 (https://github.com/vllm-project/vllm/pull/32329) to your current environment.

This model is incompatible with vLLM v0.13. Please switch to the main branch or manually apply the code from PR #32329 (https://github.com/vllm-project/vllm/pull/32329) to your current environment.

Is it based on vllm 0.13? can I directly add the change to vllm 0.13?

Sign up or log in to comment