from tensorrt_llm.commands.serve import get_llm_args, launch_server if __name__ == '__main__': llm_args, _ = get_llm_args( model=r'D:\AI\models\Josiefied-Qwen3-4B-ablit-gptq-engine', tokenizer=r'D:\AI\models\Josiefied-Qwen3-4B-abliterated-v1', backend='tensorrt', max_beam_width=1, max_batch_size=1, max_num_tokens=16384, max_seq_len=16384, tensor_parallel_size=1, pipeline_parallel_size=1, context_parallel_size=1, moe_expert_parallel_size=None, gpus_per_node=1, free_gpu_memory_fraction=0.9, kv_cache_dtype='auto', num_postprocess_workers=0, trust_remote_code=False, revision=None, reasoning_parser='qwen3', fail_fast_on_attention_window_too_large=False, otlp_traces_endpoint=None, enable_chunked_prefill=True, ) launch_server( host='localhost', port=5001, llm_args=llm_args, served_model_name='Josiefied-Qwen3-4B', )