mimic-svm / code /vllm_inference.py
ahmad4raza's picture
Upload code/vllm_inference.py with huggingface_hub
c292e01 verified
"""
Example usage for the fine-tuned merged model with vLLM.
1) Start the server (from docs + project defaults):
OMP_NUM_THREADS=1 \
vllm serve outputs/mimic_qwen3vl_lora_8bit_5_merged \
--host 0.0.0.0 \
--port 8000 \
--dtype bfloat16 \
--limit-mm-per-prompt.video 0
2) Run this client script:
python3 vllm_inference.py --model outputs/mimic_qwen3vl_lora_8bit_5_merged
"""
import argparse
import base64
import mimetypes
import os
import time
from pathlib import Path
from openai import OpenAI
os.environ["CUDA_VISIBLE_DEVICES"] = "4"
DEFAULT_MODEL_PATH = "outputs/mimic_qwen3vl_lora_8bit_5_merged"
DEFAULT_BASE_URL = "http://127.0.0.1:8002/v1"
DEFAULT_SYSTEM_PROMPT_PATH = Path(__file__).with_name("new_system_prompt_new.txt")
DEFAULT_IMAGE_1 = Path(
"/home/dgxuser16/NTL/mccarthy/ahmad/cap/dataset/images_1/s50000230/7e962a95-d661c0db-4769286c-e150a106-fb9586c6.jpg"
)
DEFAULT_IMAGE_2 = Path(
"/home/dgxuser16/NTL/mccarthy/ahmad/cap/dataset/images_1/s50000230/f605b192-2e612578-c5c95dc3-b9d6d13b-e0eee500.jpg"
)
def image_to_data_url(image_path: Path) -> str:
if not image_path.exists():
raise FileNotFoundError(f"Image not found: {image_path}")
mime_type, _ = mimetypes.guess_type(str(image_path))
if mime_type is None:
mime_type = "application/octet-stream"
encoded = base64.b64encode(image_path.read_bytes()).decode("utf-8")
return f"data:{mime_type};base64,{encoded}"
def build_messages(system_prompt: str, image_1: Path, image_2: Path) -> list[dict]:
return [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": image_to_data_url(image_1)},
},
{
"type": "image_url",
"image_url": {"url": image_to_data_url(image_2)},
},
{
"type": "text",
"text": system_prompt,
},
],
}
]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Run inference against a vLLM server for the fine-tuned Qwen3-VL model."
)
parser.add_argument(
"--base-url",
default=DEFAULT_BASE_URL,
help="OpenAI-compatible vLLM base URL.",
)
parser.add_argument(
"--model",
default=DEFAULT_MODEL_PATH,
help="Model identifier served by vLLM (use the same value passed to `vllm serve`).",
)
parser.add_argument(
"--system-prompt-path",
type=Path,
default=DEFAULT_SYSTEM_PROMPT_PATH,
help="Path to prompt text file.",
)
parser.add_argument(
"--image-1",
type=Path,
default=DEFAULT_IMAGE_1,
help="Path to first image.",
)
parser.add_argument(
"--image-2",
type=Path,
default=DEFAULT_IMAGE_2,
help="Path to second image.",
)
parser.add_argument(
"--max-tokens",
type=int,
default=2048,
help="Maximum generation tokens.",
)
parser.add_argument(
"--temperature",
type=float,
default=0.0,
help="Sampling temperature.",
)
parser.add_argument(
"--timeout",
type=float,
default=3600,
help="Client timeout in seconds.",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
if not args.system_prompt_path.exists():
raise FileNotFoundError(f"Prompt file not found: {args.system_prompt_path}")
system_prompt = args.system_prompt_path.read_text(encoding="utf-8").strip()
messages = build_messages(system_prompt=system_prompt, image_1=args.image_1, image_2=args.image_2)
api_key = os.getenv("OPENAI_API_KEY", "EMPTY")
client = OpenAI(api_key=api_key, base_url=args.base_url, timeout=args.timeout)
start = time.perf_counter()
response = client.chat.completions.create(
model=args.model,
messages=messages,
max_tokens=args.max_tokens,
temperature=args.temperature,
)
elapsed = time.perf_counter() - start
output_text = response.choices[0].message.content
print(f"Model: {args.model}")
print(f"Latency (s): {elapsed:.3f}")
usage = response.usage
if usage is not None:
print(f"Prompt tokens: {usage.prompt_tokens}")
print(f"Completion tokens: {usage.completion_tokens}")
print(f"Total tokens: {usage.total_tokens}")
print("\n--- Generated Output ---")
print(output_text)
if __name__ == "__main__":
main()