mimic-svm / code /vllm_inference.py

Upload code/vllm_inference.py with huggingface_hub

c292e01 verified about 1 month ago

4.67 kB

	"""
	Example usage for the fine-tuned merged model with vLLM.

	1) Start the server (from docs + project defaults):

	OMP_NUM_THREADS=1 \
	vllm serve outputs/mimic_qwen3vl_lora_8bit_5_merged \
	--host 0.0.0.0 \
	--port 8000 \
	--dtype bfloat16 \
	--limit-mm-per-prompt.video 0

	2) Run this client script:

	python3 vllm_inference.py --model outputs/mimic_qwen3vl_lora_8bit_5_merged
	"""

	import argparse
	import base64
	import mimetypes
	import os
	import time
	from pathlib import Path

	from openai import OpenAI


	os.environ["CUDA_VISIBLE_DEVICES"] = "4"


	DEFAULT_MODEL_PATH = "outputs/mimic_qwen3vl_lora_8bit_5_merged"
	DEFAULT_BASE_URL = "http://127.0.0.1:8002/v1"
	DEFAULT_SYSTEM_PROMPT_PATH = Path(__file__).with_name("new_system_prompt_new.txt")
	DEFAULT_IMAGE_1 = Path(
	"/home/dgxuser16/NTL/mccarthy/ahmad/cap/dataset/images_1/s50000230/7e962a95-d661c0db-4769286c-e150a106-fb9586c6.jpg"
	)
	DEFAULT_IMAGE_2 = Path(
	"/home/dgxuser16/NTL/mccarthy/ahmad/cap/dataset/images_1/s50000230/f605b192-2e612578-c5c95dc3-b9d6d13b-e0eee500.jpg"
	)


	def image_to_data_url(image_path: Path) -> str:
	if not image_path.exists():
	raise FileNotFoundError(f"Image not found: {image_path}")

	mime_type, _ = mimetypes.guess_type(str(image_path))
	if mime_type is None:
	mime_type = "application/octet-stream"

	encoded = base64.b64encode(image_path.read_bytes()).decode("utf-8")
	return f"data:{mime_type};base64,{encoded}"


	def build_messages(system_prompt: str, image_1: Path, image_2: Path) -> list[dict]:
	return [
	{
	"role": "user",
	"content": [
	{
	"type": "image_url",
	"image_url": {"url": image_to_data_url(image_1)},
	},
	{
	"type": "image_url",
	"image_url": {"url": image_to_data_url(image_2)},
	},
	{
	"type": "text",
	"text": system_prompt,
	},
	],
	}
	]


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(
	description="Run inference against a vLLM server for the fine-tuned Qwen3-VL model."
	)
	parser.add_argument(
	"--base-url",
	default=DEFAULT_BASE_URL,
	help="OpenAI-compatible vLLM base URL.",
	)
	parser.add_argument(
	"--model",
	default=DEFAULT_MODEL_PATH,
	help="Model identifier served by vLLM (use the same value passed to `vllm serve`).",
	)
	parser.add_argument(
	"--system-prompt-path",
	type=Path,
	default=DEFAULT_SYSTEM_PROMPT_PATH,
	help="Path to prompt text file.",
	)
	parser.add_argument(
	"--image-1",
	type=Path,
	default=DEFAULT_IMAGE_1,
	help="Path to first image.",
	)
	parser.add_argument(
	"--image-2",
	type=Path,
	default=DEFAULT_IMAGE_2,
	help="Path to second image.",
	)
	parser.add_argument(
	"--max-tokens",
	type=int,
	default=2048,
	help="Maximum generation tokens.",
	)
	parser.add_argument(
	"--temperature",
	type=float,
	default=0.0,
	help="Sampling temperature.",
	)
	parser.add_argument(
	"--timeout",
	type=float,
	default=3600,
	help="Client timeout in seconds.",
	)
	return parser.parse_args()


	def main() -> None:
	args = parse_args()

	if not args.system_prompt_path.exists():
	raise FileNotFoundError(f"Prompt file not found: {args.system_prompt_path}")

	system_prompt = args.system_prompt_path.read_text(encoding="utf-8").strip()
	messages = build_messages(system_prompt=system_prompt, image_1=args.image_1, image_2=args.image_2)

	api_key = os.getenv("OPENAI_API_KEY", "EMPTY")
	client = OpenAI(api_key=api_key, base_url=args.base_url, timeout=args.timeout)

	start = time.perf_counter()
	response = client.chat.completions.create(
	model=args.model,
	messages=messages,
	max_tokens=args.max_tokens,
	temperature=args.temperature,
	)
	elapsed = time.perf_counter() - start

	output_text = response.choices[0].message.content

	print(f"Model: {args.model}")
	print(f"Latency (s): {elapsed:.3f}")

	usage = response.usage
	if usage is not None:
	print(f"Prompt tokens: {usage.prompt_tokens}")
	print(f"Completion tokens: {usage.completion_tokens}")
	print(f"Total tokens: {usage.total_tokens}")

	print("\n--- Generated Output ---")
	print(output_text)


	if __name__ == "__main__":
	main()