llm_cp2 / src /lmms-eval /examples /models /sglang_qwen3vl.sh

Upload folder using huggingface_hub

b0c0df0 verified about 2 months ago

9.37 kB

	#!/bin/bash

	# Qwen3-VL Evaluation Script with SGLang Backend
	# This script demonstrates how to evaluate Qwen3-VL models using SGLang for accelerated inference
	#
	# Requirements:
	# - sglang>=0.4.6
	# - qwen-vl-utils
	# - CUDA-enabled GPU(s)
	#
	# Installation:
	# uv add "sglang[all]" qwen-vl-utils
	# OR
	# pip install "sglang[all]>=0.4.6" qwen-vl-utils

	# ============================================================================
	# Configuration
	# ============================================================================

	# Model Configuration
	# Available Qwen3-VL models:
	# - Qwen/Qwen3-VL-30B-A3B-Instruct
	# - Qwen/Qwen3-VL-30B-A3B-Thinking
	# - Qwen/Qwen3-VL-235B-A22B-Instruct
	# - Qwen/Qwen3-VL-235B-A22B-Thinking
	MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct"

	# Parallelization Settings
	# Adjust based on your GPU configuration
	TENSOR_PARALLEL_SIZE=4 # Number of GPUs for tensor parallelism (tp_size in SGLang)

	# Memory and Performance Settings
	GPU_MEMORY_UTILIZATION=0.85 # mem_fraction_static in SGLang (0.0 - 1.0)
	BATCH_SIZE=64 # Batch size for evaluation

	# SGLang Specific Settings
	MAX_PIXELS=1605632 # Maximum pixels for image processing
	MIN_PIXELS=784 # Minimum pixels (28x28)
	MAX_FRAME_NUM=32 # Maximum number of video frames
	THREADS=16 # Number of threads for decoding visuals

	# Task Configuration
	# Common tasks: mmmu_val, mme, mathvista, ai2d, etc.
	TASKS="mmmu_val,mme"

	# Output Configuration
	OUTPUT_PATH="./logs/qwen3vl_sglang"
	LOG_SAMPLES=true
	LOG_SUFFIX="qwen3vl_sglang"

	# Evaluation Limits (optional)
	# LIMIT=100 # Uncomment to limit number of samples (for testing)

	# ============================================================================
	# Environment Configuration
	# ============================================================================
	export HF_HOME="${HF_HOME:-$HOME/.cache/huggingface}"

	# ============================================================================
	# EXAMPLE 1: Basic SGLang Usage (Without MCP Tools)
	# ============================================================================
	# This is the standard evaluation without tool calling support.
	# The model will process image/video queries and return responses directly.
	#
	# Key Parameters:
	# - model: The model identifier
	# - tensor_parallel_size: Number of GPUs for tensor parallelism
	# - gpu_memory_utilization: GPU memory fraction to use
	# - max_pixels/min_pixels: Image resolution constraints
	# - max_frame_num: Maximum frames for video processing
	# - threads: Thread count for visual processing

	echo "=========================================="
	echo "Qwen3-VL Evaluation with SGLang"
	echo "=========================================="
	echo "Model: $MODEL"
	echo "Tensor Parallel Size: $TENSOR_PARALLEL_SIZE"
	echo "Tasks: $TASKS"
	echo "Batch Size: $BATCH_SIZE"
	echo "Max Pixels: $MAX_PIXELS"
	echo "Output Path: $OUTPUT_PATH"
	echo "=========================================="

	# Build the command
	CMD="uv run python -m lmms_eval \
	--model sglang \
	--model_args model=${MODEL},tensor_parallel_size=${TENSOR_PARALLEL_SIZE},gpu_memory_utilization=${GPU_MEMORY_UTILIZATION},max_pixels=${MAX_PIXELS},min_pixels=${MIN_PIXELS},max_frame_num=${MAX_FRAME_NUM},threads=${THREADS} \
	--tasks ${TASKS} \
	--batch_size ${BATCH_SIZE} \
	--output_path ${OUTPUT_PATH}"

	# Add optional arguments
	if [ "$LOG_SAMPLES" = true ]; then
	CMD="$CMD --log_samples --log_samples_suffix ${LOG_SUFFIX}"
	fi

	if [ ! -z "$LIMIT" ]; then
	CMD="$CMD --limit ${LIMIT}"
	fi

	# Execute
	echo "Running command:"
	echo "$CMD"
	echo ""

	eval $CMD

	echo ""
	echo "=========================================="
	echo "Evaluation Complete!"
	echo "Results saved to: $OUTPUT_PATH"
	echo "=========================================="

	# ============================================================================
	# EXAMPLE 2: SGLang with MCP Client Tools (Tool-Enabled Evaluation)
	# ============================================================================
	# This example demonstrates how to enable MCP (Model Context Protocol) client
	# for tool calling support with SGLang.
	#
	# IMPORTANT: Before running this, you need to:
	# 1. Create an MCP server that exposes tools (e.g., image processing, web search)
	# 2. The MCP server should be a Python script that implements tool definitions
	# 3. Pass the path to the MCP server script via mcp_server_path parameter
	#
	# How MCP Tool Calling Works with SGLang:
	# ─────────────────────────────────────────
	# 1. User sends a request with a question
	# 2. SGLang processes the message and generates text
	# 3. The function_call_parser detects if tool calls are in the generated text
	# (finish_reason == "tool_calls")
	# 4. If tool calls are detected:
	# a. Parse the tool call function name and arguments from generated text
	# b. Retrieve tool definition from MCPClient
	# c. Execute the tool via MCPClient.run_tool(tool_name, arguments)
	# d. Convert tool result to OpenAI-compatible format
	# e. Append tool result to conversation as {"role": "tool", ...}
	# f. Generate next response with updated context (max_turn times)
	# 5. Continue until model produces final text or max_turn is reached
	#
	# Tool Calling Loop in Code (from sglang.py):
	# ──────────────────────────────────────────────
	# while keep_rolling and turn_count < max_turn:
	# output = await self.client.async_generate(...)
	# if function_call_parser.has_tool_call(output["text"]):
	# tool_calls = function_call_parser.parse_non_stream(output["text"])
	# for tool_call in tool_calls:
	# result = await self.mcp_client.run_tool(tool_call.name, args)
	# # Convert result to OpenAI format
	# tool_messages.append({"role": "tool", "name": tool_call.name, "content": result})
	# messages.append(assistant_response)
	# messages.extend(tool_messages)
	# # Prepare next input for model with tool results
	# turn_count += 1
	#
	# Example with MCP tools enabled:
	# (Uncomment the following lines to use with MCP server)
	#
	# # Path to MCP server implementation
	# MCP_SERVER_PATH="/path/to/mcp_server.py"
	# WORK_DIR="/tmp/sglang_mcp_work"
	#
	# CMD="uv run python -m lmms_eval \
	# --model sglang \
	# --model_args model=${MODEL},tensor_parallel_size=${TENSOR_PARALLEL_SIZE},gpu_memory_utilization=${GPU_MEMORY_UTILIZATION},max_pixels=${MAX_PIXELS},min_pixels=${MIN_PIXELS},max_frame_num=${MAX_FRAME_NUM},threads=${THREADS},mcp_server_path=${MCP_SERVER_PATH},work_dir=${WORK_DIR},max_turn=5 \
	# --tasks ${TASKS} \
	# --batch_size 1 \
	# --output_path ${OUTPUT_PATH}_with_mcp \
	# --log_samples --log_samples_suffix ${LOG_SUFFIX}_mcp"
	#
	# eval $CMD

	# ============================================================================
	# Parameter Reference
	# ============================================================================
	# model : Model identifier (required)
	# tensor_parallel_size : Number of GPUs for tensor parallelism (default: 1)
	# gpu_memory_utilization : GPU memory fraction (0.0-1.0, default: 0.8)
	# batch_size : Batch size for evaluation (default: 1)
	# max_pixels : Max image resolution (default: 1605632)
	# min_pixels : Min image resolution (default: 28*28=784)
	# max_frame_num : Max frames for videos (default: 768)
	# fps : Frames per second for video sampling (optional)
	# nframes : Fixed number of frames for video (default: 32)
	# threads : Thread count for visual processing (default: 16)
	# mcp_server_path : Path to MCP server script for tool calling (optional)
	# work_dir : Working directory for MCP tools (default: /tmp/...)
	# max_turn : Maximum tool calling turns (default: 5)
	# chat_template : Custom chat template jinja file (optional)
	# json_model_override_args : JSON args to override model config (optional)
	#
	#
	# ============================================================================
	# Tool Calling Best Practices
	# ============================================================================
	# 1. TOOL DESIGN:
	# - Keep tools focused on single tasks
	# - Provide clear, specific descriptions
	# - Define input schema with required fields
	# - Return results in structured format
	#
	# 2. MCP SERVER:
	# - Must be a standalone Python script
	# - Should handle errors gracefully
	# - Return results in TextContent or ImageContent format
	# - Avoid long-running operations (timeouts)
	#
	# 3. CONFIGURATION:
	# - Set appropriate max_turn value (5-10 recommended)
	# - Use batch_size=1 when tools are enabled (sequential processing)
	# - Allocate sufficient work_dir space for temporary files
	# - Monitor GPU memory with tool execution
	#
	# 4. DEBUGGING:
	# - Use --verbosity DEBUG to see tool call details
	# - Check work_dir for saved images/videos
	# - Validate MCP server responds correctly:
	# `python mcp_server.py` should start without errors
	# - Test tool functions independently before evaluation
	#
	# ============================================================================