#!/bin/bash # Example script for running inference on evaluation benchmarks # # Usage: bash run.sh # # Before running: # 1. Update MODEL_FOLDER, MODEL_NAME, TOKENIZER_FOLDER, and TOKENIZER_NAME # 2. Update BENCHMARK_FOLDER to point to your benchmark data directory # 3. Update EVAL_DATASET to the desired benchmark # 4. Adjust inference parameters as needed (temperature, top-p, etc.) # Model configuration (REQUIRED) MODEL_FOLDER="/path/to/models" MODEL_NAME="your-model-name" TOKENIZER_FOLDER="/path/to/tokenizers" TOKENIZER_NAME="your-tokenizer-name" # Data configuration (REQUIRED) BENCHMARK_FOLDER="/path/to/benchmarks" EVAL_DATASET="aime25" # See README for all supported datasets # Inference parameters (OPTIONAL - defaults shown) TEMPERATURE=0.6 # 0 for greedy decoding TOP_P=0.95 # Top-p sampling threshold MAX_OUTPUT_LEN=32768 # Maximum output length in tokens BATCH_SIZE=1024 # Batch size for inference TENSOR_PARALLEL_SIZE=1 # Number of GPUs for tensor parallelism YARN_FACTOR=2 # YaRN RoPE scaling factor for extended context for 64k context suiable for long reasoning generation # Other options SEED=42 # Random seed # DEVICE_ID="0,1,2,3" # Uncomment to specify GPU devices # USE_R1_FLAG="--use_r1" # Uncomment for R1-style prompting # NO_THINK_FLAG="--no-think" # Uncomment to disable thinking mode # Run inference python inference.py \ --model-folder "${MODEL_FOLDER}" \ --model-name "${MODEL_NAME}" \ --tokenizer-folder "${TOKENIZER_FOLDER}" \ --tokenizer-name "${TOKENIZER_NAME}" \ --benchmark-folder "${BENCHMARK_FOLDER}" \ --eval-dataset "${EVAL_DATASET}" \ --temperature ${TEMPERATURE} \ --topp ${TOP_P} \ --max-output-len ${MAX_OUTPUT_LEN} \ --batch-size ${BATCH_SIZE} \ --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \ --yarn-factor ${YARN_FACTOR} \ --seed ${SEED} # ${DEVICE_ID:+--device-id "${DEVICE_ID}"} \ # ${USE_R1_FLAG} \ # ${NO_THINK_FLAG}