File size: 4,571 Bytes
a402b9b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | import json
import unittest
import warnings
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
is_in_ci,
popen_launch_server,
write_github_step_summary,
write_results_to_json,
)
register_cuda_ci(est_time=185, suite="stage-b-test-large-1-gpu")
MODEL_SCORE_THRESHOLDS = {
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.825,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.825,
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4": 0.615,
}
def parse_models(model_string):
return [model.strip() for model in model_string.split(",") if model.strip()]
def popen_launch_server_wrapper(base_url, model, is_fp8, is_tp2):
other_args = ["--log-level-http", "warning", "--trust-remote-code"]
if is_fp8:
if "Llama-3" in model or "gemma-2" in model:
other_args.extend(["--kv-cache-dtype", "fp8_e5m2"])
elif "Qwen2-72B-Instruct-FP8" in model:
other_args.extend(["--quantization", "fp8"])
elif "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8" in model:
other_args.extend([])
else:
other_args.extend(["--quantization", "fp8", "--kv-cache-dtype", "fp8_e5m2"])
if is_tp2:
other_args.extend(["--tp", "2"])
if "DeepSeek" in model:
other_args.extend(["--mem-frac", "0.85"])
process = popen_launch_server(
model,
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
)
return process
def check_model_scores(results):
failed_models = []
summary = " | model | score | threshold |\n"
summary += "| ----- | ----- | --------- |\n"
for model, score in results:
threshold = MODEL_SCORE_THRESHOLDS.get(model)
if threshold is None:
print(f"Warning: No threshold defined for model {model}")
continue
if score < threshold:
failed_models.append(
f"\nScore Check Failed: {model}\n"
f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})"
)
line = f"| {model} | {score} | {threshold} |\n"
summary += line
print(summary)
if is_in_ci():
write_github_step_summary(
f"### TestNightlyGsm8KEval for awq, gptq, gguf\n{summary}"
)
if failed_models:
raise AssertionError("\n".join(failed_models))
class TestNightlyGsm8KEval(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model_groups = [
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1), False, False),
]
cls.base_url = DEFAULT_URL_FOR_TEST
def test_mgsm_en_all_models(self):
warnings.filterwarnings(
"ignore", category=ResourceWarning, message="unclosed.*socket"
)
is_first = True
all_results = []
for model_group, is_fp8, is_tp2 in self.model_groups:
for model in model_group:
with self.subTest(model=model):
process = popen_launch_server_wrapper(
self.base_url, model, is_fp8, is_tp2
)
args = SimpleNamespace(
base_url=self.base_url,
model=model,
eval_name="mgsm_en",
num_examples=None,
num_threads=1024,
)
metrics = run_eval(args)
print(
f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
)
write_results_to_json(model, metrics, "w" if is_first else "a")
is_first = False
all_results.append((model, metrics["score"]))
kill_process_tree(process.pid)
try:
with open("results.json", "r") as f:
print("\nFinal Results from results.json:")
print(json.dumps(json.load(f), indent=2))
except Exception as e:
print(f"Error reading results.json: {e}")
# Check all scores after collecting all results
check_model_scores(all_results)
if __name__ == "__main__":
unittest.main()
|