manifest Qwen/Qwen3.5-9B @ B200 (ad126df3f8da4f66)

Browse files

Files changed (1) hide show

Qwen__Qwen3.5-9B/B200/tp1-seq131072-lora64x4/ad126df3f8da4f66/manifest.json +99 -0

Qwen__Qwen3.5-9B/B200/tp1-seq131072-lora64x4/ad126df3f8da4f66/manifest.json ADDED Viewed

	@@ -0,0 +1,99 @@

+{
+  "model": "Qwen/Qwen3.5-9B",
+  "gpu_type": "B200",
+  "tensor_parallel_size": 1,
+  "max_seq_length": 131072,
+  "enable_lora": true,
+  "max_lora_rank": 64,
+  "max_loras": 4,
+  "cudagraph_capture_sizes": [
+    1,
+    2,
+    4,
+    8,
+    16,
+    32,
+    64,
+    128,
+    192,
+    256,
+    384,
+    512,
+    640,
+    768,
+    896,
+    1000
+  ],
+  "use_mega_aot_artifact": true,
+  "deep_gemm_warmup": "skip",
+  "enable_prefix_caching": true,
+  "vllm_version": "0.22.0",
+  "torch_version": "2.11.0+cu129",
+  "torch": "2.11.0+cu129",
+  "torch_cuda": "12.9",
+  "vllm": "0.22.0",
+  "image_tag": "baseten/baseten-weight-sync-inference:main-15e6be27",
+  "caller": "github-actions:William-Gao1",
+  "model_revision": "c202236235762e1c871ad0ccb60c8ee5ba337b9a",
+  "build_id": "27573881259-1",
+  "opaque_sampler_payload": {
+    "tensor_parallel_size": 1,
+    "max_seq_length": 131072,
+    "enable_lora": true,
+    "max_lora_rank": 64,
+    "max_loras": 4,
+    "cudagraph_capture_sizes": [
+      1,
+      2,
+      4,
+      8,
+      16,
+      32,
+      64,
+      128,
+      192,
+      256,
+      384,
+      512,
+      640,
+      768,
+      896,
+      1000
+    ],
+    "use_mega_aot_artifact": true,
+    "deep_gemm_warmup": "skip",
+    "enable_prefix_caching": true,
+    "load_format": "fastsafetensors"
+  },
+  "build_profile": {
+    "cudagraph_capture_sizes": [
+      1,
+      2,
+      4,
+      8,
+      16,
+      32,
+      64,
+      128,
+      192,
+      256,
+      384,
+      512,
+      640,
+      768,
+      896,
+      1000
+    ],
+    "use_mega_aot_artifact": true,
+    "deep_gemm_warmup": "skip"
+  },
+  "ready_in_seconds_no_cache": 270.08,
+  "kv_cache_max_tokens": 4475980,
+  "kv_cache_max_concurrency": 34.14901960784314,
+  "kv_cache_gpu_memory_utilization": 0.92,
+  "cache_size_uncompressed_bytes": 300206080,
+  "cache_size_compressed_bytes": 20855259,
+  "compression": "zstd -9 -T0 (multithreaded)",
+  "compress_time_seconds": 0.48,
+  "upload_time_seconds": 5.16
+}