manifest Qwen/Qwen3-0.6B @ B200 (47aa25be9c0bf00c)

Browse files

Files changed (1) hide show

Qwen__Qwen3-0.6B/B200/tp1-seq16384-lora64x16/47aa25be9c0bf00c/manifest.json +99 -0

Qwen__Qwen3-0.6B/B200/tp1-seq16384-lora64x16/47aa25be9c0bf00c/manifest.json ADDED Viewed

	@@ -0,0 +1,99 @@

+{
+  "model": "Qwen/Qwen3-0.6B",
+  "gpu_type": "B200",
+  "tensor_parallel_size": 1,
+  "max_seq_length": 16384,
+  "enable_lora": true,
+  "max_lora_rank": 64,
+  "max_loras": 16,
+  "cudagraph_capture_sizes": [
+    1,
+    2,
+    4,
+    8,
+    16,
+    32,
+    64,
+    128,
+    192,
+    256,
+    384,
+    512,
+    640,
+    768,
+    896,
+    1000
+  ],
+  "use_mega_aot_artifact": true,
+  "deep_gemm_warmup": "skip",
+  "enable_prefix_caching": true,
+  "vllm_version": "0.22.0",
+  "torch_version": "2.11.0+cu129",
+  "torch": "2.11.0+cu129",
+  "torch_cuda": "12.9",
+  "vllm": "0.22.0",
+  "image_tag": "baseten/baseten-weight-sync-inference:main-15e6be27",
+  "caller": "github-actions:William-Gao1",
+  "model_revision": "c1899de289a04d12100db370d81485cdf75e47ca",
+  "build_id": "27571412154-1",
+  "opaque_sampler_payload": {
+    "tensor_parallel_size": 1,
+    "max_seq_length": 16384,
+    "enable_lora": true,
+    "max_lora_rank": 64,
+    "max_loras": 16,
+    "cudagraph_capture_sizes": [
+      1,
+      2,
+      4,
+      8,
+      16,
+      32,
+      64,
+      128,
+      192,
+      256,
+      384,
+      512,
+      640,
+      768,
+      896,
+      1000
+    ],
+    "use_mega_aot_artifact": true,
+    "deep_gemm_warmup": "skip",
+    "enable_prefix_caching": true,
+    "load_format": "fastsafetensors"
+  },
+  "build_profile": {
+    "cudagraph_capture_sizes": [
+      1,
+      2,
+      4,
+      8,
+      16,
+      32,
+      64,
+      128,
+      192,
+      256,
+      384,
+      512,
+      640,
+      768,
+      896,
+      1000
+    ],
+    "use_mega_aot_artifact": true,
+    "deep_gemm_warmup": "skip"
+  },
+  "ready_in_seconds_no_cache": 94.03,
+  "kv_cache_max_tokens": 1495856,
+  "kv_cache_max_concurrency": 91.2998046875,
+  "kv_cache_gpu_memory_utilization": 0.92,
+  "cache_size_uncompressed_bytes": 68997120,
+  "cache_size_compressed_bytes": 3386170,
+  "compression": "zstd -9 -T0 (multithreaded)",
+  "compress_time_seconds": 0.21,
+  "upload_time_seconds": 4.04
+}