baseten-admin commited on
Commit
b4ccd79
·
verified ·
1 Parent(s): 1fcb361

manifest Qwen/Qwen3.5-9B @ B200 (ad126df3f8da4f66)

Browse files
Qwen__Qwen3.5-9B/B200/tp1-seq131072-lora64x4/ad126df3f8da4f66/manifest.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Qwen/Qwen3.5-9B",
3
+ "gpu_type": "B200",
4
+ "tensor_parallel_size": 1,
5
+ "max_seq_length": 131072,
6
+ "enable_lora": true,
7
+ "max_lora_rank": 64,
8
+ "max_loras": 4,
9
+ "cudagraph_capture_sizes": [
10
+ 1,
11
+ 2,
12
+ 4,
13
+ 8,
14
+ 16,
15
+ 32,
16
+ 64,
17
+ 128,
18
+ 192,
19
+ 256,
20
+ 384,
21
+ 512,
22
+ 640,
23
+ 768,
24
+ 896,
25
+ 1000
26
+ ],
27
+ "use_mega_aot_artifact": true,
28
+ "deep_gemm_warmup": "skip",
29
+ "enable_prefix_caching": true,
30
+ "vllm_version": "0.22.0",
31
+ "torch_version": "2.11.0+cu129",
32
+ "torch": "2.11.0+cu129",
33
+ "torch_cuda": "12.9",
34
+ "vllm": "0.22.0",
35
+ "image_tag": "baseten/baseten-weight-sync-inference:main-15e6be27",
36
+ "caller": "github-actions:William-Gao1",
37
+ "model_revision": "c202236235762e1c871ad0ccb60c8ee5ba337b9a",
38
+ "build_id": "27573881259-1",
39
+ "opaque_sampler_payload": {
40
+ "tensor_parallel_size": 1,
41
+ "max_seq_length": 131072,
42
+ "enable_lora": true,
43
+ "max_lora_rank": 64,
44
+ "max_loras": 4,
45
+ "cudagraph_capture_sizes": [
46
+ 1,
47
+ 2,
48
+ 4,
49
+ 8,
50
+ 16,
51
+ 32,
52
+ 64,
53
+ 128,
54
+ 192,
55
+ 256,
56
+ 384,
57
+ 512,
58
+ 640,
59
+ 768,
60
+ 896,
61
+ 1000
62
+ ],
63
+ "use_mega_aot_artifact": true,
64
+ "deep_gemm_warmup": "skip",
65
+ "enable_prefix_caching": true,
66
+ "load_format": "fastsafetensors"
67
+ },
68
+ "build_profile": {
69
+ "cudagraph_capture_sizes": [
70
+ 1,
71
+ 2,
72
+ 4,
73
+ 8,
74
+ 16,
75
+ 32,
76
+ 64,
77
+ 128,
78
+ 192,
79
+ 256,
80
+ 384,
81
+ 512,
82
+ 640,
83
+ 768,
84
+ 896,
85
+ 1000
86
+ ],
87
+ "use_mega_aot_artifact": true,
88
+ "deep_gemm_warmup": "skip"
89
+ },
90
+ "ready_in_seconds_no_cache": 270.08,
91
+ "kv_cache_max_tokens": 4475980,
92
+ "kv_cache_max_concurrency": 34.14901960784314,
93
+ "kv_cache_gpu_memory_utilization": 0.92,
94
+ "cache_size_uncompressed_bytes": 300206080,
95
+ "cache_size_compressed_bytes": 20855259,
96
+ "compression": "zstd -9 -T0 (multithreaded)",
97
+ "compress_time_seconds": 0.48,
98
+ "upload_time_seconds": 5.16
99
+ }