File size: 44,077 Bytes
a402b9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
import argparse
import dataclasses
import logging
import os
from typing import Dict, List, Optional

from sglang_router.sglang_router_rs import get_available_tool_call_parsers

logger = logging.getLogger(__name__)


@dataclasses.dataclass
class RouterArgs:
    # Worker configuration
    worker_urls: List[str] = dataclasses.field(default_factory=list)
    host: str = "0.0.0.0"
    port: int = 30000

    # PD-specific configuration
    mini_lb: bool = False
    test_external_dp_routing: bool = False
    pd_disaggregation: bool = False  # Enable PD disaggregated mode
    prefill_urls: List[tuple] = dataclasses.field(
        default_factory=list
    )  # List of (url, bootstrap_port)
    decode_urls: List[str] = dataclasses.field(default_factory=list)

    # Routing policy
    policy: str = "cache_aware"
    prefill_policy: Optional[str] = None  # Specific policy for prefill nodes in PD mode
    decode_policy: Optional[str] = None  # Specific policy for decode nodes in PD mode
    worker_startup_timeout_secs: int = 1800
    worker_startup_check_interval: int = 30
    cache_threshold: float = 0.3
    balance_abs_threshold: int = 64
    balance_rel_threshold: float = 1.5
    eviction_interval_secs: int = 60
    max_tree_size: int = 2**26
    max_idle_secs: int = 4 * 3600
    assignment_mode: str = "random"  # Mode for manual policy new routing key assignment
    max_payload_size: int = 512 * 1024 * 1024  # 512MB default for large batches
    bucket_adjust_interval_secs: int = 5
    dp_aware: bool = False
    enable_igw: bool = False  # Enable IGW (Inter-Gateway) mode for multi-model support
    api_key: Optional[str] = None
    log_dir: Optional[str] = None
    log_level: Optional[str] = None
    json_log: bool = False
    # Service discovery configuration
    service_discovery: bool = False
    selector: Dict[str, str] = dataclasses.field(default_factory=dict)
    service_discovery_port: int = 80
    service_discovery_namespace: Optional[str] = None
    # PD service discovery configuration
    prefill_selector: Dict[str, str] = dataclasses.field(default_factory=dict)
    decode_selector: Dict[str, str] = dataclasses.field(default_factory=dict)
    bootstrap_port_annotation: str = "sglang.ai/bootstrap-port"
    # Prometheus configuration
    prometheus_port: Optional[int] = None
    prometheus_host: Optional[str] = None
    prometheus_duration_buckets: Optional[List[float]] = None
    # Request ID headers configuration
    request_id_headers: Optional[List[str]] = None
    # Request timeout in seconds
    request_timeout_secs: int = 1800
    # Grace period in seconds to wait for in-flight requests during shutdown
    shutdown_grace_period_secs: int = 180
    # Max concurrent requests for rate limiting (-1 to disable)
    max_concurrent_requests: int = -1
    # Queue size for pending requests when max concurrent limit reached
    queue_size: int = 100
    # Maximum time (in seconds) a request can wait in queue before timing out
    queue_timeout_secs: int = 60
    # Token bucket refill rate (tokens per second). If not set, defaults to max_concurrent_requests
    rate_limit_tokens_per_second: Optional[int] = None
    # CORS allowed origins
    cors_allowed_origins: List[str] = dataclasses.field(default_factory=list)
    # Retry configuration
    retry_max_retries: int = 5
    retry_initial_backoff_ms: int = 50
    retry_max_backoff_ms: int = 30_000
    retry_backoff_multiplier: float = 1.5
    retry_jitter_factor: float = 0.2
    disable_retries: bool = False
    # Health check configuration
    health_failure_threshold: int = 3
    health_success_threshold: int = 2
    health_check_timeout_secs: int = 5
    health_check_interval_secs: int = 60
    health_check_endpoint: str = "/health"
    disable_health_check: bool = False
    # Circuit breaker configuration
    cb_failure_threshold: int = 10
    cb_success_threshold: int = 3
    cb_timeout_duration_secs: int = 60
    cb_window_duration_secs: int = 120
    disable_circuit_breaker: bool = False
    model_path: Optional[str] = None
    tokenizer_path: Optional[str] = None
    chat_template: Optional[str] = None
    # Tokenizer cache configuration
    tokenizer_cache_enable_l0: bool = False
    tokenizer_cache_l0_max_entries: int = 10000
    tokenizer_cache_enable_l1: bool = False
    tokenizer_cache_l1_max_memory: int = 50 * 1024 * 1024  # 50MB
    reasoning_parser: Optional[str] = None
    tool_call_parser: Optional[str] = None
    # MCP server configuration
    mcp_config_path: Optional[str] = None
    # Backend selection
    backend: str = "sglang"
    # History backend configuration
    history_backend: str = "memory"
    oracle_wallet_path: Optional[str] = None
    oracle_tns_alias: Optional[str] = None
    oracle_connect_descriptor: Optional[str] = None
    oracle_username: Optional[str] = None
    oracle_password: Optional[str] = None
    oracle_pool_min: int = 1
    oracle_pool_max: int = 16
    oracle_pool_timeout_secs: int = 30
    postgres_db_url: Optional[str] = None
    postgres_pool_max: int = 16
    redis_url: Optional[str] = None
    redis_pool_max: int = 16
    redis_retention_days: int = 30
    # mTLS configuration for worker communication
    client_cert_path: Optional[str] = None
    client_key_path: Optional[str] = None
    ca_cert_paths: List[str] = dataclasses.field(default_factory=list)
    # Server TLS configuration
    server_cert_path: Optional[str] = None
    server_key_path: Optional[str] = None
    # Trace
    enable_trace: bool = False
    otlp_traces_endpoint: str = "localhost:4317"
    # Control plane authentication
    # API keys for control plane auth (list of tuples: id, name, key, role)
    control_plane_api_keys: List[tuple] = dataclasses.field(default_factory=list)
    control_plane_audit_enabled: bool = False
    # JWT/OIDC configuration for control plane auth
    jwt_issuer: Optional[str] = None
    jwt_audience: Optional[str] = None
    jwt_jwks_uri: Optional[str] = None
    jwt_role_mapping: Dict[str, str] = dataclasses.field(default_factory=dict)

    @staticmethod
    def add_cli_args(
        parser: argparse.ArgumentParser,
        use_router_prefix: bool = False,
        exclude_host_port: bool = False,
    ):
        """
        Add router-specific arguments to an argument parser.

        Args:
            parser: The argument parser to add arguments to
            use_router_prefix: If True, prefix all arguments with 'router-' to avoid conflicts
            exclude_host_port: If True, don't add host and port arguments (used when inheriting from server)
        """
        prefix = "router-" if use_router_prefix else ""

        # Create argument groups for organized --help output
        worker_group = parser.add_argument_group(
            "Worker Configuration", "Settings for worker connections and URLs"
        )
        routing_group = parser.add_argument_group(
            "Routing Policy", "Load balancing and routing configuration"
        )
        pd_group = parser.add_argument_group(
            "PD Disaggregation", "Prefill-Decode disaggregated mode settings"
        )
        k8s_group = parser.add_argument_group(
            "Service Discovery (Kubernetes)", "Kubernetes-based worker discovery"
        )
        logging_group = parser.add_argument_group("Logging", "Log output configuration")
        prometheus_group = parser.add_argument_group(
            "Prometheus Metrics", "Metrics export configuration"
        )
        request_group = parser.add_argument_group(
            "Request Handling", "Request timeout and ID configuration"
        )
        rate_limit_group = parser.add_argument_group(
            "Rate Limiting", "Concurrent request and queue limits"
        )
        retry_group = parser.add_argument_group(
            "Retry Configuration", "Automatic retry behavior for failed requests"
        )
        cb_group = parser.add_argument_group(
            "Circuit Breaker", "Circuit breaker pattern configuration"
        )
        health_group = parser.add_argument_group(
            "Health Checks", "Worker health monitoring settings"
        )
        tokenizer_group = parser.add_argument_group(
            "Tokenizer", "Tokenizer and chat template configuration"
        )
        parser_group = parser.add_argument_group(
            "Parsers", "Reasoning and tool-call parser settings"
        )
        backend_group = parser.add_argument_group(
            "Backend", "Backend runtime and history storage selection"
        )
        oracle_group = parser.add_argument_group(
            "Oracle Database", "Oracle database backend configuration"
        )
        postgres_group = parser.add_argument_group(
            "PostgreSQL Database", "PostgreSQL database backend configuration"
        )
        redis_group = parser.add_argument_group(
            "Redis Database", "Redis database backend configuration"
        )
        tls_group = parser.add_argument_group(
            "TLS/mTLS Security", "TLS certificates for server and worker communication"
        )
        trace_group = parser.add_argument_group(
            "Tracing (OpenTelemetry)", "Distributed tracing configuration"
        )
        auth_group = parser.add_argument_group(
            "Control Plane Authentication", "API key and JWT/OIDC authentication"
        )

        # Worker configuration
        if not exclude_host_port:
            worker_group.add_argument(
                "--host",
                type=str,
                default=RouterArgs.host,
                help="Host address to bind the router server. Supports IPv4, IPv6 (e.g., ::, ::1), or 0.0.0.0 for all interfaces",
            )
            worker_group.add_argument(
                "--port",
                type=int,
                default=RouterArgs.port,
                help="Port number to bind the router server",
            )

        worker_group.add_argument(
            "--worker-urls",
            type=str,
            nargs="*",
            default=[],
            help="List of worker URLs. Supports IPv4 and IPv6 addresses (use brackets for IPv6, e.g., http://[::1]:8000 http://192.168.1.1:8000)",
        )

        # Routing policy configuration
        routing_group.add_argument(
            f"--{prefix}policy",
            type=str,
            default=RouterArgs.policy,
            choices=[
                "random",
                "round_robin",
                "cache_aware",
                "power_of_two",
                "manual",
                "consistent_hashing",
                "prefix_hash",
            ],
            help="Load balancing policy to use. In PD mode, this is used for both prefill and decode unless overridden",
        )
        routing_group.add_argument(
            f"--{prefix}prefill-policy",
            type=str,
            default=None,
            choices=[
                "random",
                "round_robin",
                "cache_aware",
                "power_of_two",
                "manual",
                "bucket",
                "consistent_hashing",
                "prefix_hash",
            ],
            help="Specific policy for prefill nodes in PD mode. If not specified, uses the main policy",
        )
        routing_group.add_argument(
            f"--{prefix}decode-policy",
            type=str,
            default=None,
            choices=[
                "random",
                "round_robin",
                "cache_aware",
                "power_of_two",
                "manual",
                "consistent_hashing",
                "prefix_hash",
            ],
            help="Specific policy for decode nodes in PD mode. If not specified, uses the main policy",
        )
        routing_group.add_argument(
            f"--{prefix}cache-threshold",
            type=float,
            default=RouterArgs.cache_threshold,
            help="Cache threshold (0.0-1.0) for cache-aware routing",
        )
        routing_group.add_argument(
            f"--{prefix}balance-abs-threshold",
            type=int,
            default=RouterArgs.balance_abs_threshold,
            help="Absolute threshold for load difference. Balancing is triggered if `(max_load - min_load) > abs_threshold` and the relative threshold is also met.",
        )
        routing_group.add_argument(
            f"--{prefix}balance-rel-threshold",
            type=float,
            default=RouterArgs.balance_rel_threshold,
            help="Relative threshold for load difference. Balancing is triggered if `max_load > min_load * rel_threshold` and the absolute threshold is also met.",
        )
        routing_group.add_argument(
            f"--{prefix}bucket-adjust-interval-secs",
            type=int,
            default=RouterArgs.bucket_adjust_interval_secs,
            help="Interval in seconds between bucket boundary adjustment operations",
        )
        routing_group.add_argument(
            f"--{prefix}eviction-interval-secs",
            type=int,
            default=RouterArgs.eviction_interval_secs,
            help="Interval in seconds between cache eviction operations",
        )
        routing_group.add_argument(
            f"--{prefix}max-tree-size",
            type=int,
            default=RouterArgs.max_tree_size,
            help="Maximum size of the approximation tree for cache-aware routing",
        )
        routing_group.add_argument(
            f"--{prefix}max-idle-secs",
            type=int,
            default=RouterArgs.max_idle_secs,
            help="Maximum idle time in seconds before eviction (for manual policy)",
        )
        routing_group.add_argument(
            f"--{prefix}assignment-mode",
            type=str,
            default=RouterArgs.assignment_mode,
            choices=["random", "min_load", "min_group"],
            help="Mode for assigning new routing keys in manual policy: random (default), min_load (worker with fewest requests), min_group (worker with fewest routing keys)",
        )
        routing_group.add_argument(
            f"--{prefix}max-payload-size",
            type=int,
            default=RouterArgs.max_payload_size,
            help="Maximum payload size in bytes",
        )
        routing_group.add_argument(
            f"--{prefix}dp-aware",
            action="store_true",
            help="Enable data parallelism aware schedule",
        )
        routing_group.add_argument(
            f"--{prefix}enable-igw",
            action="store_true",
            help="Enable IGW (Inference-Gateway) mode for multi-model support",
        )

        # PD-specific arguments
        pd_group.add_argument(
            f"--{prefix}mini-lb",
            action="store_true",
            help="Enable MiniLB",
        )
        pd_group.add_argument(
            f"--{prefix}test-external-dp-routing",
            action="store_true",
            help="(MiniLB only) Randomly assign routed_dp_rank / disagg_prefill_dp_rank per request and verify the response dp_rank matches.",
        )
        pd_group.add_argument(
            f"--{prefix}pd-disaggregation",
            action="store_true",
            help="Enable PD (Prefill-Decode) disaggregated mode",
        )
        pd_group.add_argument(
            f"--{prefix}prefill",
            nargs="+",
            action="append",
            help="Prefill server URL and optional bootstrap port. Can be specified multiple times. "
            "Format: --prefill URL [BOOTSTRAP_PORT]. "
            "BOOTSTRAP_PORT can be a port number, 'none', or omitted (defaults to none).",
        )
        pd_group.add_argument(
            f"--{prefix}decode",
            nargs=1,
            action="append",
            metavar=("URL",),
            help="Decode server URL. Can be specified multiple times.",
        )
        pd_group.add_argument(
            f"--{prefix}worker-startup-timeout-secs",
            type=int,
            default=RouterArgs.worker_startup_timeout_secs,
            help="Timeout in seconds for worker startup and registration (default: 1800 / 30 minutes). Large models can take significant time to load into GPU memory.",
        )
        pd_group.add_argument(
            f"--{prefix}worker-startup-check-interval",
            type=int,
            default=RouterArgs.worker_startup_check_interval,
            help="Interval in seconds between checks for worker startup",
        )

        # Logging configuration
        logging_group.add_argument(
            f"--{prefix}log-dir",
            type=str,
            default=None,
            help="Directory to store log files. If not specified, logs are only output to console.",
        )
        logging_group.add_argument(
            f"--{prefix}log-level",
            type=str,
            default="info",
            choices=["debug", "info", "warn", "error"],
            help="Set the logging level. If not specified, defaults to INFO.",
        )
        logging_group.add_argument(
            f"--{prefix}json-log",
            action="store_true",
            help="Enable structured JSON log output instead of plain text.",
        )

        # Service discovery configuration
        k8s_group.add_argument(
            f"--{prefix}service-discovery",
            action="store_true",
            help="Enable Kubernetes service discovery",
        )
        k8s_group.add_argument(
            f"--{prefix}selector",
            type=str,
            nargs="+",
            default={},
            help="Label selector for Kubernetes service discovery (format: key1=value1 key2=value2)",
        )
        k8s_group.add_argument(
            f"--{prefix}service-discovery-port",
            type=int,
            default=RouterArgs.service_discovery_port,
            help="Port to use for discovered worker pods",
        )
        k8s_group.add_argument(
            f"--{prefix}service-discovery-namespace",
            type=str,
            help="Kubernetes namespace to watch for pods. If not provided, watches all namespaces (requires cluster-wide permissions)",
        )
        k8s_group.add_argument(
            f"--{prefix}prefill-selector",
            type=str,
            nargs="+",
            default={},
            help="Label selector for prefill server pods in PD mode (format: key1=value1 key2=value2)",
        )
        k8s_group.add_argument(
            f"--{prefix}decode-selector",
            type=str,
            nargs="+",
            default={},
            help="Label selector for decode server pods in PD mode (format: key1=value1 key2=value2)",
        )
        # Prometheus configuration
        prometheus_group.add_argument(
            f"--{prefix}prometheus-port",
            type=int,
            default=29000,
            help="Port to expose Prometheus metrics (default: 29000).",
        )
        prometheus_group.add_argument(
            f"--{prefix}prometheus-host",
            type=str,
            default="0.0.0.0",
            help="Host address to bind the Prometheus metrics server. Supports IPv4, IPv6 (e.g., ::, ::1), or 0.0.0.0 for all interfaces",
        )
        prometheus_group.add_argument(
            f"--{prefix}prometheus-duration-buckets",
            type=float,
            nargs="+",
            help="Buckets for Prometheus duration metrics",
        )

        # Request handling configuration
        request_group.add_argument(
            f"--{prefix}request-id-headers",
            type=str,
            nargs="*",
            help="Custom HTTP headers to check for request IDs (e.g., x-request-id x-trace-id). If not specified, uses common defaults.",
        )
        request_group.add_argument(
            f"--{prefix}request-timeout-secs",
            type=int,
            default=RouterArgs.request_timeout_secs,
            help="Request timeout in seconds",
        )
        request_group.add_argument(
            f"--{prefix}shutdown-grace-period-secs",
            type=int,
            default=RouterArgs.shutdown_grace_period_secs,
            help="Grace period in seconds to wait for in-flight requests during shutdown",
        )
        request_group.add_argument(
            f"--{prefix}cors-allowed-origins",
            type=str,
            nargs="*",
            default=[],
            help="CORS allowed origins (e.g., http://localhost:3000 https://example.com)",
        )

        # Rate limiting configuration
        rate_limit_group.add_argument(
            f"--{prefix}max-concurrent-requests",
            type=int,
            default=RouterArgs.max_concurrent_requests,
            help="Maximum number of concurrent requests allowed (for rate limiting). Set to -1 to disable rate limiting.",
        )
        rate_limit_group.add_argument(
            f"--{prefix}queue-size",
            type=int,
            default=RouterArgs.queue_size,
            help="Queue size for pending requests when max concurrent limit reached (0 = no queue, return 429 immediately)",
        )
        rate_limit_group.add_argument(
            f"--{prefix}queue-timeout-secs",
            type=int,
            default=RouterArgs.queue_timeout_secs,
            help="Maximum time (in seconds) a request can wait in queue before timing out",
        )
        rate_limit_group.add_argument(
            f"--{prefix}rate-limit-tokens-per-second",
            type=int,
            default=RouterArgs.rate_limit_tokens_per_second,
            help="Token bucket refill rate (tokens per second). If not set, defaults to max_concurrent_requests",
        )

        # Retry configuration
        retry_group.add_argument(
            f"--{prefix}retry-max-retries",
            type=int,
            default=RouterArgs.retry_max_retries,
            help="Maximum number of retry attempts for failed requests",
        )
        retry_group.add_argument(
            f"--{prefix}retry-initial-backoff-ms",
            type=int,
            default=RouterArgs.retry_initial_backoff_ms,
            help="Initial backoff delay in milliseconds before first retry",
        )
        retry_group.add_argument(
            f"--{prefix}retry-max-backoff-ms",
            type=int,
            default=RouterArgs.retry_max_backoff_ms,
            help="Maximum backoff delay in milliseconds between retries",
        )
        retry_group.add_argument(
            f"--{prefix}retry-backoff-multiplier",
            type=float,
            default=RouterArgs.retry_backoff_multiplier,
            help="Multiplier for exponential backoff between retries",
        )
        retry_group.add_argument(
            f"--{prefix}retry-jitter-factor",
            type=float,
            default=RouterArgs.retry_jitter_factor,
            help="Jitter factor (0.0-1.0) to add randomness to retry delays",
        )
        retry_group.add_argument(
            f"--{prefix}disable-retries",
            action="store_true",
            help="Disable retries (equivalent to setting retry_max_retries=1)",
        )

        # Circuit breaker configuration
        cb_group.add_argument(
            f"--{prefix}cb-failure-threshold",
            type=int,
            default=RouterArgs.cb_failure_threshold,
            help="Number of failures before circuit breaker opens",
        )
        cb_group.add_argument(
            f"--{prefix}cb-success-threshold",
            type=int,
            default=RouterArgs.cb_success_threshold,
            help="Number of successes in half-open state before closing circuit",
        )
        cb_group.add_argument(
            f"--{prefix}cb-timeout-duration-secs",
            type=int,
            default=RouterArgs.cb_timeout_duration_secs,
            help="Time in seconds before attempting to close an open circuit",
        )
        cb_group.add_argument(
            f"--{prefix}cb-window-duration-secs",
            type=int,
            default=RouterArgs.cb_window_duration_secs,
            help="Sliding window duration in seconds for tracking failures",
        )
        cb_group.add_argument(
            f"--{prefix}disable-circuit-breaker",
            action="store_true",
            help="Disable circuit breaker (equivalent to setting cb_failure_threshold to u32::MAX)",
        )

        # Health check configuration
        health_group.add_argument(
            f"--{prefix}health-failure-threshold",
            type=int,
            default=RouterArgs.health_failure_threshold,
            help="Number of consecutive health check failures before marking worker unhealthy",
        )
        health_group.add_argument(
            f"--{prefix}health-success-threshold",
            type=int,
            default=RouterArgs.health_success_threshold,
            help="Number of consecutive health check successes before marking worker healthy",
        )
        health_group.add_argument(
            f"--{prefix}health-check-timeout-secs",
            type=int,
            default=RouterArgs.health_check_timeout_secs,
            help="Timeout in seconds for health check requests",
        )
        health_group.add_argument(
            f"--{prefix}health-check-interval-secs",
            type=int,
            default=RouterArgs.health_check_interval_secs,
            help="Interval in seconds between runtime health checks",
        )
        health_group.add_argument(
            f"--{prefix}health-check-endpoint",
            type=str,
            default=RouterArgs.health_check_endpoint,
            help="Health check endpoint path",
        )
        health_group.add_argument(
            f"--{prefix}disable-health-check",
            action="store_true",
            default=RouterArgs.disable_health_check,
            help="Disable all worker health checks at startup",
        )
        # Tokenizer configuration
        tokenizer_group.add_argument(
            f"--{prefix}model-path",
            type=str,
            default=None,
            help="Model path for loading tokenizer (HuggingFace model ID or local path)",
        )
        tokenizer_group.add_argument(
            f"--{prefix}tokenizer-path",
            type=str,
            default=None,
            help="Explicit tokenizer path (overrides model_path tokenizer if provided)",
        )
        tokenizer_group.add_argument(
            f"--{prefix}chat-template",
            type=str,
            default=None,
            help="Chat template path (optional)",
        )
        tokenizer_group.add_argument(
            f"--{prefix}tokenizer-cache-enable-l0",
            action="store_true",
            default=RouterArgs.tokenizer_cache_enable_l0,
            help="Enable L0 (whole-string exact match) tokenizer cache (default: False)",
        )
        tokenizer_group.add_argument(
            f"--{prefix}tokenizer-cache-l0-max-entries",
            type=int,
            default=RouterArgs.tokenizer_cache_l0_max_entries,
            help="Maximum number of entries in L0 tokenizer cache (default: 10000)",
        )
        tokenizer_group.add_argument(
            f"--{prefix}tokenizer-cache-enable-l1",
            action="store_true",
            default=RouterArgs.tokenizer_cache_enable_l1,
            help="Enable L1 (prefix matching) tokenizer cache (default: False)",
        )
        tokenizer_group.add_argument(
            f"--{prefix}tokenizer-cache-l1-max-memory",
            type=int,
            default=RouterArgs.tokenizer_cache_l1_max_memory,
            help="Maximum memory for L1 tokenizer cache in bytes (default: 50MB)",
        )

        # Parser configuration
        parser_group.add_argument(
            f"--{prefix}reasoning-parser",
            type=str,
            default=None,
            help="Specify the parser for reasoning models (e.g., deepseek-r1, qwen3)",
        )
        tool_call_parser_choices = get_available_tool_call_parsers()
        parser_group.add_argument(
            f"--{prefix}tool-call-parser",
            type=str,
            default=None,
            choices=tool_call_parser_choices,
            help=f"Specify the parser for tool-call interactions (e.g., json, qwen)",
        )
        parser_group.add_argument(
            f"--{prefix}mcp-config-path",
            type=str,
            default=None,
            help="Path to MCP (Model Context Protocol) server configuration file",
        )

        # Backend selection
        backend_group.add_argument(
            f"--{prefix}backend",
            type=str,
            default=RouterArgs.backend,
            choices=["sglang", "openai"],
            help="Backend runtime to use (default: sglang)",
        )
        backend_group.add_argument(
            f"--{prefix}history-backend",
            type=str,
            default=RouterArgs.history_backend,
            choices=["memory", "none", "oracle", "postgres", "redis"],
            help="History storage backend for conversations and responses (default: memory)",
        )

        # Oracle configuration
        oracle_group.add_argument(
            f"--{prefix}oracle-wallet-path",
            type=str,
            default=os.getenv("ATP_WALLET_PATH"),
            help="Path to Oracle ATP wallet directory (env: ATP_WALLET_PATH)",
        )
        oracle_group.add_argument(
            f"--{prefix}oracle-tns-alias",
            type=str,
            default=os.getenv("ATP_TNS_ALIAS"),
            help="Oracle TNS alias from tnsnames.ora (env: ATP_TNS_ALIAS).",
        )
        oracle_group.add_argument(
            f"--{prefix}oracle-connect-descriptor",
            type=str,
            default=os.getenv("ATP_DSN"),
            help="Oracle connection descriptor/DSN (full connection string) (env: ATP_DSN)",
        )
        oracle_group.add_argument(
            f"--{prefix}oracle-username",
            type=str,
            default=os.getenv("ATP_USER"),
            help="Oracle database username (env: ATP_USER)",
        )
        oracle_group.add_argument(
            f"--{prefix}oracle-password",
            type=str,
            default=os.getenv("ATP_PASSWORD"),
            help="Oracle database password (env: ATP_PASSWORD)",
        )
        oracle_group.add_argument(
            f"--{prefix}oracle-pool-min",
            type=int,
            default=int(os.getenv("ATP_POOL_MIN", RouterArgs.oracle_pool_min)),
            help="Minimum Oracle connection pool size (default: 1, env: ATP_POOL_MIN)",
        )
        oracle_group.add_argument(
            f"--{prefix}oracle-pool-max",
            type=int,
            default=int(os.getenv("ATP_POOL_MAX", RouterArgs.oracle_pool_max)),
            help="Maximum Oracle connection pool size (default: 16, env: ATP_POOL_MAX)",
        )
        oracle_group.add_argument(
            f"--{prefix}oracle-pool-timeout-secs",
            type=int,
            default=int(
                os.getenv("ATP_POOL_TIMEOUT_SECS", RouterArgs.oracle_pool_timeout_secs)
            ),
            help="Oracle connection pool timeout in seconds (default: 30, env: ATP_POOL_TIMEOUT_SECS)",
        )

        # Postgres configuration
        postgres_group.add_argument(
            f"--{prefix}postgres-db-url",
            type=str,
            default=os.getenv("POSTGRES_DB_URL"),
            help="PostgreSQL database connection URL (env: POSTGRES_DB_URL)",
        )
        postgres_group.add_argument(
            f"--{prefix}postgres-pool-max",
            type=int,
            default=int(os.getenv("POSTGRES_POOL_MAX", RouterArgs.postgres_pool_max)),
            help="Maximum PostgreSQL connection pool size (default: 16, env: POSTGRES_POOL_MAX)",
        )

        # Redis configuration
        redis_group.add_argument(
            f"--{prefix}redis-url",
            type=str,
            default=os.getenv("REDIS_URL"),
            help="Redis connection URL (env: REDIS_URL)",
        )
        redis_group.add_argument(
            f"--{prefix}redis-pool-max",
            type=int,
            default=int(os.getenv("REDIS_POOL_MAX", RouterArgs.redis_pool_max)),
            help="Maximum Redis connection pool size (default: 16, env: REDIS_POOL_MAX)",
        )
        redis_group.add_argument(
            f"--{prefix}redis-retention-days",
            type=int,
            default=int(
                os.getenv("REDIS_RETENTION_DAYS", RouterArgs.redis_retention_days)
            ),
            help="Redis data retention in days (-1 for persistent, default: 30, env: REDIS_RETENTION_DAYS)",
        )

        # TLS/mTLS configuration
        tls_group.add_argument(
            f"--{prefix}client-cert-path",
            type=str,
            default=None,
            help="Path to client certificate for mTLS authentication with workers",
        )
        tls_group.add_argument(
            f"--{prefix}client-key-path",
            type=str,
            default=None,
            help="Path to client private key for mTLS authentication with workers",
        )
        tls_group.add_argument(
            f"--{prefix}ca-cert-paths",
            type=str,
            nargs="*",
            default=[],
            help="Path(s) to CA certificate(s) for verifying worker TLS certificates. Can specify multiple CAs.",
        )
        tls_group.add_argument(
            f"--{prefix}tls-cert-path",
            type=str,
            default=None,
            help="Path to server TLS certificate (PEM format)",
        )
        tls_group.add_argument(
            f"--{prefix}tls-key-path",
            type=str,
            default=None,
            help="Path to server TLS private key (PEM format)",
        )

        # Tracing configuration
        trace_group.add_argument(
            f"--{prefix}enable-trace",
            action="store_true",
            help="Enable opentelemetry trace",
        )
        trace_group.add_argument(
            f"--{prefix}otlp-traces-endpoint",
            type=str,
            default="localhost:4317",
            help="Config opentelemetry collector endpoint if --enable-trace is set. format: <ip>:<port>",
        )

        # Control plane authentication
        auth_group.add_argument(
            f"--{prefix}api-key",
            type=str,
            default=None,
            help="The api key used for the authorization with the worker. Useful when the dp aware scheduling strategy is enabled.",
        )
        auth_group.add_argument(
            f"--{prefix}control-plane-api-keys",
            type=str,
            nargs="*",
            default=[],
            help="API keys for control plane authentication. Format: 'id:name:role:key' where role is 'admin' or 'user'. "
            "Example: --control-plane-api-keys 'key1:Service Account:admin:secret123' 'key2:Read Only:user:secret456'",
        )
        auth_group.add_argument(
            f"--{prefix}control-plane-audit-enabled",
            action="store_true",
            default=False,
            help="Enable audit logging for control plane operations",
        )
        auth_group.add_argument(
            f"--{prefix}jwt-issuer",
            type=str,
            default=None,
            help="OIDC issuer URL for JWT authentication (e.g., https://login.microsoftonline.com/{tenant}/v2.0)",
        )
        auth_group.add_argument(
            f"--{prefix}jwt-audience",
            type=str,
            default=None,
            help="Expected audience claim for JWT tokens (usually the client ID or API identifier)",
        )
        auth_group.add_argument(
            f"--{prefix}jwt-jwks-uri",
            type=str,
            default=None,
            help="Explicit JWKS URI. If not provided, discovered from issuer via .well-known/openid-configuration",
        )
        auth_group.add_argument(
            f"--{prefix}jwt-role-mapping",
            type=str,
            nargs="*",
            default=[],
            help="Mapping from IDP role/group names to gateway roles. Format: 'idp_role=gateway_role'. "
            "Example: --jwt-role-mapping 'Gateway.Admin=admin' 'Gateway.User=user'",
        )

    @classmethod
    def from_cli_args(
        cls, args: argparse.Namespace, use_router_prefix: bool = False
    ) -> "RouterArgs":
        """
        Create RouterArgs instance from parsed command line arguments.

        Args:
            args: Parsed command line arguments
            use_router_prefix: If True, look for arguments with 'router-' prefix
        """
        prefix = "router_" if use_router_prefix else ""
        cli_args_dict = vars(args)
        args_dict = {}

        for attr in dataclasses.fields(cls):
            # Auto strip prefix from args
            if f"{prefix}{attr.name}" in cli_args_dict:
                args_dict[attr.name] = cli_args_dict[f"{prefix}{attr.name}"]
            elif attr.name in cli_args_dict:
                args_dict[attr.name] = cli_args_dict[attr.name]

            # Special handling for CLI args with dashes vs dataclass fields with underscores
            # e.g. --tls-cert-path maps to tls_cert_path in args namespace, but we might want server_cert_path in dataclass
            # Wait, dataclass fields are server_cert_path/server_key_path
            # CLI args are tls_cert_path/tls_key_path
            # We need to manually map them if names don't match

        # Map tls args to server cert/key path
        if f"{prefix}tls_cert_path" in cli_args_dict:
            args_dict["server_cert_path"] = cli_args_dict[f"{prefix}tls_cert_path"]
        if f"{prefix}tls_key_path" in cli_args_dict:
            args_dict["server_key_path"] = cli_args_dict[f"{prefix}tls_key_path"]

        # parse special arguments and remove "--prefill" and "--decode" from cli_args_dict
        args_dict["prefill_urls"] = cls._parse_prefill_urls(
            cli_args_dict.get(f"{prefix}prefill", None)
        )
        args_dict["decode_urls"] = cls._parse_decode_urls(
            cli_args_dict.get(f"{prefix}decode", None)
        )
        args_dict["selector"] = cls._parse_selector(
            cli_args_dict.get(f"{prefix}selector", None)
        )
        args_dict["prefill_selector"] = cls._parse_selector(
            cli_args_dict.get(f"{prefix}prefill_selector", None)
        )
        args_dict["decode_selector"] = cls._parse_selector(
            cli_args_dict.get(f"{prefix}decode_selector", None)
        )

        # Mooncake-specific annotation
        args_dict["bootstrap_port_annotation"] = "sglang.ai/bootstrap-port"

        # Parse control plane API keys
        args_dict["control_plane_api_keys"] = cls._parse_control_plane_api_keys(
            cli_args_dict.get(f"{prefix}control_plane_api_keys", [])
        )

        # Parse JWT role mapping
        args_dict["jwt_role_mapping"] = cls._parse_jwt_role_mapping(
            cli_args_dict.get(f"{prefix}jwt_role_mapping", [])
        )

        return cls(**args_dict)

    def _validate_router_args(self):
        # Validate configuration based on mode
        if self.pd_disaggregation:
            # Warn about policy usage in PD mode
            if self.prefill_policy and self.decode_policy and self.policy:
                logger.warning(
                    "Both --prefill-policy and --decode-policy are specified. "
                    "The main --policy flag will be ignored for PD mode."
                )
            elif self.prefill_policy and not self.decode_policy and self.policy:
                logger.info(
                    f"Using --prefill-policy '{self.prefill_policy}' for prefill nodes "
                    f"and --policy '{self.policy}' for decode nodes."
                )
            elif self.decode_policy and not self.prefill_policy and self.policy:
                logger.info(
                    f"Using --policy '{self.policy}' for prefill nodes "
                    f"and --decode-policy '{self.decode_policy}' for decode nodes."
                )

    @staticmethod
    def _parse_selector(selector_list):
        if not selector_list:
            return {}

        # Support `- --selector\n- a=b c=d` case
        if len(selector_list) == 1 and (" " in selector_list[0]):
            selector_list = selector_list[0].split(" ")

        selector = {}
        for item in selector_list:
            if "=" in item:
                key, value = item.split("=", 1)
                selector[key] = value
        return selector

    @staticmethod
    def _parse_prefill_urls(prefill_list):
        """Parse prefill URLs from --prefill arguments.

        Format: --prefill URL [BOOTSTRAP_PORT]
        Example:
            --prefill http://prefill1:8080 9000  # With bootstrap port
            --prefill http://prefill2:8080 none  # Explicitly no bootstrap port
            --prefill http://prefill3:8080       # Defaults to no bootstrap port
        """
        if not prefill_list:
            return []

        prefill_urls = []
        for prefill_args in prefill_list:

            url = prefill_args[0]

            # Handle optional bootstrap port
            if len(prefill_args) >= 2:
                bootstrap_port_str = prefill_args[1]
                # Handle 'none' as None
                if bootstrap_port_str.lower() == "none":
                    bootstrap_port = None
                else:
                    try:
                        bootstrap_port = int(bootstrap_port_str)
                    except ValueError:
                        raise ValueError(
                            f"Invalid bootstrap port: {bootstrap_port_str}. Must be a number or 'none'"
                        )
            else:
                # No bootstrap port specified, default to None
                bootstrap_port = None

            prefill_urls.append((url, bootstrap_port))

        return prefill_urls

    @staticmethod
    def _parse_decode_urls(decode_list):
        """Parse decode URLs from --decode arguments.

        Format: --decode URL
        Example: --decode http://decode1:8081 --decode http://decode2:8081
        """
        if not decode_list:
            return []

        # decode_list is a list of single-element lists due to nargs=1
        return [url[0] for url in decode_list]

    @staticmethod
    def _parse_control_plane_api_keys(api_keys_list):
        """Parse control plane API keys from --control-plane-api-keys arguments.

        Format: id:name:role:key
        Example: --control-plane-api-keys 'key1:Service Account:admin:secret123'
        """
        if not api_keys_list:
            return []

        parsed_keys = []
        for key_str in api_keys_list:
            parts = key_str.split(":", 3)  # Split into at most 4 parts
            if len(parts) != 4:
                raise ValueError(
                    f"Invalid API key format: '{key_str}'. Expected 'id:name:role:key'"
                )
            key_id, name, role, key = parts
            role_lower = role.lower()
            if role_lower not in ("admin", "user"):
                raise ValueError(f"Invalid role: '{role}'. Must be 'admin' or 'user'")
            parsed_keys.append((key_id, name, key, role_lower))
        return parsed_keys

    @staticmethod
    def _parse_jwt_role_mapping(role_mapping_list):
        """Parse JWT role mapping from --jwt-role-mapping arguments.

        Format: idp_role=gateway_role
        Example: --jwt-role-mapping 'Gateway.Admin=admin' 'Gateway.User=user'
        """
        if not role_mapping_list:
            return {}

        mapping = {}
        for mapping_str in role_mapping_list:
            if "=" not in mapping_str:
                raise ValueError(
                    f"Invalid role mapping format: '{mapping_str}'. Expected 'idp_role=gateway_role'"
                )
            idp_role, gateway_role = mapping_str.split("=", 1)
            gateway_role_lower = gateway_role.lower()
            if gateway_role_lower not in ("admin", "user"):
                raise ValueError(
                    f"Invalid gateway role: '{gateway_role}'. Must be 'admin' or 'user'"
                )
            mapping[idp_role] = gateway_role_lower
        return mapping