acnagle commited on
Commit
aa7a04b
·
verified ·
1 Parent(s): 21733ca

Upload folder using huggingface_hub

Browse files
inference_hf.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ HuggingFace-native inference for Terminator-Qwen3-8B.
4
+
5
+ Loads the frozen Qwen3 base model + trained Terminator head (FFN + optional
6
+ extra transformer layers) directly via HuggingFace transformers.
7
+
8
+ Generates chain-of-thought reasoning token-by-token. The Terminator FFN
9
+ predicts when the final answer has been reached; when a sliding-window
10
+ majority vote exceeds the threshold, an exit message is injected and the
11
+ model transitions to answering mode.
12
+
13
+ Usage:
14
+ python inference_hf.py --prompt "What is the sum of the first 100 natural numbers?"
15
+
16
+ python inference_hf.py \\
17
+ --prompt "Solve x^2 - 5x + 6 = 0" \\
18
+ --model Qwen/Qwen3-8B \\
19
+ --checkpoint terminator.pt \\
20
+ --threshold 0.7 --window-size 10
21
+ """
22
+
23
+ import argparse
24
+ import os
25
+ import sys
26
+ from pathlib import Path
27
+
28
+ import torch
29
+ import torch.nn.functional as F
30
+ from transformers import AutoModelForCausalLM, AutoTokenizer
31
+ from transformers import TopKLogitsWarper, TopPLogitsWarper, TemperatureLogitsWarper
32
+ from transformers.generation.logits_process import LogitsProcessorList
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Imports from the project
36
+ # ---------------------------------------------------------------------------
37
+
38
+ # Local: TerminatorFFN + checkpoint loader
39
+ _script_dir = Path(__file__).resolve().parent
40
+ sys.path.insert(0, str(_script_dir))
41
+ from vllm_terminator.terminator_head import load_terminator_checkpoint
42
+
43
+ # Parent dir: ExtraTransformerLayers from terminator_utils
44
+ _repo_root = _script_dir.parent
45
+ sys.path.insert(0, str(_repo_root))
46
+ from terminator_utils import ExtraTransformerLayers
47
+
48
+ # ---------------------------------------------------------------------------
49
+ # ANSI escape codes
50
+ # ---------------------------------------------------------------------------
51
+ DIM = "\033[2m"
52
+ BOLD = "\033[1m"
53
+ RESET = "\033[0m"
54
+
55
+
56
+ def load_model_and_tokenizer(model_name, device):
57
+ """Load base Qwen3 model and tokenizer."""
58
+ print(f"Loading tokenizer: {model_name}")
59
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
60
+ if tokenizer.pad_token is None:
61
+ tokenizer.pad_token = tokenizer.eos_token
62
+
63
+ think_token_id = tokenizer.convert_tokens_to_ids("<think>")
64
+ think_end_token_id = tokenizer.convert_tokens_to_ids("</think>")
65
+ if think_token_id == tokenizer.unk_token_id or think_end_token_id == tokenizer.unk_token_id:
66
+ raise ValueError(
67
+ f"<think>/<think> tokens not in tokenizer! "
68
+ f"IDs: {think_token_id}, {think_end_token_id}"
69
+ )
70
+
71
+ print(f"Loading model: {model_name}")
72
+ model = AutoModelForCausalLM.from_pretrained(
73
+ model_name,
74
+ torch_dtype=torch.bfloat16,
75
+ device_map={"": device},
76
+ trust_remote_code=True,
77
+ )
78
+ for param in model.parameters():
79
+ param.requires_grad = False
80
+ model.eval()
81
+
82
+ print(
83
+ f"Model loaded: {model.config.num_hidden_layers} layers, "
84
+ f"hidden size {model.config.hidden_size}"
85
+ )
86
+ return model, tokenizer, think_token_id, think_end_token_id
87
+
88
+
89
+ def build_extra_layers(base_model, checkpoint_config, extra_layers_state_dict, device):
90
+ """Reconstruct extra transformer layers from checkpoint state dict."""
91
+ num_extra_layers = checkpoint_config.get("num_extra_layers", 0)
92
+ if num_extra_layers == 0 or extra_layers_state_dict is None:
93
+ return None
94
+
95
+ print(f"Reconstructing {num_extra_layers} extra transformer layer(s)...")
96
+ base_layer_class = base_model.model.layers[0].__class__
97
+ model_config = base_model.config
98
+ rotary_emb = getattr(base_model.model, "rotary_emb", None)
99
+
100
+ extra_layers = ExtraTransformerLayers(
101
+ base_layer_class, num_extra_layers, model_config, rotary_emb=rotary_emb
102
+ ).to(device)
103
+ extra_layers.load_state_dict(extra_layers_state_dict)
104
+ extra_layers.eval()
105
+
106
+ param_count = sum(p.numel() for p in extra_layers.parameters())
107
+ print(f"Extra layers loaded ({param_count:,} parameters)")
108
+ return extra_layers
109
+
110
+
111
+ def generate_with_terminator(
112
+ prompt,
113
+ model,
114
+ tokenizer,
115
+ ffn,
116
+ extra_layers,
117
+ layer_idx,
118
+ think_token_id,
119
+ think_end_token_id,
120
+ threshold,
121
+ window_size,
122
+ exit_message,
123
+ max_tokens,
124
+ temperature,
125
+ device,
126
+ ):
127
+ """Generate a response with Terminator early-exit logic.
128
+
129
+ Follows the same generation pattern as inference_terminator.py:mode1_generate().
130
+ Streams thinking tokens to the terminal as they are produced.
131
+ """
132
+ # Format prompt via chat template
133
+ messages = [{"role": "user", "content": prompt}]
134
+ prompt_text = tokenizer.apply_chat_template(
135
+ messages, tokenize=False, add_generation_prompt=True
136
+ )
137
+
138
+ # Tokenize and append <think>
139
+ prompt_ids = tokenizer(
140
+ prompt_text, add_special_tokens=False, return_tensors="pt"
141
+ )["input_ids"].to(device).long()
142
+
143
+ input_ids = torch.cat(
144
+ [prompt_ids, torch.tensor([[think_token_id]], dtype=torch.long, device=device)],
145
+ dim=1,
146
+ )
147
+
148
+ # Sampling processors
149
+ logits_processor = LogitsProcessorList([
150
+ TemperatureLogitsWarper(temperature=temperature),
151
+ TopKLogitsWarper(top_k=20),
152
+ TopPLogitsWarper(top_p=0.95),
153
+ ])
154
+
155
+ # Sliding-window state
156
+ predictions_list = []
157
+ reasoning_tokens = []
158
+ early_exit = False
159
+
160
+ # Start streaming thinking output
161
+ sys.stdout.write(f"\n{DIM}Thinking...\n")
162
+ sys.stdout.flush()
163
+
164
+ for step in range(max_tokens):
165
+ attention_mask = torch.ones_like(input_ids)
166
+
167
+ # Hook to capture hidden states from the target layer
168
+ captured = {}
169
+
170
+ def hook_fn(module, input, output):
171
+ if isinstance(output, tuple):
172
+ captured["hidden"] = output[0].detach()
173
+ else:
174
+ captured["hidden"] = output.detach()
175
+
176
+ target_layer = model.model.layers[layer_idx]
177
+ handle = target_layer.register_forward_hook(hook_fn)
178
+
179
+ with torch.no_grad():
180
+ outputs = model(
181
+ input_ids=input_ids,
182
+ attention_mask=attention_mask,
183
+ use_cache=False,
184
+ )
185
+
186
+ handle.remove()
187
+
188
+ hidden_states = captured["hidden"] # [1, seq_len, hidden_size]
189
+
190
+ # Make prediction once we have at least one thinking token
191
+ if len(reasoning_tokens) > 0:
192
+ if extra_layers is not None:
193
+ h = hidden_states.float()
194
+ h = extra_layers(h, attention_mask=attention_mask)
195
+ last_h = h[:, -1:, :]
196
+ logits_pred = ffn(last_h.float())
197
+ else:
198
+ last_h = hidden_states[:, -1:, :]
199
+ logits_pred = ffn(last_h.float())
200
+
201
+ pred = torch.sigmoid(logits_pred)
202
+ predictions_list.append(pred[0, 0].item())
203
+
204
+ # Sliding-window majority vote
205
+ if len(predictions_list) >= window_size:
206
+ window = predictions_list[-window_size:]
207
+ n_above = sum(1 for p in window if p > threshold)
208
+ if n_above / window_size > 0.5:
209
+ early_exit = True
210
+ break
211
+
212
+ # Sample next token — LogitsProcessorList expects 2D [batch, vocab]
213
+ next_logits = outputs.logits[:, -1, :] # [1, vocab_size]
214
+ next_logits = logits_processor(input_ids, next_logits)
215
+ probs = F.softmax(next_logits, dim=-1)
216
+ next_token = torch.multinomial(probs, num_samples=1) # [1, 1]
217
+
218
+ # Natural </think>
219
+ if next_token.item() == think_end_token_id:
220
+ break
221
+
222
+ input_ids = torch.cat([input_ids, next_token], dim=1)
223
+ reasoning_tokens.append(next_token.item())
224
+
225
+ # Stream the token
226
+ token_text = tokenizer.decode([next_token.item()], skip_special_tokens=False)
227
+ sys.stdout.write(token_text)
228
+ sys.stdout.flush()
229
+
230
+ # End thinking section
231
+ if early_exit and exit_message:
232
+ sys.stdout.write(exit_message)
233
+ sys.stdout.write(f"{RESET}\n")
234
+ sys.stdout.flush()
235
+
236
+ # Build input for final answer generation
237
+ if early_exit and exit_message:
238
+ exit_ids = tokenizer(
239
+ exit_message, add_special_tokens=False, return_tensors="pt"
240
+ )["input_ids"].to(device).long()
241
+ input_ids = torch.cat(
242
+ [input_ids, exit_ids,
243
+ torch.tensor([[think_end_token_id]], dtype=torch.long, device=device)],
244
+ dim=1,
245
+ )
246
+ else:
247
+ input_ids = torch.cat(
248
+ [input_ids,
249
+ torch.tensor([[think_end_token_id]], dtype=torch.long, device=device)],
250
+ dim=1,
251
+ )
252
+
253
+ # Generate final answer
254
+ attention_mask = torch.ones_like(input_ids)
255
+ with torch.no_grad():
256
+ final_outputs = model.generate(
257
+ input_ids=input_ids,
258
+ attention_mask=attention_mask,
259
+ max_new_tokens=max_tokens,
260
+ do_sample=True,
261
+ temperature=temperature,
262
+ top_p=0.95,
263
+ top_k=20,
264
+ pad_token_id=tokenizer.pad_token_id,
265
+ eos_token_id=tokenizer.eos_token_id,
266
+ )
267
+
268
+ # Extract answer (everything after last </think>)
269
+ full_seq = final_outputs[0]
270
+ end_positions = (full_seq == think_end_token_id).nonzero(as_tuple=True)[0]
271
+ if len(end_positions) > 0:
272
+ answer_tokens = full_seq[end_positions[-1].item() + 1 :]
273
+ answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
274
+ else:
275
+ answer = ""
276
+
277
+ # Print answer
278
+ sys.stdout.write(f"{BOLD}Answer:{RESET}\n{answer}\n")
279
+ sys.stdout.flush()
280
+
281
+ # Summary
282
+ n_reasoning = len(reasoning_tokens)
283
+ exit_reason = "predictor" if early_exit else "natural_end"
284
+ print(
285
+ f"\n{DIM}[{exit_reason} | "
286
+ f"{n_reasoning} thinking tokens | "
287
+ f"{len(predictions_list)} predictions]{RESET}"
288
+ )
289
+
290
+
291
+ def main():
292
+ parser = argparse.ArgumentParser(
293
+ description=__doc__,
294
+ formatter_class=argparse.RawDescriptionHelpFormatter,
295
+ )
296
+ parser.add_argument("--prompt", type=str, required=True, help="Input prompt")
297
+ parser.add_argument(
298
+ "--model", type=str, default="Qwen/Qwen3-8B", help="HuggingFace model name"
299
+ )
300
+ parser.add_argument(
301
+ "--checkpoint",
302
+ type=str,
303
+ default=None,
304
+ help="Path to terminator .pt checkpoint (default: ./terminator.pt)",
305
+ )
306
+ parser.add_argument(
307
+ "--threshold", type=float, default=0.7, help="Per-prediction binarization threshold"
308
+ )
309
+ parser.add_argument(
310
+ "--window-size", type=int, default=10, help="Sliding-window size for majority vote"
311
+ )
312
+ parser.add_argument(
313
+ "--exit-message",
314
+ type=str,
315
+ default="\nI've run out of thinking tokens. I need to commit to a final answer.",
316
+ help="Message injected when terminator fires (empty string to disable)",
317
+ )
318
+ parser.add_argument(
319
+ "--max-tokens", type=int, default=32768, help="Max tokens to generate"
320
+ )
321
+ parser.add_argument(
322
+ "--temperature", type=float, default=0.6, help="Sampling temperature"
323
+ )
324
+ parser.add_argument(
325
+ "--device", type=str, default="cuda", help="Device (default: cuda)"
326
+ )
327
+ args = parser.parse_args()
328
+
329
+ # Resolve checkpoint path
330
+ if args.checkpoint is None:
331
+ args.checkpoint = str(_script_dir / "terminator.pt")
332
+
333
+ if not Path(args.checkpoint).exists():
334
+ print(f"ERROR: Checkpoint not found: {args.checkpoint}", file=sys.stderr)
335
+ sys.exit(1)
336
+
337
+ # Handle empty exit message
338
+ if args.exit_message == "":
339
+ args.exit_message = None
340
+
341
+ device = torch.device(args.device if torch.cuda.is_available() else "cpu")
342
+
343
+ # Load base model
344
+ model, tokenizer, think_id, think_end_id = load_model_and_tokenizer(
345
+ args.model, device
346
+ )
347
+
348
+ # Load terminator checkpoint
349
+ rms_eps = getattr(model.config, "rms_norm_eps", 1e-6)
350
+ ffn, ckpt_config, layer_idx, num_extra_layers, extra_sd = load_terminator_checkpoint(
351
+ args.checkpoint, rms_norm_eps=rms_eps, device=device
352
+ )
353
+ ffn_params = sum(p.numel() for p in ffn.parameters())
354
+ print(
355
+ f"Terminator FFN loaded (layer_idx={layer_idx}, "
356
+ f"threshold={args.threshold}, window={args.window_size}, "
357
+ f"params={ffn_params:,})"
358
+ )
359
+
360
+ # Extra layers
361
+ extra_layers = build_extra_layers(model, ckpt_config, extra_sd, device)
362
+
363
+ # Generate
364
+ generate_with_terminator(
365
+ prompt=args.prompt,
366
+ model=model,
367
+ tokenizer=tokenizer,
368
+ ffn=ffn,
369
+ extra_layers=extra_layers,
370
+ layer_idx=layer_idx,
371
+ think_token_id=think_id,
372
+ think_end_token_id=think_end_id,
373
+ threshold=args.threshold,
374
+ window_size=args.window_size,
375
+ exit_message=args.exit_message,
376
+ max_tokens=args.max_tokens,
377
+ temperature=args.temperature,
378
+ device=device,
379
+ )
380
+
381
+
382
+ if __name__ == "__main__":
383
+ main()
serve.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ vLLM API server launcher for Qwen3TerminatorForCausalLM.
4
+
5
+ Imports vllm_terminator BEFORE vLLM initialises, which registers
6
+ Qwen3TerminatorForCausalLM with vLLM's ModelRegistry.
7
+
8
+ NOTE: Terminator currently supports single-GPU, single-sequence inference only.
9
+ Tensor parallelism and concurrent sequences are not supported.
10
+
11
+ Environment variables:
12
+ VLLM_MODEL — path to terminator model directory (required)
13
+ VLLM_PORT — port (default 8000)
14
+ VLLM_GPU_UTIL — GPU memory fraction (default 0.90)
15
+ VLLM_MAX_MODEL_LEN — max context length
16
+ VLLM_DTYPE — dtype (default "auto")
17
+ VLLM_API_KEY — require this API key from clients
18
+ VLLM_SERVED_NAME — override served model name
19
+ VLLM_HOST — bind address (default 0.0.0.0)
20
+ NO_PREFIX_CACHING — set to 1 to disable prefix caching
21
+ VLLM_ENFORCE_EAGER — set to 1 to disable CUDA graphs (default 0)
22
+ REASONING_PARSER — set to "qwen3" to enable <think>/</think> parsing
23
+ (splits reasoning_content from content in API responses)
24
+
25
+ Example:
26
+ VLLM_MODEL=./model_dir python serve.py
27
+ """
28
+
29
+ import os
30
+ import runpy
31
+ import sys
32
+
33
+ # -----------------------------------------------------------------------
34
+ # CRITICAL: import vllm_terminator HERE, before any vLLM code runs.
35
+ # This registers Qwen3TerminatorForCausalLM with vLLM's ModelRegistry.
36
+ # -----------------------------------------------------------------------
37
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
38
+ import vllm_terminator # noqa: F401 (registers the model as a side effect)
39
+
40
+
41
+ def env(name, default=None, required=False):
42
+ v = os.environ.get(name, default)
43
+ if required and (v is None or v == ""):
44
+ print(f"Missing required env var: {name}", file=sys.stderr)
45
+ sys.exit(2)
46
+ return v
47
+
48
+
49
+ def main():
50
+ model = env("VLLM_MODEL", required=True)
51
+ host = env("VLLM_HOST", "0.0.0.0")
52
+ port = env("VLLM_PORT", "8000")
53
+ max_len = env("VLLM_MAX_MODEL_LEN", None)
54
+ gpu_util = env("VLLM_GPU_UTIL", "0.90")
55
+ served_name = env("VLLM_SERVED_NAME", None)
56
+ dtype = env("VLLM_DTYPE", "auto")
57
+ api_key = env("VLLM_API_KEY", None)
58
+ no_prefix_caching = env("NO_PREFIX_CACHING", "0")
59
+ enforce_eager = env("VLLM_ENFORCE_EAGER", "0")
60
+ reasoning_parser = env("REASONING_PARSER", None)
61
+
62
+ argv = [
63
+ "vllm.entrypoints.openai.api_server",
64
+ "--model", model,
65
+ "--host", host,
66
+ "--port", str(port),
67
+ "--dtype", dtype,
68
+ "--gpu-memory-utilization", str(gpu_util),
69
+ "--tensor-parallel-size", "1",
70
+ "--max-num-seqs", "1",
71
+ ]
72
+
73
+ if served_name:
74
+ argv += ["--served-model-name", served_name]
75
+ if max_len:
76
+ argv += ["--max-model-len", str(max_len)]
77
+ if api_key:
78
+ argv += ["--api-key", api_key]
79
+ if no_prefix_caching == "1":
80
+ argv += ["--enable-prefix-caching", "False"]
81
+ if enforce_eager == "1":
82
+ argv += ["--enforce-eager"]
83
+ if reasoning_parser:
84
+ argv += ["--reasoning-parser", reasoning_parser]
85
+
86
+ print(f"Launching vLLM Terminator server with:\n " + " ".join(argv[1:]), flush=True)
87
+
88
+ # Replace sys.argv so vLLM's argparse sees these arguments, then run the
89
+ # server module in-process (so vllm_terminator registration persists).
90
+ sys.argv = argv
91
+ runpy.run_module("vllm.entrypoints.openai.api_server", run_name="__main__")
92
+
93
+
94
+ if __name__ == "__main__":
95
+ main()
setup.sh CHANGED
@@ -86,6 +86,9 @@ uv pip install vllm --torch-backend=auto
86
  echo " Installing openai (for client)..."
87
  uv pip install openai
88
 
 
 
 
89
  echo " Done."
90
 
91
  # ------------------------------------------------------------------
 
86
  echo " Installing openai (for client)..."
87
  uv pip install openai
88
 
89
+ echo " Installing accelerate (for HF inference)..."
90
+ uv pip install accelerate
91
+
92
  echo " Done."
93
 
94
  # ------------------------------------------------------------------
setup_model_dir.py CHANGED
@@ -121,7 +121,7 @@ def main():
121
  print(f"\nTo start the server:")
122
  print(f" ./start_server.sh")
123
  print(f"\nOr manually:")
124
- print(f" VLLM_MODEL={out_dir} REASONING_PARSER=qwen3 python host_terminator.py")
125
 
126
 
127
  if __name__ == "__main__":
 
121
  print(f"\nTo start the server:")
122
  print(f" ./start_server.sh")
123
  print(f"\nOr manually:")
124
+ print(f" VLLM_MODEL={out_dir} REASONING_PARSER=qwen3 python serve.py")
125
 
126
 
127
  if __name__ == "__main__":
start_server.sh CHANGED
@@ -10,24 +10,20 @@ set -euo pipefail
10
  # Configuration (set as environment variables before running):
11
  #
12
  # VLLM_GPU_UTIL GPU memory fraction to use (default: 0.90)
13
- # - 80GB GPU (A100/H100): 0.90
14
- # - 48GB GPU (A6000/L40): 0.85
15
- # - 24GB GPU (4090/A5000): 0.70
16
  #
17
  # VLLM_MAX_MODEL_LEN Maximum context length in tokens (default: server picks)
18
- # - 80GB GPU: 32768
19
- # - 48GB GPU: 16384
20
- # - 24GB GPU: 4096 - 8192
21
  #
22
  # VLLM_PORT Server port (default: 8000)
23
  #
24
  # VLLM_ENFORCE_EAGER Set to 1 to disable CUDA graphs (default: 0)
25
  # Use if you encounter CUDA graph compilation errors.
 
26
  #
27
  # VLLM_API_KEY Require this API key from clients (default: none)
28
  #
29
  # Usage:
30
  # ./start_server.sh
 
31
  # VLLM_GPU_UTIL=0.70 VLLM_MAX_MODEL_LEN=8192 ./start_server.sh
32
  # ==========================================================================
33
 
@@ -49,4 +45,4 @@ export VLLM_MODEL="$MODEL_DIR"
49
  export REASONING_PARSER="${REASONING_PARSER:-qwen3}"
50
  export VLLM_SERVED_NAME="${VLLM_SERVED_NAME:-Terminator-Qwen3-8B}"
51
 
52
- exec python "$SCRIPT_DIR/host_terminator.py"
 
10
  # Configuration (set as environment variables before running):
11
  #
12
  # VLLM_GPU_UTIL GPU memory fraction to use (default: 0.90)
 
 
 
13
  #
14
  # VLLM_MAX_MODEL_LEN Maximum context length in tokens (default: server picks)
 
 
 
15
  #
16
  # VLLM_PORT Server port (default: 8000)
17
  #
18
  # VLLM_ENFORCE_EAGER Set to 1 to disable CUDA graphs (default: 0)
19
  # Use if you encounter CUDA graph compilation errors.
20
+ # NOTE: VLLM_ENFORCE_EAGER=0 will result in slower responses
21
  #
22
  # VLLM_API_KEY Require this API key from clients (default: none)
23
  #
24
  # Usage:
25
  # ./start_server.sh
26
+ # or to manually override default environment variables:
27
  # VLLM_GPU_UTIL=0.70 VLLM_MAX_MODEL_LEN=8192 ./start_server.sh
28
  # ==========================================================================
29
 
 
45
  export REASONING_PARSER="${REASONING_PARSER:-qwen3}"
46
  export VLLM_SERVED_NAME="${VLLM_SERVED_NAME:-Terminator-Qwen3-8B}"
47
 
48
+ exec python "$SCRIPT_DIR/serve.py"
vllm_terminator/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (731 Bytes). View file
 
vllm_terminator/__pycache__/terminator_head.cpython-312.pyc ADDED
Binary file (6.8 kB). View file