Terminator-Qwen3-8B / client.py
acnagle's picture
Upload client.py with huggingface_hub
b1e0b5b verified
#!/usr/bin/env python3
"""
Client for the Terminator vLLM server.
Supports single-prompt and multi-turn conversation modes with streaming
output. Thinking content is displayed in dimmed text; answer content in
normal text.
Usage:
# Single prompt
python client.py --prompt "What is the sum of the first 100 natural numbers?"
# Interactive multi-turn conversation
python client.py --interactive
# Custom server URL and max tokens
python client.py --base-url http://localhost:8001/v1 --max-tokens 8192 --prompt "Hello"
"""
import argparse
import sys
from openai import OpenAI
# ANSI escape codes
DIM = "\033[2m"
BOLD = "\033[1m"
RESET = "\033[0m"
BANNER_LINES = [
r"β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•—β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•—β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ–ˆβ•—β–ˆβ–ˆβ•—β–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ•— β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•— ",
r"β•šβ•β•β–ˆβ–ˆβ•”β•β•β•β–ˆβ–ˆβ•”β•β•β•β•β•β–ˆβ–ˆβ•”β•β•β–ˆβ–ˆβ•—β–ˆβ–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ–ˆβ–ˆβ•‘β–ˆβ–ˆβ•‘β–ˆβ–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•”β•β•β–ˆβ–ˆβ•—β•šβ•β•β–ˆβ–ˆβ•”β•β•β•β–ˆβ–ˆβ•”β•β•β•β–ˆβ–ˆβ•—β–ˆβ–ˆβ•”β•β•β–ˆβ–ˆβ•—",
r" β–ˆβ–ˆβ•‘ β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•”β•β–ˆβ–ˆβ•”β–ˆβ–ˆβ–ˆβ–ˆβ•”β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•”β–ˆβ–ˆβ•— β–ˆβ–ˆβ•‘β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•‘ β–ˆβ–ˆβ•‘ β–ˆβ–ˆβ•‘ β–ˆβ–ˆβ•‘β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•”β•",
r" β–ˆβ–ˆβ•‘ β–ˆβ–ˆβ•”β•β•β• β–ˆβ–ˆβ•”β•β•β–ˆβ–ˆβ•—β–ˆβ–ˆβ•‘β•šβ–ˆβ–ˆβ•”β•β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•‘β•šβ–ˆβ–ˆβ•—β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•”β•β•β–ˆβ–ˆβ•‘ β–ˆβ–ˆβ•‘ β–ˆβ–ˆβ•‘ β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•”β•β•β–ˆβ–ˆβ•—",
r" β–ˆβ–ˆβ•‘ β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•—β–ˆβ–ˆβ•‘ β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•‘ β•šβ•β• β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•‘ β•šβ–ˆβ–ˆβ–ˆβ–ˆβ•‘β–ˆβ–ˆβ•‘ β–ˆβ–ˆβ•‘ β–ˆβ–ˆβ•‘ β•šβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•”β•β–ˆβ–ˆβ•‘ β–ˆβ–ˆβ•‘",
r" β•šβ•β• β•šβ•β•β•β•β•β•β•β•šβ•β• β•šβ•β•β•šβ•β• β•šβ•β•β•šβ•β•β•šβ•β• β•šβ•β•β•β•β•šβ•β• β•šβ•β• β•šβ•β• β•šβ•β•β•β•β•β• β•šβ•β• β•šβ•β•",
]
# Dark red -> light red gradient (one color per row)
_GRADIENT_RGB = [
(140, 0, 0),
(165, 15, 15),
(190, 35, 35),
(215, 55, 55),
(235, 70, 70),
(255, 90, 90),
]
def print_banner() -> None:
for line, (r, g, b) in zip(BANNER_LINES, _GRADIENT_RGB):
print(f"\033[38;2;{r};{g};{b}m{line}{RESET}")
def detect_model(client: OpenAI) -> str:
"""Auto-detect the served model name from the server."""
try:
models = client.models.list()
if not models.data:
print("ERROR: No models available on the server.", file=sys.stderr)
sys.exit(1)
return models.data[0].id
except Exception as e:
print(f"ERROR: Could not connect to server: {e}", file=sys.stderr)
sys.exit(1)
def stream_response(
client: OpenAI,
model: str,
messages: list[dict],
max_tokens: int | None,
temperature: float,
) -> str:
"""Stream a chat completion response.
Thinking content is printed in dim text, answer content in normal text.
Returns the assistant's answer content (for conversation history).
"""
kwargs = dict(
model=model,
messages=messages,
temperature=temperature,
stream=True,
extra_body={"chat_template_kwargs": {"enable_thinking": True}},
)
if max_tokens is not None:
kwargs["max_tokens"] = max_tokens
stream = client.chat.completions.create(**kwargs)
in_thinking = False
in_answer = False
full_content = ""
try:
for chunk in stream:
if not chunk.choices:
continue
delta = chunk.choices[0].delta
reasoning = getattr(delta, "reasoning", None)
if reasoning:
if not in_thinking:
sys.stdout.write(f"\n{DIM}Thinking...\n")
in_thinking = True
sys.stdout.write(reasoning)
sys.stdout.flush()
if delta.content:
if not in_answer:
if in_thinking:
sys.stdout.write(RESET)
sys.stdout.write(f"\n{BOLD}Answer:{RESET}\n")
in_answer = True
sys.stdout.write(delta.content)
sys.stdout.flush()
full_content += delta.content
except KeyboardInterrupt:
pass
finally:
sys.stdout.write(RESET)
sys.stdout.flush()
print()
return full_content
def run_single(client, model, prompt, max_tokens, temperature):
"""Run a single prompt and exit."""
messages = [{"role": "user", "content": prompt}]
stream_response(client, model, messages, max_tokens, temperature)
def run_interactive(client, model, max_tokens, temperature):
"""Interactive multi-turn conversation loop."""
messages = []
print()
print_banner()
print()
print(f" Connected to {BOLD}{model}{RESET}")
print(f" Type your message and press Enter. Type {BOLD}quit{RESET} or Ctrl+C to exit.")
print(f" {DIM}Note: interactive mode is single-line only. For multiline prompts,{RESET}")
print(f" {DIM} either flatten it to a single line or use:{RESET}")
print(f" {DIM} python client.py --prompt 'line one{RESET}")
print(f" {DIM} line two{RESET}")
print(f" {DIM} line three'{RESET}")
print()
while True:
try:
user_input = input(f"{BOLD}>>>{RESET} ")
except (KeyboardInterrupt, EOFError):
print("\nGoodbye!")
break
if user_input.strip().lower() in ("quit", "exit", "q"):
print("Goodbye!")
break
if not user_input.strip():
continue
messages.append({"role": "user", "content": user_input})
content = stream_response(client, model, messages, max_tokens, temperature)
messages.append({"role": "assistant", "content": content})
print()
def main():
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
mode = parser.add_mutually_exclusive_group(required=True)
mode.add_argument("--prompt", type=str, help="Single prompt to send")
mode.add_argument(
"--interactive", action="store_true",
help="Start an interactive multi-turn conversation",
)
parser.add_argument(
"--base-url", default="http://localhost:8000/v1",
help="vLLM server URL (default: http://localhost:8000/v1)",
)
parser.add_argument(
"--max-tokens", type=int, default=None,
help="Maximum tokens to generate (default: server decides based on context length)",
)
parser.add_argument(
"--temperature", type=float, default=0.6,
help="Sampling temperature (default: 0.6)",
)
args = parser.parse_args()
client = OpenAI(base_url=args.base_url, api_key="EMPTY")
model = detect_model(client)
if args.prompt:
run_single(client, model, args.prompt, args.max_tokens, args.temperature)
else:
run_interactive(client, model, args.max_tokens, args.temperature)
if __name__ == "__main__":
main()