#!/usr/bin/env python3 """ Client for the Terminator vLLM server. Supports single-prompt and multi-turn conversation modes with streaming output. Thinking content is displayed in dimmed text; answer content in normal text. Usage: # Single prompt python client.py --prompt "What is the sum of the first 100 natural numbers?" # Interactive multi-turn conversation python client.py --interactive # Custom server URL and max tokens python client.py --base-url http://localhost:8001/v1 --max-tokens 8192 --prompt "Hello" """ import argparse import sys from openai import OpenAI # ANSI escape codes DIM = "\033[2m" BOLD = "\033[1m" RESET = "\033[0m" BANNER_LINES = [ r"████████╗███████╗██████╗ ███╗ ███╗██╗███╗ ██╗ █████╗ ████████╗ ██████╗ ██████╗ ", r"╚══██╔══╝██╔════╝██╔══██╗████╗ ████║██║████╗ ██║██╔══██╗╚══██╔══╝██╔═══██╗██╔══██╗", r" ██║ █████╗ ██████╔╝██╔████╔██║██║██╔██╗ ██║███████║ ██║ ██║ ██║██████╔╝", r" ██║ ██╔══╝ ██╔══██╗██║╚██╔╝██║██║██║╚██╗██║██╔══██║ ██║ ██║ ██║██╔══██╗", r" ██║ ███████╗██║ ██║██║ ╚═╝ ██║██║██║ ╚████║██║ ██║ ██║ ╚██████╔╝██║ ██║", r" ╚═╝ ╚══════╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝╚═╝ ╚═══╝╚═╝ ╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═╝", ] # Dark red -> light red gradient (one color per row) _GRADIENT_RGB = [ (140, 0, 0), (165, 15, 15), (190, 35, 35), (215, 55, 55), (235, 70, 70), (255, 90, 90), ] def print_banner() -> None: for line, (r, g, b) in zip(BANNER_LINES, _GRADIENT_RGB): print(f"\033[38;2;{r};{g};{b}m{line}{RESET}") def detect_model(client: OpenAI) -> str: """Auto-detect the served model name from the server.""" try: models = client.models.list() if not models.data: print("ERROR: No models available on the server.", file=sys.stderr) sys.exit(1) return models.data[0].id except Exception as e: print(f"ERROR: Could not connect to server: {e}", file=sys.stderr) sys.exit(1) def stream_response( client: OpenAI, model: str, messages: list[dict], max_tokens: int | None, temperature: float, ) -> str: """Stream a chat completion response. Thinking content is printed in dim text, answer content in normal text. Returns the assistant's answer content (for conversation history). """ kwargs = dict( model=model, messages=messages, temperature=temperature, stream=True, extra_body={"chat_template_kwargs": {"enable_thinking": True}}, ) if max_tokens is not None: kwargs["max_tokens"] = max_tokens stream = client.chat.completions.create(**kwargs) in_thinking = False in_answer = False full_content = "" try: for chunk in stream: if not chunk.choices: continue delta = chunk.choices[0].delta reasoning = getattr(delta, "reasoning", None) if reasoning: if not in_thinking: sys.stdout.write(f"\n{DIM}Thinking...\n") in_thinking = True sys.stdout.write(reasoning) sys.stdout.flush() if delta.content: if not in_answer: if in_thinking: sys.stdout.write(RESET) sys.stdout.write(f"\n{BOLD}Answer:{RESET}\n") in_answer = True sys.stdout.write(delta.content) sys.stdout.flush() full_content += delta.content except KeyboardInterrupt: pass finally: sys.stdout.write(RESET) sys.stdout.flush() print() return full_content def run_single(client, model, prompt, max_tokens, temperature): """Run a single prompt and exit.""" messages = [{"role": "user", "content": prompt}] stream_response(client, model, messages, max_tokens, temperature) def run_interactive(client, model, max_tokens, temperature): """Interactive multi-turn conversation loop.""" messages = [] print() print_banner() print() print(f" Connected to {BOLD}{model}{RESET}") print(f" Type your message and press Enter. Type {BOLD}quit{RESET} or Ctrl+C to exit.") print(f" {DIM}Note: interactive mode is single-line only. For multiline prompts,{RESET}") print(f" {DIM} either flatten it to a single line or use:{RESET}") print(f" {DIM} python client.py --prompt 'line one{RESET}") print(f" {DIM} line two{RESET}") print(f" {DIM} line three'{RESET}") print() while True: try: user_input = input(f"{BOLD}>>>{RESET} ") except (KeyboardInterrupt, EOFError): print("\nGoodbye!") break if user_input.strip().lower() in ("quit", "exit", "q"): print("Goodbye!") break if not user_input.strip(): continue messages.append({"role": "user", "content": user_input}) content = stream_response(client, model, messages, max_tokens, temperature) messages.append({"role": "assistant", "content": content}) print() def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, ) mode = parser.add_mutually_exclusive_group(required=True) mode.add_argument("--prompt", type=str, help="Single prompt to send") mode.add_argument( "--interactive", action="store_true", help="Start an interactive multi-turn conversation", ) parser.add_argument( "--base-url", default="http://localhost:8000/v1", help="vLLM server URL (default: http://localhost:8000/v1)", ) parser.add_argument( "--max-tokens", type=int, default=None, help="Maximum tokens to generate (default: server decides based on context length)", ) parser.add_argument( "--temperature", type=float, default=0.6, help="Sampling temperature (default: 0.6)", ) args = parser.parse_args() client = OpenAI(base_url=args.base_url, api_key="EMPTY") model = detect_model(client) if args.prompt: run_single(client, model, args.prompt, args.max_tokens, args.temperature) else: run_interactive(client, model, args.max_tokens, args.temperature) if __name__ == "__main__": main()