File size: 7,235 Bytes

#!/usr/bin/env python3
"""
Client for the Terminator vLLM server.

Supports single-prompt and multi-turn conversation modes with streaming
output. Thinking content is displayed in dimmed text; answer content in
normal text.

Usage:
    # Single prompt
    python client.py --prompt "What is the sum of the first 100 natural numbers?"

    # Interactive multi-turn conversation
    python client.py --interactive

    # Custom server URL and max tokens
    python client.py --base-url http://localhost:8001/v1 --max-tokens 8192 --prompt "Hello"
"""

import argparse
import sys

from openai import OpenAI


# ANSI escape codes
DIM = "\033[2m"
BOLD = "\033[1m"
RESET = "\033[0m"

BANNER_LINES = [
    r"████████╗███████╗██████╗ ███╗   ███╗██╗███╗   ██╗ █████╗ ████████╗ ██████╗ ██████╗ ",
    r"╚══██╔══╝██╔════╝██╔══██╗████╗ ████║██║████╗  ██║██╔══██╗╚══██╔══╝██╔═══██╗██╔══██╗",
    r"   ██║   █████╗  ██████╔╝██╔████╔██║██║██╔██╗ ██║███████║   ██║   ██║   ██║██████╔╝",
    r"   ██║   ██╔══╝  ██╔══██╗██║╚██╔╝██║██║██║╚██╗██║██╔══██║   ██║   ██║   ██║██╔══██╗",
    r"   ██║   ███████╗██║  ██║██║ ╚═╝ ██║██║██║ ╚████║██║  ██║   ██║   ╚██████╔╝██║  ██║",
    r"   ╚═╝   ╚══════╝╚═╝  ╚═╝╚═╝     ╚═╝╚═╝╚═╝  ╚═══╝╚═╝  ╚═╝   ╚═╝    ╚═════╝ ╚═╝  ╚═╝",
]

# Dark red -> light red gradient (one color per row)
_GRADIENT_RGB = [
    (140, 0, 0),
    (165, 15, 15),
    (190, 35, 35),
    (215, 55, 55),
    (235, 70, 70),
    (255, 90, 90),
]


def print_banner() -> None:
    for line, (r, g, b) in zip(BANNER_LINES, _GRADIENT_RGB):
        print(f"\033[38;2;{r};{g};{b}m{line}{RESET}")


def detect_model(client: OpenAI) -> str:
    """Auto-detect the served model name from the server."""
    try:
        models = client.models.list()
        if not models.data:
            print("ERROR: No models available on the server.", file=sys.stderr)
            sys.exit(1)
        return models.data[0].id
    except Exception as e:
        print(f"ERROR: Could not connect to server: {e}", file=sys.stderr)
        sys.exit(1)


def stream_response(
    client: OpenAI,
    model: str,
    messages: list[dict],
    max_tokens: int | None,
    temperature: float,
) -> str:
    """Stream a chat completion response.

    Thinking content is printed in dim text, answer content in normal text.
    Returns the assistant's answer content (for conversation history).
    """
    kwargs = dict(
        model=model,
        messages=messages,
        temperature=temperature,
        stream=True,
        extra_body={"chat_template_kwargs": {"enable_thinking": True}},
    )
    if max_tokens is not None:
        kwargs["max_tokens"] = max_tokens

    stream = client.chat.completions.create(**kwargs)

    in_thinking = False
    in_answer = False
    full_content = ""

    try:
        for chunk in stream:
            if not chunk.choices:
                continue
            delta = chunk.choices[0].delta

            reasoning = getattr(delta, "reasoning", None)
            if reasoning:
                if not in_thinking:
                    sys.stdout.write(f"\n{DIM}Thinking...\n")
                    in_thinking = True
                sys.stdout.write(reasoning)
                sys.stdout.flush()

            if delta.content:
                if not in_answer:
                    if in_thinking:
                        sys.stdout.write(RESET)
                    sys.stdout.write(f"\n{BOLD}Answer:{RESET}\n")
                    in_answer = True
                sys.stdout.write(delta.content)
                sys.stdout.flush()
                full_content += delta.content
    except KeyboardInterrupt:
        pass
    finally:
        sys.stdout.write(RESET)
        sys.stdout.flush()

    print()
    return full_content


def run_single(client, model, prompt, max_tokens, temperature):
    """Run a single prompt and exit."""
    messages = [{"role": "user", "content": prompt}]
    stream_response(client, model, messages, max_tokens, temperature)


def run_interactive(client, model, max_tokens, temperature):
    """Interactive multi-turn conversation loop."""
    messages = []
    print()
    print_banner()
    print()
    print(f" Connected to {BOLD}{model}{RESET}")
    print(f" Type your message and press Enter. Type {BOLD}quit{RESET} or Ctrl+C to exit.")
    print(f" {DIM}Note: interactive mode is single-line only. For multiline prompts,{RESET}")
    print(f" {DIM}      either flatten it to a single line or use:{RESET}")
    print(f" {DIM}      python client.py --prompt 'line one{RESET}")
    print(f" {DIM}      line two{RESET}")
    print(f" {DIM}      line three'{RESET}")
    print()

    while True:
        try:
            user_input = input(f"{BOLD}>>>{RESET} ")
        except (KeyboardInterrupt, EOFError):
            print("\nGoodbye!")
            break

        if user_input.strip().lower() in ("quit", "exit", "q"):
            print("Goodbye!")
            break

        if not user_input.strip():
            continue

        messages.append({"role": "user", "content": user_input})
        content = stream_response(client, model, messages, max_tokens, temperature)
        messages.append({"role": "assistant", "content": content})
        print()


def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    mode = parser.add_mutually_exclusive_group(required=True)
    mode.add_argument("--prompt", type=str, help="Single prompt to send")
    mode.add_argument(
        "--interactive", action="store_true",
        help="Start an interactive multi-turn conversation",
    )
    parser.add_argument(
        "--base-url", default="http://localhost:8000/v1",
        help="vLLM server URL (default: http://localhost:8000/v1)",
    )
    parser.add_argument(
        "--max-tokens", type=int, default=None,
        help="Maximum tokens to generate (default: server decides based on context length)",
    )
    parser.add_argument(
        "--temperature", type=float, default=0.6,
        help="Sampling temperature (default: 0.6)",
    )
    args = parser.parse_args()

    client = OpenAI(base_url=args.base_url, api_key="EMPTY")
    model = detect_model(client)

    if args.prompt:
        run_single(client, model, args.prompt, args.max_tokens, args.temperature)
    else:
        run_interactive(client, model, args.max_tokens, args.temperature)


if __name__ == "__main__":
    main()