| | |
| | """ |
| | Client for the Terminator vLLM server. |
| | |
| | Supports single-prompt and multi-turn conversation modes with streaming |
| | output. Thinking content is displayed in dimmed text; answer content in |
| | normal text. |
| | |
| | Usage: |
| | # Single prompt |
| | python client.py --prompt "What is the sum of the first 100 natural numbers?" |
| | |
| | # Interactive multi-turn conversation |
| | python client.py --interactive |
| | |
| | # Custom server URL and max tokens |
| | python client.py --base-url http://localhost:8001/v1 --max-tokens 8192 --prompt "Hello" |
| | """ |
| |
|
| | import argparse |
| | import sys |
| |
|
| | from openai import OpenAI |
| |
|
| |
|
| | |
| | DIM = "\033[2m" |
| | BOLD = "\033[1m" |
| | RESET = "\033[0m" |
| |
|
| | BANNER_LINES = [ |
| | r"ββββββββββββββββββββββββ ββββ βββββββββββ βββ ββββββ βββββββββ βββββββ βββββββ ", |
| | r"ββββββββββββββββββββββββββββββ βββββββββββββ βββββββββββββββββββββββββββββββββββββ", |
| | r" βββ ββββββ ββββββββββββββββββββββββββββ βββββββββββ βββ βββ βββββββββββ", |
| | r" βββ ββββββ ββββββββββββββββββββββββββββββββββββββββ βββ βββ βββββββββββ", |
| | r" βββ βββββββββββ ββββββ βββ βββββββββ βββββββββ βββ βββ ββββββββββββ βββ", |
| | r" βββ βββββββββββ ββββββ βββββββββ ββββββββ βββ βββ βββββββ βββ βββ", |
| | ] |
| |
|
| | |
| | _GRADIENT_RGB = [ |
| | (140, 0, 0), |
| | (165, 15, 15), |
| | (190, 35, 35), |
| | (215, 55, 55), |
| | (235, 70, 70), |
| | (255, 90, 90), |
| | ] |
| |
|
| |
|
| | def print_banner() -> None: |
| | for line, (r, g, b) in zip(BANNER_LINES, _GRADIENT_RGB): |
| | print(f"\033[38;2;{r};{g};{b}m{line}{RESET}") |
| |
|
| |
|
| | def detect_model(client: OpenAI) -> str: |
| | """Auto-detect the served model name from the server.""" |
| | try: |
| | models = client.models.list() |
| | if not models.data: |
| | print("ERROR: No models available on the server.", file=sys.stderr) |
| | sys.exit(1) |
| | return models.data[0].id |
| | except Exception as e: |
| | print(f"ERROR: Could not connect to server: {e}", file=sys.stderr) |
| | sys.exit(1) |
| |
|
| |
|
| | def stream_response( |
| | client: OpenAI, |
| | model: str, |
| | messages: list[dict], |
| | max_tokens: int | None, |
| | temperature: float, |
| | ) -> str: |
| | """Stream a chat completion response. |
| | |
| | Thinking content is printed in dim text, answer content in normal text. |
| | Returns the assistant's answer content (for conversation history). |
| | """ |
| | kwargs = dict( |
| | model=model, |
| | messages=messages, |
| | temperature=temperature, |
| | stream=True, |
| | extra_body={"chat_template_kwargs": {"enable_thinking": True}}, |
| | ) |
| | if max_tokens is not None: |
| | kwargs["max_tokens"] = max_tokens |
| |
|
| | stream = client.chat.completions.create(**kwargs) |
| |
|
| | in_thinking = False |
| | in_answer = False |
| | full_content = "" |
| |
|
| | try: |
| | for chunk in stream: |
| | if not chunk.choices: |
| | continue |
| | delta = chunk.choices[0].delta |
| |
|
| | reasoning = getattr(delta, "reasoning", None) |
| | if reasoning: |
| | if not in_thinking: |
| | sys.stdout.write(f"\n{DIM}Thinking...\n") |
| | in_thinking = True |
| | sys.stdout.write(reasoning) |
| | sys.stdout.flush() |
| |
|
| | if delta.content: |
| | if not in_answer: |
| | if in_thinking: |
| | sys.stdout.write(RESET) |
| | sys.stdout.write(f"\n{BOLD}Answer:{RESET}\n") |
| | in_answer = True |
| | sys.stdout.write(delta.content) |
| | sys.stdout.flush() |
| | full_content += delta.content |
| | except KeyboardInterrupt: |
| | pass |
| | finally: |
| | sys.stdout.write(RESET) |
| | sys.stdout.flush() |
| |
|
| | print() |
| | return full_content |
| |
|
| |
|
| | def run_single(client, model, prompt, max_tokens, temperature): |
| | """Run a single prompt and exit.""" |
| | messages = [{"role": "user", "content": prompt}] |
| | stream_response(client, model, messages, max_tokens, temperature) |
| |
|
| |
|
| | def run_interactive(client, model, max_tokens, temperature): |
| | """Interactive multi-turn conversation loop.""" |
| | messages = [] |
| | print() |
| | print_banner() |
| | print() |
| | print(f" Connected to {BOLD}{model}{RESET}") |
| | print(f" Type your message and press Enter. Type {BOLD}quit{RESET} or Ctrl+C to exit.") |
| | print(f" {DIM}Note: interactive mode is single-line only. For multiline prompts,{RESET}") |
| | print(f" {DIM} either flatten it to a single line or use:{RESET}") |
| | print(f" {DIM} python client.py --prompt 'line one{RESET}") |
| | print(f" {DIM} line two{RESET}") |
| | print(f" {DIM} line three'{RESET}") |
| | print() |
| |
|
| | while True: |
| | try: |
| | user_input = input(f"{BOLD}>>>{RESET} ") |
| | except (KeyboardInterrupt, EOFError): |
| | print("\nGoodbye!") |
| | break |
| |
|
| | if user_input.strip().lower() in ("quit", "exit", "q"): |
| | print("Goodbye!") |
| | break |
| |
|
| | if not user_input.strip(): |
| | continue |
| |
|
| | messages.append({"role": "user", "content": user_input}) |
| | content = stream_response(client, model, messages, max_tokens, temperature) |
| | messages.append({"role": "assistant", "content": content}) |
| | print() |
| |
|
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser( |
| | description=__doc__, |
| | formatter_class=argparse.RawDescriptionHelpFormatter, |
| | ) |
| | mode = parser.add_mutually_exclusive_group(required=True) |
| | mode.add_argument("--prompt", type=str, help="Single prompt to send") |
| | mode.add_argument( |
| | "--interactive", action="store_true", |
| | help="Start an interactive multi-turn conversation", |
| | ) |
| | parser.add_argument( |
| | "--base-url", default="http://localhost:8000/v1", |
| | help="vLLM server URL (default: http://localhost:8000/v1)", |
| | ) |
| | parser.add_argument( |
| | "--max-tokens", type=int, default=None, |
| | help="Maximum tokens to generate (default: server decides based on context length)", |
| | ) |
| | parser.add_argument( |
| | "--temperature", type=float, default=0.6, |
| | help="Sampling temperature (default: 0.6)", |
| | ) |
| | args = parser.parse_args() |
| |
|
| | client = OpenAI(base_url=args.base_url, api_key="EMPTY") |
| | model = detect_model(client) |
| |
|
| | if args.prompt: |
| | run_single(client, model, args.prompt, args.max_tokens, args.temperature) |
| | else: |
| | run_interactive(client, model, args.max_tokens, args.temperature) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|