Upload client.py with huggingface_hub

b1e0b5b verified 9 days ago

7.24 kB

	#!/usr/bin/env python3
	"""
	Client for the Terminator vLLM server.

	Supports single-prompt and multi-turn conversation modes with streaming
	output. Thinking content is displayed in dimmed text; answer content in
	normal text.

	Usage:
	# Single prompt
	python client.py --prompt "What is the sum of the first 100 natural numbers?"

	# Interactive multi-turn conversation
	python client.py --interactive

	# Custom server URL and max tokens
	python client.py --base-url http://localhost:8001/v1 --max-tokens 8192 --prompt "Hello"
	"""

	import argparse
	import sys

	from openai import OpenAI


	# ANSI escape codes
	DIM = "\033[2m"
	BOLD = "\033[1m"
	RESET = "\033[0m"

	BANNER_LINES = [
	r"████████╗███████╗██████╗ ███╗ ███╗██╗███╗ ██╗ █████╗ ████████╗ ██████╗ ██████╗ ",
	r"╚══██╔══╝██╔════╝██╔══██╗████╗ ████║██║████╗ ██║██╔══██╗╚══██╔══╝██╔═══██╗██╔══██╗",
	r" ██║ █████╗ ██████╔╝██╔████╔██║██║██╔██╗ ██║███████║ ██║ ██║ ██║██████╔╝",
	r" ██║ ██╔══╝ ██╔══██╗██║╚██╔╝██║██║██║╚██╗██║██╔══██║ ██║ ██║ ██║██╔══██╗",
	r" ██║ ███████╗██║ ██║██║ ╚═╝ ██║██║██║ ╚████║██║ ██║ ██║ ╚██████╔╝██║ ██║",
	r" ╚═╝ ╚══════╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝╚═╝ ╚═══╝╚═╝ ╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═╝",
	]

	# Dark red -> light red gradient (one color per row)
	_GRADIENT_RGB = [
	(140, 0, 0),
	(165, 15, 15),
	(190, 35, 35),
	(215, 55, 55),
	(235, 70, 70),
	(255, 90, 90),
	]


	def print_banner() -> None:
	for line, (r, g, b) in zip(BANNER_LINES, _GRADIENT_RGB):
	print(f"\033[38;2;{r};{g};{b}m{line}{RESET}")


	def detect_model(client: OpenAI) -> str:
	"""Auto-detect the served model name from the server."""
	try:
	models = client.models.list()
	if not models.data:
	print("ERROR: No models available on the server.", file=sys.stderr)
	sys.exit(1)
	return models.data[0].id
	except Exception as e:
	print(f"ERROR: Could not connect to server: {e}", file=sys.stderr)
	sys.exit(1)


	def stream_response(
	client: OpenAI,
	model: str,
	messages: list[dict],
	max_tokens: int \| None,
	temperature: float,
	) -> str:
	"""Stream a chat completion response.

	Thinking content is printed in dim text, answer content in normal text.
	Returns the assistant's answer content (for conversation history).
	"""
	kwargs = dict(
	model=model,
	messages=messages,
	temperature=temperature,
	stream=True,
	extra_body={"chat_template_kwargs": {"enable_thinking": True}},
	)
	if max_tokens is not None:
	kwargs["max_tokens"] = max_tokens

	stream = client.chat.completions.create(**kwargs)

	in_thinking = False
	in_answer = False
	full_content = ""

	try:
	for chunk in stream:
	if not chunk.choices:
	continue
	delta = chunk.choices[0].delta

	reasoning = getattr(delta, "reasoning", None)
	if reasoning:
	if not in_thinking:
	sys.stdout.write(f"\n{DIM}Thinking...\n")
	in_thinking = True
	sys.stdout.write(reasoning)
	sys.stdout.flush()

	if delta.content:
	if not in_answer:
	if in_thinking:
	sys.stdout.write(RESET)
	sys.stdout.write(f"\n{BOLD}Answer:{RESET}\n")
	in_answer = True
	sys.stdout.write(delta.content)
	sys.stdout.flush()
	full_content += delta.content
	except KeyboardInterrupt:
	pass
	finally:
	sys.stdout.write(RESET)
	sys.stdout.flush()

	print()
	return full_content


	def run_single(client, model, prompt, max_tokens, temperature):
	"""Run a single prompt and exit."""
	messages = [{"role": "user", "content": prompt}]
	stream_response(client, model, messages, max_tokens, temperature)


	def run_interactive(client, model, max_tokens, temperature):
	"""Interactive multi-turn conversation loop."""
	messages = []
	print()
	print_banner()
	print()
	print(f" Connected to {BOLD}{model}{RESET}")
	print(f" Type your message and press Enter. Type {BOLD}quit{RESET} or Ctrl+C to exit.")
	print(f" {DIM}Note: interactive mode is single-line only. For multiline prompts,{RESET}")
	print(f" {DIM} either flatten it to a single line or use:{RESET}")
	print(f" {DIM} python client.py --prompt 'line one{RESET}")
	print(f" {DIM} line two{RESET}")
	print(f" {DIM} line three'{RESET}")
	print()

	while True:
	try:
	user_input = input(f"{BOLD}>>>{RESET} ")
	except (KeyboardInterrupt, EOFError):
	print("\nGoodbye!")
	break

	if user_input.strip().lower() in ("quit", "exit", "q"):
	print("Goodbye!")
	break

	if not user_input.strip():
	continue

	messages.append({"role": "user", "content": user_input})
	content = stream_response(client, model, messages, max_tokens, temperature)
	messages.append({"role": "assistant", "content": content})
	print()


	def main():
	parser = argparse.ArgumentParser(
	description=__doc__,
	formatter_class=argparse.RawDescriptionHelpFormatter,
	)
	mode = parser.add_mutually_exclusive_group(required=True)
	mode.add_argument("--prompt", type=str, help="Single prompt to send")
	mode.add_argument(
	"--interactive", action="store_true",
	help="Start an interactive multi-turn conversation",
	)
	parser.add_argument(
	"--base-url", default="http://localhost:8000/v1",
	help="vLLM server URL (default: http://localhost:8000/v1)",
	)
	parser.add_argument(
	"--max-tokens", type=int, default=None,
	help="Maximum tokens to generate (default: server decides based on context length)",
	)
	parser.add_argument(
	"--temperature", type=float, default=0.6,
	help="Sampling temperature (default: 0.6)",
	)
	args = parser.parse_args()

	client = OpenAI(base_url=args.base_url, api_key="EMPTY")
	model = detect_model(client)

	if args.prompt:
	run_single(client, model, args.prompt, args.max_tokens, args.temperature)
	else:
	run_interactive(client, model, args.max_tokens, args.temperature)


	if __name__ == "__main__":
	main()