Spaces:

build-small-hackathon
/

open-cortex

Sleeping

App Files Files Community

open-cortex / scripts /runtime_snapshot.py

peninsula123

feat(main): stream metrics fetch

828386c 11 days ago

Raw

History Blame Contribute Delete

2.48 kB

	import json
	import sys
	import time

	import httpx

	from open_cortex.runtime.metrics import fetch_runtime_snapshot

	URL = "http://127.0.0.1:8080/v1/chat/completions"
	METRICS_URL = "http://127.0.0.1:8080/metrics"
	SLOTS_URL = "http://127.0.0.1:8080/slots"

	def parse_prometheus_value(text: str, name: str) -> float \| None:
	prefix = f"{name}"
	for line in text.splitlines():
	if line.startswith(prefix):
	return float(line.removeprefix(prefix))

	return None

	request_body = {
	"messages": [
	{
	"role": "user",
	"content": "用三句话解释大语言模型的推理过程。",
	}
	],
	"temperature": 0.2,
	"max_tokens": 100,
	"stream": True,
	"stream_options": {"include_usage": True},
	"timings_per_token": True,
	}

	request_started = time.perf_counter()
	first_token_at = None
	final_stats = None

	with httpx.Client(timeout=120.0) as client:
	with client.stream("POST", URL, json=request_body) as response:
	response.raise_for_status()

	for line in response.iter_lines():
	if not line.startswith("data: "):
	continue

	data = line.removeprefix("data: ")

	if data == "[DONE]":
	break

	event = json.loads(data)
	choices = event.get("choices", [])

	if choices:
	content = choices[0].get("delta", {}).get("content")

	if content:
	if first_token_at is None:
	first_token_at = time.perf_counter()
	snapshot = fetch_runtime_snapshot(client,METRICS_URL,SLOTS_URL)
	print(f"\n\nFIRST TOKEN SNAPSHOT: {snapshot}\n", file = sys.stderr)
	print(content, end="", flush=True)

	if event.get("usage"):
	final_stats = event

	print()

	if first_token_at is not None:
	ttft_ms = (first_token_at - request_started) * 1000
	print(f"TTFT: {ttft_ms:.1f} ms", file=sys.stderr)

	if final_stats is not None:
	usage = final_stats["usage"]
	timings = final_stats["timings"]

	print(f"Prompt tokens: {usage['prompt_tokens']}", file=sys.stderr)
	print(f"Output tokens: {usage['completion_tokens']}", file=sys.stderr)
	print(
	f"Prefill: {timings['prompt_per_second']:.1f} tok/s",
	file=sys.stderr,
	)
	print(
	f"Decode: {timings['predicted_per_second']:.1f} tok/s",
	file=sys.stderr,
	)