Spaces:
Sleeping
Sleeping
| import json | |
| import sys | |
| import time | |
| import httpx | |
| from open_cortex.runtime.metrics import fetch_runtime_snapshot | |
| URL = "http://127.0.0.1:8080/v1/chat/completions" | |
| METRICS_URL = "http://127.0.0.1:8080/metrics" | |
| SLOTS_URL = "http://127.0.0.1:8080/slots" | |
| def parse_prometheus_value(text: str, name: str) -> float | None: | |
| prefix = f"{name}" | |
| for line in text.splitlines(): | |
| if line.startswith(prefix): | |
| return float(line.removeprefix(prefix)) | |
| return None | |
| request_body = { | |
| "messages": [ | |
| { | |
| "role": "user", | |
| "content": "用三句话解释大语言模型的推理过程。", | |
| } | |
| ], | |
| "temperature": 0.2, | |
| "max_tokens": 100, | |
| "stream": True, | |
| "stream_options": {"include_usage": True}, | |
| "timings_per_token": True, | |
| } | |
| request_started = time.perf_counter() | |
| first_token_at = None | |
| final_stats = None | |
| with httpx.Client(timeout=120.0) as client: | |
| with client.stream("POST", URL, json=request_body) as response: | |
| response.raise_for_status() | |
| for line in response.iter_lines(): | |
| if not line.startswith("data: "): | |
| continue | |
| data = line.removeprefix("data: ") | |
| if data == "[DONE]": | |
| break | |
| event = json.loads(data) | |
| choices = event.get("choices", []) | |
| if choices: | |
| content = choices[0].get("delta", {}).get("content") | |
| if content: | |
| if first_token_at is None: | |
| first_token_at = time.perf_counter() | |
| snapshot = fetch_runtime_snapshot(client,METRICS_URL,SLOTS_URL) | |
| print(f"\n\nFIRST TOKEN SNAPSHOT: {snapshot}\n", file = sys.stderr) | |
| print(content, end="", flush=True) | |
| if event.get("usage"): | |
| final_stats = event | |
| print() | |
| if first_token_at is not None: | |
| ttft_ms = (first_token_at - request_started) * 1000 | |
| print(f"TTFT: {ttft_ms:.1f} ms", file=sys.stderr) | |
| if final_stats is not None: | |
| usage = final_stats["usage"] | |
| timings = final_stats["timings"] | |
| print(f"Prompt tokens: {usage['prompt_tokens']}", file=sys.stderr) | |
| print(f"Output tokens: {usage['completion_tokens']}", file=sys.stderr) | |
| print( | |
| f"Prefill: {timings['prompt_per_second']:.1f} tok/s", | |
| file=sys.stderr, | |
| ) | |
| print( | |
| f"Decode: {timings['predicted_per_second']:.1f} tok/s", | |
| file=sys.stderr, | |
| ) |