Spaces:

ChitandaV2
/

fixflow

Running

App Files Files Community

fixflow / backend /llm_client.py

E5K7

feat: Next.js frontend with live IDE, URL routing, retry logic, and step tracker

342230a about 1 month ago

raw

history blame contribute delete

5.16 kB

	"""
	LLM Client for GLM 5.1 via Z.ai API (OpenAI-compatible endpoint).
	Includes automatic retry with exponential backoff for rate-limit (429) errors.
	"""
	import time
	import logging
	import random
	from typing import Iterator, List, Dict, Any, Optional
	import openai
	from backend.config import GLM_API_KEY, GLM_BASE_URL, GLM_MODEL, LOG_LLM_CALLS

	logger = logging.getLogger(__name__)

	MAX_RETRIES = 5
	INITIAL_BACKOFF = 5 # seconds


	class GLMClient:
	"""OpenAI-compatible wrapper for Z.ai's GLM models."""

	def __init__(
	self,
	api_key: Optional[str] = None,
	base_url: str = GLM_BASE_URL,
	model: str = GLM_MODEL,
	):
	self.api_key = api_key or GLM_API_KEY
	self.base_url = base_url
	self.model = model
	self._client: Optional[openai.OpenAI] = None

	def _get_client(self) -> openai.OpenAI:
	if self._client is None:
	if not self.api_key:
	raise ValueError(
	"GLM API key is not set. Please provide it in the sidebar or .env file."
	)
	self._client = openai.OpenAI(
	api_key=self.api_key,
	base_url=self.base_url,
	)
	return self._client

	def _backoff_wait(self, attempt: int) -> None:
	"""Exponential backoff with jitter. Waits and logs the wait time."""
	wait = INITIAL_BACKOFF * (2 ** attempt) + random.uniform(0, 2)
	logger.warning("[GLM] Rate limited (429). Retrying in %.1fs (attempt %d/%d)...", wait, attempt + 1, MAX_RETRIES)
	time.sleep(wait)

	def chat(
	self,
	messages: List[Dict[str, str]],
	temperature: float = 0.3,
	max_tokens: int = 4096,
	) -> str:
	"""Synchronous chat completion with automatic retry on 429."""
	client = self._get_client()
	start = time.time()

	if LOG_LLM_CALLS:
	logger.info(
	"[GLM] chat() \| model=%s \| messages=%d \| temp=%.1f",
	self.model, len(messages), temperature,
	)

	last_error = None
	for attempt in range(MAX_RETRIES):
	try:
	response = client.chat.completions.create(
	model=self.model,
	messages=messages,
	temperature=temperature,
	max_tokens=max_tokens,
	)
	content = response.choices[0].message.content or ""
	elapsed = time.time() - start

	if LOG_LLM_CALLS:
	logger.info("[GLM] completed in %.2fs \| output_chars=%d", elapsed, len(content))

	return content

	except openai.RateLimitError as e:
	last_error = e
	if attempt < MAX_RETRIES - 1:
	self._backoff_wait(attempt)
	else:
	raise RuntimeError(
	f"GLM API rate limit exceeded after {MAX_RETRIES} retries. "
	f"Please wait a moment and try again. Detail: {e}"
	) from e
	except openai.APIError as e:
	raise RuntimeError(f"GLM API error: {e}") from e

	raise RuntimeError(f"GLM request failed after {MAX_RETRIES} attempts: {last_error}")

	def chat_stream(
	self,
	messages: List[Dict[str, str]],
	temperature: float = 0.3,
	max_tokens: int = 4096,
	) -> Iterator[str]:
	"""Streaming chat completion with automatic retry on 429."""
	client = self._get_client()

	if LOG_LLM_CALLS:
	logger.info(
	"[GLM] chat_stream() \| model=%s \| messages=%d",
	self.model, len(messages),
	)

	last_error = None
	for attempt in range(MAX_RETRIES):
	try:
	response = client.chat.completions.create(
	model=self.model,
	messages=messages,
	temperature=temperature,
	max_tokens=max_tokens,
	stream=True,
	)
	for chunk in response:
	delta = chunk.choices[0].delta
	if delta and delta.content:
	yield delta.content
	return # Completed successfully

	except openai.RateLimitError as e:
	last_error = e
	if attempt < MAX_RETRIES - 1:
	self._backoff_wait(attempt)
	else:
	raise RuntimeError(
	f"GLM API rate limit exceeded after {MAX_RETRIES} retries. "
	f"Please wait a moment and try again. Detail: {e}"
	) from e
	except openai.APIError as e:
	raise RuntimeError(f"GLM API error: {e}") from e

	raise RuntimeError(f"GLM stream failed after {MAX_RETRIES} attempts: {last_error}")

	def update_api_key(self, api_key: str) -> None:
	"""Allow hot-swapping the API key (e.g. from Streamlit sidebar)."""
	self.api_key = api_key
	self._client = None # Force re-initialization