Rox-Turbo commited on
Commit
aa4f314
·
verified ·
1 Parent(s): 6800ca4

Update server.py

Browse files
Files changed (1) hide show
  1. server.py +114 -57
server.py CHANGED
@@ -1,27 +1,28 @@
1
  import logging
2
  import os
3
  import sys
4
- from typing import List, Optional, AsyncGenerator
 
 
5
  from contextlib import asynccontextmanager
6
 
7
  from dotenv import load_dotenv
8
- from fastapi import FastAPI, HTTPException, Request
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from fastapi.middleware.gzip import GZipMiddleware
11
  from fastapi.responses import JSONResponse, StreamingResponse
12
  from pydantic import BaseModel, Field
13
- from openai import OpenAI
 
14
  import json
15
 
16
 
17
  # Load environment variables
18
  load_dotenv()
19
 
20
- # Configure minimal logging for production speed
21
- logging.basicConfig(
22
- level=logging.WARNING,
23
- format='%(levelname)s - %(message)s'
24
- )
25
  logger = logging.getLogger("rox_ai")
26
 
27
  # Check for API key
@@ -30,6 +31,19 @@ NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
30
  if not NVIDIA_API_KEY:
31
  raise RuntimeError("NVIDIA_API_KEY not set")
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # Model configurations
34
  ROX_CORE_MODEL = "minimaxai/minimax-m2.5"
35
  ROX_TURBO_MODEL = "meta/llama-3.1-8b-instruct" # Changed to a more reliable model
@@ -50,18 +64,33 @@ ROX_DYNO_IDENTITY = "You are Rox 6 Dyno, an AI model created by Rox AI. Your cre
50
  ROX_CODER_7_IDENTITY = "You are Rox 7 Coder, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You are the most advanced coding specialist."
51
  ROX_VISION_IDENTITY = "You are Rox Vision Max, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You specialize in visual understanding and multimodal tasks."
52
 
53
- # Initialize OpenAI client with timeout optimization
54
- client = OpenAI(
55
- base_url="https://integrate.api.nvidia.com/v1",
56
- api_key=NVIDIA_API_KEY,
57
- timeout=60.0,
58
- max_retries=2
59
- )
60
-
61
  @asynccontextmanager
62
  async def lifespan(app: FastAPI):
63
  """Lifespan context manager"""
64
- yield
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
67
  # Initialize FastAPI app - optimized for speed
@@ -75,38 +104,64 @@ app = FastAPI(
75
  )
76
 
77
  # GZip compression for faster transfers
78
- app.add_middleware(GZipMiddleware, minimum_size=500)
79
 
80
- # CORS - unlimited access
81
  app.add_middleware(
82
  CORSMiddleware,
83
- allow_origins=["*"],
84
- allow_credentials=True,
85
  allow_methods=["*"],
86
  allow_headers=["*"],
87
  )
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  # Minimal exception handler
91
  @app.exception_handler(Exception)
92
  async def global_exception_handler(request: Request, exc: Exception):
 
93
  return JSONResponse(
94
  status_code=500,
95
  content={"error": "Internal server error"}
96
  )
97
 
98
 
99
- @app.get("/health")
100
- def health_check():
101
- """Health check endpoint"""
102
- return {"status": "healthy", "service": "Rox AI", "version": "2.0"}
 
103
 
104
 
105
  # Helper function for streaming responses
106
- async def stream_response(model: str, messages: list, temperature: float, top_p: float, max_tokens: int, extra_body: dict = None):
 
 
 
 
 
 
 
 
107
  """Stream responses from OpenAI API"""
108
  try:
109
- stream = client.chat.completions.create(
110
  model=model,
111
  messages=messages,
112
  temperature=temperature,
@@ -116,9 +171,11 @@ async def stream_response(model: str, messages: list, temperature: float, top_p:
116
  extra_body=extra_body
117
  )
118
 
119
- for chunk in stream:
120
- if chunk.choices[0].delta.content:
121
- yield f"data: {json.dumps({'content': chunk.choices[0].delta.content})}\n\n"
 
 
122
 
123
  yield "data: [DONE]\n\n"
124
  except Exception as e:
@@ -243,21 +300,21 @@ class HFResponseItem(BaseModel):
243
  async def chat(req: ChatRequest):
244
  """Rox Core - Main conversational model with streaming support"""
245
  messages = [{"role": "system", "content": ROX_CORE_IDENTITY}]
246
- messages.extend([m.dict() for m in req.messages])
247
 
248
  if req.stream:
249
  return StreamingResponse(
250
- stream_response(ROX_CORE_MODEL, messages, req.temperature, req.top_p, req.max_tokens),
251
  media_type="text/event-stream"
252
  )
253
 
254
  try:
255
- completion = client.chat.completions.create(
256
  model=ROX_CORE_MODEL,
257
  messages=messages,
258
  temperature=req.temperature,
259
  top_p=req.top_p,
260
- max_tokens=req.max_tokens,
261
  stream=False
262
  )
263
  return {"content": completion.choices[0].message.content or ""}
@@ -269,21 +326,21 @@ async def chat(req: ChatRequest):
269
  async def turbo(req: ChatRequest):
270
  """Rox 2.1 Turbo - Fast and efficient with streaming"""
271
  messages = [{"role": "system", "content": ROX_TURBO_IDENTITY}]
272
- messages.extend([m.dict() for m in req.messages])
273
 
274
  if req.stream:
275
  return StreamingResponse(
276
- stream_response(ROX_TURBO_MODEL, messages, req.temperature, req.top_p, req.max_tokens),
277
  media_type="text/event-stream"
278
  )
279
 
280
  try:
281
- completion = client.chat.completions.create(
282
  model=ROX_TURBO_MODEL,
283
  messages=messages,
284
  temperature=req.temperature,
285
  top_p=req.top_p,
286
- max_tokens=req.max_tokens,
287
  stream=False
288
  )
289
  return {"content": completion.choices[0].message.content or ""}
@@ -295,7 +352,7 @@ async def turbo(req: ChatRequest):
295
  async def coder(req: ChatRequest):
296
  """Rox 3.5 Coder - Specialized coding with streaming"""
297
  messages = [{"role": "system", "content": ROX_CODER_IDENTITY}]
298
- messages.extend([m.dict() for m in req.messages])
299
 
300
  extra_body = {
301
  "top_k": 20,
@@ -306,12 +363,12 @@ async def coder(req: ChatRequest):
306
 
307
  if req.stream:
308
  return StreamingResponse(
309
- stream_response(ROX_CODER_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 16384), extra_body),
310
  media_type="text/event-stream"
311
  )
312
 
313
  try:
314
- completion = client.chat.completions.create(
315
  model=ROX_CODER_MODEL,
316
  messages=messages,
317
  temperature=req.temperature,
@@ -329,18 +386,18 @@ async def coder(req: ChatRequest):
329
  async def turbo45(req: ChatRequest):
330
  """Rox 4.5 Turbo - Advanced reasoning with streaming"""
331
  messages = [{"role": "system", "content": ROX_TURBO_45_IDENTITY}]
332
- messages.extend([m.dict() for m in req.messages])
333
 
334
  extra_body = {"chat_template_kwargs": {"thinking": True}}
335
 
336
  if req.stream:
337
  return StreamingResponse(
338
- stream_response(ROX_TURBO_45_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192), extra_body),
339
  media_type="text/event-stream"
340
  )
341
 
342
  try:
343
- completion = client.chat.completions.create(
344
  model=ROX_TURBO_45_MODEL,
345
  messages=messages,
346
  temperature=req.temperature,
@@ -358,18 +415,18 @@ async def turbo45(req: ChatRequest):
358
  async def ultra(req: ChatRequest):
359
  """Rox 5 Ultra - Most advanced with streaming"""
360
  messages = [{"role": "system", "content": ROX_ULTRA_IDENTITY}]
361
- messages.extend([m.dict() for m in req.messages])
362
 
363
  extra_body = {"chat_template_kwargs": {"thinking": True}}
364
 
365
  if req.stream:
366
  return StreamingResponse(
367
- stream_response(ROX_ULTRA_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192), extra_body),
368
  media_type="text/event-stream"
369
  )
370
 
371
  try:
372
- completion = client.chat.completions.create(
373
  model=ROX_ULTRA_MODEL,
374
  messages=messages,
375
  temperature=req.temperature,
@@ -387,18 +444,18 @@ async def ultra(req: ChatRequest):
387
  async def dyno(req: ChatRequest):
388
  """Rox 6 Dyno - Extended context with streaming"""
389
  messages = [{"role": "system", "content": ROX_DYNO_IDENTITY}]
390
- messages.extend([m.dict() for m in req.messages])
391
 
392
  extra_body = {"chat_template_kwargs": {"thinking": True}}
393
 
394
  if req.stream:
395
  return StreamingResponse(
396
- stream_response(ROX_DYNO_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 16384), extra_body),
397
  media_type="text/event-stream"
398
  )
399
 
400
  try:
401
- completion = client.chat.completions.create(
402
  model=ROX_DYNO_MODEL,
403
  messages=messages,
404
  temperature=req.temperature,
@@ -416,7 +473,7 @@ async def dyno(req: ChatRequest):
416
  async def coder7(req: ChatRequest):
417
  """Rox 7 Coder - Most advanced coding with streaming"""
418
  messages = [{"role": "system", "content": ROX_CODER_7_IDENTITY}]
419
- messages.extend([m.dict() for m in req.messages])
420
 
421
  extra_body = {
422
  "chat_template_kwargs": {
@@ -427,12 +484,12 @@ async def coder7(req: ChatRequest):
427
 
428
  if req.stream:
429
  return StreamingResponse(
430
- stream_response(ROX_CODER_7_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 16384), extra_body),
431
  media_type="text/event-stream"
432
  )
433
 
434
  try:
435
- completion = client.chat.completions.create(
436
  model=ROX_CODER_7_MODEL,
437
  messages=messages,
438
  temperature=req.temperature,
@@ -450,16 +507,16 @@ async def coder7(req: ChatRequest):
450
  async def vision(req: ChatRequest):
451
  """Rox Vision Max - Visual understanding with streaming"""
452
  messages = [{"role": "system", "content": ROX_VISION_IDENTITY}]
453
- messages.extend([m.dict() for m in req.messages])
454
 
455
  if req.stream:
456
  return StreamingResponse(
457
- stream_response(ROX_VISION_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192)),
458
  media_type="text/event-stream"
459
  )
460
 
461
  try:
462
- completion = client.chat.completions.create(
463
  model=ROX_VISION_MODEL,
464
  messages=messages,
465
  temperature=req.temperature,
@@ -482,7 +539,7 @@ async def hf_generate(req: HFRequest):
482
  ]
483
 
484
  try:
485
- completion = client.chat.completions.create(
486
  model=ROX_CORE_MODEL,
487
  messages=messages,
488
  temperature=params.temperature or 0.7,
 
1
  import logging
2
  import os
3
  import sys
4
+ import time
5
+ import uuid
6
+ from typing import List, Optional, AsyncGenerator, Iterable
7
  from contextlib import asynccontextmanager
8
 
9
  from dotenv import load_dotenv
10
+ from fastapi import FastAPI, HTTPException, Request, Response
11
  from fastapi.middleware.cors import CORSMiddleware
12
  from fastapi.middleware.gzip import GZipMiddleware
13
  from fastapi.responses import JSONResponse, StreamingResponse
14
  from pydantic import BaseModel, Field
15
+ from openai import AsyncOpenAI
16
+ import httpx
17
  import json
18
 
19
 
20
  # Load environment variables
21
  load_dotenv()
22
 
23
+ # Configure logging (env-controlled)
24
+ LOG_LEVEL = os.getenv("LOG_LEVEL", "WARNING").upper()
25
+ logging.basicConfig(level=LOG_LEVEL, format="%(levelname)s - %(message)s")
 
 
26
  logger = logging.getLogger("rox_ai")
27
 
28
  # Check for API key
 
31
  if not NVIDIA_API_KEY:
32
  raise RuntimeError("NVIDIA_API_KEY not set")
33
 
34
+ API_BASE_URL = os.getenv("NVIDIA_BASE_URL", "https://integrate.api.nvidia.com/v1")
35
+
36
+ def _parse_cors_origins(value: str) -> List[str]:
37
+ v = (value or "").strip()
38
+ if not v:
39
+ return []
40
+ if v == "*":
41
+ return ["*"]
42
+ return [o.strip() for o in v.split(",") if o.strip()]
43
+
44
+ CORS_ORIGINS = _parse_cors_origins(os.getenv("CORS_ORIGINS", "*"))
45
+ GZIP_MIN_SIZE = int(os.getenv("GZIP_MIN_SIZE", "500"))
46
+
47
  # Model configurations
48
  ROX_CORE_MODEL = "minimaxai/minimax-m2.5"
49
  ROX_TURBO_MODEL = "meta/llama-3.1-8b-instruct" # Changed to a more reliable model
 
64
  ROX_CODER_7_IDENTITY = "You are Rox 7 Coder, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You are the most advanced coding specialist."
65
  ROX_VISION_IDENTITY = "You are Rox Vision Max, an AI model created by Rox AI. Your creator and owner is Mohammad Faiz, the founder of Rox AI company. You specialize in visual understanding and multimodal tasks."
66
 
 
 
 
 
 
 
 
 
67
  @asynccontextmanager
68
  async def lifespan(app: FastAPI):
69
  """Lifespan context manager"""
70
+ # One pooled async HTTP client for all requests (keep-alive, limits, timeouts)
71
+ timeout_s = float(os.getenv("UPSTREAM_TIMEOUT_SECONDS", "60"))
72
+ max_retries = int(os.getenv("UPSTREAM_MAX_RETRIES", "2"))
73
+ max_connections = int(os.getenv("UPSTREAM_MAX_CONNECTIONS", "200"))
74
+ max_keepalive = int(os.getenv("UPSTREAM_MAX_KEEPALIVE_CONNECTIONS", "50"))
75
+
76
+ http_client = httpx.AsyncClient(
77
+ timeout=httpx.Timeout(timeout_s),
78
+ limits=httpx.Limits(max_connections=max_connections, max_keepalive_connections=max_keepalive),
79
+ headers={"User-Agent": "Rox-AI-API/2.0"},
80
+ )
81
+ app.state.http_client = http_client
82
+ app.state.client = AsyncOpenAI(
83
+ base_url=API_BASE_URL,
84
+ api_key=NVIDIA_API_KEY,
85
+ timeout=timeout_s,
86
+ max_retries=max_retries,
87
+ http_client=http_client,
88
+ )
89
+
90
+ try:
91
+ yield
92
+ finally:
93
+ await http_client.aclose()
94
 
95
 
96
  # Initialize FastAPI app - optimized for speed
 
104
  )
105
 
106
  # GZip compression for faster transfers
107
+ app.add_middleware(GZipMiddleware, minimum_size=GZIP_MIN_SIZE)
108
 
109
+ # CORS - env controlled (default "*")
110
  app.add_middleware(
111
  CORSMiddleware,
112
+ allow_origins=CORS_ORIGINS,
113
+ allow_credentials=(CORS_ORIGINS != ["*"]),
114
  allow_methods=["*"],
115
  allow_headers=["*"],
116
  )
117
 
118
+ @app.middleware("http")
119
+ async def add_request_context(request: Request, call_next):
120
+ request_id = request.headers.get("x-request-id") or str(uuid.uuid4())
121
+ start = time.perf_counter()
122
+ try:
123
+ response: Response = await call_next(request)
124
+ finally:
125
+ elapsed_ms = (time.perf_counter() - start) * 1000.0
126
+ # Keep logs lightweight; only emit at INFO+ if enabled
127
+ if logger.isEnabledFor(logging.INFO):
128
+ logger.info("%s %s -> %.2fms id=%s", request.method, request.url.path, elapsed_ms, request_id)
129
+
130
+ response.headers["X-Request-Id"] = request_id
131
+ response.headers["X-Process-Time-Ms"] = f"{elapsed_ms:.2f}"
132
+ return response
133
+
134
 
135
  # Minimal exception handler
136
  @app.exception_handler(Exception)
137
  async def global_exception_handler(request: Request, exc: Exception):
138
+ logger.exception("Unhandled error on %s %s", request.method, request.url.path)
139
  return JSONResponse(
140
  status_code=500,
141
  content={"error": "Internal server error"}
142
  )
143
 
144
 
145
+ def _client(app_: FastAPI) -> AsyncOpenAI:
146
+ c = getattr(app_.state, "client", None)
147
+ if c is None:
148
+ raise RuntimeError("Client not initialized")
149
+ return c
150
 
151
 
152
  # Helper function for streaming responses
153
+ async def stream_response(
154
+ app_: FastAPI,
155
+ model: str,
156
+ messages: list,
157
+ temperature: float,
158
+ top_p: float,
159
+ max_tokens: int,
160
+ extra_body: dict | None = None,
161
+ ) -> AsyncGenerator[str, None]:
162
  """Stream responses from OpenAI API"""
163
  try:
164
+ stream = await _client(app_).chat.completions.create(
165
  model=model,
166
  messages=messages,
167
  temperature=temperature,
 
171
  extra_body=extra_body
172
  )
173
 
174
+ async for chunk in stream:
175
+ delta = chunk.choices[0].delta
176
+ content = getattr(delta, "content", None)
177
+ if content:
178
+ yield f"data: {json.dumps({'content': content})}\n\n"
179
 
180
  yield "data: [DONE]\n\n"
181
  except Exception as e:
 
300
  async def chat(req: ChatRequest):
301
  """Rox Core - Main conversational model with streaming support"""
302
  messages = [{"role": "system", "content": ROX_CORE_IDENTITY}]
303
+ messages.extend([m.model_dump() for m in req.messages])
304
 
305
  if req.stream:
306
  return StreamingResponse(
307
+ stream_response(app, ROX_CORE_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192)),
308
  media_type="text/event-stream"
309
  )
310
 
311
  try:
312
+ completion = await _client(app).chat.completions.create(
313
  model=ROX_CORE_MODEL,
314
  messages=messages,
315
  temperature=req.temperature,
316
  top_p=req.top_p,
317
+ max_tokens=min(req.max_tokens, 8192),
318
  stream=False
319
  )
320
  return {"content": completion.choices[0].message.content or ""}
 
326
  async def turbo(req: ChatRequest):
327
  """Rox 2.1 Turbo - Fast and efficient with streaming"""
328
  messages = [{"role": "system", "content": ROX_TURBO_IDENTITY}]
329
+ messages.extend([m.model_dump() for m in req.messages])
330
 
331
  if req.stream:
332
  return StreamingResponse(
333
+ stream_response(app, ROX_TURBO_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192)),
334
  media_type="text/event-stream"
335
  )
336
 
337
  try:
338
+ completion = await _client(app).chat.completions.create(
339
  model=ROX_TURBO_MODEL,
340
  messages=messages,
341
  temperature=req.temperature,
342
  top_p=req.top_p,
343
+ max_tokens=min(req.max_tokens, 8192),
344
  stream=False
345
  )
346
  return {"content": completion.choices[0].message.content or ""}
 
352
  async def coder(req: ChatRequest):
353
  """Rox 3.5 Coder - Specialized coding with streaming"""
354
  messages = [{"role": "system", "content": ROX_CODER_IDENTITY}]
355
+ messages.extend([m.model_dump() for m in req.messages])
356
 
357
  extra_body = {
358
  "top_k": 20,
 
363
 
364
  if req.stream:
365
  return StreamingResponse(
366
+ stream_response(app, ROX_CODER_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 16384), extra_body),
367
  media_type="text/event-stream"
368
  )
369
 
370
  try:
371
+ completion = await _client(app).chat.completions.create(
372
  model=ROX_CODER_MODEL,
373
  messages=messages,
374
  temperature=req.temperature,
 
386
  async def turbo45(req: ChatRequest):
387
  """Rox 4.5 Turbo - Advanced reasoning with streaming"""
388
  messages = [{"role": "system", "content": ROX_TURBO_45_IDENTITY}]
389
+ messages.extend([m.model_dump() for m in req.messages])
390
 
391
  extra_body = {"chat_template_kwargs": {"thinking": True}}
392
 
393
  if req.stream:
394
  return StreamingResponse(
395
+ stream_response(app, ROX_TURBO_45_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192), extra_body),
396
  media_type="text/event-stream"
397
  )
398
 
399
  try:
400
+ completion = await _client(app).chat.completions.create(
401
  model=ROX_TURBO_45_MODEL,
402
  messages=messages,
403
  temperature=req.temperature,
 
415
  async def ultra(req: ChatRequest):
416
  """Rox 5 Ultra - Most advanced with streaming"""
417
  messages = [{"role": "system", "content": ROX_ULTRA_IDENTITY}]
418
+ messages.extend([m.model_dump() for m in req.messages])
419
 
420
  extra_body = {"chat_template_kwargs": {"thinking": True}}
421
 
422
  if req.stream:
423
  return StreamingResponse(
424
+ stream_response(app, ROX_ULTRA_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192), extra_body),
425
  media_type="text/event-stream"
426
  )
427
 
428
  try:
429
+ completion = await _client(app).chat.completions.create(
430
  model=ROX_ULTRA_MODEL,
431
  messages=messages,
432
  temperature=req.temperature,
 
444
  async def dyno(req: ChatRequest):
445
  """Rox 6 Dyno - Extended context with streaming"""
446
  messages = [{"role": "system", "content": ROX_DYNO_IDENTITY}]
447
+ messages.extend([m.model_dump() for m in req.messages])
448
 
449
  extra_body = {"chat_template_kwargs": {"thinking": True}}
450
 
451
  if req.stream:
452
  return StreamingResponse(
453
+ stream_response(app, ROX_DYNO_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 16384), extra_body),
454
  media_type="text/event-stream"
455
  )
456
 
457
  try:
458
+ completion = await _client(app).chat.completions.create(
459
  model=ROX_DYNO_MODEL,
460
  messages=messages,
461
  temperature=req.temperature,
 
473
  async def coder7(req: ChatRequest):
474
  """Rox 7 Coder - Most advanced coding with streaming"""
475
  messages = [{"role": "system", "content": ROX_CODER_7_IDENTITY}]
476
+ messages.extend([m.model_dump() for m in req.messages])
477
 
478
  extra_body = {
479
  "chat_template_kwargs": {
 
484
 
485
  if req.stream:
486
  return StreamingResponse(
487
+ stream_response(app, ROX_CODER_7_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 16384), extra_body),
488
  media_type="text/event-stream"
489
  )
490
 
491
  try:
492
+ completion = await _client(app).chat.completions.create(
493
  model=ROX_CODER_7_MODEL,
494
  messages=messages,
495
  temperature=req.temperature,
 
507
  async def vision(req: ChatRequest):
508
  """Rox Vision Max - Visual understanding with streaming"""
509
  messages = [{"role": "system", "content": ROX_VISION_IDENTITY}]
510
+ messages.extend([m.model_dump() for m in req.messages])
511
 
512
  if req.stream:
513
  return StreamingResponse(
514
+ stream_response(app, ROX_VISION_MODEL, messages, req.temperature, req.top_p, min(req.max_tokens, 8192)),
515
  media_type="text/event-stream"
516
  )
517
 
518
  try:
519
+ completion = await _client(app).chat.completions.create(
520
  model=ROX_VISION_MODEL,
521
  messages=messages,
522
  temperature=req.temperature,
 
539
  ]
540
 
541
  try:
542
+ completion = await _client(app).chat.completions.create(
543
  model=ROX_CORE_MODEL,
544
  messages=messages,
545
  temperature=params.temperature or 0.7,