Pomilon commited on
Commit
b249d06
·
verified ·
1 Parent(s): 0842f0d

Update aetheris/api/server.py

Browse files
Files changed (1) hide show
  1. aetheris/api/server.py +57 -23
aetheris/api/server.py CHANGED
@@ -38,6 +38,10 @@ def get_engine():
38
  async def startup_event():
39
  get_engine()
40
 
 
 
 
 
41
  @app.get("/v1/models", response_model=ModelList)
42
  async def list_models():
43
  return ModelList(data=[ModelCard(id="aetheris-hybrid-mamba-moe")])
@@ -68,37 +72,67 @@ async def chat_completions(request: ChatCompletionRequest):
68
  )]
69
  ).model_dump())
70
 
71
- for token in engine.generate(
72
- prompt=prompt,
73
- max_new_tokens=request.max_tokens or 100,
74
- temperature=request.temperature,
75
- top_p=request.top_p,
76
- repetition_penalty=1.0 + request.frequency_penalty, # Approximating
77
- stream=True
78
- ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  yield json.dumps(ChatCompletionChunk(
80
  id=request_id,
81
  created=created_time,
82
  model=request.model,
83
  choices=[ChatCompletionChunkChoice(
84
  index=0,
85
- delta=ChatCompletionChunkDelta(content=token),
86
- finish_reason=None
87
  )]
88
  ).model_dump())
89
-
90
- yield json.dumps(ChatCompletionChunk(
91
- id=request_id,
92
- created=created_time,
93
- model=request.model,
94
- choices=[ChatCompletionChunkChoice(
95
- index=0,
96
- delta=ChatCompletionChunkDelta(),
97
- finish_reason="stop"
98
- )]
99
- ).model_dump())
100
-
101
- yield "[DONE]"
102
 
103
  return EventSourceResponse(event_generator())
104
 
 
38
  async def startup_event():
39
  get_engine()
40
 
41
+ @app.get("/")
42
+ async def root():
43
+ return {"status": "running", "message": "Aetheris API is active. Use /v1/chat/completions for inference."}
44
+
45
  @app.get("/v1/models", response_model=ModelList)
46
  async def list_models():
47
  return ModelList(data=[ModelCard(id="aetheris-hybrid-mamba-moe")])
 
72
  )]
73
  ).model_dump())
74
 
75
+ # Offload synchronous generation to a thread to avoid blocking the event loop
76
+ queue = asyncio.Queue()
77
+ loop = asyncio.get_running_loop()
78
+ import threading
79
+ stop_event = threading.Event()
80
+
81
+ def producer():
82
+ try:
83
+ # Run the synchronous generator
84
+ for token in engine.generate(
85
+ prompt=prompt,
86
+ max_new_tokens=request.max_tokens or 100,
87
+ temperature=request.temperature,
88
+ top_p=request.top_p,
89
+ repetition_penalty=1.0 + request.frequency_penalty,
90
+ stream=True
91
+ ):
92
+ if stop_event.is_set():
93
+ break
94
+ # Schedule the put() coroutine on the main loop
95
+ asyncio.run_coroutine_threadsafe(queue.put(token), loop)
96
+ except Exception as e:
97
+ print(f"Generation error: {e}")
98
+ finally:
99
+ # Signal done
100
+ asyncio.run_coroutine_threadsafe(queue.put(None), loop)
101
+
102
+ thread = threading.Thread(target=producer, daemon=True)
103
+ thread.start()
104
+
105
+ try:
106
+ while True:
107
+ token = await queue.get()
108
+ if token is None:
109
+ break
110
+
111
+ yield json.dumps(ChatCompletionChunk(
112
+ id=request_id,
113
+ created=created_time,
114
+ model=request.model,
115
+ choices=[ChatCompletionChunkChoice(
116
+ index=0,
117
+ delta=ChatCompletionChunkDelta(content=token),
118
+ finish_reason=None
119
+ )]
120
+ ).model_dump())
121
+
122
  yield json.dumps(ChatCompletionChunk(
123
  id=request_id,
124
  created=created_time,
125
  model=request.model,
126
  choices=[ChatCompletionChunkChoice(
127
  index=0,
128
+ delta=ChatCompletionChunkDelta(),
129
+ finish_reason="stop"
130
  )]
131
  ).model_dump())
132
+
133
+ yield "[DONE]"
134
+ finally:
135
+ stop_event.set()
 
 
 
 
 
 
 
 
 
136
 
137
  return EventSourceResponse(event_generator())
138