Spaces:
Runtime error
Runtime error
Commit
·
ab616bd
1
Parent(s):
0cbf645
Fixed issue with the server not managing request lifecycle correctly
Browse files- main/api.py +35 -24
main/api.py
CHANGED
|
@@ -63,18 +63,26 @@ class InferenceApi(LitAPI):
|
|
| 63 |
stream: bool = False
|
| 64 |
) -> Any:
|
| 65 |
"""Make an authenticated request to the LLM Server."""
|
|
|
|
|
|
|
|
|
|
| 66 |
try:
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
)
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
method,
|
| 79 |
self._get_endpoint(endpoint),
|
| 80 |
params=params,
|
|
@@ -84,7 +92,7 @@ class InferenceApi(LitAPI):
|
|
| 84 |
return response
|
| 85 |
|
| 86 |
except Exception as e:
|
| 87 |
-
self.logger.error(f"Error in request to {
|
| 88 |
raise
|
| 89 |
|
| 90 |
def predict(self, x: str, **kwargs) -> Iterator[str]:
|
|
@@ -235,18 +243,21 @@ class InferenceApi(LitAPI):
|
|
| 235 |
self.logger.debug(f"Forwarding streaming request for prompt: {prompt[:50]}...")
|
| 236 |
|
| 237 |
try:
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
)
|
| 248 |
-
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
except Exception as e:
|
| 252 |
self.logger.error(f"Error in generate_stream: {str(e)}")
|
|
|
|
| 63 |
stream: bool = False
|
| 64 |
) -> Any:
|
| 65 |
"""Make an authenticated request to the LLM Server."""
|
| 66 |
+
base_url = self.llm_config.get('host', 'http://localhost:8001')
|
| 67 |
+
full_endpoint = f"{base_url.rstrip('/')}/{self._get_endpoint(endpoint).lstrip('/')}"
|
| 68 |
+
|
| 69 |
try:
|
| 70 |
+
self.logger.info(f"Making {method} request to: {full_endpoint}")
|
| 71 |
+
# Create client outside the with block for streaming
|
| 72 |
+
client = await self._get_client()
|
| 73 |
+
|
| 74 |
+
if stream:
|
| 75 |
+
# For streaming, return both client and response context managers
|
| 76 |
+
return client, client.stream(
|
| 77 |
+
method,
|
| 78 |
+
self._get_endpoint(endpoint),
|
| 79 |
+
params=params,
|
| 80 |
+
json=json
|
| 81 |
+
)
|
| 82 |
+
else:
|
| 83 |
+
# For non-streaming, use context manager
|
| 84 |
+
async with client as c:
|
| 85 |
+
response = await c.request(
|
| 86 |
method,
|
| 87 |
self._get_endpoint(endpoint),
|
| 88 |
params=params,
|
|
|
|
| 92 |
return response
|
| 93 |
|
| 94 |
except Exception as e:
|
| 95 |
+
self.logger.error(f"Error in request to {full_endpoint}: {str(e)}")
|
| 96 |
raise
|
| 97 |
|
| 98 |
def predict(self, x: str, **kwargs) -> Iterator[str]:
|
|
|
|
| 243 |
self.logger.debug(f"Forwarding streaming request for prompt: {prompt[:50]}...")
|
| 244 |
|
| 245 |
try:
|
| 246 |
+
client, stream_cm = await self._make_request(
|
| 247 |
+
"POST",
|
| 248 |
+
"generate_stream",
|
| 249 |
+
json={
|
| 250 |
+
"prompt": prompt,
|
| 251 |
+
"system_message": system_message,
|
| 252 |
+
"max_new_tokens": max_new_tokens
|
| 253 |
+
},
|
| 254 |
+
stream=True
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
async with client:
|
| 258 |
+
async with stream_cm as response:
|
| 259 |
+
async for chunk in response.aiter_text():
|
| 260 |
+
yield chunk
|
| 261 |
|
| 262 |
except Exception as e:
|
| 263 |
self.logger.error(f"Error in generate_stream: {str(e)}")
|