Spaces:

TeamGenKI
/

Inference-API

Runtime error

App Files Files Community

AurelioAguirre commited on Jan 16, 2025

Commit

ab616bd

1 Parent(s): 0cbf645

Fixed issue with the server not managing request lifecycle correctly

Browse files

Files changed (1) hide show

main/api.py +35 -24

main/api.py CHANGED Viewed

@@ -63,18 +63,26 @@ class InferenceApi(LitAPI):
             stream: bool = False
     ) -> Any:
         """Make an authenticated request to the LLM Server."""
         try:
-            async with await self._get_client() as client:
-                if stream:
-                    # Return the context manager directly, don't await it
-                    return client.stream(
-                        method,
-                        self._get_endpoint(endpoint),
-                        params=params,
-                        json=json
-                    )
-                else:
-                    response = await client.request(
                         method,
                         self._get_endpoint(endpoint),
                         params=params,
@@ -84,7 +92,7 @@ class InferenceApi(LitAPI):
                     return response
         except Exception as e:
-            self.logger.error(f"Error in request to {endpoint}: {str(e)}")
             raise
     def predict(self, x: str, **kwargs) -> Iterator[str]:
@@ -235,18 +243,21 @@ class InferenceApi(LitAPI):
         self.logger.debug(f"Forwarding streaming request for prompt: {prompt[:50]}...")
         try:
-            async with await self._make_request(
-                    "POST",
-                    "generate_stream",
-                    json={
-                        "prompt": prompt,
-                        "system_message": system_message,
-                        "max_new_tokens": max_new_tokens
-                    },
-                    stream=True
-            ) as response:
-                async for chunk in response.aiter_text():
-                    yield chunk
         except Exception as e:
             self.logger.error(f"Error in generate_stream: {str(e)}")

             stream: bool = False
     ) -> Any:
         """Make an authenticated request to the LLM Server."""
+        base_url = self.llm_config.get('host', 'http://localhost:8001')
+        full_endpoint = f"{base_url.rstrip('/')}/{self._get_endpoint(endpoint).lstrip('/')}"
         try:
+            self.logger.info(f"Making {method} request to: {full_endpoint}")
+            # Create client outside the with block for streaming
+            client = await self._get_client()
+            if stream:
+                # For streaming, return both client and response context managers
+                return client, client.stream(
+                    method,
+                    self._get_endpoint(endpoint),
+                    params=params,
+                    json=json
+                )
+            else:
+                # For non-streaming, use context manager
+                async with client as c:
+                    response = await c.request(
                         method,
                         self._get_endpoint(endpoint),
                         params=params,
                     return response
         except Exception as e:
+            self.logger.error(f"Error in request to {full_endpoint}: {str(e)}")
             raise
     def predict(self, x: str, **kwargs) -> Iterator[str]:
         self.logger.debug(f"Forwarding streaming request for prompt: {prompt[:50]}...")
         try:
+            client, stream_cm = await self._make_request(
+                "POST",
+                "generate_stream",
+                json={
+                    "prompt": prompt,
+                    "system_message": system_message,
+                    "max_new_tokens": max_new_tokens
+                },
+                stream=True
+            )
+            async with client:
+                async with stream_cm as response:
+                    async for chunk in response.aiter_text():
+                        yield chunk
         except Exception as e:
             self.logger.error(f"Error in generate_stream: {str(e)}")