Spaces:

ka1kuk
/

LLM-api

Sleeping

App Files Files Community

ka1kuk commited on Mar 12, 2024

Commit

8ef9b69

verified ·

1 Parent(s): 129a4cd

Update apis/chat_api.py

Browse files

Files changed (1) hide show

apis/chat_api.py +61 -2

apis/chat_api.py CHANGED Viewed

@@ -3,13 +3,15 @@ import os
 import sys
 import time
 import uvicorn
 from pathlib import Path
 from fastapi import FastAPI, Depends
 from fastapi.responses import HTMLResponse
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from pydantic import BaseModel, Field
-from typing import Union
 from sse_starlette.sse import EventSourceResponse, ServerSentEvent
 from utils.logger import logger
 from networks.message_streamer import MessageStreamer
@@ -18,7 +20,6 @@ from mocks.stream_chat_mocker import stream_chat_mock
 from fastapi.middleware.cors import CORSMiddleware
 class ChatAPIApp:
     def __init__(self):
         self.app = FastAPI(
@@ -79,6 +80,13 @@ class ChatAPIApp:
                     "created": current_time,
                     "owned_by": "codellama",
                 },
             ],
         }
         return self.available_models
@@ -103,6 +111,23 @@ class ChatAPIApp:
             logger.warn("Not provide HF Token!")
         return None
     class ChatCompletionsPostItem(BaseModel):
         model: str = Field(
             default="mixtral-8x7b",
@@ -161,6 +186,28 @@ class ChatAPIApp:
             data_response = streamer.chat_return_dict(stream_response)
             return data_response
     def setup_routes(self):
         for prefix in ["", "/v1", "/api", "/api/v1"]:
             if prefix in ["/api/v1"]:
@@ -180,6 +227,18 @@ class ChatAPIApp:
                 include_in_schema=include_in_schema,
             )(self.chat_completions)
 class ArgParser(argparse.ArgumentParser):
     def __init__(self, *args, **kwargs):

 import sys
 import time
 import uvicorn
+import requests
+import asyncio
 from pathlib import Path
 from fastapi import FastAPI, Depends
 from fastapi.responses import HTMLResponse
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from pydantic import BaseModel, Field
+from typing import Union, List
 from sse_starlette.sse import EventSourceResponse, ServerSentEvent
 from utils.logger import logger
 from networks.message_streamer import MessageStreamer
 from fastapi.middleware.cors import CORSMiddleware
 class ChatAPIApp:
     def __init__(self):
         self.app = FastAPI(
                     "created": current_time,
                     "owned_by": "codellama",
                 },
+                {
+                    "id": "bert-base-uncased",
+                    "description": "[google-bert/bert-base-uncased]: https://huggingface.co/google-bert/bert-base-uncased",
+                    "object": "embedding",
+                    "created": current_time,
+                    "owned_by": "google",
+                },
             ],
         }
         return self.available_models
             logger.warn("Not provide HF Token!")
         return None
+    class QueryRequest(BaseModel):
+        texts: List[str]
+        model_name: str = Field(..., example="bert-base-uncased")
+        api_key: str = Field(..., example="your_hf_api_key_here")
+    async def send_request_to_hugging_face(texts, model_name, api_key):
+        api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_name}"
+        headers = {"Authorization": f"Bearer {api_key}"}
+        response = requests.post(api_url, headers=headers, json={"inputs": texts})
+        result = response.json()
+        if isinstance(result, list) and len(result) > 0 and isinstance(result[0], list):
+            return result
+        elif "error" in result:
+            raise RuntimeError("The model is currently loading, please re-run the query.")
+        else:
+            raise RuntimeError("Unexpected response format.")
     class ChatCompletionsPostItem(BaseModel):
         model: str = Field(
             default="mixtral-8x7b",
             data_response = streamer.chat_return_dict(stream_response)
             return data_response
+    async def embedding(request: QueryRequest):
+    try:
+        for attempt in range(3):  # Retry logic
+            try:
+                embeddings = await send_request_to_hugging_face(request.texts, request.model_name, request.api_key)
+                data = [
+                    {"object": "embedding", "index": i, "embedding": embedding}
+                    for i, embedding in enumerate(embeddings)
+                ]
+                return {
+                    "object": "list",
+                    "data": data,
+                    "model": request.model_name,
+                    "usage": {"prompt_tokens": len(request.texts), "total_tokens": len(request.texts)}
+                }
+            except RuntimeError as e:
+                if attempt < 2:  # Don't sleep on the last attempt
+                    await asyncio.sleep(10)  # Delay for the retry
+        raise HTTPException(status_code=503, detail="The model is currently loading, please try again later.")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
     def setup_routes(self):
         for prefix in ["", "/v1", "/api", "/api/v1"]:
             if prefix in ["/api/v1"]:
                 include_in_schema=include_in_schema,
             )(self.chat_completions)
+            if prefix in ["/v1"]:
+                include_in_schema = True
+            else:
+                include_in_schema = False
+            self.app.post(
+                prefix + "/embedding",  # Use the specific prefix for this route
+                summary="Generate embeddings for the given texts",
+                include_in_schema=include_in_schema,
+                response_model=List  # Adapt based on your actual response model
+            )(self.embedding)
 class ArgParser(argparse.ArgumentParser):
     def __init__(self, *args, **kwargs):