Spaces:

jiminaa
/

language-chatbot

Sleeping

App Files Files Community

jiminaa commited on Jan 17

Commit

6482d6b

1 Parent(s): 32fdadf

openai compatible endpoint

Browse files

Files changed (2) hide show

Dockerfile +0 -1
main.py +75 -2

Dockerfile CHANGED Viewed

@@ -12,6 +12,5 @@ RUN pip uninstall -y gradio gradio-client || true \
 USER user
 # Start the FastAPI app on port 7860, the default port expected by Spaces
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

 USER user
 # Start the FastAPI app on port 7860, the default port expected by Spaces
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py CHANGED Viewed

@@ -7,8 +7,10 @@ from fastapi import FastAPI
 from fastapi.responses import StreamingResponse, RedirectResponse
 from pydantic import BaseModel
 import json
-from typing import List, Literal
 import os
 HF_TOKEN = os.getenv("HF_TOKEN")
@@ -123,6 +125,14 @@ class GenerateRequest(BaseModel):
     max_length: int = 256
     temperature: float = 0.7
 # fastAPI endpoints
 # return information about the API
@@ -177,7 +187,70 @@ async def generate_stream_api(request: GenerateRequest):
         headers={
             "Cache-Control": "no-cache",  # Don't cache streaming responses
             "Connection": "keep-alive",  # Keep connection open
-            "X-Accel-Buffering": "no",
         }
     )

 from fastapi.responses import StreamingResponse, RedirectResponse
 from pydantic import BaseModel
 import json
+from typing import List, Literal, Optional
 import os
+import uuid
+import time
 HF_TOKEN = os.getenv("HF_TOKEN")
     max_length: int = 256
     temperature: float = 0.7
+# OpenAI-compatible request format for InferenceClient
+class ChatCompletionRequest(BaseModel):
+    model: str = "default"
+    messages: List[Message]
+    max_tokens: Optional[int] = 256
+    temperature: Optional[float] = 0.7
+    stream: Optional[bool] = True
 # fastAPI endpoints
 # return information about the API
         headers={
             "Cache-Control": "no-cache",  # Don't cache streaming responses
             "Connection": "keep-alive",  # Keep connection open
+            "X-Accel-Buffering": "no",
+        }
+    )
+# OpenAI-compatible endpoint for HuggingFace InferenceClient
+# Pass language via the `model` field (e.g., "English", "Spanish", "Korean")
+@app.post("/v1/chat/completions")
+async def chat_completions(request: ChatCompletionRequest):
+    messages_dicts = [{"role": msg.role, "content": msg.content} for msg in request.messages]
+    # Use model field as language selector, default to English if invalid
+    language = request.model if request.model in adapter_paths else "English"
+    chat_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
+    created = int(time.time())
+    def event_generator():
+        try:
+            for token in generate_text_stream(
+                messages_dicts,
+                language,
+                request.max_tokens or 256,
+                request.temperature or 0.7
+            ):
+                chunk = {
+                    "id": chat_id,
+                    "object": "chat.completion.chunk",
+                    "created": created,
+                    "model": language,
+                    "choices": [{
+                        "index": 0,
+                        "delta": {"content": token},
+                        "finish_reason": None
+                    }]
+                }
+                yield f"data: {json.dumps(chunk)}\n\n"
+            # Final chunk with finish_reason
+            final_chunk = {
+                "id": chat_id,
+                "object": "chat.completion.chunk",
+                "created": created,
+                "model": language,
+                "choices": [{
+                    "index": 0,
+                    "delta": {},
+                    "finish_reason": "stop"
+                }]
+            }
+            yield f"data: {json.dumps(final_chunk)}\n\n"
+            yield "data: [DONE]\n\n"
+        except Exception as e:
+            error_chunk = {"error": {"message": str(e), "type": "server_error"}}
+            yield f"data: {json.dumps(error_chunk)}\n\n"
+    return StreamingResponse(
+        event_generator(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",
         }
     )