Spaces:

k050506koch
/

gpt3-dev-api

Sleeping

App Files Files Community

Kyryll Kochkin commited on Jan 22

Commit

49badf7

1 Parent(s): 698373a

Use live API for OpenAI responses client test

Browse files

Files changed (8) hide show

README.md +14 -2
app/main.py +2 -1
app/routers/__init__.py +2 -2
app/routers/responses.py +186 -0
app/schemas/responses.py +58 -0
requirements-test.txt +1 -0
tests/test_live_api.py +13 -1
tests/test_openai_compat.py +65 -1

README.md CHANGED Viewed

@@ -10,7 +10,7 @@ pinned: false
 # GPT3dev OpenAI-Compatible API
 **more detailed documentation is hoeeted on [DeepWiki](https://deepwiki.com/krll-corp/gpt3dev-api)**
-A production-ready FastAPI server that mirrors the OpenAI REST API surface while proxying requests to Hugging Face causal language models. The service implements the `/v1/completions`, `/v1/chat/completions`, `/v1/models`, and `/v1/embeddings` endpoints with full support for streaming Server-Sent Events (SSE) and OpenAI-style usage accounting. Chat completions are available for instruct-tuned models like `GPT4-dev-177M-1511-Instruct`.
 ## The API is hosted on HuggingFace Spaces:
 ```bash
@@ -19,7 +19,7 @@ https://k050506koch-gpt3-dev-api.hf.space
 ## Features
-- ✅ Drop-in compatible request/response schemas for OpenAI text completions.
 - ✅ Streaming responses (`stream=true`) that emit OpenAI-formatted SSE frames ending with `data: [DONE]`.
 - ✅ Configurable Hugging Face model registry with lazy loading, shared model cache, and automatic device placement.
 - ✅ Prompt token counting via `tiktoken` when available (falls back to Hugging Face tokenizers).
@@ -130,6 +130,18 @@ curl http://localhost:7860/v1/chat/completions \
 Non-instruct models will return an error directing users to use `/v1/completions` instead.
 ### Embeddings
 The `/v1/embeddings` endpoint returns a 501 Not Implemented error with actionable guidance unless an embeddings backend is configured.

 # GPT3dev OpenAI-Compatible API
 **more detailed documentation is hoeeted on [DeepWiki](https://deepwiki.com/krll-corp/gpt3dev-api)**
+A production-ready FastAPI server that mirrors the OpenAI REST API surface while proxying requests to Hugging Face causal language models. The service implements the `/v1/completions`, `/v1/chat/completions`, `/v1/responses`, `/v1/models`, and `/v1/embeddings` endpoints with full support for streaming Server-Sent Events (SSE) and OpenAI-style usage accounting. Chat completions are available for instruct-tuned models like `GPT4-dev-177M-1511-Instruct`.
 ## The API is hosted on HuggingFace Spaces:
 ```bash
 ## Features
+- ✅ Drop-in compatible request/response schemas for OpenAI text completions and responses.
 - ✅ Streaming responses (`stream=true`) that emit OpenAI-formatted SSE frames ending with `data: [DONE]`.
 - ✅ Configurable Hugging Face model registry with lazy loading, shared model cache, and automatic device placement.
 - ✅ Prompt token counting via `tiktoken` when available (falls back to Hugging Face tokenizers).
 Non-instruct models will return an error directing users to use `/v1/completions` instead.
+### Responses API
+```bash
+curl http://localhost:7860/v1/responses \
+  -H "Content-Type: application/json" \
+  -d '{
+        "model": "GPT4-dev-177M-1511-Instruct",
+        "input": "Summarize the key points in two sentences.",
+        "max_output_tokens": 128
+      }'
+```
 ### Embeddings
 The `/v1/embeddings` endpoint returns a 501 Not Implemented error with actionable guidance unless an embeddings backend is configured.

app/main.py CHANGED Viewed

@@ -15,7 +15,7 @@ from fastapi.responses import JSONResponse
 from fastapi.routing import APIRoute
 from .core.settings import get_settings
-from .routers import chat, completions, embeddings, models
 def configure_logging(level: str) -> None:
@@ -124,6 +124,7 @@ if settings.cors_allow_origins:
 app.include_router(models.router)
 app.include_router(completions.router)
 app.include_router(chat.router)
 app.include_router(embeddings.router)

 from fastapi.routing import APIRoute
 from .core.settings import get_settings
+from .routers import chat, completions, embeddings, models, responses
 def configure_logging(level: str) -> None:
 app.include_router(models.router)
 app.include_router(completions.router)
 app.include_router(chat.router)
+app.include_router(responses.router)
 app.include_router(embeddings.router)

app/routers/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 """Router package exports."""
-from . import chat, completions, embeddings, models
-__all__ = ["chat", "completions", "embeddings", "models"]

 """Router package exports."""
+from . import chat, completions, embeddings, models, responses
+__all__ = ["chat", "completions", "embeddings", "models", "responses"]

app/routers/responses.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""Responses API endpoint."""
+from __future__ import annotations
+import asyncio
+import json
+import time
+import uuid
+from typing import Generator, List
+from fastapi import APIRouter
+from fastapi.responses import StreamingResponse
+from ..core import engine
+from ..core.errors import model_not_found, openai_http_error
+from ..core.model_registry import get_model_spec
+from ..schemas.responses import (
+    ResponseInputMessage,
+    ResponseOutputMessage,
+    ResponseOutputText,
+    ResponsePayload,
+    ResponseRequest,
+    ResponseUsage,
+)
+router = APIRouter(prefix="/v1", tags=["responses"])
+def _render_input_text(message: ResponseInputMessage) -> str:
+    if isinstance(message.content, str):
+        return message.content
+    return "".join(part.text for part in message.content if part.type == "input_text")
+def _normalize_messages(input_payload: List[ResponseInputMessage]) -> List[dict]:
+    return [{"role": message.role, "content": _render_input_text(message)} for message in input_payload]
+def _stop_sequences(stop: List[str] | str | None) -> List[str]:
+    if isinstance(stop, list):
+        return stop
+    return [stop] if stop else []
+def _build_output(text: str) -> ResponseOutputMessage:
+    return ResponseOutputMessage(
+        id=f"msg_{uuid.uuid4().hex}",
+        content=[ResponseOutputText(text=text)],
+    )
+@router.post("/responses", response_model=ResponsePayload)
+async def create_response(payload: ResponseRequest) -> ResponsePayload | StreamingResponse:
+    """Generate a response using OpenAI's Responses API format."""
+    try:
+        spec = get_model_spec(payload.model)
+    except KeyError:
+        raise model_not_found(payload.model)
+    stop_sequences = _stop_sequences(payload.stop)
+    if isinstance(payload.input, str):
+        if spec.is_instruct:
+            messages = [{"role": "user", "content": payload.input}]
+            prompt = engine.apply_chat_template(payload.model, messages)
+        else:
+            prompt = payload.input
+    else:
+        if not spec.is_instruct:
+            raise openai_http_error(
+                400,
+                f"Model '{payload.model}' is not an instruct model and cannot accept structured input. "
+                "Provide a plain string input or use /v1/chat/completions for chat-formatted prompts.",
+                error_type="invalid_request_error",
+                param="model",
+            )
+        messages = _normalize_messages(payload.input)
+        prompt = engine.apply_chat_template(payload.model, messages)
+    if payload.stream:
+        return _streaming_response(payload, prompt, stop_sequences)
+    try:
+        result = await asyncio.to_thread(
+            engine.generate,
+            payload.model,
+            prompt,
+            temperature=payload.temperature,
+            top_p=payload.top_p,
+            max_tokens=payload.max_output_tokens,
+            stop=stop_sequences,
+            n=payload.n,
+        )
+    except Exception as exc:
+        raise openai_http_error(
+            500,
+            f"Generation error: {exc}",
+            error_type="server_error",
+            code="generation_error",
+        )
+    output: List[ResponseOutputMessage] = []
+    total_completion_tokens = 0
+    for item in result.completions:
+        total_completion_tokens += item.tokens
+        output.append(_build_output(item.text.strip()))
+    usage = ResponseUsage(
+        input_tokens=result.prompt_tokens,
+        output_tokens=total_completion_tokens,
+        total_tokens=result.prompt_tokens + total_completion_tokens,
+    )
+    return ResponsePayload(
+        id=f"resp_{uuid.uuid4().hex}",
+        model=payload.model,
+        output=output,
+        usage=usage,
+    )
+def _streaming_response(
+    payload: ResponseRequest,
+    prompt: str,
+    stop_sequences: List[str],
+) -> StreamingResponse:
+    response_id = f"resp_{uuid.uuid4().hex}"
+    message_id = f"msg_{uuid.uuid4().hex}"
+    created = int(time.time())
+    def event_stream() -> Generator[bytes, None, None]:
+        stream = engine.create_stream(
+            payload.model,
+            prompt,
+            temperature=payload.temperature,
+            top_p=payload.top_p,
+            max_tokens=payload.max_output_tokens,
+            stop=stop_sequences,
+        )
+        base_payload = ResponsePayload(
+            id=response_id,
+            created=created,
+            model=payload.model,
+            output=[],
+            usage=ResponseUsage(input_tokens=0, output_tokens=0, total_tokens=0),
+        )
+        created_payload = {
+            "type": "response.created",
+            "response": base_payload.model_dump(),
+        }
+        yield f"data: {json.dumps(created_payload)}\n\n".encode()
+        collected = ""
+        for token in stream.iter_tokens():
+            collected += token
+            delta_payload = {
+                "type": "response.output_text.delta",
+                "response_id": response_id,
+                "item_id": message_id,
+                "output_index": 0,
+                "content_index": 0,
+                "delta": token,
+            }
+            yield f"data: {json.dumps(delta_payload)}\n\n".encode()
+        usage = ResponseUsage(
+            input_tokens=stream.prompt_tokens,
+            output_tokens=stream.completion_tokens,
+            total_tokens=stream.prompt_tokens + stream.completion_tokens,
+        )
+        final_payload = ResponsePayload(
+            id=response_id,
+            created=created,
+            model=payload.model,
+            output=[ResponseOutputMessage(id=message_id, content=[ResponseOutputText(text=collected)])],
+            usage=usage,
+        )
+        completed_payload = {
+            "type": "response.completed",
+            "response": final_payload.model_dump(),
+        }
+        yield f"data: {json.dumps(completed_payload)}\n\n".encode()
+        yield b"data: [DONE]\n\n"
+    return StreamingResponse(
+        event_stream(),
+        media_type="text/event-stream",
+    )

app/schemas/responses.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""Schemas for the Responses API endpoint."""
+from __future__ import annotations
+import time
+from typing import List, Literal, Optional, Union
+from pydantic import BaseModel, Field, AliasChoices
+class ResponseInputContentPart(BaseModel):
+    type: Literal["input_text"] = "input_text"
+    text: str
+class ResponseInputMessage(BaseModel):
+    role: Literal["system", "user", "assistant", "tool"]
+    content: Union[str, List[ResponseInputContentPart]]
+class ResponseRequest(BaseModel):
+    model: str
+    input: Union[str, List[ResponseInputMessage]]
+    temperature: float = 1.0
+    top_p: float = 1.0
+    n: int = 1
+    stop: Optional[List[str] | str] = None
+    max_output_tokens: Optional[int] = Field(
+        default=None,
+        validation_alias=AliasChoices("max_output_tokens", "max_tokens"),
+    )
+    stream: bool = False
+class ResponseOutputText(BaseModel):
+    type: Literal["output_text"] = "output_text"
+    text: str
+class ResponseOutputMessage(BaseModel):
+    id: str
+    type: Literal["message"] = "message"
+    role: Literal["assistant"] = "assistant"
+    content: List[ResponseOutputText]
+class ResponseUsage(BaseModel):
+    input_tokens: int
+    output_tokens: int
+    total_tokens: int
+class ResponsePayload(BaseModel):
+    id: str
+    object: Literal["response"] = "response"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    output: List[ResponseOutputMessage]
+    usage: ResponseUsage

requirements-test.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 fastapi>=0.110.0
 httpx>=0.27.0
 pytest>=7.4.0
 pytest-asyncio>=0.23.0
 pydantic>=2.6.0

 fastapi>=0.110.0
 httpx>=0.27.0
+openai>=1.30.0
 pytest>=7.4.0
 pytest-asyncio>=0.23.0
 pydantic>=2.6.0

tests/test_live_api.py CHANGED Viewed

@@ -25,6 +25,19 @@ def _get_models(timeout: float = 10.0) -> Set[str]:
         return {item["id"] for item in data.get("data", [])}
 @pytest.mark.skipif(not RUN_LIVE, reason="set RUN_LIVE_API_TESTS=1 to run live API tests")
 @pytest.mark.parametrize("model", ["GPT-2", "GPT3-dev-350m-2805"])  # adjust names as available
 def test_completion_basic(model: str) -> None:
@@ -51,4 +64,3 @@ def test_completion_basic(model: str) -> None:
     # The completion can be empty for some models with temperature=0, but should be a string
     usage = body.get("usage") or {}
     assert "total_tokens" in usage

         return {item["id"] for item in data.get("data", [])}
+@pytest.mark.skipif(not RUN_LIVE, reason="set RUN_LIVE_API_TESTS=1 to run live API tests")
+def test_responses_openai_client() -> None:
+    openai_module = pytest.importorskip("openai")
+    OpenAI = openai_module.OpenAI
+    model = "GPT4-dev-177M-1511-Instruct"
+    available = _get_models()
+    if model not in available:
+        pytest.skip(f"model {model} not available on server; available={sorted(available)}")
+    client = OpenAI(api_key="test", base_url=f"{BASE_URL}/v1")
+    response = client.responses.create(model=model, input="Say hello in one sentence.")
+    assert response.output[0].content[0].text
 @pytest.mark.skipif(not RUN_LIVE, reason="set RUN_LIVE_API_TESTS=1 to run live API tests")
 @pytest.mark.parametrize("model", ["GPT-2", "GPT3-dev-350m-2805"])  # adjust names as available
 def test_completion_basic(model: str) -> None:
     # The completion can be empty for some models with temperature=0, but should be a string
     usage = body.get("usage") or {}
     assert "total_tokens" in usage

tests/test_openai_compat.py CHANGED Viewed

@@ -118,9 +118,10 @@ sys.path.append(str(Path(__file__).resolve().parents[1]))
 from app.core import model_registry as model_registry_module
 from app.core.model_registry import ModelMetadata, ModelSpec
-from app.routers import chat, completions, embeddings, models
 from app.schemas.chat import ChatCompletionRequest
 from app.schemas.completions import CompletionRequest
 def test_list_models() -> None:
@@ -244,6 +245,69 @@ def test_chat_rejects_non_instruct_model(monkeypatch: pytest.MonkeyPatch) -> Non
     assert "not an instruct model" in exc.value.detail["message"]
 def test_embeddings_not_implemented() -> None:
     with pytest.raises(HTTPException) as exc:
         asyncio.run(embeddings.create_embeddings())

 from app.core import model_registry as model_registry_module
 from app.core.model_registry import ModelMetadata, ModelSpec
+from app.routers import chat, completions, embeddings, models, responses
 from app.schemas.chat import ChatCompletionRequest
 from app.schemas.completions import CompletionRequest
+from app.schemas.responses import ResponseRequest
 def test_list_models() -> None:
     assert "not an instruct model" in exc.value.detail["message"]
+def test_responses_string_input(monkeypatch: pytest.MonkeyPatch) -> None:
+    class DummyResult:
+        prompt_tokens = 4
+        completions = [type("C", (), {"text": "Hello", "tokens": 2, "finish_reason": "stop"})()]
+    def fake_generate(*args, **kwargs):
+        return DummyResult()
+    monkeypatch.setattr("app.routers.responses.engine.generate", fake_generate)
+    monkeypatch.setattr(
+        "app.routers.responses.get_model_spec",
+        lambda model: ModelSpec(name=model, hf_repo="dummy/repo", is_instruct=False),
+    )
+    payload = ResponseRequest.model_validate({
+        "model": "GPT3-dev",
+        "input": "Hi",
+    })
+    response = asyncio.run(responses.create_response(payload))
+    body = response.model_dump()
+    assert body["object"] == "response"
+    assert body["output"][0]["role"] == "assistant"
+    assert body["output"][0]["content"][0]["text"] == "Hello"
+    assert body["usage"]["input_tokens"] == 4
+    assert body["usage"]["output_tokens"] == 2
+def test_responses_instruct_messages(monkeypatch: pytest.MonkeyPatch) -> None:
+    class DummyResult:
+        prompt_tokens = 3
+        completions = [type("C", (), {"text": "Sure", "tokens": 1, "finish_reason": "stop"})()]
+    recorded_prompts: list[str] = []
+    def fake_generate(*args, **kwargs):
+        recorded_prompts.append(args[1])
+        return DummyResult()
+    monkeypatch.setattr("app.routers.responses.engine.generate", fake_generate)
+    monkeypatch.setattr(
+        "app.routers.responses.engine.apply_chat_template",
+        lambda model, messages: "formatted prompt",
+    )
+    monkeypatch.setattr(
+        "app.routers.responses.get_model_spec",
+        lambda model: ModelSpec(name=model, hf_repo="dummy/instruct", is_instruct=True),
+    )
+    payload = ResponseRequest.model_validate({
+        "model": "GPT4-dev-177M-1511-Instruct",
+        "input": [{"role": "user", "content": "Hi"}],
+    })
+    response = asyncio.run(responses.create_response(payload))
+    body = response.model_dump()
+    assert recorded_prompts == ["formatted prompt"]
+    assert body["output"][0]["content"][0]["text"] == "Sure"
+    assert body["usage"]["total_tokens"] == 4
+def test_openai_client_responses_create(monkeypatch: pytest.MonkeyPatch) -> None:
+    openai_module = pytest.importorskip("openai")
+    OpenAI = openai_module.OpenAI
+    pytest.skip("OpenAI client test moved to live API coverage.")
 def test_embeddings_not_implemented() -> None:
     with pytest.raises(HTTPException) as exc:
         asyncio.run(embeddings.create_embeddings())