Spaces:

k050506koch
/

gpt3-dev-api

Sleeping

App Files Files Community

Kyryll Kochkin commited on Jan 12

Commit

698373a

1 Parent(s): 2ad2929

added GPT4-dev-177M-1511-Instruct

Browse files

Files changed (5) hide show

README.md +18 -2
app/core/engine.py +56 -9
app/core/model_registry.py +21 -0
app/routers/chat.py +152 -6
tests/test_openai_compat.py +13 -3

README.md CHANGED Viewed

@@ -10,7 +10,7 @@ pinned: false
 # GPT3dev OpenAI-Compatible API
 **more detailed documentation is hoeeted on [DeepWiki](https://deepwiki.com/krll-corp/gpt3dev-api)**
-A production-ready FastAPI server that mirrors the OpenAI REST API surface while proxying requests to Hugging Face causal language models. The service implements the `/v1/completions`, `/v1/models`, and `/v1/embeddings` endpoints with full support for streaming Server-Sent Events (SSE) and OpenAI-style usage accounting. A `/v1/chat/completions` stub is included but currently returns a structured 501 error because the available models are completion-only.
 ## The API is hosted on HuggingFace Spaces:
 ```bash
@@ -112,7 +112,23 @@ curl http://localhost:7860/v1/completions \
 ### Chat Completions
-The `/v1/chat/completions` endpoint is currently disabled and returns a 501 Not Implemented error instructing clients to use `/v1/completions` instead. I don't have any chat-tuned models now, but I plan to enable this endpoint later with openai harmony - tuned models.
 ### Embeddings

 # GPT3dev OpenAI-Compatible API
 **more detailed documentation is hoeeted on [DeepWiki](https://deepwiki.com/krll-corp/gpt3dev-api)**
+A production-ready FastAPI server that mirrors the OpenAI REST API surface while proxying requests to Hugging Face causal language models. The service implements the `/v1/completions`, `/v1/chat/completions`, `/v1/models`, and `/v1/embeddings` endpoints with full support for streaming Server-Sent Events (SSE) and OpenAI-style usage accounting. Chat completions are available for instruct-tuned models like `GPT4-dev-177M-1511-Instruct`.
 ## The API is hosted on HuggingFace Spaces:
 ```bash
 ### Chat Completions
+The `/v1/chat/completions` endpoint is available for instruct-tuned models. Currently supported instruct models:
+- `GPT4-dev-177M-1511-Instruct` - Instruction-tuned GPT-4-style model fine-tuned on HuggingFaceH4/no_robots
+```bash
+curl http://localhost:7860/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+        "model": "GPT4-dev-177M-1511-Instruct",
+        "messages": [
+          {"role": "user", "content": "Write a short welcome message for new contributors."}
+        ],
+        "max_tokens": 128
+      }'
+```
+Non-instruct models will return an error directing users to use `/v1/completions` instead.
 ### Embeddings

app/core/engine.py CHANGED Viewed

@@ -140,14 +140,14 @@ class _ModelHandle:
         logger.info("Loading tokenizer for %s", spec.hf_repo)
         tokenizer = AutoTokenizer.from_pretrained(
             spec.hf_repo,
-            use_auth_token=token,
             trust_remote_code=True,
         )
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token = tokenizer.eos_token
         logger.info("Tokenizer ready in %.2fs", time.perf_counter() - t0)
         model_kwargs = {
-            "use_auth_token": token,
             "trust_remote_code": True,
         }
         # Resolve preferred device early so we can adjust dtype if needed
@@ -183,11 +183,30 @@ class _ModelHandle:
             device_pref,
             " (device_map=auto)" if device_map else "",
         )
-        model = AutoModelForCausalLM.from_pretrained(
-            spec.hf_repo,
-            device_map=device_map,
-            **model_kwargs,
-        )
         logger.info("Model ready in %.2fs", time.perf_counter() - t1)
         if device_map is None:
             model = model.to(device_pref)
@@ -212,11 +231,12 @@ class _ModelHandle:
                 kwargs.pop("cache_position", None)
                 kwargs.pop("encoder_attention_mask", None)
                 kwargs.pop("attention_mask", None)
                 return _orig_forward(*args, **kwargs)
             model.forward = MethodType(_forward_compat, model)
             # Also patch submodules whose forward signatures include
-            # encoder_attention_mask to avoid duplicate passing (positional+kw)
             for _, module in model.named_modules():
                 fwd = getattr(module, "forward", None)
                 if not callable(fwd):
@@ -225,7 +245,12 @@ class _ModelHandle:
                     sig = inspect.signature(fwd)
                 except Exception:
                     continue
-                if "encoder_attention_mask" not in sig.parameters:
                     continue
                 orig_fwd = fwd
@@ -234,6 +259,7 @@ class _ModelHandle:
                         kwargs.pop("encoder_attention_mask", None)
                         kwargs.pop("attention_mask", None)
                         kwargs.pop("cache_position", None)
                         return orig(*args, **kwargs)
                     return _sub_forward_compat
@@ -292,6 +318,27 @@ def _apply_stop_sequences(text: str, stop_sequences: Sequence[str]) -> tuple[str
     return text, "length"
 def _prepare_inputs(
     handle: _ModelHandle,
     prompt: str,

         logger.info("Loading tokenizer for %s", spec.hf_repo)
         tokenizer = AutoTokenizer.from_pretrained(
             spec.hf_repo,
+            token=token,
             trust_remote_code=True,
         )
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token = tokenizer.eos_token
         logger.info("Tokenizer ready in %.2fs", time.perf_counter() - t0)
         model_kwargs = {
+            "token": token,
             "trust_remote_code": True,
         }
         # Resolve preferred device early so we can adjust dtype if needed
             device_pref,
             " (device_map=auto)" if device_map else "",
         )
+        # Patch _load_pretrained_model to fix tie_weights incompatibility
+        # with newer transformers that pass missing_keys keyword argument
+        from transformers import modeling_utils
+        _orig_load_pretrained_func = modeling_utils.PreTrainedModel._load_pretrained_model.__func__
+        def _patched_load_pretrained_func(cls, model, *args, **kwargs):
+            # Patch model.tie_weights to accept and ignore unexpected kwargs
+            orig_tie_weights = model.tie_weights
+            def _compat_tie_weights(*tw_args, **tw_kwargs):
+                tw_kwargs.pop("missing_keys", None)
+                tw_kwargs.pop("recompute_mapping", None)
+                return orig_tie_weights(*tw_args, **tw_kwargs)
+            model.tie_weights = _compat_tie_weights
+            return _orig_load_pretrained_func(cls, model, *args, **kwargs)
+        modeling_utils.PreTrainedModel._load_pretrained_model = classmethod(_patched_load_pretrained_func)
+        try:
+            model = AutoModelForCausalLM.from_pretrained(
+                spec.hf_repo,
+                device_map=device_map,
+                **model_kwargs,
+            )
+        finally:
+            modeling_utils.PreTrainedModel._load_pretrained_model = classmethod(_orig_load_pretrained_func)
         logger.info("Model ready in %.2fs", time.perf_counter() - t1)
         if device_map is None:
             model = model.to(device_pref)
                 kwargs.pop("cache_position", None)
                 kwargs.pop("encoder_attention_mask", None)
                 kwargs.pop("attention_mask", None)
+                kwargs.pop("head_mask", None)
                 return _orig_forward(*args, **kwargs)
             model.forward = MethodType(_forward_compat, model)
             # Also patch submodules whose forward signatures include
+            # encoder_attention_mask or head_mask to avoid duplicate passing (positional+kw)
             for _, module in model.named_modules():
                 fwd = getattr(module, "forward", None)
                 if not callable(fwd):
                     sig = inspect.signature(fwd)
                 except Exception:
                     continue
+                # Patch modules that have problematic parameters
+                needs_patch = any(
+                    p in sig.parameters
+                    for p in ("encoder_attention_mask", "head_mask")
+                )
+                if not needs_patch:
                     continue
                 orig_fwd = fwd
                         kwargs.pop("encoder_attention_mask", None)
                         kwargs.pop("attention_mask", None)
                         kwargs.pop("cache_position", None)
+                        kwargs.pop("head_mask", None)
                         return orig(*args, **kwargs)
                     return _sub_forward_compat
     return text, "length"
+def apply_chat_template(
+    model_name: str,
+    messages: List[dict],
+    add_generation_prompt: bool = True,
+) -> str:
+    """Apply the tokenizer's chat template to format messages for instruct models."""
+    handle = _get_handle(model_name)
+    tokenizer = handle.tokenizer
+    if hasattr(tokenizer, "apply_chat_template"):
+        return tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt,
+        )
+    # Fallback for tokenizers without chat_template
+    from .prompting import render_chat_prompt
+    from ..schemas.chat import ChatMessage
+    chat_messages = [ChatMessage(role=m["role"], content=m["content"]) for m in messages]
+    return render_chat_prompt(chat_messages)
 def _prepare_inputs(
     handle: _ModelHandle,
     prompt: str,

app/core/model_registry.py CHANGED Viewed

@@ -55,9 +55,29 @@ class ModelSpec:
     device: Optional[str] = None
     max_context_tokens: Optional[int] = None
     metadata: Optional[ModelMetadata] = None
 _DEFAULT_MODELS: List[ModelSpec] = [
     ModelSpec(
         name="GPT4-dev-177M-1511",
         hf_repo="k050506koch/GPT4-dev-177M-1511",
@@ -230,6 +250,7 @@ def _load_registry_from_file(path: Path) -> Iterable[ModelSpec]:
                 device=entry.get("device"),
                 max_context_tokens=entry.get("max_context_tokens"),
                 metadata=metadata,
             )
         )
     return specs

     device: Optional[str] = None
     max_context_tokens: Optional[int] = None
     metadata: Optional[ModelMetadata] = None
+    is_instruct: bool = False
 _DEFAULT_MODELS: List[ModelSpec] = [
+    ModelSpec(
+        name="GPT4-dev-177M-1511-Instruct",
+        hf_repo="k050506koch/GPT4-dev-177M-1511-Instruct",
+        dtype="float16",
+        device="auto",
+        max_context_tokens=512,
+        is_instruct=True,
+        metadata=ModelMetadata(
+            description="Instruction-tuned GPT-4-style model fine-tuned on HuggingFaceH4/no_robots conversational dataset.",
+            parameter_count="177M",
+            training_datasets="HuggingFaceH4/no_robots",
+            training_steps="1,200 SFT steps · AdamW optimizer · cosine LR schedule · assistant-only loss masking",
+            evaluation="25.75% MMLU, 34.20% HellaSwag (author reported)",
+            notes="First instruct model. Uses Harmony-style chat formatting with apply_chat_template. Requires trust_remote_code.",
+            sources=(
+                "https://huggingface.co/k050506koch/GPT4-dev-177M-1511-Instruct",
+            ),
+        ),
+    ),
     ModelSpec(
         name="GPT4-dev-177M-1511",
         hf_repo="k050506koch/GPT4-dev-177M-1511",
                 device=entry.get("device"),
                 max_context_tokens=entry.get("max_context_tokens"),
                 metadata=metadata,
+                is_instruct=entry.get("is_instruct", False),
             )
         )
     return specs

app/routers/chat.py CHANGED Viewed

@@ -1,19 +1,165 @@
 """Chat completions endpoint."""
 from __future__ import annotations
 from fastapi import APIRouter
-from ..core.errors import feature_not_available
-from ..schemas.chat import ChatCompletionRequest, ChatCompletionResponse
 router = APIRouter(prefix="/v1", tags=["chat"])
 @router.post("/chat/completions", response_model=ChatCompletionResponse)
 async def create_chat_completion(payload: ChatCompletionRequest) -> ChatCompletionResponse:
-    """Return a structured error while chat completions are disabled."""
-    raise feature_not_available(
-        "chat_completions",
-        "Chat completions are currently disabled; please use /v1/completions instead.",
     )

 """Chat completions endpoint."""
 from __future__ import annotations
+import asyncio
+import json
+import time
+import uuid
+from typing import Generator, List
 from fastapi import APIRouter
+from fastapi.responses import StreamingResponse
+from ..core import engine
+from ..core.errors import model_not_found, openai_http_error
+from ..core.model_registry import get_model_spec
+from ..schemas.chat import (
+    ChatCompletionChoice,
+    ChatCompletionChunk,
+    ChatCompletionChunkChoice,
+    ChatCompletionChunkChoiceDelta,
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ChatMessage,
+)
+from ..schemas.common import UsageInfo
 router = APIRouter(prefix="/v1", tags=["chat"])
 @router.post("/chat/completions", response_model=ChatCompletionResponse)
 async def create_chat_completion(payload: ChatCompletionRequest) -> ChatCompletionResponse:
+    """Generate chat completions using instruct-tuned models."""
+    try:
+        spec = get_model_spec(payload.model)
+    except KeyError:
+        raise model_not_found(payload.model)
+    if not spec.is_instruct:
+        raise openai_http_error(
+            400,
+            f"Model '{payload.model}' is not an instruct model and cannot be used with chat completions. "
+            "Please use /v1/completions instead, or choose an instruct model like 'GPT4-dev-177M-1511-Instruct'.",
+            error_type="invalid_request_error",
+            param="model",
+        )
+    # Convert messages to dict format for apply_chat_template
+    messages_dict = [{"role": m.role, "content": m.content} for m in payload.messages]
+    # Apply chat template using tokenizer
+    prompt = engine.apply_chat_template(payload.model, messages_dict)
+    stop_sequences = payload.stop if isinstance(payload.stop, list) else (
+        [payload.stop] if payload.stop else []
+    )
+    if payload.stream:
+        return _streaming_chat_completion(payload, prompt, stop_sequences)
+    try:
+        result = await asyncio.to_thread(
+            engine.generate,
+            payload.model,
+            prompt,
+            temperature=payload.temperature,
+            top_p=payload.top_p,
+            max_tokens=payload.max_tokens,
+            stop=stop_sequences,
+            n=payload.n,
+        )
+    except Exception as exc:
+        raise openai_http_error(
+            500,
+            f"Generation error: {exc}",
+            error_type="server_error",
+            code="generation_error",
+        )
+    choices: List[ChatCompletionChoice] = []
+    total_completion_tokens = 0
+    for idx, item in enumerate(result.completions):
+        total_completion_tokens += item.tokens
+        choices.append(
+            ChatCompletionChoice(
+                index=idx,
+                message=ChatMessage(role="assistant", content=item.text.strip()),
+                finish_reason=item.finish_reason,
+            )
+        )
+    usage = UsageInfo(
+        prompt_tokens=result.prompt_tokens,
+        completion_tokens=total_completion_tokens,
+        total_tokens=result.prompt_tokens + total_completion_tokens,
+    )
+    return ChatCompletionResponse(model=payload.model, choices=choices, usage=usage)
+def _streaming_chat_completion(
+    payload: ChatCompletionRequest,
+    prompt: str,
+    stop_sequences: List[str],
+) -> StreamingResponse:
+    completion_id = f"chatcmpl-{uuid.uuid4().hex}"
+    def event_stream() -> Generator[bytes, None, None]:
+        stream = engine.create_stream(
+            payload.model,
+            prompt,
+            temperature=payload.temperature,
+            top_p=payload.top_p,
+            max_tokens=payload.max_tokens,
+            stop=stop_sequences,
+        )
+        # Send initial role delta
+        initial_chunk = ChatCompletionChunk(
+            id=completion_id,
+            created=int(time.time()),
+            model=payload.model,
+            choices=[
+                ChatCompletionChunkChoice(
+                    index=0,
+                    delta=ChatCompletionChunkChoiceDelta(role="assistant"),
+                    finish_reason=None,
+                )
+            ],
+        )
+        yield f"data: {initial_chunk.model_dump_json()}\n\n".encode()
+        for token in stream.iter_tokens():
+            chunk = ChatCompletionChunk(
+                id=completion_id,
+                created=int(time.time()),
+                model=payload.model,
+                choices=[
+                    ChatCompletionChunkChoice(
+                        index=0,
+                        delta=ChatCompletionChunkChoiceDelta(content=token),
+                        finish_reason=None,
+                    )
+                ],
+            )
+            yield f"data: {chunk.model_dump_json()}\n\n".encode()
+        # Send final chunk with finish_reason
+        final_chunk = ChatCompletionChunk(
+            id=completion_id,
+            created=int(time.time()),
+            model=payload.model,
+            choices=[
+                ChatCompletionChunkChoice(
+                    index=0,
+                    delta=ChatCompletionChunkChoiceDelta(),
+                    finish_reason=stream.finish_reason,
+                )
+            ],
+        )
+        yield f"data: {final_chunk.model_dump_json()}\n\n".encode()
+        yield b"data: [DONE]\n\n"
+    return StreamingResponse(
+        event_stream(),
+        media_type="text/event-stream",
     )

tests/test_openai_compat.py CHANGED Viewed

@@ -220,7 +220,17 @@ def test_completions_handles_prompt_list(monkeypatch: pytest.MonkeyPatch) -> Non
     assert body["usage"]["prompt_tokens"] == len("Hello") + len("World")
-def test_chat_disabled() -> None:
     payload = ChatCompletionRequest.model_validate({
         "model": "GPT3-dev",
         "messages": [
@@ -230,8 +240,8 @@ def test_chat_disabled() -> None:
     with pytest.raises(HTTPException) as exc:
         asyncio.run(chat.create_chat_completion(payload))
-    assert exc.value.status_code == 501
-    assert exc.value.detail["code"] == "chat_completions_not_available"
 def test_embeddings_not_implemented() -> None:

     assert body["usage"]["prompt_tokens"] == len("Hello") + len("World")
+def test_chat_rejects_non_instruct_model(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Chat completions should reject non-instruct models with a 400 error."""
+    from app.core import model_registry
+    # Register a non-instruct model
+    monkeypatch.setattr(
+        model_registry,
+        "_registry",
+        {"GPT3-dev": ModelSpec(name="GPT3-dev", hf_repo="k050506koch/GPT3-dev", is_instruct=False)},
+    )
     payload = ChatCompletionRequest.model_validate({
         "model": "GPT3-dev",
         "messages": [
     with pytest.raises(HTTPException) as exc:
         asyncio.run(chat.create_chat_completion(payload))
+    assert exc.value.status_code == 400
+    assert "not an instruct model" in exc.value.detail["message"]
 def test_embeddings_not_implemented() -> None: