openapi: 3.1.0
info:
  title: Multi-Agent Land — Model Serving API
  version: "1.0.0"
  description: >
    OpenAI-compatible inference API served by the Modal apps in this repo
    (`nvidia-llms`, `openbmb-llms`, `google-llms`). Each model is exposed as its
    own endpoint and speaks the OpenAI REST protocol, so any OpenAI-compatible
    client works by pointing `base_url` at the endpoint URL.

    Every running endpoint also serves a live, model-specific spec at
    `/openapi.json` and an interactive Swagger UI at `/docs`. This checked-in
    spec documents the shared, stable surface across all endpoints.
servers:
  - url: https://{workspace}--{endpoint}.modal.run/v1
    description: Per-model Modal endpoint (one server per model).
    variables:
      workspace:
        default: your-workspace
        description: Your Modal workspace slug.
      endpoint:
        default: google-llms-gemma-4-12b
        description: >
          The Modal function label: <app-name>-<endpoint_name>, where app-name is
          the modal.App (nvidia-llms / openbmb-llms / google-llms) and endpoint_name
          is the per-model slug from registry.py.
        enum:
          - nvidia-llms-nemotron-3-nano-4b
          - openbmb-llms-minicpm-o-4-5
          - openbmb-llms-minicpm-4-1-8b
          - google-llms-gemma-4-26b
          - google-llms-gemma-4-12b
security:
  - bearerAuth: []
paths:
  /models:
    get:
      operationId: listModels
      summary: List the model(s) served by this endpoint.
      responses:
        "200":
          description: Available models.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ModelList"
  /chat/completions:
    post:
      operationId: createChatCompletion
      summary: Create a chat completion (OpenAI-compatible).
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/ChatCompletionRequest"
      responses:
        "200":
          description: Chat completion. A stream of SSE chunks when `stream` is true.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ChatCompletionResponse"
        "401":
          $ref: "#/components/responses/Unauthorized"
  /completions:
    post:
      operationId: createCompletion
      summary: Create a text completion (OpenAI-compatible).
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/CompletionRequest"
      responses:
        "200":
          description: Text completion.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/CompletionResponse"
        "401":
          $ref: "#/components/responses/Unauthorized"
components:
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      description: >
        Required only when the apps are deployed with MODAL_LLM_REQUIRE_AUTH=1.
        The token is the value of the `llm-api-key` Modal Secret (VLLM_API_KEY).
        Without auth the endpoints are public and any token is accepted.
  responses:
    Unauthorized:
      description: Missing or invalid bearer token (auth-enabled deploys only).
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/Error"
  schemas:
    Model:
      type: object
      properties:
        id:
          type: string
          description: Served model id (the Hugging Face repo id), e.g. "google/gemma-4-12B".
        object:
          type: string
          const: model
        created:
          type: integer
        owned_by:
          type: string
    ModelList:
      type: object
      properties:
        object:
          type: string
          const: list
        data:
          type: array
          items:
            $ref: "#/components/schemas/Model"
    ChatMessage:
      type: object
      required: [role, content]
      properties:
        role:
          type: string
          enum: [system, user, assistant, tool]
        content:
          description: >
            Plain text, or an array of content parts for multimodal models
            (e.g. MiniCPM-o) with `type` of text / image_url / input_audio.
          oneOf:
            - type: string
            - type: array
              items:
                type: object
    ChatCompletionRequest:
      type: object
      required: [model, messages]
      properties:
        model:
          type: string
          description: Served model id (must match the endpoint's model).
        messages:
          type: array
          items:
            $ref: "#/components/schemas/ChatMessage"
        max_tokens:
          type: integer
        temperature:
          type: number
          default: 1.0
        top_p:
          type: number
          default: 1.0
        stream:
          type: boolean
          default: false
        tools:
          type: array
          description: Tool/function definitions (models with a tool_call_parser).
          items:
            type: object
        tool_choice:
          description: "auto | none | required | a specific tool."
          oneOf:
            - type: string
            - type: object
    ChatCompletionResponse:
      type: object
      properties:
        id:
          type: string
        object:
          type: string
          const: chat.completion
        created:
          type: integer
        model:
          type: string
        choices:
          type: array
          items:
            type: object
            properties:
              index:
                type: integer
              message:
                $ref: "#/components/schemas/ChatMessage"
              finish_reason:
                type: string
        usage:
          $ref: "#/components/schemas/Usage"
    CompletionRequest:
      type: object
      required: [model, prompt]
      properties:
        model:
          type: string
        prompt:
          oneOf:
            - type: string
            - type: array
              items:
                type: string
        max_tokens:
          type: integer
        temperature:
          type: number
        stream:
          type: boolean
          default: false
    CompletionResponse:
      type: object
      properties:
        id:
          type: string
        object:
          type: string
          const: text_completion
        model:
          type: string
        choices:
          type: array
          items:
            type: object
            properties:
              index:
                type: integer
              text:
                type: string
              finish_reason:
                type: string
        usage:
          $ref: "#/components/schemas/Usage"
    Usage:
      type: object
      properties:
        prompt_tokens:
          type: integer
        completion_tokens:
          type: integer
        total_tokens:
          type: integer
    Error:
      type: object
      properties:
        error:
          type: object
          properties:
            message:
              type: string
            type:
              type: string
            code:
              type: string