comicx / comic /modal_backend.py
ASTRALK's picture
Upload comic/modal_backend.py with huggingface_hub
be67303 verified
Raw
History Blame Contribute Delete
3.14 kB
"""Real Modal backends: Gemma (vLLM OpenAI endpoint) + FLUX (resident pipeline).
Config via env (set after deploying the two Modal apps in serve/):
COMIC_GEMMA_URL - base URL of the vLLM OpenAI server, ending in /v1
COMIC_GEMMA_MODEL - served model name (default "gemma-comic")
COMIC_GEMMA_KEY - token if the endpoint uses proxy-auth (else "EMPTY")
COMIC_FLUX_APP - Modal app name for FLUX (default comic-flux)
COMIC_FLUX_CLS - Modal class name (default FluxRenderer)
Pattern mirrors the wisdom2 reference deployment: an OpenAI client to the vLLM /v1
endpoint for the writer, and a modal.Cls handle for the artist. The generous timeout
lets the first call after scale-to-zero ride through the GPU cold boot.
"""
from __future__ import annotations
import os
from .backends import WriterBackend, ArtistBackend
DEFAULT_GEMMA_URL = "https://keshav-public07--comic-gemma-serve.modal.run/v1"
DEFAULT_GEMMA_MODEL = "gemma-comic"
class ModalWriter(WriterBackend):
def __init__(self, base_url=None, model=None, api_key=None,
temperature=0.9, max_tokens=16384, timeout=900):
from openai import OpenAI
base_url = base_url or os.environ.get("COMIC_GEMMA_URL", DEFAULT_GEMMA_URL)
self.model = model or os.environ.get("COMIC_GEMMA_MODEL", DEFAULT_GEMMA_MODEL)
self.temperature = temperature
# Bibles/panel batches are large JSON; give plenty of output room.
self.max_tokens = max_tokens
# 15 min: absorbs the ~850s first-deploy cold boot without timing out.
self._client = OpenAI(
base_url=base_url,
api_key=api_key or os.environ.get("COMIC_GEMMA_KEY", "EMPTY"),
timeout=timeout,
)
def chat(self, messages: list) -> str:
resp = self._client.chat.completions.create(
model=self.model,
messages=messages,
stream=False,
temperature=self.temperature,
max_tokens=self.max_tokens,
# Ask vLLM for a JSON object directly when the server supports it; the
# prompts also demand strict JSON, so this is belt-and-braces.
response_format={"type": "json_object"},
)
return resp.choices[0].message.content or ""
def warm(self) -> bool:
try:
self._client.models.list()
return True
except Exception:
return False
class ModalArtist(ArtistBackend):
def __init__(self, app=None, cls=None):
import modal
app = app or os.environ.get("COMIC_FLUX_APP", "comic-flux")
cls = cls or os.environ.get("COMIC_FLUX_CLS", "FluxRenderer")
Renderer = modal.Cls.from_name(app, cls)
self._obj = Renderer()
def render(self, prompt: str, seed: int = 0) -> bytes:
return self._obj.render.remote(prompt, seed)
def render_batch(self, prompts: list, seeds: list) -> list:
return self._obj.render_batch.remote(list(prompts), list(seeds))
def warm(self) -> bool:
try:
return bool(self._obj.warm.remote())
except Exception:
return False