Spaces:

build-small-hackathon
/

comicx

Running on Zero

App Files Files Community

comicx / comic /modal_backend.py

ASTRALK

Upload comic/modal_backend.py with huggingface_hub

be67303 verified 20 days ago

Raw

History Blame Contribute Delete

3.14 kB

	"""Real Modal backends: Gemma (vLLM OpenAI endpoint) + FLUX (resident pipeline).

	Config via env (set after deploying the two Modal apps in serve/):
	COMIC_GEMMA_URL - base URL of the vLLM OpenAI server, ending in /v1
	COMIC_GEMMA_MODEL - served model name (default "gemma-comic")
	COMIC_GEMMA_KEY - token if the endpoint uses proxy-auth (else "EMPTY")
	COMIC_FLUX_APP - Modal app name for FLUX (default comic-flux)
	COMIC_FLUX_CLS - Modal class name (default FluxRenderer)

	Pattern mirrors the wisdom2 reference deployment: an OpenAI client to the vLLM /v1
	endpoint for the writer, and a modal.Cls handle for the artist. The generous timeout
	lets the first call after scale-to-zero ride through the GPU cold boot.
	"""

	from __future__ import annotations

	import os

	from .backends import WriterBackend, ArtistBackend

	DEFAULT_GEMMA_URL = "https://keshav-public07--comic-gemma-serve.modal.run/v1"
	DEFAULT_GEMMA_MODEL = "gemma-comic"


	class ModalWriter(WriterBackend):
	def __init__(self, base_url=None, model=None, api_key=None,
	temperature=0.9, max_tokens=16384, timeout=900):
	from openai import OpenAI

	base_url = base_url or os.environ.get("COMIC_GEMMA_URL", DEFAULT_GEMMA_URL)
	self.model = model or os.environ.get("COMIC_GEMMA_MODEL", DEFAULT_GEMMA_MODEL)
	self.temperature = temperature
	# Bibles/panel batches are large JSON; give plenty of output room.
	self.max_tokens = max_tokens
	# 15 min: absorbs the ~850s first-deploy cold boot without timing out.
	self._client = OpenAI(
	base_url=base_url,
	api_key=api_key or os.environ.get("COMIC_GEMMA_KEY", "EMPTY"),
	timeout=timeout,
	)

	def chat(self, messages: list) -> str:
	resp = self._client.chat.completions.create(
	model=self.model,
	messages=messages,
	stream=False,
	temperature=self.temperature,
	max_tokens=self.max_tokens,
	# Ask vLLM for a JSON object directly when the server supports it; the
	# prompts also demand strict JSON, so this is belt-and-braces.
	response_format={"type": "json_object"},
	)
	return resp.choices[0].message.content or ""

	def warm(self) -> bool:
	try:
	self._client.models.list()
	return True
	except Exception:
	return False


	class ModalArtist(ArtistBackend):
	def __init__(self, app=None, cls=None):
	import modal

	app = app or os.environ.get("COMIC_FLUX_APP", "comic-flux")
	cls = cls or os.environ.get("COMIC_FLUX_CLS", "FluxRenderer")
	Renderer = modal.Cls.from_name(app, cls)
	self._obj = Renderer()

	def render(self, prompt: str, seed: int = 0) -> bytes:
	return self._obj.render.remote(prompt, seed)

	def render_batch(self, prompts: list, seeds: list) -> list:
	return self._obj.render_batch.remote(list(prompts), list(seeds))

	def warm(self) -> bool:
	try:
	return bool(self._obj.warm.remote())
	except Exception:
	return False