"""Modal GPU endpoints for Tabras: MiniCPM card authoring, Nemotron boss play, and SDXL-Turbo art. The Gradio Space calls these over HTTP, so all heavy compute runs on dedicated, autoscaled Modal GPUs instead of the Space. Deploy: modal deploy modal_app.py Then set the printed URLs as Space variables: TABRAS_CARD_ENDPOINT -> CardModel.chat URL TABRAS_BOSS_ENDPOINT -> BossModel.chat URL TABRAS_ART_ENDPOINT -> ArtModel.generate URL """ import modal CACHE = "/cache" hf_cache = modal.Volume.from_name("tabras-hf-cache", create_if_missing=True) MINICPM = "openbmb/MiniCPM-V-4" NEMOTRON = "nvidia/Nemotron-Mini-4B-Instruct" SDXL = "stabilityai/sdxl-turbo" llm_image = ( modal.Image.debian_slim(python_version="3.11") .pip_install( "torch", "transformers==4.49.0", "accelerate", "sentencepiece", "torchvision", "einops", "pillow", "fastapi[standard]", ) .env({"HF_HOME": CACHE}) ) art_image = ( modal.Image.debian_slim(python_version="3.11") .pip_install( # diffusers 0.31 supports SDXL-Turbo and is compatible with transformers # 4.49; newer diffusers (0.35+) imports flux2 which needs Qwen3ForCausalLM. "torch", "diffusers==0.31.0", "transformers==4.49.0", "accelerate", "safetensors", "pillow", "fastapi[standard]", ) .env({"HF_HOME": CACHE}) ) app = modal.App("tabras-models") # ---- MiniCPM: card authoring (OpenAI-compatible chat) ---- @app.cls(image=llm_image, gpu="A10G", volumes={CACHE: hf_cache}, min_containers=3, max_containers=4, scaledown_window=600, timeout=600) class CardModel: @modal.enter() def load(self) -> None: import torch from transformers import AutoModel, AutoTokenizer self.tok = AutoTokenizer.from_pretrained(MINICPM, trust_remote_code=True) self.model = ( AutoModel.from_pretrained(MINICPM, trust_remote_code=True, attn_implementation="sdpa", torch_dtype=torch.float16) .eval() .cuda() ) @modal.fastapi_endpoint(method="POST") def chat(self, item: dict) -> dict: msgs = item.get("messages", []) system = " ".join(m["content"] for m in msgs if m.get("role") == "system") user = " ".join(m["content"] for m in msgs if m.get("role") == "user") temp = float(item.get("temperature", 0.7)) text = str( self.model.chat( msgs=[{"role": "user", "content": user}], image=None, tokenizer=self.tok, system_prompt=system, sampling=temp > 0, temperature=max(temp, 0.01), max_new_tokens=int(item.get("max_tokens", 128)), ) ) return {"choices": [{"message": {"role": "assistant", "content": text}}]} # ---- Nemotron: boss play (OpenAI-compatible chat) ---- @app.cls(image=llm_image, gpu="A10G", volumes={CACHE: hf_cache}, min_containers=1, max_containers=2, scaledown_window=600, timeout=600) class BossModel: @modal.enter() def load(self) -> None: import torch from transformers import AutoModelForCausalLM, AutoTokenizer self.tok = AutoTokenizer.from_pretrained(NEMOTRON) self.model = AutoModelForCausalLM.from_pretrained(NEMOTRON, torch_dtype=torch.float16).eval().cuda() @modal.fastapi_endpoint(method="POST") def chat(self, item: dict) -> dict: import torch msgs = item.get("messages", []) inputs = self.tok.apply_chat_template(msgs, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda") temp = float(item.get("temperature", 0.2)) with torch.no_grad(): out = self.model.generate( inputs, max_new_tokens=int(item.get("max_tokens", 96)), do_sample=temp > 0, temperature=max(temp, 0.01), ) text = self.tok.decode(out[0][inputs.shape[-1]:], skip_special_tokens=True) return {"choices": [{"message": {"role": "assistant", "content": text}}]} # ---- SDXL-Turbo: card art (returns a JPEG data URI) ---- @app.cls(image=art_image, gpu="A10G", volumes={CACHE: hf_cache}, min_containers=3, max_containers=4, scaledown_window=600, timeout=600) class ArtModel: @modal.enter() def load(self) -> None: import torch from diffusers import AutoPipelineForText2Image self.pipe = AutoPipelineForText2Image.from_pretrained(SDXL, torch_dtype=torch.float16).to("cuda") self.pipe.set_progress_bar_config(disable=True) @modal.fastapi_endpoint(method="POST") def generate(self, item: dict) -> dict: import base64 from io import BytesIO result = self.pipe( prompt=item["prompt"], num_inference_steps=int(item.get("steps", 4)), guidance_scale=float(item.get("guidance", 0.0)), width=int(item.get("width", 512)), height=int(item.get("height", 320)), negative_prompt=item.get("negative_prompt"), ) buffer = BytesIO() result.images[0].save(buffer, format="JPEG", quality=85) return {"image": "data:image/jpeg;base64," + base64.b64encode(buffer.getvalue()).decode("ascii")}