Spaces:
Sleeping
Sleeping
| """Modal GPU endpoints for Tabras: MiniCPM card authoring, Nemotron boss play, | |
| and SDXL-Turbo art. The Gradio Space calls these over HTTP, so all heavy compute | |
| runs on dedicated, autoscaled Modal GPUs instead of the Space. | |
| Deploy: modal deploy modal_app.py | |
| Then set the printed URLs as Space variables: | |
| TABRAS_CARD_ENDPOINT -> CardModel.chat URL | |
| TABRAS_BOSS_ENDPOINT -> BossModel.chat URL | |
| TABRAS_ART_ENDPOINT -> ArtModel.generate URL | |
| """ | |
| import modal | |
| CACHE = "/cache" | |
| hf_cache = modal.Volume.from_name("tabras-hf-cache", create_if_missing=True) | |
| MINICPM = "openbmb/MiniCPM-V-4" | |
| NEMOTRON = "nvidia/Nemotron-Mini-4B-Instruct" | |
| SDXL = "stabilityai/sdxl-turbo" | |
| llm_image = ( | |
| modal.Image.debian_slim(python_version="3.11") | |
| .pip_install( | |
| "torch", "transformers==4.49.0", "accelerate", "sentencepiece", | |
| "torchvision", "einops", "pillow", "fastapi[standard]", | |
| ) | |
| .env({"HF_HOME": CACHE}) | |
| ) | |
| art_image = ( | |
| modal.Image.debian_slim(python_version="3.11") | |
| .pip_install( | |
| # diffusers 0.31 supports SDXL-Turbo and is compatible with transformers | |
| # 4.49; newer diffusers (0.35+) imports flux2 which needs Qwen3ForCausalLM. | |
| "torch", "diffusers==0.31.0", "transformers==4.49.0", "accelerate", | |
| "safetensors", "pillow", "fastapi[standard]", | |
| ) | |
| .env({"HF_HOME": CACHE}) | |
| ) | |
| app = modal.App("tabras-models") | |
| # ---- MiniCPM: card authoring (OpenAI-compatible chat) ---- | |
| class CardModel: | |
| def load(self) -> None: | |
| import torch | |
| from transformers import AutoModel, AutoTokenizer | |
| self.tok = AutoTokenizer.from_pretrained(MINICPM, trust_remote_code=True) | |
| self.model = ( | |
| AutoModel.from_pretrained(MINICPM, trust_remote_code=True, attn_implementation="sdpa", torch_dtype=torch.float16) | |
| .eval() | |
| .cuda() | |
| ) | |
| def chat(self, item: dict) -> dict: | |
| msgs = item.get("messages", []) | |
| system = " ".join(m["content"] for m in msgs if m.get("role") == "system") | |
| user = " ".join(m["content"] for m in msgs if m.get("role") == "user") | |
| temp = float(item.get("temperature", 0.7)) | |
| text = str( | |
| self.model.chat( | |
| msgs=[{"role": "user", "content": user}], | |
| image=None, | |
| tokenizer=self.tok, | |
| system_prompt=system, | |
| sampling=temp > 0, | |
| temperature=max(temp, 0.01), | |
| max_new_tokens=int(item.get("max_tokens", 128)), | |
| ) | |
| ) | |
| return {"choices": [{"message": {"role": "assistant", "content": text}}]} | |
| # ---- Nemotron: boss play (OpenAI-compatible chat) ---- | |
| class BossModel: | |
| def load(self) -> None: | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| self.tok = AutoTokenizer.from_pretrained(NEMOTRON) | |
| self.model = AutoModelForCausalLM.from_pretrained(NEMOTRON, torch_dtype=torch.float16).eval().cuda() | |
| def chat(self, item: dict) -> dict: | |
| import torch | |
| msgs = item.get("messages", []) | |
| inputs = self.tok.apply_chat_template(msgs, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda") | |
| temp = float(item.get("temperature", 0.2)) | |
| with torch.no_grad(): | |
| out = self.model.generate( | |
| inputs, | |
| max_new_tokens=int(item.get("max_tokens", 96)), | |
| do_sample=temp > 0, | |
| temperature=max(temp, 0.01), | |
| ) | |
| text = self.tok.decode(out[0][inputs.shape[-1]:], skip_special_tokens=True) | |
| return {"choices": [{"message": {"role": "assistant", "content": text}}]} | |
| # ---- SDXL-Turbo: card art (returns a JPEG data URI) ---- | |
| class ArtModel: | |
| def load(self) -> None: | |
| import torch | |
| from diffusers import AutoPipelineForText2Image | |
| self.pipe = AutoPipelineForText2Image.from_pretrained(SDXL, torch_dtype=torch.float16).to("cuda") | |
| self.pipe.set_progress_bar_config(disable=True) | |
| def generate(self, item: dict) -> dict: | |
| import base64 | |
| from io import BytesIO | |
| result = self.pipe( | |
| prompt=item["prompt"], | |
| num_inference_steps=int(item.get("steps", 4)), | |
| guidance_scale=float(item.get("guidance", 0.0)), | |
| width=int(item.get("width", 512)), | |
| height=int(item.get("height", 320)), | |
| negative_prompt=item.get("negative_prompt"), | |
| ) | |
| buffer = BytesIO() | |
| result.images[0].save(buffer, format="JPEG", quality=85) | |
| return {"image": "data:image/jpeg;base64," + base64.b64encode(buffer.getvalue()).decode("ascii")} | |