illuma

+"""
+Illuma (BLIP3o-NEXT-GRPO-TexT-3B) - Custom Handler for Hugging Face Inference Endpoints
+This handler enables running the illuma image generation model as a production API
+on Hugging Face Inference Endpoints with a dedicated GPU.
+Architecture: Qwen2.5 VL AR (3B) + SANA 1.5 Diffusion Decoder
+License: Apache 2.0
+"""
+import os
+import base64
+import io
+import torch
+from typing import Any, Dict
+from PIL import Image
+from dataclasses import dataclass
+from transformers import AutoTokenizer
+from blip3o.model import *
+@dataclass
+class T2IConfig:
+    model_path: str = ""
+    device: str = "cuda:0"
+    dtype: torch.dtype = torch.bfloat16
+    scale: int = 0
+    seq_len: int = 729
+    top_p: float = 0.95
+    top_k: int = 1200
+class EndpointHandler:
+    """Custom inference handler for Illuma (BLIP3o-NEXT) image generation."""
+    def __init__(self, model_dir: str, **kwargs: Any) -> None:
+        """Load the model and tokenizer on startup."""
+        self.config = T2IConfig(model_path=model_dir)
+        self.device = torch.device(self.config.device if torch.cuda.is_available() else "cpu")
+        print(f"[Illuma] Loading model from: {model_dir}")
+        print(f"[Illuma] Device: {self.device}")
+        self.model = blip3oQwenForInferenceLM.from_pretrained(
+            self.config.model_path,
+            torch_dtype=self.config.dtype
+        ).to(self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_path)
+        print("[Illuma] Model loaded successfully!")
+    def __call__(self, data: Dict[str, Any]) -> Any:
+        """
+        Generate an image from a text prompt.
+        Input (JSON):
+        {
+            "inputs": "A neon sign that says HELLO",
+            "parameters": {
+                "seq_len": 729,
+                "top_p": 0.95,
+                "top_k": 1200,
+                "guidance_scale": 3.0
+            }
+        }
+        Output:
+        - Returns base64-encoded PNG image
+        - Or raw PNG bytes if Content-Type is set
+        """
+        # Extract prompt
+        prompt = data.get("inputs", "")
+        if not prompt:
+            return {"error": "No prompt provided. Send {'inputs': 'your prompt here'}"}
+        # Extract optional parameters
+        parameters = data.get("parameters", {})
+        seq_len = parameters.get("seq_len", self.config.seq_len)
+        top_p = parameters.get("top_p", self.config.top_p)
+        top_k = parameters.get("top_k", self.config.top_k)
+        print(f"[Illuma] Generating image for: {prompt[:100]}...")
+        try:
+            image = self._generate(prompt, seq_len, top_p, top_k)
+            return self._encode_image(image)
+        except Exception as e:
+            print(f"[Illuma] Error generating image: {e}")
+            return {"error": str(e)}
+    def _generate(self, prompt: str, seq_len: int, top_p: float, top_k: float) -> Image.Image:
+        """Generate image using the BLIP3o-NEXT inference pipeline."""
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": f"Please generate image based on the following caption: {prompt}"}
+        ]
+        input_text = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        input_text += "\n"
+        inputs = self.tokenizer(
+            [input_text],
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            padding_side="left"
+        )
+        gen_ids, output_image = self.model.generate_images(
+            inputs.input_ids.to(self.device),
+            inputs.attention_mask.to(self.device),
+            max_new_tokens=seq_len,
+            do_sample=True,
+            top_p=top_p,
+            top_k=top_k
+        )
+        return output_image[0]
+    def _encode_image(self, image: Image.Image) -> Dict[str, str]:
+        """Encode PIL Image to base64 for API response."""
+        buffered = io.BytesIO()
+        image.save(buffered, format="PNG")
+        img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        return {"image": img_b64}
+# For local testing
+if __name__ == "__main__":
+    handler = EndpointHandler(model_dir="Salesforce/BLIP3o-NEXT-GRPO-TexT-3B")
+    result = handler({"inputs": "A neon sign that says ILLUMA"})
+    print(f"Generated image, base64 length: {len(result.get('image', ''))}")