samwell
/

cosmos-predict2-endpoint

Model card Files Files and versions

xet

Community

samwell commited on Jan 16

Commit

dd3f963

verified ·

1 Parent(s): 01565f9

FastAPI handler for custom container

Browse files

Files changed (1) hide show

handler.py +87 -53

handler.py CHANGED Viewed

@@ -1,71 +1,105 @@
 import torch
 import base64
 import io
-from typing import Dict, Any
 from PIL import Image
-class EndpointHandler:
-    def __init__(self, path: str = ""):
-        from diffusers import Cosmos2VideoToWorldPipeline
-        from diffusers.utils import export_to_video
-        self.export_to_video = export_to_video
-        model_id = "nvidia/Cosmos-Predict2-2B-Video2World"
-        self.pipe = Cosmos2VideoToWorldPipeline.from_pretrained(
-            model_id,
-            torch_dtype=torch.bfloat16,
-        )
-        self.pipe.to("cuda")
-    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        inputs = data.get("inputs", data)
-        image_data = inputs.get("image")
-        if not image_data:
-            return {"error": "No image provided"}
-        try:
-            if image_data.startswith("http"):
-                from diffusers.utils import load_image
-                image = load_image(image_data)
-            else:
-                image_bytes = base64.b64decode(image_data)
-                image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-        except Exception as e:
-            return {"error": f"Failed to load image: {str(e)}"}
-        prompt = inputs.get("prompt", "")
-        if not prompt:
-            return {"error": "No prompt provided"}
-        negative_prompt = inputs.get("negative_prompt", "ugly, static, blurry, low quality")
-        num_frames = inputs.get("num_frames", 93)
-        num_inference_steps = inputs.get("num_inference_steps", 35)
-        guidance_scale = inputs.get("guidance_scale", 7.0)
-        seed = inputs.get("seed")
-        generator = torch.Generator(device="cuda").manual_seed(int(seed)) if seed else None
-        try:
-            output = self.pipe(
-                image=image,
-                prompt=prompt,
-                negative_prompt=negative_prompt,
-                num_frames=num_frames,
-                num_inference_steps=num_inference_steps,
-                guidance_scale=guidance_scale,
-                generator=generator,
-            )
-            video_path = "/tmp/output.mp4"
-            self.export_to_video(output.frames[0], video_path, fps=16)
-            with open(video_path, "rb") as f:
-                video_b64 = base64.b64encode(f.read()).decode("utf-8")
-            return {"video": video_b64, "content_type": "video/mp4"}
-        except Exception as e:
-            return {"error": f"Inference failed: {str(e)}"}

 import torch
 import base64
 import io
+import os
+from typing import Dict, Any, Optional
 from PIL import Image
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+app = FastAPI()
+# Global pipeline
+pipe = None
+export_to_video = None
+class InferenceRequest(BaseModel):
+    image: str  # base64 or URL
+    prompt: str
+    negative_prompt: str = "ugly, static, blurry, low quality"
+    num_frames: int = 93
+    num_inference_steps: int = 35
+    guidance_scale: float = 7.0
+    seed: Optional[int] = None
+class InferenceInputs(BaseModel):
+    inputs: InferenceRequest
+@app.on_event("startup")
+async def load_model():
+    global pipe, export_to_video
+    from diffusers import Cosmos2VideoToWorldPipeline
+    from diffusers.utils import export_to_video as etv
+    export_to_video = etv
+    model_id = "nvidia/Cosmos-Predict2-2B-Video2World"
+    pipe = Cosmos2VideoToWorldPipeline.from_pretrained(
+        model_id,
+        torch_dtype=torch.bfloat16,
+        token=os.environ.get("HF_TOKEN"),
+    )
+    pipe.to("cuda")
+    print("Model loaded successfully!")
+@app.post("/")
+async def predict(request: dict):
+    global pipe, export_to_video
+    # Handle both direct and nested input formats
+    inputs = request.get("inputs", request)
+    image_data = inputs.get("image")
+    if not image_data:
+        raise HTTPException(status_code=400, detail="No image provided")
+    prompt = inputs.get("prompt", "")
+    if not prompt:
+        raise HTTPException(status_code=400, detail="No prompt provided")
+    # Load image
+    try:
+        if image_data.startswith("http"):
+            from diffusers.utils import load_image
+            image = load_image(image_data)
+        else:
+            image_bytes = base64.b64decode(image_data)
+            image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Failed to load image: {str(e)}")
+    negative_prompt = inputs.get("negative_prompt", "ugly, static, blurry, low quality")
+    num_frames = inputs.get("num_frames", 93)
+    num_inference_steps = inputs.get("num_inference_steps", 35)
+    guidance_scale = inputs.get("guidance_scale", 7.0)
+    seed = inputs.get("seed")
+    generator = None
+    if seed is not None:
+        generator = torch.Generator(device="cuda").manual_seed(int(seed))
+    try:
+        output = pipe(
+            image=image,
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_frames=num_frames,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            generator=generator,
+        )
+        video_path = "/tmp/output.mp4"
+        export_to_video(output.frames[0], video_path, fps=16)
+        with open(video_path, "rb") as f:
+            video_b64 = base64.b64encode(f.read()).decode("utf-8")
+        return {"video": video_b64, "content_type": "video/mp4"}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Inference failed: {str(e)}")
+@app.get("/health")
+async def health():
+    return {"status": "healthy"}