Spaces:

samwell
/

cosmos-predict2-space

Paused

samwell commited on Jan 17

Commit

d95102e

verified ·

1 Parent(s): 850ecac

Fix: ensure_on_device before each inference to prevent text_encoder drift

Files changed (1) hide show

handler.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Optional
 from PIL import Image
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 app = FastAPI()
@@ -39,18 +40,19 @@ async def load_model():
         token=os.environ.get("HF_TOKEN"),
     )
     pipe = pipe.to(DEVICE)
-    # Ensure all components are on the same device
-    if hasattr(pipe, 'text_encoder') and pipe.text_encoder is not None:
-        pipe.text_encoder = pipe.text_encoder.to(DEVICE)
-    if hasattr(pipe, 'vae') and pipe.vae is not None:
-        pipe.vae = pipe.vae.to(DEVICE)
-    if hasattr(pipe, 'transformer') and pipe.transformer is not None:
-        pipe.transformer = pipe.transformer.to(DEVICE)
     print("Model loaded successfully!")
     print(f"Pipeline device: {pipe.device}")
 @app.post("/predict")
 @app.post("/")
 async def predict(request: dict):
@@ -88,7 +90,9 @@ async def predict(request: dict):
     guidance_scale = inputs.get("guidance_scale", 7.0)
     try:
-        # Run without generator - let pipeline handle device placement
         with torch.inference_mode():
             output = pipe(
                 image=image,
@@ -105,6 +109,10 @@ async def predict(request: dict):
         with open(video_path, "rb") as f:
             video_b64 = base64.b64encode(f.read()).decode("utf-8")
         return {"video": video_b64, "content_type": "video/mp4"}
     except Exception as e:

 from PIL import Image
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+import gc
 app = FastAPI()
         token=os.environ.get("HF_TOKEN"),
     )
     pipe = pipe.to(DEVICE)
     print("Model loaded successfully!")
     print(f"Pipeline device: {pipe.device}")
+def ensure_on_device():
+    """Ensure all pipeline components are on CUDA before inference"""
+    global pipe, DEVICE
+    pipe = pipe.to(DEVICE)
+    # Force text_encoder to CUDA (this is the problematic component)
+    if hasattr(pipe, 'text_encoder') and pipe.text_encoder is not None:
+        pipe.text_encoder = pipe.text_encoder.to(DEVICE)
+    torch.cuda.empty_cache()
+    gc.collect()
 @app.post("/predict")
 @app.post("/")
 async def predict(request: dict):
     guidance_scale = inputs.get("guidance_scale", 7.0)
     try:
+        # Ensure all components on CUDA before each inference
+        ensure_on_device()
         with torch.inference_mode():
             output = pipe(
                 image=image,
         with open(video_path, "rb") as f:
             video_b64 = base64.b64encode(f.read()).decode("utf-8")
+        # Clean up after inference
+        torch.cuda.empty_cache()
+        gc.collect()
         return {"video": video_b64, "content_type": "video/mp4"}
     except Exception as e: