Spaces:

TaliDror
/

AAS2F

Running on Zero

App Files Files Community

TaliDror commited on 12 days ago

Commit

39db2c4

1 Parent(s): e66529d

adaptation to enable ZeroGPU

Browse files

Files changed (1) hide show

app.py +40 -18

app.py CHANGED Viewed

@@ -362,46 +362,68 @@ def select_best_image(images: list, method: str) -> Image.Image:
 def generate(audio_path, num_samples, guidance_scale, num_inference_steps, base_seed, select_best, best_selection="pairwise"):
     global pipeline, speaker_encoder, facenet_model, facenet_classify_model, device
-    if pipeline is None:
-        return None, "Model not loaded. Check Space configuration."
     if audio_path is None:
         return None, "Please provide an audio file."
     try:
         waveform = load_and_process_audio(audio_path, device, max_seconds=5.0)
     except Exception as e:
         return None, f"Audio loading failed: {e}"
     with torch.no_grad():
-        speech_z = speaker_encoder(waveform, normalize=True, apply_shared_projection=False)
-        dtype = torch.float16 if device == "cuda" else torch.float32
         id_emb = speech_z.to(dtype)
         id_emb_projected = project_face_embs(pipeline, id_emb)
-    images = []
-    for i in range(int(num_samples)):
-        seed = int(base_seed) + i
-        generator = torch.Generator(device=device).manual_seed(seed)
-        img = pipeline(
-            prompt_embeds=id_emb_projected,
-            num_inference_steps=int(num_inference_steps),
-            guidance_scale=float(guidance_scale),
-            num_images_per_prompt=1,
-            generator=generator,
-        ).images[0]
-        images.append(img)
     if select_best:
-        model_ready = facenet_model is not None if best_selection in ("mean", "pairwise") else facenet_classify_model is not None
         if model_ready:
             best = select_best_image(images, best_selection)
         else:
             best = images[0]
         return [best], ""
     return images, ""
 # ---------------------------------------------------------------------------
 # Model loading
 # ---------------------------------------------------------------------------

 def generate(audio_path, num_samples, guidance_scale, num_inference_steps, base_seed, select_best, best_selection="pairwise"):
     global pipeline, speaker_encoder, facenet_model, facenet_classify_model, device
     if audio_path is None:
         return None, "Please provide an audio file."
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"[generate] device = {device}")
+    if pipeline is None or speaker_encoder is None:
+        print("[generate] Loading models lazily...")
+        load_models()
+        print("[generate] Models loaded.")
+    if pipeline is None or speaker_encoder is None:
+        return None, "Model loading failed. Check logs."
     try:
         waveform = load_and_process_audio(audio_path, device, max_seconds=5.0)
     except Exception as e:
         return None, f"Audio loading failed: {e}"
+    dtype = torch.float16 if device == "cuda" else torch.float32
     with torch.no_grad():
+        speech_z = speaker_encoder(
+            waveform,
+            normalize=True,
+            apply_shared_projection=False,
+        )
         id_emb = speech_z.to(dtype)
         id_emb_projected = project_face_embs(pipeline, id_emb)
+        images = []
+        for i in range(int(num_samples)):
+            seed = int(base_seed) + i
+            generator = torch.Generator(device=device).manual_seed(seed)
+            img = pipeline(
+                prompt_embeds=id_emb_projected,
+                num_inference_steps=int(num_inference_steps),
+                guidance_scale=float(guidance_scale),
+                num_images_per_prompt=1,
+                generator=generator,
+            ).images[0]
+            images.append(img)
     if select_best:
+        model_ready = (
+            facenet_model is not None
+            if best_selection in ("mean", "pairwise")
+            else facenet_classify_model is not None
+        )
         if model_ready:
             best = select_best_image(images, best_selection)
         else:
             best = images[0]
         return [best], ""
     return images, ""
 # ---------------------------------------------------------------------------
 # Model loading
 # ---------------------------------------------------------------------------