Spaces:

TaliDror
/

AAS2F

Running on Zero

App Files Files Community

TaliDror commited on 14 days ago

Commit

deb433b

1 Parent(s): 626735d

adaptation to enable ZeroGPU

Browse files

Files changed (1) hide show

app.py +13 -10

app.py CHANGED Viewed

@@ -25,6 +25,7 @@ from PIL import Image
 from diffusers import StableDiffusionPipeline, UNet2DConditionModel, DPMSolverMultistepScheduler
 from huggingface_hub import snapshot_download, hf_hub_download
 import gradio as gr
 from external.arc2face import CLIPTextModelWrapper, project_face_embs
 from core.models.encoder.speech_face_encoder import SpeechFaceXVectorEncoder
@@ -357,7 +358,7 @@ def select_best_image(images: list, method: str) -> Image.Image:
 # ---------------------------------------------------------------------------
 # Generation
 # ---------------------------------------------------------------------------
 def generate(audio_path, num_samples, guidance_scale, num_inference_steps, base_seed, select_best, best_selection="pairwise"):
     global pipeline, speaker_encoder, facenet_model, facenet_classify_model, device
@@ -373,7 +374,8 @@ def generate(audio_path, num_samples, guidance_scale, num_inference_steps, base_
     with torch.no_grad():
         speech_z = speaker_encoder(waveform, normalize=True, apply_shared_projection=False)
-        id_emb = speech_z.to(torch.float16)
         id_emb_projected = project_face_embs(pipeline, id_emb)
     images = []
@@ -406,6 +408,7 @@ def generate(audio_path, num_samples, guidance_scale, num_inference_steps, base_
 def load_models():
     global pipeline, speaker_encoder, facenet_model, facenet_classify_model, device
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f"Using device: {device}")
@@ -432,21 +435,21 @@ def load_models():
     # Diffusion pipeline
     print("Loading diffusion pipeline...")
     if SKIP_LORA:
-        encoder = CLIPTextModelWrapper.from_pretrained(ARC2FACE_REPO, subfolder='encoder', torch_dtype=torch.float16)
-        unet = UNet2DConditionModel.from_pretrained(ARC2FACE_REPO, subfolder='arc2face', torch_dtype=torch.float16)
         print("  Using base Arc2Face (no LoRA)")
     else:
         checkpoint_dir = snapshot_download(CHECKPOINT_REPO)
         checkpoint = resolve_checkpoint_path(checkpoint_dir)
         print(f"  Checkpoint: {checkpoint}")
-        encoder = load_encoder_with_lora(checkpoint).to(dtype=torch.float16)
-        unet = load_unet_with_lora(checkpoint).to(dtype=torch.float16)
     pipeline = StableDiffusionPipeline.from_pretrained(
         BASE_MODEL,
         text_encoder=encoder,
         unet=unet,
-        torch_dtype=torch.float16,
         safety_checker=None,
     )
     pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
@@ -471,7 +474,8 @@ def load_models():
 # ---------------------------------------------------------------------------
 def build_demo():
-    facenet_available = facenet_model is not None and facenet_classify_model is not None
     with gr.Blocks(title="Speech-to-Face Generation") as demo:
         gr.Markdown("# Speech-to-Face Generation")
@@ -526,7 +530,6 @@ def build_demo():
 # Entry point
 # ---------------------------------------------------------------------------
-load_models()
 demo = build_demo()
 demo.launch()

 from diffusers import StableDiffusionPipeline, UNet2DConditionModel, DPMSolverMultistepScheduler
 from huggingface_hub import snapshot_download, hf_hub_download
 import gradio as gr
+import spaces
 from external.arc2face import CLIPTextModelWrapper, project_face_embs
 from core.models.encoder.speech_face_encoder import SpeechFaceXVectorEncoder
 # ---------------------------------------------------------------------------
 # Generation
 # ---------------------------------------------------------------------------
+@spaces.GPU(duration=120)
 def generate(audio_path, num_samples, guidance_scale, num_inference_steps, base_seed, select_best, best_selection="pairwise"):
     global pipeline, speaker_encoder, facenet_model, facenet_classify_model, device
     with torch.no_grad():
         speech_z = speaker_encoder(waveform, normalize=True, apply_shared_projection=False)
+        dtype = torch.float16 if device == "cuda" else torch.float32
+        id_emb = speech_z.to(dtype)
         id_emb_projected = project_face_embs(pipeline, id_emb)
     images = []
 def load_models():
     global pipeline, speaker_encoder, facenet_model, facenet_classify_model, device
+    dtype = torch.float16 if device == "cuda" else torch.float32
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f"Using device: {device}")
     # Diffusion pipeline
     print("Loading diffusion pipeline...")
     if SKIP_LORA:
+        encoder = CLIPTextModelWrapper.from_pretrained(ARC2FACE_REPO, subfolder='encoder', torch_dtype=dtype)
+        unet = UNet2DConditionModel.from_pretrained(ARC2FACE_REPO, subfolder='arc2face', torch_dtype=dtype)
         print("  Using base Arc2Face (no LoRA)")
     else:
         checkpoint_dir = snapshot_download(CHECKPOINT_REPO)
         checkpoint = resolve_checkpoint_path(checkpoint_dir)
         print(f"  Checkpoint: {checkpoint}")
+        encoder = load_encoder_with_lora(checkpoint).to(dtype=dtype)
+        unet = load_unet_with_lora(checkpoint).to(dtype=dtype)
     pipeline = StableDiffusionPipeline.from_pretrained(
         BASE_MODEL,
         text_encoder=encoder,
         unet=unet,
+        torch_dtype=dtype,
         safety_checker=None,
     )
     pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
 # ---------------------------------------------------------------------------
 def build_demo():
+    #facenet_available = facenet_model is not None and facenet_classify_model is not None
+    facenet_available = True
     with gr.Blocks(title="Speech-to-Face Generation") as demo:
         gr.Markdown("# Speech-to-Face Generation")
 # Entry point
 # ---------------------------------------------------------------------------
 demo = build_demo()
+demo.queue()
 demo.launch()