Spaces:

TaliDror
/

AAS2F

Running on Zero

File size: 24,505 Bytes

import os
import spaces
import sys
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["MKL_THREADING_LAYER"] = "GNU"

# ---------------------------------------------------------------------------
# Configuration — set CHECKPOINT_REPO as a HuggingFace Space secret to load
# fine-tuned models. If left empty, the demo uses base Arc2Face with a raw
# WavLM x-vector encoder (useful for testing that the Space works).
# ---------------------------------------------------------------------------
CHECKPOINT_REPO = os.environ.get("CHECKPOINT_REPO", "")
ENCODER_FILENAME = os.environ.get("ENCODER_FILENAME", "speaker_encoder.pt")
ARC2FACE_REPO = "FoivosPar/Arc2Face"
BASE_MODEL = "stable-diffusion-v1-5/stable-diffusion-v1-5"
SKIP_LORA = not bool(CHECKPOINT_REPO)
SKIP_SPEAKER_ENCODER = not bool(CHECKPOINT_REPO)

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from PIL import Image
from diffusers import StableDiffusionPipeline, UNet2DConditionModel, DPMSolverMultistepScheduler
from huggingface_hub import snapshot_download, hf_hub_download
import gradio as gr

from external.arc2face import CLIPTextModelWrapper, project_face_embs
from core.models.encoder.speech_face_encoder import SpeechFaceXVectorEncoder

# ---------------------------------------------------------------------------
# Globals populated at startup
# ---------------------------------------------------------------------------
pipeline = None
speaker_encoder = None
facenet_model = None
facenet_classify_model = None
mtcnn_model = None
device = "cuda" if torch.cuda.is_available() else "cpu"


# ---------------------------------------------------------------------------
# PEFT-compatible attention processors (inlined from core/factories/lora_factory.py)
# These fix "Linear.forward() takes 2 positional arguments but 3 were given"
# when using LoRA-wrapped UNet attention layers.
# ---------------------------------------------------------------------------

class PeftCompatibleAttnProcessor:
    def __call__(
        self,
        attn,
        hidden_states: torch.Tensor,
        encoder_hidden_states=None,
        attention_mask=None,
        temb=None,
        *args,
        **kwargs,
    ) -> torch.Tensor:
        residual = hidden_states

        if attn.spatial_norm is not None:
            hidden_states = attn.spatial_norm(hidden_states, temb)

        input_ndim = hidden_states.ndim

        if input_ndim == 4:
            batch_size, channel, height, width = hidden_states.shape
            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)

        batch_size, sequence_length, _ = (
            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
        )
        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)

        if attn.group_norm is not None:
            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)

        query = attn.to_q(hidden_states)

        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

        key = attn.to_k(encoder_hidden_states)
        value = attn.to_v(encoder_hidden_states)

        query = attn.head_to_batch_dim(query)
        key = attn.head_to_batch_dim(key)
        value = attn.head_to_batch_dim(value)

        attention_probs = attn.get_attention_scores(query, key, attention_mask)
        hidden_states = torch.bmm(attention_probs, value)
        hidden_states = attn.batch_to_head_dim(hidden_states)

        hidden_states = attn.to_out[0](hidden_states)
        hidden_states = attn.to_out[1](hidden_states)

        if input_ndim == 4:
            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)

        if attn.residual_connection:
            hidden_states = hidden_states + residual

        hidden_states = hidden_states / attn.rescale_output_factor
        return hidden_states


class PeftCompatibleAttnProcessor2_0:
    def __init__(self):
        if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
            raise ImportError("PeftCompatibleAttnProcessor2_0 requires PyTorch 2.0+.")

    def __call__(
        self,
        attn,
        hidden_states: torch.Tensor,
        encoder_hidden_states=None,
        attention_mask=None,
        temb=None,
        *args,
        **kwargs,
    ) -> torch.Tensor:
        residual = hidden_states

        if attn.spatial_norm is not None:
            hidden_states = attn.spatial_norm(hidden_states, temb)

        input_ndim = hidden_states.ndim

        if input_ndim == 4:
            batch_size, channel, height, width = hidden_states.shape
            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)

        batch_size, sequence_length, _ = (
            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
        )

        if attention_mask is not None:
            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])

        if attn.group_norm is not None:
            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)

        query = attn.to_q(hidden_states)

        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

        key = attn.to_k(encoder_hidden_states)
        value = attn.to_v(encoder_hidden_states)

        inner_dim = key.shape[-1]
        head_dim = inner_dim // attn.heads

        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)

        hidden_states = torch.nn.functional.scaled_dot_product_attention(
            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
        )

        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
        hidden_states = hidden_states.to(query.dtype)

        hidden_states = attn.to_out[0](hidden_states)
        hidden_states = attn.to_out[1](hidden_states)

        if input_ndim == 4:
            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)

        if attn.residual_connection:
            hidden_states = hidden_states + residual

        hidden_states = hidden_states / attn.rescale_output_factor
        return hidden_states


def _set_attn_processor_for_lora(unet: nn.Module) -> None:
    try:
        attn_procs = {}
        for name in unet.attn_processors.keys():
            if hasattr(torch.nn.functional, 'scaled_dot_product_attention'):
                attn_procs[name] = PeftCompatibleAttnProcessor2_0()
            else:
                attn_procs[name] = PeftCompatibleAttnProcessor()
        unet.set_attn_processor(attn_procs)
        print("  Set PEFT-compatible attention processors")
    except Exception as e:
        print(f"  Warning: Could not set attention processors for LoRA: {e}")


# ---------------------------------------------------------------------------
# Utilities
# ---------------------------------------------------------------------------

MIN_AUDIO_SECONDS = 5.0

def load_and_process_audio(audio_file: str, dev: str, max_seconds: float = 6.0):
    try:
        waveform, sample_rate = torchaudio.load(audio_file)
    except Exception:
        import soundfile as sf
        data, sample_rate = sf.read(audio_file, always_2d=True)
        waveform = torch.from_numpy(data.T.astype(np.float32))
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    duration = waveform.shape[1] / 16000
    if duration < MIN_AUDIO_SECONDS:
        raise ValueError(f"Audio is too short ({duration:.1f}s). Please provide at least {MIN_AUDIO_SECONDS:.0f} seconds of speech.")
    max_samples = int(max_seconds * 16000)
    if waveform.shape[1] > max_samples:
        waveform = waveform[:, :max_samples]
    elif waveform.shape[1] < max_samples:
        waveform = F.pad(waveform, (0, max_samples - waveform.shape[1]))
    return waveform.squeeze(0).unsqueeze(0).to(dev)


def is_lora_checkpoint(checkpoint_path: str, subfolder: str) -> bool:
    return os.path.exists(os.path.join(checkpoint_path, subfolder, "adapter_config.json"))


def resolve_checkpoint_path(checkpoint_path: str) -> str:
    checkpoint_path = os.path.expanduser(checkpoint_path)
    if not os.path.exists(checkpoint_path):
        raise FileNotFoundError(f"Checkpoint path does not exist: {checkpoint_path}")
    expected_subs = {"encoder", "unet"}
    if os.path.isdir(checkpoint_path):
        children = set(os.listdir(checkpoint_path))
        if expected_subs.issubset(children):
            return checkpoint_path
        ckpts = [d for d in os.listdir(checkpoint_path)
                 if d.startswith("checkpoint-") and os.path.isdir(os.path.join(checkpoint_path, d))]
        if not ckpts:
            return checkpoint_path

        def ckpt_num(name):
            try:
                return int(name.split("checkpoint-")[-1])
            except Exception:
                return -1
        return os.path.join(checkpoint_path, sorted(ckpts, key=ckpt_num)[-1])
    return checkpoint_path


# ---------------------------------------------------------------------------
# LoRA checkpoint loading
# ---------------------------------------------------------------------------

def load_encoder_with_lora(checkpoint_path: str):
    encoder_path = os.path.join(checkpoint_path, "lora", "encoder")
    if is_lora_checkpoint(checkpoint_path, os.path.join("lora", "encoder")):
        from peft import PeftModel
        base_encoder = CLIPTextModelWrapper.from_pretrained(ARC2FACE_REPO, subfolder='encoder')
        encoder = PeftModel.from_pretrained(base_encoder, encoder_path)
        encoder = encoder.merge_and_unload()
        encoder.forward = base_encoder.forward
        return encoder
    return CLIPTextModelWrapper.from_pretrained(checkpoint_path, subfolder="encoder")


def load_unet_with_lora(checkpoint_path: str):
    unet_path = os.path.join(checkpoint_path, "lora", "unet")
    if is_lora_checkpoint(checkpoint_path, os.path.join("lora", "unet")):
        from peft import PeftModel
        base_unet = UNet2DConditionModel.from_pretrained(ARC2FACE_REPO, subfolder='arc2face')
        unet = PeftModel.from_pretrained(base_unet, unet_path)
        unet = unet.merge_and_unload()
        unet.forward = base_unet.forward
        _set_attn_processor_for_lora(unet)
        return unet
    return UNet2DConditionModel.from_pretrained(checkpoint_path, subfolder="unet")


# ---------------------------------------------------------------------------
# Raw WavLM encoder (fallback when no fine-tuned checkpoint is provided)
# ---------------------------------------------------------------------------

class RawWavLMEncoder:
    def __init__(self, pretrained_path: str, dev: str):
        from transformers import WavLMForXVector
        self.wavlm_xvector = WavLMForXVector.from_pretrained(pretrained_path).to(dev)
        self.wavlm_xvector.eval()

    def __call__(self, waveform, normalize=True, apply_shared_projection=False):
        emb = self.wavlm_xvector(input_values=waveform, return_dict=True).embeddings
        if normalize:
            emb = F.normalize(emb, p=2, dim=1)
        return emb

    def eval(self):
        self.wavlm_xvector.eval()
        return self

    def to(self, dev):
        self.wavlm_xvector = self.wavlm_xvector.to(dev)
        return self


# ---------------------------------------------------------------------------
# FaceNet best-sample selection
# ---------------------------------------------------------------------------

def _facenet_transform():
    from torchvision import transforms
    return transforms.Compose([
        transforms.Resize((160, 160)),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
    ])


def _extract_facenet_emb(img: Image.Image, model) -> torch.Tensor:
    tensor = _facenet_transform()(img.convert("RGB")).unsqueeze(0)
    with torch.no_grad():
        emb = model(tensor)
    return F.normalize(emb.squeeze(0), p=2, dim=0)


def _extract_facenet_logits(img: Image.Image, model) -> torch.Tensor:
    tensor = _facenet_transform()(img.convert("RGB")).unsqueeze(0)
    with torch.no_grad():
        logits = model(tensor)
    return logits.squeeze(0)


def select_best_images(pairs: list, n: int) -> list:
    """pairs: list of (image, seed). Returns top-n (image, seed) pairs."""
    global facenet_model

    n = min(n, len(pairs))
    images = [p[0] for p in pairs]
    if facenet_model is None:
        return pairs[:n]

    embeddings = torch.stack([_extract_facenet_emb(img, facenet_model) for img in images])
    sim_matrix = F.cosine_similarity(embeddings.unsqueeze(1), embeddings.unsqueeze(0), dim=2)
    avg_sims = (sim_matrix.sum(dim=1) - 1) / (len(images) - 1)
    top_indices = avg_sims.argsort(descending=True)[:n].tolist()
    print(f"[select_best:pairwise] top {n} indices={top_indices} avg_sims={avg_sims[top_indices].tolist()}")
    return [pairs[i] for i in top_indices]


def select_best_images_combined(pairs: list, n: int) -> list:
    """pairs: list of (image, seed). Returns top-n (image, seed) pairs."""
    global mtcnn_model, facenet_classify_model

    n = min(n, len(pairs))
    if mtcnn_model is None or facenet_classify_model is None:
        print("[select_best:combined] models unavailable, falling back to pairwise")
        return select_best_images(pairs, n)

    scores = []
    for idx, (img, _) in enumerate(pairs):
        _, probs = mtcnn_model.detect(img)
        det_conf = float(probs[0]) if probs is not None and probs[0] is not None else 0.0

        tensor = _facenet_transform()(img.convert("RGB")).unsqueeze(0)
        with torch.no_grad():
            logits = facenet_classify_model(tensor)
        classify_conf = float(F.softmax(logits, dim=1).max(dim=1).values[0])

        combined = det_conf * classify_conf
        scores.append(combined)
        print(f"  [combined] idx={idx} det={det_conf:.3f} classify={classify_conf:.3f} combined={combined:.3f}")

    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:n]
    print(f"[select_best:combined] top {n} indices={top_indices} scores={[scores[i] for i in top_indices]}")
    return [pairs[i] for i in top_indices]


SELECTION_METHODS = ["Pairwise similarity", "Detection + Classify confidence"]
DEFAULT_SELECTION_METHOD = SELECTION_METHODS[0]


# ---------------------------------------------------------------------------
# Generation
# ---------------------------------------------------------------------------
#GENERATION_SEEDS = [42, 48, 56, 63, 74, 84, 86]
#107, 119
GENERATION_SEEDS = [42, 48, 56, 63, 74, 84, 86, 107, 119, 124, 125, 127, 128, 129]
INTERNAL_SAMPLES = len(GENERATION_SEEDS)
SHOW_SEED_CAPTIONS = False
RANDOM_SEED_SELECTION = True  # If True, randomly pick DEFAULT_NUM_DISPLAY seeds to generate (faster). If False, generate all seeds and rank by quality.

@spaces.GPU(duration=120)
def generate(audio_path, num_display, guidance_scale, num_inference_steps, base_seed, selection_method=DEFAULT_SELECTION_METHOD):
    global pipeline, speaker_encoder, facenet_model, device

    if audio_path is None:
        return None, "Please provide an audio file."

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"[generate] device = {device}")

    if pipeline is None or speaker_encoder is None:
        print("[generate] Loading models lazily...")
        load_models()
        print("[generate] Models loaded.")

    if pipeline is None or speaker_encoder is None:
        return None, "Model loading failed. Check logs."

    try:
        waveform = load_and_process_audio(audio_path, device, max_seconds=5.0)
    except Exception as e:
        return None, f"Audio loading failed: {e}"

    dtype = torch.float16 if device == "cuda" else torch.float32

    with torch.no_grad():
        speech_z = speaker_encoder(
            waveform,
            normalize=True,
            apply_shared_projection=False,
        )

        id_emb = speech_z.to(dtype)
        id_emb_projected = project_face_embs(pipeline, id_emb)

        n = int(num_display)
        seeds_to_run = (
            np.random.choice(GENERATION_SEEDS, size=min(n, len(GENERATION_SEEDS)), replace=False).tolist()
            if RANDOM_SEED_SELECTION else GENERATION_SEEDS
        )
        print(f"[generate] seeds_to_run={seeds_to_run}")

        pairs = []
        for seed in seeds_to_run:
            generator = torch.Generator(device=device).manual_seed(seed)

            img = pipeline(
                prompt_embeds=id_emb_projected,
                num_inference_steps=int(num_inference_steps),
                guidance_scale=float(guidance_scale),
                num_images_per_prompt=1,
                generator=generator,
            ).images[0]

            pairs.append((img, seed))

    if RANDOM_SEED_SELECTION:
        best = pairs
    elif selection_method == "Detection + Classify confidence":
        best = select_best_images_combined(pairs, int(num_display))
    else:
        best = select_best_images(pairs, int(num_display))

    return [(img, f"Seed: {seed}") if SHOW_SEED_CAPTIONS else img for img, seed in best], ""

# ---------------------------------------------------------------------------
# Model loading
# ---------------------------------------------------------------------------

def load_models():
    global pipeline, speaker_encoder, facenet_model, facenet_classify_model, mtcnn_model, device
    dtype = torch.float16 if device == "cuda" else torch.float32

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Speaker encoder
    print("Loading speaker encoder...")
    if SKIP_SPEAKER_ENCODER:
        speaker_encoder = RawWavLMEncoder("microsoft/wavlm-base-sv", device)
        print("  Using raw WavLM x-vector encoder (no fine-tuned checkpoint)")
    else:
        enc = SpeechFaceXVectorEncoder(
            pretrained_path="microsoft/wavlm-base-sv",
            face_emb_dim=512,
            dropout=0.0,
            use_projection=True,
            freeze_feature_encoder=True,
        )
        encoder_pt = hf_hub_download(CHECKPOINT_REPO, ENCODER_FILENAME)
        ckpt = torch.load(encoder_pt, map_location=device, weights_only=False)
        enc.load_state_dict(ckpt["model"], strict=False)
        speaker_encoder = enc.to(device).eval()
        print(f"  Loaded from {CHECKPOINT_REPO}/{ENCODER_FILENAME}")

    # Diffusion pipeline
    print("Loading diffusion pipeline...")
    if SKIP_LORA:
        encoder = CLIPTextModelWrapper.from_pretrained(ARC2FACE_REPO, subfolder='encoder', torch_dtype=dtype)
        unet = UNet2DConditionModel.from_pretrained(ARC2FACE_REPO, subfolder='arc2face', torch_dtype=dtype)
        print("  Using base Arc2Face (no LoRA)")
    else:
        checkpoint_dir = snapshot_download(CHECKPOINT_REPO)
        checkpoint = resolve_checkpoint_path(checkpoint_dir)
        print(f"  Checkpoint: {checkpoint}")
        encoder = load_encoder_with_lora(checkpoint).to(dtype=dtype)
        unet = load_unet_with_lora(checkpoint).to(dtype=dtype)

    pipeline = StableDiffusionPipeline.from_pretrained(
        BASE_MODEL,
        text_encoder=encoder,
        unet=unet,
        torch_dtype=dtype,
        safety_checker=None,
    )
    pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
    pipeline = pipeline.to(device)
    print("  Pipeline ready")

    # FaceNet + MTCNN for best-sample selection
    print("Loading FaceNet + MTCNN for best-sample selection...")
    try:
        from facenet_pytorch import InceptionResnetV1, MTCNN
        facenet_model = InceptionResnetV1(pretrained='vggface2', classify=False).eval()
        facenet_classify_model = InceptionResnetV1(pretrained='vggface2', classify=True).eval()
        mtcnn_model = MTCNN(keep_all=False, device='cpu')
        print("  FaceNet + MTCNN ready")
    except Exception as e:
        print(f"  FaceNet/MTCNN unavailable ({e}); select-best will fall back to first image")
        facenet_model = None
        facenet_classify_model = None
        mtcnn_model = None


# ---------------------------------------------------------------------------
# Gradio UI
# ---------------------------------------------------------------------------

def build_demo():
    with gr.Blocks(title="AAS2F: Ambiguity-Aware Speech-to-Face Synthesis with Speaker-Conditioned Diffusion Models") as demo:
        gr.Markdown("# AAS2F: Ambiguity-Aware Speech-to-Face Synthesis with Speaker-Conditioned Diffusion Models")
        gr.Markdown(
            "**Steps to use the demo:**\n\n"
            "1. Upload or record a speech audio clip. **Please provide at least 5 seconds of speech.**\n"
            "2. Note that it works best with **English**, but should work with other languages as well.\n"
            "3. After you are done recording/uploading the audio, click the 'Generate' button to start the generation process.\n"
            "4. After a few seconds, the generated images will be displayed on the right."
        )

        DEFAULT_NUM_DISPLAY = 3
        DEFAULT_GUIDANCE_SCALE = 2.5
        DEFAULT_NUM_STEPS = 50
        DEFAULT_BASE_SEED = 42

        with gr.Row():
            with gr.Column():
                with gr.Row():
                    audio_upload = gr.Audio(
                        sources=["upload"],
                        type="filepath",
                        label="Upload Audio",
                    )
                    audio_mic = gr.Audio(
                        sources=["microphone"],
                        type="filepath",
                        label="Record Audio",
                    )
                generate_btn = gr.Button("Generate", variant="primary", interactive=False)

            with gr.Column():
                gallery = gr.Gallery(label="Generated Images")
                status = gr.HTML(visible=False)

        def _update_btn(upload, mic):
            return gr.update(interactive=(upload is not None or mic is not None))

        audio_upload.change(fn=_update_btn, inputs=[audio_upload, audio_mic], outputs=generate_btn)
        audio_mic.change(fn=_update_btn, inputs=[audio_upload, audio_mic], outputs=generate_btn)

        def _generate(upload, mic):
            audio = upload if upload is not None else mic
            imgs, msg = generate(audio, DEFAULT_NUM_DISPLAY, DEFAULT_GUIDANCE_SCALE, DEFAULT_NUM_STEPS, DEFAULT_BASE_SEED)
            if msg:
                error_html = f'<div style="background:#fee2e2;border:1px solid #f87171;border-radius:8px;padding:12px 16px;color:#b91c1c;font-size:0.95em;">⚠️ {msg}</div>'
                return imgs, gr.update(value=error_html, visible=True)
            return imgs, gr.update(value="", visible=False)

        generate_btn.click(
            fn=lambda u, m: (gr.update(value="Generating...", interactive=False, variant="secondary"), gr.update(value="", visible=False)),
            inputs=[audio_upload, audio_mic],
            outputs=[generate_btn, status],
        ).then(
            fn=_generate,
            inputs=[audio_upload, audio_mic],
            outputs=[gallery, status],
        ).then(
            fn=lambda u, m: gr.update(value="Generate", interactive=(u is not None or m is not None), variant="primary"),
            inputs=[audio_upload, audio_mic],
            outputs=generate_btn,
        )

    return demo


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------

demo = build_demo()
demo.queue()
demo.launch()