File size: 4,120 Bytes
6f2e51a
50b13cb
6f2e51a
 
9aa9151
 
 
5da1c64
819778a
9aa9151
 
50b13cb
9aa9151
 
5da1c64
50b13cb
5da1c64
50b13cb
 
9aa9151
 
819778a
50b13cb
a02db7d
9aa9151
50b13cb
9aa9151
a02db7d
5da1c64
6f2e51a
 
a02db7d
9aa9151
5da1c64
 
 
 
 
50b13cb
 
5da1c64
 
50b13cb
6f2e51a
9aa9151
819778a
 
6f2e51a
 
 
 
50b13cb
6f2e51a
50b13cb
6f2e51a
 
 
 
50b13cb
6f2e51a
819778a
50b13cb
 
 
9aa9151
 
50b13cb
 
9aa9151
 
 
 
50b13cb
9aa9151
5da1c64
 
 
 
 
 
 
50b13cb
5da1c64
9aa9151
 
 
 
 
 
 
50b13cb
9aa9151
a02db7d
9aa9151
50b13cb
9aa9151
a02db7d
50b13cb
 
 
a02db7d
9aa9151
 
50b13cb
 
 
9aa9151
 
50b13cb
9aa9151
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
# allow loading the CMU Arctic xvectors dataset script on HF
os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "1"

import gradio as gr
import torch
import numpy as np
from transformers import pipeline, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset

# ---------------------------
# CPU-only global handles
# ---------------------------
_captioner = None
_tts_processor = None
_tts_model = None
_tts_vocoder = None
_speaker_embeddings = None  # required by SpeechT5


def load_models_cpu():
    """Load BLIP-2 (image captioning) and SpeechT5 (text-to-speech) on CPU."""
    global _captioner, _tts_processor, _tts_model, _tts_vocoder, _speaker_embeddings

    if _captioner is None:
        print("Loading BLIP-2 image captioning model (CPU)...")
        _captioner = pipeline(
            task="image-to-text",
            model="Salesforce/blip2-flan-t5-xl",
            dtype=torch.float32,   # CPU dtype (alias of torch_dtype)
            device_map=None,       # ensure CPU
        )

    if _tts_processor is None:
        print("Loading SpeechT5 processor...")
        _tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")

    if _tts_model is None:
        print("Loading SpeechT5 TTS model (CPU)...")
        _tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to("cpu").eval()

    if _tts_vocoder is None:
        print("Loading SpeechT5 HiFiGAN vocoder (CPU)...")
        _tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to("cpu").eval()

    if _speaker_embeddings is None:
        print("Loading default speaker embeddings for SpeechT5...")
        try:
            emb_ds = load_dataset(
                "Matthijs/cmu-arctic-xvectors",
                split="validation",
                trust_remote_code=True,  # needed with modern datasets
            )
            # HF examples commonly use index 7306 (speaker "slt")
            emb = emb_ds[7306]["xvector"]
            _speaker_embeddings = torch.tensor(emb, dtype=torch.float32).unsqueeze(0)
        except Exception as e:
            print(f"Speaker embedding load failed: {e}. Using a random voice embedding.")
            # SpeechT5 expects shape (1, 512)
            _speaker_embeddings = torch.randn(1, 512, dtype=torch.float32)


def describe_and_speak(image):
    """Generate an English caption for the image and speak it aloud (defaults only)."""
    load_models_cpu()

    # --- 1) Caption (defaults; no beams/tokens passed) ---
    result = _captioner(image)
    caption = (result[0].get("generated_text", "") if result else "").strip()
    if not caption:
        caption = "A description could not be generated for this image."

    # --- 2) Text β†’ Speech (SpeechT5) ---
    try:
        inputs = _tts_processor(text=caption, return_tensors="pt")
        with torch.no_grad():
            speech = _tts_model.generate_speech(
                inputs["input_ids"],
                speaker_embeddings=_speaker_embeddings,
                vocoder=_tts_vocoder,
            )
        sr = 16000  # SpeechT5 HiFiGAN outputs 16 kHz mono
        audio = np.asarray(speech.numpy(), dtype=np.float32)
    except Exception as e:
        caption += f"\n\n[TTS error: {e}]"
        sr = 22050
        audio = np.zeros(sr, dtype=np.float32)

    return caption, (sr, audio)


# ---------------------------
# Gradio UI
# ---------------------------
with gr.Blocks(title="Image β†’ Speech (HF models, CPU)") as demo:
    gr.Markdown(
        """
        # πŸ–ΌοΈ Image β†’ πŸŽ™οΈ Speech (CPU)
        1) Caption with **BLIP-2** β†’ 2) Speak with **SpeechT5** (HiFiGAN vocoder).  
        *First run downloads models and speaker embeddings β€” please wait.*
        """
    )

    inp_image = gr.Image(type="pil", label="Upload an image (JPG/PNG)")
    out_text = gr.Textbox(label="Generated Caption", lines=3)
    out_audio = gr.Audio(label="Spoken Caption", type="numpy")

    btn = gr.Button("Generate")
    btn.click(describe_and_speak, [inp_image], [out_text, out_audio])

if __name__ == "__main__":
    demo.launch()