File size: 4,120 Bytes
6f2e51a 50b13cb 6f2e51a 9aa9151 5da1c64 819778a 9aa9151 50b13cb 9aa9151 5da1c64 50b13cb 5da1c64 50b13cb 9aa9151 819778a 50b13cb a02db7d 9aa9151 50b13cb 9aa9151 a02db7d 5da1c64 6f2e51a a02db7d 9aa9151 5da1c64 50b13cb 5da1c64 50b13cb 6f2e51a 9aa9151 819778a 6f2e51a 50b13cb 6f2e51a 50b13cb 6f2e51a 50b13cb 6f2e51a 819778a 50b13cb 9aa9151 50b13cb 9aa9151 50b13cb 9aa9151 5da1c64 50b13cb 5da1c64 9aa9151 50b13cb 9aa9151 a02db7d 9aa9151 50b13cb 9aa9151 a02db7d 50b13cb a02db7d 9aa9151 50b13cb 9aa9151 50b13cb 9aa9151 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | import os
# allow loading the CMU Arctic xvectors dataset script on HF
os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "1"
import gradio as gr
import torch
import numpy as np
from transformers import pipeline, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
# ---------------------------
# CPU-only global handles
# ---------------------------
_captioner = None
_tts_processor = None
_tts_model = None
_tts_vocoder = None
_speaker_embeddings = None # required by SpeechT5
def load_models_cpu():
"""Load BLIP-2 (image captioning) and SpeechT5 (text-to-speech) on CPU."""
global _captioner, _tts_processor, _tts_model, _tts_vocoder, _speaker_embeddings
if _captioner is None:
print("Loading BLIP-2 image captioning model (CPU)...")
_captioner = pipeline(
task="image-to-text",
model="Salesforce/blip2-flan-t5-xl",
dtype=torch.float32, # CPU dtype (alias of torch_dtype)
device_map=None, # ensure CPU
)
if _tts_processor is None:
print("Loading SpeechT5 processor...")
_tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
if _tts_model is None:
print("Loading SpeechT5 TTS model (CPU)...")
_tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to("cpu").eval()
if _tts_vocoder is None:
print("Loading SpeechT5 HiFiGAN vocoder (CPU)...")
_tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to("cpu").eval()
if _speaker_embeddings is None:
print("Loading default speaker embeddings for SpeechT5...")
try:
emb_ds = load_dataset(
"Matthijs/cmu-arctic-xvectors",
split="validation",
trust_remote_code=True, # needed with modern datasets
)
# HF examples commonly use index 7306 (speaker "slt")
emb = emb_ds[7306]["xvector"]
_speaker_embeddings = torch.tensor(emb, dtype=torch.float32).unsqueeze(0)
except Exception as e:
print(f"Speaker embedding load failed: {e}. Using a random voice embedding.")
# SpeechT5 expects shape (1, 512)
_speaker_embeddings = torch.randn(1, 512, dtype=torch.float32)
def describe_and_speak(image):
"""Generate an English caption for the image and speak it aloud (defaults only)."""
load_models_cpu()
# --- 1) Caption (defaults; no beams/tokens passed) ---
result = _captioner(image)
caption = (result[0].get("generated_text", "") if result else "").strip()
if not caption:
caption = "A description could not be generated for this image."
# --- 2) Text β Speech (SpeechT5) ---
try:
inputs = _tts_processor(text=caption, return_tensors="pt")
with torch.no_grad():
speech = _tts_model.generate_speech(
inputs["input_ids"],
speaker_embeddings=_speaker_embeddings,
vocoder=_tts_vocoder,
)
sr = 16000 # SpeechT5 HiFiGAN outputs 16 kHz mono
audio = np.asarray(speech.numpy(), dtype=np.float32)
except Exception as e:
caption += f"\n\n[TTS error: {e}]"
sr = 22050
audio = np.zeros(sr, dtype=np.float32)
return caption, (sr, audio)
# ---------------------------
# Gradio UI
# ---------------------------
with gr.Blocks(title="Image β Speech (HF models, CPU)") as demo:
gr.Markdown(
"""
# πΌοΈ Image β ποΈ Speech (CPU)
1) Caption with **BLIP-2** β 2) Speak with **SpeechT5** (HiFiGAN vocoder).
*First run downloads models and speaker embeddings β please wait.*
"""
)
inp_image = gr.Image(type="pil", label="Upload an image (JPG/PNG)")
out_text = gr.Textbox(label="Generated Caption", lines=3)
out_audio = gr.Audio(label="Spoken Caption", type="numpy")
btn = gr.Button("Generate")
btn.click(describe_and_speak, [inp_image], [out_text, out_audio])
if __name__ == "__main__":
demo.launch()
|