Maria604 commited on
Commit
6f2e51a
·
1 Parent(s): 5da1c64
Files changed (1) hide show
  1. app.py +21 -7
app.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  import gradio as gr
2
  import torch
3
  import numpy as np
@@ -22,8 +26,8 @@ def load_models_cpu():
22
  _captioner = pipeline(
23
  task="image-to-text",
24
  model="Salesforce/blip2-flan-t5-xl",
25
- dtype=torch.float32, # CPU
26
- device_map=None, # CPU
27
  )
28
 
29
  # --- Load SpeechT5 stack explicitly (no pipeline) ---
@@ -34,17 +38,27 @@ def load_models_cpu():
34
  if _tts_model is None:
35
  print("Loading SpeechT5 TTS model...")
36
  _tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
37
- _tts_model = _tts_model.to("cpu")
38
 
39
  if _tts_vocoder is None:
40
  print("Loading SpeechT5 HiFiGAN vocoder...")
41
- _tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to("cpu")
42
 
43
  if _speaker_embeddings is None:
44
  print("Loading default speaker embeddings for SpeechT5...")
45
- emb_ds = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
46
- # “slt” female speaker used in HF examples
47
- _speaker_embeddings = torch.tensor(emb_ds[7306]["xvector"]).unsqueeze(0)
 
 
 
 
 
 
 
 
 
 
48
 
49
  def describe_and_speak(image, beams, max_tokens):
50
  """Generate an English caption for the image and read it aloud."""
 
1
+ import os
2
+ # Allow loading the CMU Arctic xvectors dataset script on HF
3
+ os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "1"
4
+
5
  import gradio as gr
6
  import torch
7
  import numpy as np
 
26
  _captioner = pipeline(
27
  task="image-to-text",
28
  model="Salesforce/blip2-flan-t5-xl",
29
+ dtype=torch.float32, # CPU dtype (alias of torch_dtype)
30
+ device_map=None, # ensure CPU
31
  )
32
 
33
  # --- Load SpeechT5 stack explicitly (no pipeline) ---
 
38
  if _tts_model is None:
39
  print("Loading SpeechT5 TTS model...")
40
  _tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
41
+ _tts_model = _tts_model.to("cpu").eval()
42
 
43
  if _tts_vocoder is None:
44
  print("Loading SpeechT5 HiFiGAN vocoder...")
45
+ _tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to("cpu").eval()
46
 
47
  if _speaker_embeddings is None:
48
  print("Loading default speaker embeddings for SpeechT5...")
49
+ try:
50
+ emb_ds = load_dataset(
51
+ "Matthijs/cmu-arctic-xvectors",
52
+ split="validation",
53
+ trust_remote_code=True, # required with modern datasets
54
+ )
55
+ # “slt” female speaker used in HF examples
56
+ emb = emb_ds[7306]["xvector"]
57
+ _speaker_embeddings = torch.tensor(emb, dtype=torch.float32).unsqueeze(0)
58
+ except Exception as e:
59
+ print(f"Speaker embedding load failed: {e}. Using a random voice embedding.")
60
+ # SpeechT5 expects a (1, 512) tensor; random fallback gives a generic voice
61
+ _speaker_embeddings = torch.randn(1, 512, dtype=torch.float32)
62
 
63
  def describe_and_speak(image, beams, max_tokens):
64
  """Generate an English caption for the image and read it aloud."""