Spaces:

Maria604
/

Trial

Sleeping

App Files Files Community

Maria604 commited on Oct 27, 2025

Commit

6f2e51a

1 Parent(s): 5da1c64

fix

Browse files

Files changed (1) hide show

app.py +21 -7

app.py CHANGED Viewed

@@ -1,3 +1,7 @@
 import gradio as gr
 import torch
 import numpy as np
@@ -22,8 +26,8 @@ def load_models_cpu():
         _captioner = pipeline(
             task="image-to-text",
             model="Salesforce/blip2-flan-t5-xl",
-            dtype=torch.float32,   # CPU
-            device_map=None,       # CPU
         )
     # --- Load SpeechT5 stack explicitly (no pipeline) ---
@@ -34,17 +38,27 @@ def load_models_cpu():
     if _tts_model is None:
         print("Loading SpeechT5 TTS model...")
         _tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
-        _tts_model = _tts_model.to("cpu")
     if _tts_vocoder is None:
         print("Loading SpeechT5 HiFiGAN vocoder...")
-        _tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to("cpu")
     if _speaker_embeddings is None:
         print("Loading default speaker embeddings for SpeechT5...")
-        emb_ds = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-        # “slt” female speaker used in HF examples
-        _speaker_embeddings = torch.tensor(emb_ds[7306]["xvector"]).unsqueeze(0)
 def describe_and_speak(image, beams, max_tokens):
     """Generate an English caption for the image and read it aloud."""

+import os
+# Allow loading the CMU Arctic xvectors dataset script on HF
+os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "1"
 import gradio as gr
 import torch
 import numpy as np
         _captioner = pipeline(
             task="image-to-text",
             model="Salesforce/blip2-flan-t5-xl",
+            dtype=torch.float32,   # CPU dtype (alias of torch_dtype)
+            device_map=None,       # ensure CPU
         )
     # --- Load SpeechT5 stack explicitly (no pipeline) ---
     if _tts_model is None:
         print("Loading SpeechT5 TTS model...")
         _tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+        _tts_model = _tts_model.to("cpu").eval()
     if _tts_vocoder is None:
         print("Loading SpeechT5 HiFiGAN vocoder...")
+        _tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to("cpu").eval()
     if _speaker_embeddings is None:
         print("Loading default speaker embeddings for SpeechT5...")
+        try:
+            emb_ds = load_dataset(
+                "Matthijs/cmu-arctic-xvectors",
+                split="validation",
+                trust_remote_code=True,  # required with modern datasets
+            )
+            # “slt” female speaker used in HF examples
+            emb = emb_ds[7306]["xvector"]
+            _speaker_embeddings = torch.tensor(emb, dtype=torch.float32).unsqueeze(0)
+        except Exception as e:
+            print(f"Speaker embedding load failed: {e}. Using a random voice embedding.")
+            # SpeechT5 expects a (1, 512) tensor; random fallback gives a generic voice
+            _speaker_embeddings = torch.randn(1, 512, dtype=torch.float32)
 def describe_and_speak(image, beams, max_tokens):
     """Generate an English caption for the image and read it aloud."""