Spaces:

Maria604
/

Trial

Sleeping

App Files Files Community

Maria604 commited on Oct 27, 2025

Commit

a02db7d

1 Parent(s): 0ea639b

fix hopefully

Browse files

Files changed (2) hide show

app.py +37 -48
requirements.txt +7 -5

app.py CHANGED Viewed

@@ -2,60 +2,50 @@ import gradio as gr
 import torch
 import numpy as np
 from transformers import pipeline
-from TTS.api import TTS
 # ---------------------------
-# CPU-only, lazy-loaded models
 # ---------------------------
 _captioner = None
 _tts = None
 def load_models_cpu():
     global _captioner, _tts
     if _captioner is None:
-        # BLIP-2 (smaller/CPU-friendlier checkpoint)
-        # You can switch to "Salesforce/blip2-flan-t5-xl" if you prefer (slower on CPU).
         _captioner = pipeline(
-        task="image-to-text",
-        model="Salesforce/blip2-flan-t5-xl",
-        torch_dtype=torch.float32,  # CPU
-        device_map=None,            # CPU
-)
     if _tts is None:
-        # Multilingual XTTS-v2 (runs on CPU; first load may take a bit)
-        _tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
-        # Do NOT move to CUDA; we keep CPU-only for Spaces CPU Basic
-        # (_tts manages device internally; CPU is default)
 def describe_and_speak(image, beams, max_tokens):
-    """
-    1) Caption the image in English with BLIP-2
-    2) Speak the caption in English with XTTS-v2
-    Returns: (caption_text, (sample_rate, audio_numpy))
-    """
     load_models_cpu()
-    # -------- 1) Caption (English) --------
-    gen_kwargs = {
-        "num_beams": int(beams),
-        "max_new_tokens": int(max_tokens),
-    }
     result = _captioner(image, **gen_kwargs)
     caption = (result[0].get("generated_text", "") if result else "").strip()
     if not caption:
         caption = "A description could not be generated for this image."
-    # -------- 2) TTS (English) --------
     try:
-        # XTTS expects language code like "en"
-        audio = _tts.tts(text=caption, language="en")
-        # XTTS returns a float32 numpy array; default samplerate is 22050 Hz
-        sr = 22050
-        audio = np.asarray(audio, dtype=np.float32)
     except Exception as e:
-        # On any TTS error, return silence and append the error in text
         caption += f"\n\n[TTS error: {e}]"
         sr = 22050
         audio = np.zeros(sr, dtype=np.float32)
@@ -63,34 +53,33 @@ def describe_and_speak(image, beams, max_tokens):
     return caption, (sr, audio)
 # ---------------------------
-# Gradio UI (simple & clean)
 # ---------------------------
-with gr.Blocks(title="Image → English Audio (CPU-only)") as demo:
     gr.Markdown(
-        "# Image → Audio Description (CPU-only)\n"
-        "Upload an image. The app will generate an **English caption (BLIP-2)** and "
-        "read it aloud using **XTTS-v2**.\n\n"
-        "Tip: On CPU, first run can be slow while models download."
     )
     with gr.Row():
-        inp_image = gr.Image(type="pil", label="Upload image (PNG/JPG)")
         with gr.Column():
-            beams = gr.Slider(1, 4, value=2, step=1, label="Caption beams (quality vs speed)")
             max_tokens = gr.Slider(10, 60, value=30, step=5, label="Max caption tokens")
     with gr.Row():
-        out_text = gr.Textbox(label="Caption", lines=3)
-        out_audio = gr.Audio(label="Spoken caption", type="numpy")
     btn = gr.Button("Generate")
-    btn.click(
-        fn=describe_and_speak,
-        inputs=[inp_image, beams, max_tokens],
-        outputs=[out_text, out_audio],
-        api_name="describe_and_speak",
-    )
 if __name__ == "__main__":
-    # On Spaces, Gradio handles serving. Locally, this starts the app.
     demo.launch()

 import torch
 import numpy as np
 from transformers import pipeline
 # ---------------------------
+# CPU-only model loaders
 # ---------------------------
 _captioner = None
 _tts = None
 def load_models_cpu():
+    """Load BLIP-2 (image captioning) and ESPnet VITS (text-to-speech) on CPU."""
     global _captioner, _tts
     if _captioner is None:
+        print("Loading BLIP-2 image captioning model...")
         _captioner = pipeline(
+            task="image-to-text",
+            model="Salesforce/blip2-flan-t5-xl",  # high-quality public model
+            torch_dtype=torch.float32,
+            device_map=None,  # CPU only
+        )
     if _tts is None:
+        print("Loading ESPnet VITS text-to-speech model...")
+        _tts = pipeline(
+            task="text-to-speech",
+            model="espnet/kan-bayashi_ljspeech_vits",  # English-only TTS
+        )
 def describe_and_speak(image, beams, max_tokens):
+    """Generate an English caption for the image and read it aloud."""
     load_models_cpu()
+    # --- Step 1: Caption the image ---
+    gen_kwargs = {"num_beams": int(beams), "max_new_tokens": int(max_tokens)}
     result = _captioner(image, **gen_kwargs)
     caption = (result[0].get("generated_text", "") if result else "").strip()
     if not caption:
         caption = "A description could not be generated for this image."
+    # --- Step 2: Convert text to speech ---
     try:
+        tts_output = _tts(caption)
+        audio = np.array(tts_output["audio"], dtype=np.float32)
+        sr = tts_output["sampling_rate"]
     except Exception as e:
         caption += f"\n\n[TTS error: {e}]"
         sr = 22050
         audio = np.zeros(sr, dtype=np.float32)
     return caption, (sr, audio)
 # ---------------------------
+# Gradio UI
 # ---------------------------
+with gr.Blocks(title="Image → Speech (Hugging Face models, CPU)") as demo:
     gr.Markdown(
+        """
+        # 🖼️ Image → 🎙️ Speech
+        Upload an image, and the app will:
+        1. Generate a caption using **BLIP-2**
+        2. Read it aloud using **ESPnet VITS**
+        *Runs fully on CPU (Hugging Face public models).
+        First run may take a few minutes while models download.*
+        """
     )
     with gr.Row():
+        inp_image = gr.Image(type="pil", label="Upload an image (JPG or PNG)")
         with gr.Column():
+            beams = gr.Slider(1, 4, value=2, step=1, label="Caption beams (quality vs. speed)")
             max_tokens = gr.Slider(10, 60, value=30, step=5, label="Max caption tokens")
     with gr.Row():
+        out_text = gr.Textbox(label="Generated Caption", lines=3)
+        out_audio = gr.Audio(label="Spoken Caption", type="numpy")
     btn = gr.Button("Generate")
+    btn.click(fn=describe_and_speak, inputs=[inp_image, beams, max_tokens], outputs=[out_text, out_audio])
 if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

@@ -1,9 +1,11 @@
 gradio
-transformers>=4.41.0
 torch
-sentencepiece
 accelerate
-numpy
-soundfile
 Pillow
-TTS>=0.22.0

 gradio
+transformers>=4.44.2
 torch
 accelerate
+sentencepiece
 Pillow
+soundfile
+safetensors
+timm
+scipy
+numpy<2.0