Maria604 commited on
Commit
a02db7d
ยท
1 Parent(s): 0ea639b

fix hopefully

Browse files
Files changed (2) hide show
  1. app.py +37 -48
  2. requirements.txt +7 -5
app.py CHANGED
@@ -2,60 +2,50 @@ import gradio as gr
2
  import torch
3
  import numpy as np
4
  from transformers import pipeline
5
- from TTS.api import TTS
6
 
7
  # ---------------------------
8
- # CPU-only, lazy-loaded models
9
  # ---------------------------
10
  _captioner = None
11
  _tts = None
12
 
13
  def load_models_cpu():
 
14
  global _captioner, _tts
 
15
  if _captioner is None:
16
- # BLIP-2 (smaller/CPU-friendlier checkpoint)
17
- # You can switch to "Salesforce/blip2-flan-t5-xl" if you prefer (slower on CPU).
18
  _captioner = pipeline(
19
- task="image-to-text",
20
- model="Salesforce/blip2-flan-t5-xl",
21
- torch_dtype=torch.float32, # CPU
22
- device_map=None, # CPU
23
- )
24
-
25
 
26
  if _tts is None:
27
- # Multilingual XTTS-v2 (runs on CPU; first load may take a bit)
28
- _tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
29
- # Do NOT move to CUDA; we keep CPU-only for Spaces CPU Basic
30
- # (_tts manages device internally; CPU is default)
 
31
 
32
  def describe_and_speak(image, beams, max_tokens):
33
- """
34
- 1) Caption the image in English with BLIP-2
35
- 2) Speak the caption in English with XTTS-v2
36
- Returns: (caption_text, (sample_rate, audio_numpy))
37
- """
38
  load_models_cpu()
39
 
40
- # -------- 1) Caption (English) --------
41
- gen_kwargs = {
42
- "num_beams": int(beams),
43
- "max_new_tokens": int(max_tokens),
44
- }
45
  result = _captioner(image, **gen_kwargs)
46
  caption = (result[0].get("generated_text", "") if result else "").strip()
47
  if not caption:
48
  caption = "A description could not be generated for this image."
49
 
50
- # -------- 2) TTS (English) --------
51
  try:
52
- # XTTS expects language code like "en"
53
- audio = _tts.tts(text=caption, language="en")
54
- # XTTS returns a float32 numpy array; default samplerate is 22050 Hz
55
- sr = 22050
56
- audio = np.asarray(audio, dtype=np.float32)
57
  except Exception as e:
58
- # On any TTS error, return silence and append the error in text
59
  caption += f"\n\n[TTS error: {e}]"
60
  sr = 22050
61
  audio = np.zeros(sr, dtype=np.float32)
@@ -63,34 +53,33 @@ def describe_and_speak(image, beams, max_tokens):
63
  return caption, (sr, audio)
64
 
65
  # ---------------------------
66
- # Gradio UI (simple & clean)
67
  # ---------------------------
68
- with gr.Blocks(title="Image โ†’ English Audio (CPU-only)") as demo:
69
  gr.Markdown(
70
- "# Image โ†’ Audio Description (CPU-only)\n"
71
- "Upload an image. The app will generate an **English caption (BLIP-2)** and "
72
- "read it aloud using **XTTS-v2**.\n\n"
73
- "Tip: On CPU, first run can be slow while models download."
 
 
 
 
 
74
  )
75
 
76
  with gr.Row():
77
- inp_image = gr.Image(type="pil", label="Upload image (PNG/JPG)")
78
  with gr.Column():
79
- beams = gr.Slider(1, 4, value=2, step=1, label="Caption beams (quality vs speed)")
80
  max_tokens = gr.Slider(10, 60, value=30, step=5, label="Max caption tokens")
81
 
82
  with gr.Row():
83
- out_text = gr.Textbox(label="Caption", lines=3)
84
- out_audio = gr.Audio(label="Spoken caption", type="numpy")
85
 
86
  btn = gr.Button("Generate")
87
- btn.click(
88
- fn=describe_and_speak,
89
- inputs=[inp_image, beams, max_tokens],
90
- outputs=[out_text, out_audio],
91
- api_name="describe_and_speak",
92
- )
93
 
94
  if __name__ == "__main__":
95
- # On Spaces, Gradio handles serving. Locally, this starts the app.
96
  demo.launch()
 
2
  import torch
3
  import numpy as np
4
  from transformers import pipeline
 
5
 
6
  # ---------------------------
7
+ # CPU-only model loaders
8
  # ---------------------------
9
  _captioner = None
10
  _tts = None
11
 
12
  def load_models_cpu():
13
+ """Load BLIP-2 (image captioning) and ESPnet VITS (text-to-speech) on CPU."""
14
  global _captioner, _tts
15
+
16
  if _captioner is None:
17
+ print("Loading BLIP-2 image captioning model...")
 
18
  _captioner = pipeline(
19
+ task="image-to-text",
20
+ model="Salesforce/blip2-flan-t5-xl", # high-quality public model
21
+ torch_dtype=torch.float32,
22
+ device_map=None, # CPU only
23
+ )
 
24
 
25
  if _tts is None:
26
+ print("Loading ESPnet VITS text-to-speech model...")
27
+ _tts = pipeline(
28
+ task="text-to-speech",
29
+ model="espnet/kan-bayashi_ljspeech_vits", # English-only TTS
30
+ )
31
 
32
  def describe_and_speak(image, beams, max_tokens):
33
+ """Generate an English caption for the image and read it aloud."""
 
 
 
 
34
  load_models_cpu()
35
 
36
+ # --- Step 1: Caption the image ---
37
+ gen_kwargs = {"num_beams": int(beams), "max_new_tokens": int(max_tokens)}
 
 
 
38
  result = _captioner(image, **gen_kwargs)
39
  caption = (result[0].get("generated_text", "") if result else "").strip()
40
  if not caption:
41
  caption = "A description could not be generated for this image."
42
 
43
+ # --- Step 2: Convert text to speech ---
44
  try:
45
+ tts_output = _tts(caption)
46
+ audio = np.array(tts_output["audio"], dtype=np.float32)
47
+ sr = tts_output["sampling_rate"]
 
 
48
  except Exception as e:
 
49
  caption += f"\n\n[TTS error: {e}]"
50
  sr = 22050
51
  audio = np.zeros(sr, dtype=np.float32)
 
53
  return caption, (sr, audio)
54
 
55
  # ---------------------------
56
+ # Gradio UI
57
  # ---------------------------
58
+ with gr.Blocks(title="Image โ†’ Speech (Hugging Face models, CPU)") as demo:
59
  gr.Markdown(
60
+ """
61
+ # ๐Ÿ–ผ๏ธ Image โ†’ ๐ŸŽ™๏ธ Speech
62
+ Upload an image, and the app will:
63
+ 1. Generate a caption using **BLIP-2**
64
+ 2. Read it aloud using **ESPnet VITS**
65
+
66
+ *Runs fully on CPU (Hugging Face public models).
67
+ First run may take a few minutes while models download.*
68
+ """
69
  )
70
 
71
  with gr.Row():
72
+ inp_image = gr.Image(type="pil", label="Upload an image (JPG or PNG)")
73
  with gr.Column():
74
+ beams = gr.Slider(1, 4, value=2, step=1, label="Caption beams (quality vs. speed)")
75
  max_tokens = gr.Slider(10, 60, value=30, step=5, label="Max caption tokens")
76
 
77
  with gr.Row():
78
+ out_text = gr.Textbox(label="Generated Caption", lines=3)
79
+ out_audio = gr.Audio(label="Spoken Caption", type="numpy")
80
 
81
  btn = gr.Button("Generate")
82
+ btn.click(fn=describe_and_speak, inputs=[inp_image, beams, max_tokens], outputs=[out_text, out_audio])
 
 
 
 
 
83
 
84
  if __name__ == "__main__":
 
85
  demo.launch()
requirements.txt CHANGED
@@ -1,9 +1,11 @@
1
  gradio
2
- transformers>=4.41.0
3
  torch
4
- sentencepiece
5
  accelerate
6
- numpy
7
- soundfile
8
  Pillow
9
- TTS>=0.22.0
 
 
 
 
 
1
  gradio
2
+ transformers>=4.44.2
3
  torch
 
4
  accelerate
5
+ sentencepiece
 
6
  Pillow
7
+ soundfile
8
+ safetensors
9
+ timm
10
+ scipy
11
+ numpy<2.0