Fix transformers 5.0.0 compatibility

#4
Files changed (2) hide show
  1. app.py +57 -26
  2. requirements.txt +1 -1
app.py CHANGED
@@ -19,34 +19,61 @@ def load_audio_from_url(url):
19
 
20
  @spaces.GPU
21
  def synthesize_speech(text, ref_audio, ref_text):
22
- if ref_audio is None or ref_text.strip() == "":
23
- return "Error: Please provide a reference audio and its corresponding text."
24
-
25
- # Ensure valid reference audio input
26
- if isinstance(ref_audio, tuple) and len(ref_audio) == 2:
27
- sample_rate, audio_data = ref_audio
28
- else:
29
- return "Error: Invalid reference audio input."
30
-
31
- # Save reference audio directly without resampling
32
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
33
- sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
34
- temp_audio.flush()
35
-
36
- audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
37
-
38
- # Normalize output and save
39
- if audio.dtype == np.int16:
40
- audio = audio.astype(np.float32) / 32768.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- return 24000, audio
 
 
 
 
 
43
 
44
 
45
- # Load TTS model
46
  repo_id = "ai4bharat/IndicF5"
47
- model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
48
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
49
  print("Device", device)
 
50
  model = model.to(device)
51
 
52
  # Example Data (Multiple Examples)
@@ -87,6 +114,13 @@ EXAMPLES = [
87
  # Preload all example audios
88
  for example in EXAMPLES:
89
  sample_rate, audio_data = load_audio_from_url(example["audio_url"])
 
 
 
 
 
 
 
90
  example["sample_rate"] = sample_rate
91
  example["audio_data"] = audio_data
92
 
@@ -96,11 +130,8 @@ with gr.Blocks() as iface:
96
  gr.Markdown(
97
  """
98
  # **IndicF5: High-Quality Text-to-Speech for Indian Languages**
99
-
100
  [![Hugging Face](https://img.shields.io/badge/HuggingFace-Model-orange)](https://huggingface.co/ai4bharat/IndicF5)
101
-
102
  We release **IndicF5**, a **near-human polyglot** **Text-to-Speech (TTS)** model trained on **1417 hours** of high-quality speech from **[Rasa](https://huggingface.co/datasets/ai4bharat/Rasa), [IndicTTS](https://www.iitm.ac.in/donlab/indictts/database), [LIMMITS](https://sites.google.com/view/limmits24/), and [IndicVoices-R](https://huggingface.co/datasets/ai4bharat/indicvoices_r)**.
103
-
104
  IndicF5 supports **11 Indian languages**:
105
  **Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Odia, Punjabi, Tamil, Telugu.**
106
 
@@ -111,7 +142,7 @@ with gr.Blocks() as iface:
111
  with gr.Row():
112
  with gr.Column():
113
  text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text to convert to speech...", lines=3)
114
- ref_audio_input = gr.Audio(type="numpy", label="Reference Prompt Audio")
115
  ref_text_input = gr.Textbox(label="Text in Reference Prompt Audio", placeholder="Enter the transcript of the reference audio...", lines=2)
116
  submit_btn = gr.Button("🎤 Generate Speech", variant="primary")
117
 
 
19
 
20
  @spaces.GPU
21
  def synthesize_speech(text, ref_audio, ref_text):
22
+ try:
23
+ if ref_audio is None or ref_text.strip() == "":
24
+ return "Error: Please provide a reference audio and its corresponding text."
25
+
26
+ # Ensure valid reference audio input
27
+ if isinstance(ref_audio, tuple) and len(ref_audio) == 2:
28
+ sample_rate, audio_data = ref_audio
29
+ else:
30
+ return "Error: Invalid reference audio input."
31
+
32
+ # Save reference audio directly without resampling
33
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
34
+ sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
35
+ temp_audio.flush()
36
+
37
+ audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
38
+
39
+ # Validate audio output
40
+ if audio is None or (isinstance(audio, np.ndarray) and audio.size == 0):
41
+ print("Error: Model returned empty audio")
42
+ return None
43
+
44
+ #print(f"DEBUG: audio dtype={audio.dtype}, shape={audio.shape}, min={audio.min()}, max={audio.max()}")
45
+
46
+ # Normalize output to float32
47
+ if audio.dtype == np.int16:
48
+ audio = audio.astype(np.float32) / 32768.0
49
+ elif audio.dtype == np.float64:
50
+ audio = audio.astype(np.float32)
51
+ elif audio.dtype != np.float32:
52
+ audio = audio.astype(np.float32)
53
+
54
+ #print(f"DEBUG: after conversion dtype={audio.dtype}, min={audio.min()}, max={audio.max()}")
55
+
56
+ # Ensure values are in range [-1.0, 1.0]
57
+ max_val = np.abs(audio).max()
58
+ if max_val > 0:
59
+ audio = audio / max_val
60
+ audio = np.clip(audio, -1.0, 1.0)
61
+
62
+ #print(f"DEBUG: after normalization min={audio.min()}, max={audio.max()}")
63
 
64
+ return 24000, audio
65
+ except Exception as e:
66
+ print(f"Error in synthesize_speech: {str(e)}")
67
+ import traceback
68
+ traceback.print_exc()
69
+ return None
70
 
71
 
72
+ # Load TTS model (patched to work with transformers 5.0.0)
73
  repo_id = "ai4bharat/IndicF5"
 
74
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
75
  print("Device", device)
76
+ model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
77
  model = model.to(device)
78
 
79
  # Example Data (Multiple Examples)
 
114
  # Preload all example audios
115
  for example in EXAMPLES:
116
  sample_rate, audio_data = load_audio_from_url(example["audio_url"])
117
+ # Convert to float32 to avoid gradio warnings
118
+ if audio_data is not None:
119
+ if audio_data.dtype == np.float64:
120
+ audio_data = audio_data.astype(np.float32)
121
+ elif audio_data.dtype == np.int16:
122
+ audio_data = audio_data.astype(np.float32) / 32768.0
123
+ audio_data = np.clip(audio_data, -1.0, 1.0)
124
  example["sample_rate"] = sample_rate
125
  example["audio_data"] = audio_data
126
 
 
130
  gr.Markdown(
131
  """
132
  # **IndicF5: High-Quality Text-to-Speech for Indian Languages**
 
133
  [![Hugging Face](https://img.shields.io/badge/HuggingFace-Model-orange)](https://huggingface.co/ai4bharat/IndicF5)
 
134
  We release **IndicF5**, a **near-human polyglot** **Text-to-Speech (TTS)** model trained on **1417 hours** of high-quality speech from **[Rasa](https://huggingface.co/datasets/ai4bharat/Rasa), [IndicTTS](https://www.iitm.ac.in/donlab/indictts/database), [LIMMITS](https://sites.google.com/view/limmits24/), and [IndicVoices-R](https://huggingface.co/datasets/ai4bharat/indicvoices_r)**.
 
135
  IndicF5 supports **11 Indian languages**:
136
  **Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Odia, Punjabi, Tamil, Telugu.**
137
 
 
142
  with gr.Row():
143
  with gr.Column():
144
  text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text to convert to speech...", lines=3)
145
+ ref_audio_input = gr.Audio(type="numpy", label="Reference Prompt Audio", sources=["microphone", "upload"])
146
  ref_text_input = gr.Textbox(label="Text in Reference Prompt Audio", placeholder="Enter the transcript of the reference audio...", lines=2)
147
  submit_btn = gr.Button("🎤 Generate Speech", variant="primary")
148
 
requirements.txt CHANGED
@@ -18,7 +18,7 @@ git+https://github.com/ai4bharat/IndicF5.git
18
  # torchaudio>=2.0.0
19
  # torchdiffeq
20
  # tqdm>=4.65.0
21
- transformers<4.50
22
  # transformers_stream_generator
23
  # vocos
24
  # wandb
 
18
  # torchaudio>=2.0.0
19
  # torchdiffeq
20
  # tqdm>=4.65.0
21
+ transformers>=5.0.0
22
  # transformers_stream_generator
23
  # vocos
24
  # wandb