pkanda commited on
Commit
9daf1d6
·
verified ·
1 Parent(s): ac0cd94

Upload AgentF5TTSChunk.py

Browse files

I did some modifications from original chunk file

Differences Between the Original and Improved Gradio Interface in AgentF5TTSChunk.py
1. Improved Error Handling

Original: Did not check if model_path or output_audio_folder existed before processing.
Improved: Now verifies that model_path exists before initializing the model, and that output_audio_folder exists before proceeding. If they don’t exist, it logs an error and prevents execution to avoid crashes.

2. Output Audio Handling

Original: Expected a file path for output audio.
Improved: Now asks for an output folder instead of a specific file, dynamically creating "generated_audio.wav" inside the chosen folder.

3. File Validation Before Returning Output

Original: Directly returned the generated file path.
Improved: Now checks if the generated file exists before returning it to Gradio. If it doesn’t exist, logs an error and returns None.

4. More Robust Logging

Original: Limited logging for errors.
Improved: Added logging for missing files and incorrect paths to help with debugging.

5. Gradio Input Adjustments

Original: Accepted a file path for the model as a string.
Improved: Now uses gr.File for model_path and gr.Textbox for output_audio_folder to ensure the correct types are received.

Summary of Key Enhancements
Feature Original Improved
Model Path Handling Directly used path Checks if path exists before loading
Output File Handling Expected file path Uses folder and generates the filename
Error Handling Limited Logs errors for missing paths/files
Gradio Inputs File path as string Uses proper gr.File and gr.Textbox inputs
File Validation Returned file blindly Ensures file exists before returning

Files changed (1) hide show
  1. AgentF5TTSChunk.py +145 -190
AgentF5TTSChunk.py CHANGED
@@ -1,190 +1,145 @@
1
- import os
2
- import re
3
- import time
4
- import logging
5
- import subprocess
6
- from f5_tts.api import F5TTS
7
-
8
-
9
-
10
- logging.basicConfig(level=logging.INFO)
11
-
12
-
13
- class AgentF5TTS:
14
- def __init__(self, ckpt_file, vocoder_name="vocos", delay=0, device="mps"):
15
- """
16
- Initialize the F5-TTS Agent.
17
-
18
- :param ckpt_file: Path to the safetensors model checkpoint.
19
- :param vocoder_name: Name of the vocoder to use ("vocos" or "bigvgan").
20
- :param delay: Delay in seconds between audio generations.
21
- :param device: Device to use ("cpu", "cuda", "mps").
22
- """
23
- self.model = F5TTS(ckpt_file=ckpt_file, vocoder_name=vocoder_name, device=device)
24
- self.delay = delay # Delay in seconds
25
-
26
- def generate_emotion_speech(self, text_file, output_audio_file, speaker_emotion_refs, convert_to_mp3=False):
27
- """
28
- Generate speech using the F5-TTS model.
29
-
30
- :param text_file: Path to the input text file.
31
- :param output_audio_file: Path to save the combined audio output.
32
- :param speaker_emotion_refs: Dictionary mapping (speaker, emotion) tuples to reference audio paths.
33
- :param convert_to_mp3: Boolean flag to convert the output to MP3.
34
- """
35
- try:
36
- with open(text_file, "r", encoding="utf-8") as file:
37
- lines = [line.strip() for line in file if line.strip()]
38
- except FileNotFoundError:
39
- logging.error(f"Text file not found: {text_file}")
40
- return
41
-
42
- if not lines:
43
- logging.error("Input text file is empty.")
44
- return
45
-
46
- temp_files = []
47
- os.makedirs(os.path.dirname(output_audio_file), exist_ok=True)
48
-
49
- for i, line in enumerate(lines):
50
-
51
- speaker, emotion = self._determine_speaker_emotion(line)
52
- ref_audio = speaker_emotion_refs.get((speaker, emotion))
53
- line = re.sub(r'\[speaker:.*?\]\s*', '', line)
54
- if not ref_audio or not os.path.exists(ref_audio):
55
- logging.error(f"Reference audio not found for speaker '{speaker}', emotion '{emotion}'.")
56
- continue
57
-
58
- ref_text = "" # Placeholder or load corresponding text
59
- temp_file = f"{output_audio_file}_line{i + 1}.wav"
60
-
61
- try:
62
- logging.info(f"Generating speech for line {i + 1}: '{line}' with speaker '{speaker}', emotion '{emotion}'")
63
- self.model.infer(
64
- ref_file=ref_audio,
65
- ref_text=ref_text,
66
- gen_text=line,
67
- file_wave=temp_file,
68
- remove_silence=True,
69
- )
70
- temp_files.append(temp_file)
71
- time.sleep(self.delay)
72
- except Exception as e:
73
- logging.error(f"Error generating speech for line {i + 1}: {e}")
74
-
75
- self._combine_audio_files(temp_files, output_audio_file, convert_to_mp3)
76
-
77
-
78
-
79
- def generate_speech(self, text_file, output_audio_file, ref_audio, convert_to_mp3=False):
80
- try:
81
- with open(text_file, 'r', encoding='utf-8') as file:
82
- lines = [line.strip() for line in file if line.strip()]
83
- except FileNotFoundError:
84
- logging.error(f"Text file not found: {text_file}")
85
- return
86
-
87
- if not lines:
88
- logging.error("Input text file is empty.")
89
- return
90
-
91
- temp_files = []
92
- os.makedirs(os.path.dirname(output_audio_file), exist_ok=True)
93
-
94
- for i, line in enumerate(lines):
95
-
96
- if not ref_audio or not os.path.exists(ref_audio):
97
- logging.error(f"Reference audio not found for speaker.")
98
- continue
99
- temp_file = f"{output_audio_file}_line{i + 1}.wav"
100
-
101
- try:
102
- logging.info(f"Generating speech for line {i + 1}: '{line}'")
103
- self.model.infer(
104
- ref_file=ref_audio, # No reference audio
105
- ref_text="", # No reference text
106
- gen_text=line,
107
- file_wave=temp_file,
108
- )
109
- temp_files.append(temp_file)
110
- except Exception as e:
111
- logging.error(f"Error generating speech for line {i + 1}: {e}")
112
-
113
- # Combine temp_files into output_audio_file if needed
114
- self._combine_audio_files(temp_files, output_audio_file, convert_to_mp3)
115
-
116
-
117
-
118
-
119
- def _determine_speaker_emotion(self, text):
120
- """
121
- Extract speaker and emotion from the text using regex.
122
- Default to "speaker1" and "neutral" if not specified.
123
- """
124
- speaker, emotion = "speaker1", "neutral" # Default values
125
-
126
- # Use regex to find [speaker:speaker_name, emotion:emotion_name]
127
- match = re.search(r"\[speaker:(.*?), emotion:(.*?)\]", text)
128
- if match:
129
- speaker = match.group(1).strip()
130
- emotion = match.group(2).strip()
131
-
132
- logging.info(f"Determined speaker: '{speaker}', emotion: '{emotion}'")
133
- return speaker, emotion
134
-
135
- def _combine_audio_files(self, temp_files, output_audio_file, convert_to_mp3):
136
- """Combine multiple audio files into a single file using FFmpeg."""
137
- if not temp_files:
138
- logging.error("No audio files to combine.")
139
- return
140
-
141
- list_file = "file_list.txt"
142
- with open(list_file, "w") as f:
143
- for temp in temp_files:
144
- f.write(f"file '{temp}'\n")
145
-
146
- try:
147
- subprocess.run(["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", output_audio_file], check=True)
148
- if convert_to_mp3:
149
- mp3_output = output_audio_file.replace(".wav", ".mp3")
150
- subprocess.run(["ffmpeg", "-y", "-i", output_audio_file, "-codec:a", "libmp3lame", "-qscale:a", "2", mp3_output], check=True)
151
- logging.info(f"Converted to MP3: {mp3_output}")
152
- for temp in temp_files:
153
- os.remove(temp)
154
- os.remove(list_file)
155
- except Exception as e:
156
- logging.error(f"Error combining audio files: {e}")
157
-
158
-
159
- # Example usage, remove from this line on to import into other agents.
160
- # make sure to adjust the paths to yourr files.
161
- if __name__ == "__main__":
162
-
163
- env = os.environ.copy()
164
- env["PYTHONUNBUFFERED"] = "1"
165
-
166
- model_path = "./F5-TTS/ckpts/pt-br/model_last.safetensors"
167
- speaker_emotion_refs = {
168
- ("speaker1", "happy"): "ref_audios/speaker1_happy.wav",
169
- ("speaker1", "sad"): "ref_audios/speaker1_sad.wav",
170
- ("speaker1", "angry"): "ref_audios/speaker1_angry.wav",
171
- }
172
- agent = AgentF5TTS(ckpt_file=model_path, vocoder_name="vocos", delay=6)
173
-
174
- agent.generate_emotion_speech(
175
- text_file="input_text.txt",
176
- output_audio_file="output/final_output_emo.wav",
177
- speaker_emotion_refs=speaker_emotion_refs,
178
- convert_to_mp3=True,
179
- )
180
-
181
- agent.generate_speech(
182
- text_file="input_text2.txt",
183
- output_audio_file="output/final_output.wav",
184
- ref_audio="ref_audios/refaudio.mp3",
185
- convert_to_mp3=True,
186
- )
187
-
188
-
189
-
190
-
 
1
+ import os
2
+ import re
3
+ import time
4
+ import logging
5
+ import json
6
+ import subprocess
7
+ import gradio as gr
8
+ from f5_tts.api import F5TTS
9
+
10
+ # Constants
11
+ CONFIG_FILE = "last_inputs.json"
12
+
13
+ # Initialize logging
14
+ logging.basicConfig(level=logging.INFO)
15
+
16
+ class AgentF5TTS:
17
+ def __init__(self, ckpt_file, vocoder_name="vocos", delay=0, device="cuda"):
18
+ self.model = F5TTS(ckpt_file=ckpt_file, vocoder_name=vocoder_name, device=device)
19
+ self.delay = delay
20
+
21
+ def generate_emotion_speech(self, text, output_audio_folder, speaker_emotion_refs, convert_to_mp3=False):
22
+ lines = [line.strip() for line in text.split("\n") if line.strip()]
23
+
24
+ if not lines:
25
+ logging.error("Input text is empty.")
26
+ return
27
+
28
+ if not output_audio_folder:
29
+ logging.error("Output audio folder is not specified.")
30
+ return None
31
+ if not os.path.exists(output_audio_folder):
32
+ os.makedirs(output_audio_folder, exist_ok=True)
33
+ output_audio_file = os.path.join(output_audio_folder, "generated_audio.wav")
34
+ temp_files = []
35
+
36
+ for i, line in enumerate(lines):
37
+ speaker, emotion = self._determine_speaker_emotion(line)
38
+ ref_audio = speaker_emotion_refs.get((speaker, emotion))
39
+ line = re.sub(r'\[speaker:.*?\]\s*', '', line)
40
+ if not ref_audio or not os.path.exists(ref_audio):
41
+ logging.error(f"Reference audio not found for speaker '{speaker}', emotion '{emotion}'.")
42
+ continue
43
+
44
+ temp_file = os.path.join(output_audio_folder, f"line_{i + 1}.wav")
45
+
46
+ try:
47
+ logging.info(f"Generating speech for line {i + 1}: '{line}' with speaker '{speaker}', emotion '{emotion}'")
48
+ self.model.infer(
49
+ ref_file=ref_audio,
50
+ ref_text="", # Placeholder or load corresponding text
51
+ gen_text=line,
52
+ file_wave=temp_file,
53
+ remove_silence=True
54
+ )
55
+ temp_files.append(temp_file)
56
+ time.sleep(self.delay)
57
+ except Exception as e:
58
+ logging.error(f"Error generating speech for line {i + 1}: {e}")
59
+
60
+ self.combine_audio_files(temp_files, output_audio_file, convert_to_mp3)
61
+ return output_audio_file
62
+
63
+ def _determine_speaker_emotion(self, text):
64
+ speaker, emotion = "speaker1", "neutral"
65
+ match = re.search(r"\[speaker:(.*?), emotion:(.*?)\]", text)
66
+ if match:
67
+ speaker = match.group(1).strip()
68
+ emotion = match.group(2).strip()
69
+ return speaker, emotion
70
+
71
+ def combine_audio_files(self, temp_files, output_audio_file, convert_to_mp3):
72
+ if not temp_files:
73
+ logging.error("No audio files to combine.")
74
+ return
75
+
76
+ list_file = os.path.join(os.path.dirname(output_audio_file), "file_list.txt")
77
+ with open(list_file, "w") as f:
78
+ for temp in temp_files:
79
+ f.write(f"file '{temp}'\n")
80
+
81
+ try:
82
+ subprocess.run(["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", output_audio_file], check=True)
83
+ if convert_to_mp3:
84
+ mp3_output = output_audio_file.replace(".wav", ".mp3")
85
+ subprocess.run(["ffmpeg", "-y", "-i", output_audio_file, "-codec:a", "libmp3lame", "-qscale:a", "2", mp3_output], check=True)
86
+ logging.info(f"Converted to MP3: {mp3_output}")
87
+ for temp in temp_files:
88
+ os.remove(temp)
89
+ os.remove(list_file)
90
+ except Exception as e:
91
+ logging.error(f"Error combining audio files: {e}")
92
+
93
+ # Load last inputs from JSON file
94
+ def load_last_inputs():
95
+ if os.path.exists(CONFIG_FILE):
96
+ with open(CONFIG_FILE, "r") as f:
97
+ return json.load(f)
98
+ return {}
99
+
100
+ # Save inputs to JSON file
101
+ def save_last_inputs(inputs):
102
+ with open(CONFIG_FILE, "w") as f:
103
+ json.dump(inputs, f, indent=4)
104
+
105
+ # Gradio Interface
106
+ def gradio_interface(model_path, vocoder_name, delay, device, text, output_audio_folder, ref_audio, convert_to_mp3, speaker1_happy, speaker1_sad, speaker1_angry, speaker1_neutral):
107
+ if not os.path.exists(model_path.name):
108
+ logging.error(f"Model path does not exist: {model_path.name}")
109
+ return None
110
+ agent = AgentF5TTS(ckpt_file=model_path.name, vocoder_name=vocoder_name, delay=delay, device=device)
111
+ speaker_emotion_refs = {
112
+ ("speaker1", "happy"): speaker1_happy.name,
113
+ ("speaker1", "sad"): speaker1_sad.name,
114
+ ("speaker1", "angry"): speaker1_angry.name,
115
+ ("speaker1", "neutral"): speaker1_neutral.name,
116
+ }
117
+ if not os.path.exists(output_audio_folder):
118
+ logging.error(f"Output audio folder does not exist: {output_audio_folder}")
119
+ return None
120
+ output_file = agent.generate_emotion_speech(text, output_audio_folder, speaker_emotion_refs, convert_to_mp3)
121
+ return output_file if os.path.exists(output_file) else None
122
+
123
+ # Launch Gradio App
124
+ iface = gr.Interface(
125
+ fn=gradio_interface,
126
+ inputs=[
127
+ gr.File(label="Model Path"),
128
+ gr.Dropdown(label="Vocoder", choices=["vocos", "bigvgan"]),
129
+ gr.Number(label="Delay (seconds)"),
130
+ gr.Dropdown(label="Device", choices=["cpu", "cuda"]),
131
+ gr.Textbox(label="Input Text"),
132
+ gr.Textbox(label="Output Audio Folder"),
133
+ gr.File(label="Reference Audio File"),
134
+ gr.Checkbox(label="Convert to MP3"),
135
+ gr.File(label="Speaker1 Happy Reference Audio"),
136
+ gr.File(label="Speaker1 Sad Reference Audio"),
137
+ gr.File(label="Speaker1 Angry Reference Audio"),
138
+ gr.File(label="Speaker1 Neutral Reference Audio")
139
+ ],
140
+ outputs=gr.Audio(label="Generated Audio"),
141
+ title="F5-TTS Text-to-Speech Generator",
142
+ description="Generate speech from text using the F5-TTS model."
143
+ )
144
+
145
+ iface.launch()