Upload AgentF5TTSChunk.py

#11
by pkanda - opened
Files changed (1) hide show
  1. AgentF5TTSChunk.py +145 -190
AgentF5TTSChunk.py CHANGED
@@ -1,190 +1,145 @@
1
- import os
2
- import re
3
- import time
4
- import logging
5
- import subprocess
6
- from f5_tts.api import F5TTS
7
-
8
-
9
-
10
- logging.basicConfig(level=logging.INFO)
11
-
12
-
13
- class AgentF5TTS:
14
- def __init__(self, ckpt_file, vocoder_name="vocos", delay=0, device="mps"):
15
- """
16
- Initialize the F5-TTS Agent.
17
-
18
- :param ckpt_file: Path to the safetensors model checkpoint.
19
- :param vocoder_name: Name of the vocoder to use ("vocos" or "bigvgan").
20
- :param delay: Delay in seconds between audio generations.
21
- :param device: Device to use ("cpu", "cuda", "mps").
22
- """
23
- self.model = F5TTS(ckpt_file=ckpt_file, vocoder_name=vocoder_name, device=device)
24
- self.delay = delay # Delay in seconds
25
-
26
- def generate_emotion_speech(self, text_file, output_audio_file, speaker_emotion_refs, convert_to_mp3=False):
27
- """
28
- Generate speech using the F5-TTS model.
29
-
30
- :param text_file: Path to the input text file.
31
- :param output_audio_file: Path to save the combined audio output.
32
- :param speaker_emotion_refs: Dictionary mapping (speaker, emotion) tuples to reference audio paths.
33
- :param convert_to_mp3: Boolean flag to convert the output to MP3.
34
- """
35
- try:
36
- with open(text_file, "r", encoding="utf-8") as file:
37
- lines = [line.strip() for line in file if line.strip()]
38
- except FileNotFoundError:
39
- logging.error(f"Text file not found: {text_file}")
40
- return
41
-
42
- if not lines:
43
- logging.error("Input text file is empty.")
44
- return
45
-
46
- temp_files = []
47
- os.makedirs(os.path.dirname(output_audio_file), exist_ok=True)
48
-
49
- for i, line in enumerate(lines):
50
-
51
- speaker, emotion = self._determine_speaker_emotion(line)
52
- ref_audio = speaker_emotion_refs.get((speaker, emotion))
53
- line = re.sub(r'\[speaker:.*?\]\s*', '', line)
54
- if not ref_audio or not os.path.exists(ref_audio):
55
- logging.error(f"Reference audio not found for speaker '{speaker}', emotion '{emotion}'.")
56
- continue
57
-
58
- ref_text = "" # Placeholder or load corresponding text
59
- temp_file = f"{output_audio_file}_line{i + 1}.wav"
60
-
61
- try:
62
- logging.info(f"Generating speech for line {i + 1}: '{line}' with speaker '{speaker}', emotion '{emotion}'")
63
- self.model.infer(
64
- ref_file=ref_audio,
65
- ref_text=ref_text,
66
- gen_text=line,
67
- file_wave=temp_file,
68
- remove_silence=True,
69
- )
70
- temp_files.append(temp_file)
71
- time.sleep(self.delay)
72
- except Exception as e:
73
- logging.error(f"Error generating speech for line {i + 1}: {e}")
74
-
75
- self._combine_audio_files(temp_files, output_audio_file, convert_to_mp3)
76
-
77
-
78
-
79
- def generate_speech(self, text_file, output_audio_file, ref_audio, convert_to_mp3=False):
80
- try:
81
- with open(text_file, 'r', encoding='utf-8') as file:
82
- lines = [line.strip() for line in file if line.strip()]
83
- except FileNotFoundError:
84
- logging.error(f"Text file not found: {text_file}")
85
- return
86
-
87
- if not lines:
88
- logging.error("Input text file is empty.")
89
- return
90
-
91
- temp_files = []
92
- os.makedirs(os.path.dirname(output_audio_file), exist_ok=True)
93
-
94
- for i, line in enumerate(lines):
95
-
96
- if not ref_audio or not os.path.exists(ref_audio):
97
- logging.error(f"Reference audio not found for speaker.")
98
- continue
99
- temp_file = f"{output_audio_file}_line{i + 1}.wav"
100
-
101
- try:
102
- logging.info(f"Generating speech for line {i + 1}: '{line}'")
103
- self.model.infer(
104
- ref_file=ref_audio, # No reference audio
105
- ref_text="", # No reference text
106
- gen_text=line,
107
- file_wave=temp_file,
108
- )
109
- temp_files.append(temp_file)
110
- except Exception as e:
111
- logging.error(f"Error generating speech for line {i + 1}: {e}")
112
-
113
- # Combine temp_files into output_audio_file if needed
114
- self._combine_audio_files(temp_files, output_audio_file, convert_to_mp3)
115
-
116
-
117
-
118
-
119
- def _determine_speaker_emotion(self, text):
120
- """
121
- Extract speaker and emotion from the text using regex.
122
- Default to "speaker1" and "neutral" if not specified.
123
- """
124
- speaker, emotion = "speaker1", "neutral" # Default values
125
-
126
- # Use regex to find [speaker:speaker_name, emotion:emotion_name]
127
- match = re.search(r"\[speaker:(.*?), emotion:(.*?)\]", text)
128
- if match:
129
- speaker = match.group(1).strip()
130
- emotion = match.group(2).strip()
131
-
132
- logging.info(f"Determined speaker: '{speaker}', emotion: '{emotion}'")
133
- return speaker, emotion
134
-
135
- def _combine_audio_files(self, temp_files, output_audio_file, convert_to_mp3):
136
- """Combine multiple audio files into a single file using FFmpeg."""
137
- if not temp_files:
138
- logging.error("No audio files to combine.")
139
- return
140
-
141
- list_file = "file_list.txt"
142
- with open(list_file, "w") as f:
143
- for temp in temp_files:
144
- f.write(f"file '{temp}'\n")
145
-
146
- try:
147
- subprocess.run(["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", output_audio_file], check=True)
148
- if convert_to_mp3:
149
- mp3_output = output_audio_file.replace(".wav", ".mp3")
150
- subprocess.run(["ffmpeg", "-y", "-i", output_audio_file, "-codec:a", "libmp3lame", "-qscale:a", "2", mp3_output], check=True)
151
- logging.info(f"Converted to MP3: {mp3_output}")
152
- for temp in temp_files:
153
- os.remove(temp)
154
- os.remove(list_file)
155
- except Exception as e:
156
- logging.error(f"Error combining audio files: {e}")
157
-
158
-
159
- # Example usage, remove from this line on to import into other agents.
160
- # make sure to adjust the paths to yourr files.
161
- if __name__ == "__main__":
162
-
163
- env = os.environ.copy()
164
- env["PYTHONUNBUFFERED"] = "1"
165
-
166
- model_path = "./F5-TTS/ckpts/pt-br/model_last.safetensors"
167
- speaker_emotion_refs = {
168
- ("speaker1", "happy"): "ref_audios/speaker1_happy.wav",
169
- ("speaker1", "sad"): "ref_audios/speaker1_sad.wav",
170
- ("speaker1", "angry"): "ref_audios/speaker1_angry.wav",
171
- }
172
- agent = AgentF5TTS(ckpt_file=model_path, vocoder_name="vocos", delay=6)
173
-
174
- agent.generate_emotion_speech(
175
- text_file="input_text.txt",
176
- output_audio_file="output/final_output_emo.wav",
177
- speaker_emotion_refs=speaker_emotion_refs,
178
- convert_to_mp3=True,
179
- )
180
-
181
- agent.generate_speech(
182
- text_file="input_text2.txt",
183
- output_audio_file="output/final_output.wav",
184
- ref_audio="ref_audios/refaudio.mp3",
185
- convert_to_mp3=True,
186
- )
187
-
188
-
189
-
190
-
 
1
+ import os
2
+ import re
3
+ import time
4
+ import logging
5
+ import json
6
+ import subprocess
7
+ import gradio as gr
8
+ from f5_tts.api import F5TTS
9
+
10
+ # Constants
11
+ CONFIG_FILE = "last_inputs.json"
12
+
13
+ # Initialize logging
14
+ logging.basicConfig(level=logging.INFO)
15
+
16
+ class AgentF5TTS:
17
+ def __init__(self, ckpt_file, vocoder_name="vocos", delay=0, device="cuda"):
18
+ self.model = F5TTS(ckpt_file=ckpt_file, vocoder_name=vocoder_name, device=device)
19
+ self.delay = delay
20
+
21
+ def generate_emotion_speech(self, text, output_audio_folder, speaker_emotion_refs, convert_to_mp3=False):
22
+ lines = [line.strip() for line in text.split("\n") if line.strip()]
23
+
24
+ if not lines:
25
+ logging.error("Input text is empty.")
26
+ return
27
+
28
+ if not output_audio_folder:
29
+ logging.error("Output audio folder is not specified.")
30
+ return None
31
+ if not os.path.exists(output_audio_folder):
32
+ os.makedirs(output_audio_folder, exist_ok=True)
33
+ output_audio_file = os.path.join(output_audio_folder, "generated_audio.wav")
34
+ temp_files = []
35
+
36
+ for i, line in enumerate(lines):
37
+ speaker, emotion = self._determine_speaker_emotion(line)
38
+ ref_audio = speaker_emotion_refs.get((speaker, emotion))
39
+ line = re.sub(r'\[speaker:.*?\]\s*', '', line)
40
+ if not ref_audio or not os.path.exists(ref_audio):
41
+ logging.error(f"Reference audio not found for speaker '{speaker}', emotion '{emotion}'.")
42
+ continue
43
+
44
+ temp_file = os.path.join(output_audio_folder, f"line_{i + 1}.wav")
45
+
46
+ try:
47
+ logging.info(f"Generating speech for line {i + 1}: '{line}' with speaker '{speaker}', emotion '{emotion}'")
48
+ self.model.infer(
49
+ ref_file=ref_audio,
50
+ ref_text="", # Placeholder or load corresponding text
51
+ gen_text=line,
52
+ file_wave=temp_file,
53
+ remove_silence=True
54
+ )
55
+ temp_files.append(temp_file)
56
+ time.sleep(self.delay)
57
+ except Exception as e:
58
+ logging.error(f"Error generating speech for line {i + 1}: {e}")
59
+
60
+ self.combine_audio_files(temp_files, output_audio_file, convert_to_mp3)
61
+ return output_audio_file
62
+
63
+ def _determine_speaker_emotion(self, text):
64
+ speaker, emotion = "speaker1", "neutral"
65
+ match = re.search(r"\[speaker:(.*?), emotion:(.*?)\]", text)
66
+ if match:
67
+ speaker = match.group(1).strip()
68
+ emotion = match.group(2).strip()
69
+ return speaker, emotion
70
+
71
+ def combine_audio_files(self, temp_files, output_audio_file, convert_to_mp3):
72
+ if not temp_files:
73
+ logging.error("No audio files to combine.")
74
+ return
75
+
76
+ list_file = os.path.join(os.path.dirname(output_audio_file), "file_list.txt")
77
+ with open(list_file, "w") as f:
78
+ for temp in temp_files:
79
+ f.write(f"file '{temp}'\n")
80
+
81
+ try:
82
+ subprocess.run(["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", output_audio_file], check=True)
83
+ if convert_to_mp3:
84
+ mp3_output = output_audio_file.replace(".wav", ".mp3")
85
+ subprocess.run(["ffmpeg", "-y", "-i", output_audio_file, "-codec:a", "libmp3lame", "-qscale:a", "2", mp3_output], check=True)
86
+ logging.info(f"Converted to MP3: {mp3_output}")
87
+ for temp in temp_files:
88
+ os.remove(temp)
89
+ os.remove(list_file)
90
+ except Exception as e:
91
+ logging.error(f"Error combining audio files: {e}")
92
+
93
+ # Load last inputs from JSON file
94
+ def load_last_inputs():
95
+ if os.path.exists(CONFIG_FILE):
96
+ with open(CONFIG_FILE, "r") as f:
97
+ return json.load(f)
98
+ return {}
99
+
100
+ # Save inputs to JSON file
101
+ def save_last_inputs(inputs):
102
+ with open(CONFIG_FILE, "w") as f:
103
+ json.dump(inputs, f, indent=4)
104
+
105
+ # Gradio Interface
106
+ def gradio_interface(model_path, vocoder_name, delay, device, text, output_audio_folder, ref_audio, convert_to_mp3, speaker1_happy, speaker1_sad, speaker1_angry, speaker1_neutral):
107
+ if not os.path.exists(model_path.name):
108
+ logging.error(f"Model path does not exist: {model_path.name}")
109
+ return None
110
+ agent = AgentF5TTS(ckpt_file=model_path.name, vocoder_name=vocoder_name, delay=delay, device=device)
111
+ speaker_emotion_refs = {
112
+ ("speaker1", "happy"): speaker1_happy.name,
113
+ ("speaker1", "sad"): speaker1_sad.name,
114
+ ("speaker1", "angry"): speaker1_angry.name,
115
+ ("speaker1", "neutral"): speaker1_neutral.name,
116
+ }
117
+ if not os.path.exists(output_audio_folder):
118
+ logging.error(f"Output audio folder does not exist: {output_audio_folder}")
119
+ return None
120
+ output_file = agent.generate_emotion_speech(text, output_audio_folder, speaker_emotion_refs, convert_to_mp3)
121
+ return output_file if os.path.exists(output_file) else None
122
+
123
+ # Launch Gradio App
124
+ iface = gr.Interface(
125
+ fn=gradio_interface,
126
+ inputs=[
127
+ gr.File(label="Model Path"),
128
+ gr.Dropdown(label="Vocoder", choices=["vocos", "bigvgan"]),
129
+ gr.Number(label="Delay (seconds)"),
130
+ gr.Dropdown(label="Device", choices=["cpu", "cuda"]),
131
+ gr.Textbox(label="Input Text"),
132
+ gr.Textbox(label="Output Audio Folder"),
133
+ gr.File(label="Reference Audio File"),
134
+ gr.Checkbox(label="Convert to MP3"),
135
+ gr.File(label="Speaker1 Happy Reference Audio"),
136
+ gr.File(label="Speaker1 Sad Reference Audio"),
137
+ gr.File(label="Speaker1 Angry Reference Audio"),
138
+ gr.File(label="Speaker1 Neutral Reference Audio")
139
+ ],
140
+ outputs=gr.Audio(label="Generated Audio"),
141
+ title="F5-TTS Text-to-Speech Generator",
142
+ description="Generate speech from text using the F5-TTS model."
143
+ )
144
+
145
+ iface.launch()