Upload AgentF5TTSChunk.py
Browse filesI did some modifications from original chunk file
Differences Between the Original and Improved Gradio Interface in AgentF5TTSChunk.py
1. Improved Error Handling
Original: Did not check if model_path or output_audio_folder existed before processing.
Improved: Now verifies that model_path exists before initializing the model, and that output_audio_folder exists before proceeding. If they don’t exist, it logs an error and prevents execution to avoid crashes.
2. Output Audio Handling
Original: Expected a file path for output audio.
Improved: Now asks for an output folder instead of a specific file, dynamically creating "generated_audio.wav" inside the chosen folder.
3. File Validation Before Returning Output
Original: Directly returned the generated file path.
Improved: Now checks if the generated file exists before returning it to Gradio. If it doesn’t exist, logs an error and returns None.
4. More Robust Logging
Original: Limited logging for errors.
Improved: Added logging for missing files and incorrect paths to help with debugging.
5. Gradio Input Adjustments
Original: Accepted a file path for the model as a string.
Improved: Now uses gr.File for model_path and gr.Textbox for output_audio_folder to ensure the correct types are received.
Summary of Key Enhancements
Feature Original Improved
Model Path Handling Directly used path Checks if path exists before loading
Output File Handling Expected file path Uses folder and generates the filename
Error Handling Limited Logs errors for missing paths/files
Gradio Inputs File path as string Uses proper gr.File and gr.Textbox inputs
File Validation Returned file blindly Ensures file exists before returning
- AgentF5TTSChunk.py +145 -190
|
@@ -1,190 +1,145 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import re
|
| 3 |
-
import time
|
| 4 |
-
import logging
|
| 5 |
-
import
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
""
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
"
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
try:
|
| 147 |
-
subprocess.run(["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", output_audio_file], check=True)
|
| 148 |
-
if convert_to_mp3:
|
| 149 |
-
mp3_output = output_audio_file.replace(".wav", ".mp3")
|
| 150 |
-
subprocess.run(["ffmpeg", "-y", "-i", output_audio_file, "-codec:a", "libmp3lame", "-qscale:a", "2", mp3_output], check=True)
|
| 151 |
-
logging.info(f"Converted to MP3: {mp3_output}")
|
| 152 |
-
for temp in temp_files:
|
| 153 |
-
os.remove(temp)
|
| 154 |
-
os.remove(list_file)
|
| 155 |
-
except Exception as e:
|
| 156 |
-
logging.error(f"Error combining audio files: {e}")
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
# Example usage, remove from this line on to import into other agents.
|
| 160 |
-
# make sure to adjust the paths to yourr files.
|
| 161 |
-
if __name__ == "__main__":
|
| 162 |
-
|
| 163 |
-
env = os.environ.copy()
|
| 164 |
-
env["PYTHONUNBUFFERED"] = "1"
|
| 165 |
-
|
| 166 |
-
model_path = "./F5-TTS/ckpts/pt-br/model_last.safetensors"
|
| 167 |
-
speaker_emotion_refs = {
|
| 168 |
-
("speaker1", "happy"): "ref_audios/speaker1_happy.wav",
|
| 169 |
-
("speaker1", "sad"): "ref_audios/speaker1_sad.wav",
|
| 170 |
-
("speaker1", "angry"): "ref_audios/speaker1_angry.wav",
|
| 171 |
-
}
|
| 172 |
-
agent = AgentF5TTS(ckpt_file=model_path, vocoder_name="vocos", delay=6)
|
| 173 |
-
|
| 174 |
-
agent.generate_emotion_speech(
|
| 175 |
-
text_file="input_text.txt",
|
| 176 |
-
output_audio_file="output/final_output_emo.wav",
|
| 177 |
-
speaker_emotion_refs=speaker_emotion_refs,
|
| 178 |
-
convert_to_mp3=True,
|
| 179 |
-
)
|
| 180 |
-
|
| 181 |
-
agent.generate_speech(
|
| 182 |
-
text_file="input_text2.txt",
|
| 183 |
-
output_audio_file="output/final_output.wav",
|
| 184 |
-
ref_audio="ref_audios/refaudio.mp3",
|
| 185 |
-
convert_to_mp3=True,
|
| 186 |
-
)
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import time
|
| 4 |
+
import logging
|
| 5 |
+
import json
|
| 6 |
+
import subprocess
|
| 7 |
+
import gradio as gr
|
| 8 |
+
from f5_tts.api import F5TTS
|
| 9 |
+
|
| 10 |
+
# Constants
|
| 11 |
+
CONFIG_FILE = "last_inputs.json"
|
| 12 |
+
|
| 13 |
+
# Initialize logging
|
| 14 |
+
logging.basicConfig(level=logging.INFO)
|
| 15 |
+
|
| 16 |
+
class AgentF5TTS:
|
| 17 |
+
def __init__(self, ckpt_file, vocoder_name="vocos", delay=0, device="cuda"):
|
| 18 |
+
self.model = F5TTS(ckpt_file=ckpt_file, vocoder_name=vocoder_name, device=device)
|
| 19 |
+
self.delay = delay
|
| 20 |
+
|
| 21 |
+
def generate_emotion_speech(self, text, output_audio_folder, speaker_emotion_refs, convert_to_mp3=False):
|
| 22 |
+
lines = [line.strip() for line in text.split("\n") if line.strip()]
|
| 23 |
+
|
| 24 |
+
if not lines:
|
| 25 |
+
logging.error("Input text is empty.")
|
| 26 |
+
return
|
| 27 |
+
|
| 28 |
+
if not output_audio_folder:
|
| 29 |
+
logging.error("Output audio folder is not specified.")
|
| 30 |
+
return None
|
| 31 |
+
if not os.path.exists(output_audio_folder):
|
| 32 |
+
os.makedirs(output_audio_folder, exist_ok=True)
|
| 33 |
+
output_audio_file = os.path.join(output_audio_folder, "generated_audio.wav")
|
| 34 |
+
temp_files = []
|
| 35 |
+
|
| 36 |
+
for i, line in enumerate(lines):
|
| 37 |
+
speaker, emotion = self._determine_speaker_emotion(line)
|
| 38 |
+
ref_audio = speaker_emotion_refs.get((speaker, emotion))
|
| 39 |
+
line = re.sub(r'\[speaker:.*?\]\s*', '', line)
|
| 40 |
+
if not ref_audio or not os.path.exists(ref_audio):
|
| 41 |
+
logging.error(f"Reference audio not found for speaker '{speaker}', emotion '{emotion}'.")
|
| 42 |
+
continue
|
| 43 |
+
|
| 44 |
+
temp_file = os.path.join(output_audio_folder, f"line_{i + 1}.wav")
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
logging.info(f"Generating speech for line {i + 1}: '{line}' with speaker '{speaker}', emotion '{emotion}'")
|
| 48 |
+
self.model.infer(
|
| 49 |
+
ref_file=ref_audio,
|
| 50 |
+
ref_text="", # Placeholder or load corresponding text
|
| 51 |
+
gen_text=line,
|
| 52 |
+
file_wave=temp_file,
|
| 53 |
+
remove_silence=True
|
| 54 |
+
)
|
| 55 |
+
temp_files.append(temp_file)
|
| 56 |
+
time.sleep(self.delay)
|
| 57 |
+
except Exception as e:
|
| 58 |
+
logging.error(f"Error generating speech for line {i + 1}: {e}")
|
| 59 |
+
|
| 60 |
+
self.combine_audio_files(temp_files, output_audio_file, convert_to_mp3)
|
| 61 |
+
return output_audio_file
|
| 62 |
+
|
| 63 |
+
def _determine_speaker_emotion(self, text):
|
| 64 |
+
speaker, emotion = "speaker1", "neutral"
|
| 65 |
+
match = re.search(r"\[speaker:(.*?), emotion:(.*?)\]", text)
|
| 66 |
+
if match:
|
| 67 |
+
speaker = match.group(1).strip()
|
| 68 |
+
emotion = match.group(2).strip()
|
| 69 |
+
return speaker, emotion
|
| 70 |
+
|
| 71 |
+
def combine_audio_files(self, temp_files, output_audio_file, convert_to_mp3):
|
| 72 |
+
if not temp_files:
|
| 73 |
+
logging.error("No audio files to combine.")
|
| 74 |
+
return
|
| 75 |
+
|
| 76 |
+
list_file = os.path.join(os.path.dirname(output_audio_file), "file_list.txt")
|
| 77 |
+
with open(list_file, "w") as f:
|
| 78 |
+
for temp in temp_files:
|
| 79 |
+
f.write(f"file '{temp}'\n")
|
| 80 |
+
|
| 81 |
+
try:
|
| 82 |
+
subprocess.run(["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", output_audio_file], check=True)
|
| 83 |
+
if convert_to_mp3:
|
| 84 |
+
mp3_output = output_audio_file.replace(".wav", ".mp3")
|
| 85 |
+
subprocess.run(["ffmpeg", "-y", "-i", output_audio_file, "-codec:a", "libmp3lame", "-qscale:a", "2", mp3_output], check=True)
|
| 86 |
+
logging.info(f"Converted to MP3: {mp3_output}")
|
| 87 |
+
for temp in temp_files:
|
| 88 |
+
os.remove(temp)
|
| 89 |
+
os.remove(list_file)
|
| 90 |
+
except Exception as e:
|
| 91 |
+
logging.error(f"Error combining audio files: {e}")
|
| 92 |
+
|
| 93 |
+
# Load last inputs from JSON file
|
| 94 |
+
def load_last_inputs():
|
| 95 |
+
if os.path.exists(CONFIG_FILE):
|
| 96 |
+
with open(CONFIG_FILE, "r") as f:
|
| 97 |
+
return json.load(f)
|
| 98 |
+
return {}
|
| 99 |
+
|
| 100 |
+
# Save inputs to JSON file
|
| 101 |
+
def save_last_inputs(inputs):
|
| 102 |
+
with open(CONFIG_FILE, "w") as f:
|
| 103 |
+
json.dump(inputs, f, indent=4)
|
| 104 |
+
|
| 105 |
+
# Gradio Interface
|
| 106 |
+
def gradio_interface(model_path, vocoder_name, delay, device, text, output_audio_folder, ref_audio, convert_to_mp3, speaker1_happy, speaker1_sad, speaker1_angry, speaker1_neutral):
|
| 107 |
+
if not os.path.exists(model_path.name):
|
| 108 |
+
logging.error(f"Model path does not exist: {model_path.name}")
|
| 109 |
+
return None
|
| 110 |
+
agent = AgentF5TTS(ckpt_file=model_path.name, vocoder_name=vocoder_name, delay=delay, device=device)
|
| 111 |
+
speaker_emotion_refs = {
|
| 112 |
+
("speaker1", "happy"): speaker1_happy.name,
|
| 113 |
+
("speaker1", "sad"): speaker1_sad.name,
|
| 114 |
+
("speaker1", "angry"): speaker1_angry.name,
|
| 115 |
+
("speaker1", "neutral"): speaker1_neutral.name,
|
| 116 |
+
}
|
| 117 |
+
if not os.path.exists(output_audio_folder):
|
| 118 |
+
logging.error(f"Output audio folder does not exist: {output_audio_folder}")
|
| 119 |
+
return None
|
| 120 |
+
output_file = agent.generate_emotion_speech(text, output_audio_folder, speaker_emotion_refs, convert_to_mp3)
|
| 121 |
+
return output_file if os.path.exists(output_file) else None
|
| 122 |
+
|
| 123 |
+
# Launch Gradio App
|
| 124 |
+
iface = gr.Interface(
|
| 125 |
+
fn=gradio_interface,
|
| 126 |
+
inputs=[
|
| 127 |
+
gr.File(label="Model Path"),
|
| 128 |
+
gr.Dropdown(label="Vocoder", choices=["vocos", "bigvgan"]),
|
| 129 |
+
gr.Number(label="Delay (seconds)"),
|
| 130 |
+
gr.Dropdown(label="Device", choices=["cpu", "cuda"]),
|
| 131 |
+
gr.Textbox(label="Input Text"),
|
| 132 |
+
gr.Textbox(label="Output Audio Folder"),
|
| 133 |
+
gr.File(label="Reference Audio File"),
|
| 134 |
+
gr.Checkbox(label="Convert to MP3"),
|
| 135 |
+
gr.File(label="Speaker1 Happy Reference Audio"),
|
| 136 |
+
gr.File(label="Speaker1 Sad Reference Audio"),
|
| 137 |
+
gr.File(label="Speaker1 Angry Reference Audio"),
|
| 138 |
+
gr.File(label="Speaker1 Neutral Reference Audio")
|
| 139 |
+
],
|
| 140 |
+
outputs=gr.Audio(label="Generated Audio"),
|
| 141 |
+
title="F5-TTS Text-to-Speech Generator",
|
| 142 |
+
description="Generate speech from text using the F5-TTS model."
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
iface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|