Upload AgentF5TTSChunk.py

I did some modifications from original chunk file

Differences Between the Original and Improved Gradio Interface in AgentF5TTSChunk.py
1. Improved Error Handling

Original: Did not check if model_path or output_audio_folder existed before processing.
Improved: Now verifies that model_path exists before initializing the model, and that output_audio_folder exists before proceeding. If they don’t exist, it logs an error and prevents execution to avoid crashes.

2. Output Audio Handling

Original: Expected a file path for output audio.
Improved: Now asks for an output folder instead of a specific file, dynamically creating "generated_audio.wav" inside the chosen folder.

3. File Validation Before Returning Output

Original: Directly returned the generated file path.
Improved: Now checks if the generated file exists before returning it to Gradio. If it doesn’t exist, logs an error and returns None.

4. More Robust Logging

Original: Limited logging for errors.
Improved: Added logging for missing files and incorrect paths to help with debugging.

5. Gradio Input Adjustments

Original: Accepted a file path for the model as a string.
Improved: Now uses gr.File for model_path and gr.Textbox for output_audio_folder to ensure the correct types are received.

Summary of Key Enhancements
Feature Original Improved
Model Path Handling Directly used path Checks if path exists before loading
Output File Handling Expected file path Uses folder and generates the filename
Error Handling Limited Logs errors for missing paths/files
Gradio Inputs File path as string Uses proper gr.File and gr.Textbox inputs
File Validation Returned file blindly Ensures file exists before returning

Files changed (1) hide show

AgentF5TTSChunk.py +145 -190

AgentF5TTSChunk.py CHANGED Viewed

@@ -1,190 +1,145 @@
-import os
-import re
-import time
-import logging
-import subprocess
-from f5_tts.api import F5TTS
-logging.basicConfig(level=logging.INFO)
-class AgentF5TTS:
-    def __init__(self, ckpt_file, vocoder_name="vocos", delay=0, device="mps"):
-        """
-        Initialize the F5-TTS Agent.
-        :param ckpt_file: Path to the safetensors model checkpoint.
-        :param vocoder_name: Name of the vocoder to use ("vocos" or "bigvgan").
-        :param delay: Delay in seconds between audio generations.
-        :param device: Device to use ("cpu", "cuda", "mps").
-        """
-        self.model = F5TTS(ckpt_file=ckpt_file, vocoder_name=vocoder_name, device=device)
-        self.delay = delay  # Delay in seconds
-    def generate_emotion_speech(self, text_file, output_audio_file, speaker_emotion_refs, convert_to_mp3=False):
-        """
-        Generate speech using the F5-TTS model.
-        :param text_file: Path to the input text file.
-        :param output_audio_file: Path to save the combined audio output.
-        :param speaker_emotion_refs: Dictionary mapping (speaker, emotion) tuples to reference audio paths.
-        :param convert_to_mp3: Boolean flag to convert the output to MP3.
-        """
-        try:
-            with open(text_file, "r", encoding="utf-8") as file:
-                lines = [line.strip() for line in file if line.strip()]
-        except FileNotFoundError:
-            logging.error(f"Text file not found: {text_file}")
-            return
-        if not lines:
-            logging.error("Input text file is empty.")
-            return
-        temp_files = []
-        os.makedirs(os.path.dirname(output_audio_file), exist_ok=True)
-        for i, line in enumerate(lines):
-            speaker, emotion = self._determine_speaker_emotion(line)
-            ref_audio = speaker_emotion_refs.get((speaker, emotion))
-            line = re.sub(r'\[speaker:.*?\]\s*', '', line)
-            if not ref_audio or not os.path.exists(ref_audio):
-                logging.error(f"Reference audio not found for speaker '{speaker}', emotion '{emotion}'.")
-                continue
-            ref_text = ""  # Placeholder or load corresponding text
-            temp_file = f"{output_audio_file}_line{i + 1}.wav"
-            try:
-                logging.info(f"Generating speech for line {i + 1}: '{line}' with speaker '{speaker}', emotion '{emotion}'")
-                self.model.infer(
-                    ref_file=ref_audio,
-                    ref_text=ref_text,
-                    gen_text=line,
-                    file_wave=temp_file,
-                    remove_silence=True,
-                )
-                temp_files.append(temp_file)
-                time.sleep(self.delay)
-            except Exception as e:
-                logging.error(f"Error generating speech for line {i + 1}: {e}")
-        self._combine_audio_files(temp_files, output_audio_file, convert_to_mp3)
-    def generate_speech(self, text_file, output_audio_file, ref_audio, convert_to_mp3=False):
-        try:
-            with open(text_file, 'r', encoding='utf-8') as file:
-                lines = [line.strip() for line in file if line.strip()]
-        except FileNotFoundError:
-            logging.error(f"Text file not found: {text_file}")
-            return
-        if not lines:
-            logging.error("Input text file is empty.")
-            return
-        temp_files = []
-        os.makedirs(os.path.dirname(output_audio_file), exist_ok=True)
-        for i, line in enumerate(lines):
-            if not ref_audio or not os.path.exists(ref_audio):
-                logging.error(f"Reference audio not found for speaker.")
-                continue
-            temp_file = f"{output_audio_file}_line{i + 1}.wav"
-            try:
-                logging.info(f"Generating speech for line {i + 1}: '{line}'")
-                self.model.infer(
-                    ref_file=ref_audio,  # No reference audio
-                    ref_text="",  # No reference text
-                    gen_text=line,
-                    file_wave=temp_file,
-                )
-                temp_files.append(temp_file)
-            except Exception as e:
-                logging.error(f"Error generating speech for line {i + 1}: {e}")
-        # Combine temp_files into output_audio_file if needed
-        self._combine_audio_files(temp_files, output_audio_file, convert_to_mp3)
-    def _determine_speaker_emotion(self, text):
-        """
-        Extract speaker and emotion from the text using regex.
-        Default to "speaker1" and "neutral" if not specified.
-        """
-        speaker, emotion = "speaker1", "neutral"  # Default values
-        # Use regex to find [speaker:speaker_name, emotion:emotion_name]
-        match = re.search(r"\[speaker:(.*?), emotion:(.*?)\]", text)
-        if match:
-            speaker = match.group(1).strip()
-            emotion = match.group(2).strip()
-        logging.info(f"Determined speaker: '{speaker}', emotion: '{emotion}'")
-        return speaker, emotion
-    def _combine_audio_files(self, temp_files, output_audio_file, convert_to_mp3):
-        """Combine multiple audio files into a single file using FFmpeg."""
-        if not temp_files:
-            logging.error("No audio files to combine.")
-            return
-        list_file = "file_list.txt"
-        with open(list_file, "w") as f:
-            for temp in temp_files:
-                f.write(f"file '{temp}'\n")
-        try:
-            subprocess.run(["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", output_audio_file], check=True)
-            if convert_to_mp3:
-                mp3_output = output_audio_file.replace(".wav", ".mp3")
-                subprocess.run(["ffmpeg", "-y", "-i", output_audio_file, "-codec:a", "libmp3lame", "-qscale:a", "2", mp3_output], check=True)
-                logging.info(f"Converted to MP3: {mp3_output}")
-            for temp in temp_files:
-                os.remove(temp)
-            os.remove(list_file)
-        except Exception as e:
-            logging.error(f"Error combining audio files: {e}")
-# Example usage, remove from this line on to import into other agents.
-# make sure to adjust the paths to yourr files.
-if __name__ == "__main__":
-    env = os.environ.copy()
-    env["PYTHONUNBUFFERED"] = "1"
-    model_path = "./F5-TTS/ckpts/pt-br/model_last.safetensors"
-    speaker_emotion_refs = {
-        ("speaker1", "happy"): "ref_audios/speaker1_happy.wav",
-        ("speaker1", "sad"): "ref_audios/speaker1_sad.wav",
-        ("speaker1", "angry"): "ref_audios/speaker1_angry.wav",
-    }
-    agent = AgentF5TTS(ckpt_file=model_path, vocoder_name="vocos", delay=6)
-    agent.generate_emotion_speech(
-        text_file="input_text.txt",
-        output_audio_file="output/final_output_emo.wav",
-        speaker_emotion_refs=speaker_emotion_refs,
-        convert_to_mp3=True,
-    )
-    agent.generate_speech(
-        text_file="input_text2.txt",
-        output_audio_file="output/final_output.wav",
-        ref_audio="ref_audios/refaudio.mp3",
-        convert_to_mp3=True,
-    )

+import os
+import re
+import time
+import logging
+import json
+import subprocess
+import gradio as gr
+from f5_tts.api import F5TTS
+# Constants
+CONFIG_FILE = "last_inputs.json"
+# Initialize logging
+logging.basicConfig(level=logging.INFO)
+class AgentF5TTS:
+    def __init__(self, ckpt_file, vocoder_name="vocos", delay=0, device="cuda"):
+        self.model = F5TTS(ckpt_file=ckpt_file, vocoder_name=vocoder_name, device=device)
+        self.delay = delay
+    def generate_emotion_speech(self, text, output_audio_folder, speaker_emotion_refs, convert_to_mp3=False):
+        lines = [line.strip() for line in text.split("\n") if line.strip()]
+        if not lines:
+            logging.error("Input text is empty.")
+            return
+        if not output_audio_folder:
+            logging.error("Output audio folder is not specified.")
+            return None
+        if not os.path.exists(output_audio_folder):
+            os.makedirs(output_audio_folder, exist_ok=True)
+        output_audio_file = os.path.join(output_audio_folder, "generated_audio.wav")
+        temp_files = []
+        for i, line in enumerate(lines):
+            speaker, emotion = self._determine_speaker_emotion(line)
+            ref_audio = speaker_emotion_refs.get((speaker, emotion))
+            line = re.sub(r'\[speaker:.*?\]\s*', '', line)
+            if not ref_audio or not os.path.exists(ref_audio):
+                logging.error(f"Reference audio not found for speaker '{speaker}', emotion '{emotion}'.")
+                continue
+            temp_file = os.path.join(output_audio_folder, f"line_{i + 1}.wav")
+            try:
+                logging.info(f"Generating speech for line {i + 1}: '{line}' with speaker '{speaker}', emotion '{emotion}'")
+                self.model.infer(
+                    ref_file=ref_audio,
+                    ref_text="",  # Placeholder or load corresponding text
+                    gen_text=line,
+                    file_wave=temp_file,
+                    remove_silence=True
+                )
+                temp_files.append(temp_file)
+                time.sleep(self.delay)
+            except Exception as e:
+                logging.error(f"Error generating speech for line {i + 1}: {e}")
+        self.combine_audio_files(temp_files, output_audio_file, convert_to_mp3)
+        return output_audio_file
+    def _determine_speaker_emotion(self, text):
+        speaker, emotion = "speaker1", "neutral"
+        match = re.search(r"\[speaker:(.*?), emotion:(.*?)\]", text)
+        if match:
+            speaker = match.group(1).strip()
+            emotion = match.group(2).strip()
+        return speaker, emotion
+    def combine_audio_files(self, temp_files, output_audio_file, convert_to_mp3):
+        if not temp_files:
+            logging.error("No audio files to combine.")
+            return
+        list_file = os.path.join(os.path.dirname(output_audio_file), "file_list.txt")
+        with open(list_file, "w") as f:
+            for temp in temp_files:
+                f.write(f"file '{temp}'\n")
+        try:
+            subprocess.run(["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", output_audio_file], check=True)
+            if convert_to_mp3:
+                mp3_output = output_audio_file.replace(".wav", ".mp3")
+                subprocess.run(["ffmpeg", "-y", "-i", output_audio_file, "-codec:a", "libmp3lame", "-qscale:a", "2", mp3_output], check=True)
+                logging.info(f"Converted to MP3: {mp3_output}")
+            for temp in temp_files:
+                os.remove(temp)
+            os.remove(list_file)
+        except Exception as e:
+            logging.error(f"Error combining audio files: {e}")
+# Load last inputs from JSON file
+def load_last_inputs():
+    if os.path.exists(CONFIG_FILE):
+        with open(CONFIG_FILE, "r") as f:
+            return json.load(f)
+    return {}
+# Save inputs to JSON file
+def save_last_inputs(inputs):
+    with open(CONFIG_FILE, "w") as f:
+        json.dump(inputs, f, indent=4)
+# Gradio Interface
+def gradio_interface(model_path, vocoder_name, delay, device, text, output_audio_folder, ref_audio, convert_to_mp3, speaker1_happy, speaker1_sad, speaker1_angry, speaker1_neutral):
+    if not os.path.exists(model_path.name):
+        logging.error(f"Model path does not exist: {model_path.name}")
+        return None
+    agent = AgentF5TTS(ckpt_file=model_path.name, vocoder_name=vocoder_name, delay=delay, device=device)
+    speaker_emotion_refs = {
+        ("speaker1", "happy"): speaker1_happy.name,
+        ("speaker1", "sad"): speaker1_sad.name,
+        ("speaker1", "angry"): speaker1_angry.name,
+        ("speaker1", "neutral"): speaker1_neutral.name,
+    }
+    if not os.path.exists(output_audio_folder):
+        logging.error(f"Output audio folder does not exist: {output_audio_folder}")
+        return None
+    output_file = agent.generate_emotion_speech(text, output_audio_folder, speaker_emotion_refs, convert_to_mp3)
+    return output_file if os.path.exists(output_file) else None
+# Launch Gradio App
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[
+        gr.File(label="Model Path"),
+        gr.Dropdown(label="Vocoder", choices=["vocos", "bigvgan"]),
+        gr.Number(label="Delay (seconds)"),
+        gr.Dropdown(label="Device", choices=["cpu", "cuda"]),
+        gr.Textbox(label="Input Text"),
+        gr.Textbox(label="Output Audio Folder"),
+        gr.File(label="Reference Audio File"),
+        gr.Checkbox(label="Convert to MP3"),
+        gr.File(label="Speaker1 Happy Reference Audio"),
+        gr.File(label="Speaker1 Sad Reference Audio"),
+        gr.File(label="Speaker1 Angry Reference Audio"),
+        gr.File(label="Speaker1 Neutral Reference Audio")
+    ],
+    outputs=gr.Audio(label="Generated Audio"),
+    title="F5-TTS Text-to-Speech Generator",
+    description="Generate speech from text using the F5-TTS model."
+)
+iface.launch()