Spaces:

sablab
/

F5

Sleeping

App Files Files Community

sablab commited on Jul 16, 2025

Commit

950142f

verified ·

1 Parent(s): 5cefbe6

Create app.py

Browse files

Files changed (1) hide show

app.py +60 -0

app.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import gradio as gr
+import torch
+from transformers import VitsModel, VitsTokenizer
+# --- 1. Load Model and Tokenizer ---
+# Load the pretrained model and tokenizer from Hugging Face.
+# This is done once when the app starts, not for every prediction.
+print("Loading F5-TTS model and tokenizer...")
+model = VitsModel.from_pretrained("SWivid/F5-TTS")
+tokenizer = VitsTokenizer.from_pretrained("SWivid/F5-TTS")
+print("Model and tokenizer loaded successfully.")
+# --- 2. Define the Speech Synthesis Function ---
+def synthesize_speech(text):
+    """
+    Converts text to speech using the F5-TTS model.
+    """
+    # Tokenize the input text. The `return_tensors="pt"` part formats it for PyTorch.
+    inputs = tokenizer(text, return_tensors="pt")
+    # Generate the audio waveform.
+    # We use torch.no_grad() to speed up inference as we aren't training the model.
+    with torch.no_grad():
+        waveform = model(**inputs).waveform
+    # The output is a PyTorch tensor. Convert it to a NumPy array.
+    # .squeeze() removes any extra single dimensions.
+    waveform_numpy = waveform.cpu().numpy().squeeze()
+    # Get the sampling rate from the model's configuration.
+    sampling_rate = model.config.sampling_rate
+    # Return the sampling rate and waveform as a tuple for the Gradio Audio component.
+    return (sampling_rate, waveform_numpy)
+# --- 3. Build the Gradio Interface ---
+demo = gr.Interface(
+    fn=synthesize_speech,
+    inputs=gr.Textbox(
+        label="Text to Synthesize",
+        info="Enter the text you want to convert to speech.",
+        value="Hello, this is a demonstration of the F5 text to speech model."
+    ),
+    outputs=gr.Audio(
+        label="Synthesized Audio",
+        type="numpy"  # The function returns a NumPy array
+    ),
+    title="🗣️ F5-TTS Text-to-Speech",
+    description="A simple Gradio app to run the `SWivid/F5-TTS` model for text-to-speech conversion. Built by Gemini.",
+    examples=[
+        ["The quick brown fox jumps over the lazy dog."],
+        ["To be, or not to be, that is the question."],
+        ["Artificial intelligence will shape our future in profound ways."]
+    ],
+    cache_examples=True # Cache results for faster demo
+)
+# --- 4. Launch the App ---
+if __name__ == "__main__":
+    demo.launch()