File size: 3,089 Bytes
6c20eaa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import numpy as np
import soundfile as sf
import gradio as gr
from g2p_en import G2p
from tensorflow.keras.models import load_model
import os

# Load the pre-trained model from the local directory
model_path = './model.h5'
model = load_model(model_path)

# Initialize the g2p model for converting text to phonemes
g2p = G2p()

def text_to_phonemes(text):
    """Convert input text to phonemes."""
    phonemes = g2p(text)
    return phonemes

def generate_audio_features(phonemes, duration):
    """Generate audio features from phonemes using the trained model."""
    # Convert phonemes into a format suitable for the model input
    # This is an example; modify it to match your model's requirements
    phoneme_features = np.array([ord(p) for p in phonemes])  # Convert phonemes to numerical features
    phoneme_features = np.expand_dims(phoneme_features, axis=0)  # Reshape for model input
    
    # Generate audio features using the model
    audio_features = model.predict(phoneme_features)
    
    # Adjust the length of the features based on the selected duration
    num_samples = int(duration * 22050)  # Example calculation assuming 22050 samples per second
    audio_features = np.resize(audio_features, (num_samples,))
    
    return audio_features

def features_to_audio(features):
    """Convert generated features back to audio."""
    # Normalize the audio to a suitable range (-1 to 1)
    audio = np.interp(features, (features.min(), features.max()), (-1, 1))
    
    return audio

def generate_audio(text, duration):
    """Main function to handle text-to-audio conversion."""
    # Step 1: Convert text to phonemes
    phonemes = text_to_phonemes(text)
    
    # Step 2: Generate audio features using the pre-trained model and duration
    audio_features = generate_audio_features(phonemes, duration)
    
    # Step 3: Convert features to actual audio
    audio_data = features_to_audio(audio_features)
    
    # Step 4: Save the generated audio
    audio_file = 'generated_audio.wav'
    sample_rate = 22050  # Use the sample rate for audio generation
    sf.write(audio_file, audio_data, sample_rate)
    
    return audio_file

# Gradio interface
def text_to_audio_interface(text, duration):
    """Gradio interface function to generate and return audio."""
    # Call the generate_audio function with the text and selected duration
    audio_file = generate_audio(text, duration)
    
    # Return the path to the generated audio file
    return audio_file

# Create the Gradio interface with a note, labeled button, and a slider for duration
gr.Interface(
    fn=text_to_audio_interface, 
    inputs=[
        gr.inputs.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into Sfx Sound"),
        gr.inputs.Slider(minimum=1, maximum=20, default=5, step=1, label="Audio Duration (seconds)")
    ],
    outputs=gr.outputs.Audio(label="Generated Audio Preview"),
    title="Text-to-Audio Generator",
    description="Write a Word, set the duration, and press 'Generate' to convert the word into an audio effect!",
    live=True
).launch()