File size: 3,625 Bytes
6c20eaa
38b530f
 
 
 
3ac0359
6c20eaa
38b530f
 
 
6c20eaa
38b530f
 
6c20eaa
38b530f
 
 
 
 
 
 
 
6c20eaa
38b530f
 
 
 
 
6c20eaa
38b530f
 
6c20eaa
809a47f
26107f3
 
809a47f
26107f3
 
809a47f
26107f3
 
6c20eaa
3ac0359
c3f5e81
3ac0359
 
 
809a47f
 
 
 
 
3ac0359
c3f5e81
 
 
3ac0359
c3f5e81
 
38b530f
 
c3f5e81
38b530f
 
6c20eaa
c3f5e81
 
 
 
6c20eaa
c3f5e81
 
 
809a47f
c3f5e81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38b530f
 
 
 
c3f5e81
 
 
 
809a47f
38b530f
 
c3f5e81
38b530f
6c20eaa
38b530f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
import tensorflow as tf
import numpy as np
import nltk
from nltk.corpus import cmudict
from scipy.io.wavfile import write

# Download required NLTK data
nltk.download('averaged_perceptron_tagger')
nltk.download('cmudict')

# Load your model
model = tf.keras.models.load_model('audio_model.h5')

# Preprocess input text
def preprocess_text(text):
    """
    Process the input text to prepare it for the model.
    """
    d = cmudict.dict()
    words = text.lower().split()
    phonemes = []

    for word in words:
        if word in d:
            phonemes.append(d[word][0])
        else:
            phonemes.append(['UNKNOWN'])
    
    # Flatten the list of phonemes
    flattened_phonemes = [p for sublist in phonemes for p in sublist]
    
    # Create dummy feature vectors (this should be replaced with actual feature extraction)
    num_features = 13
    sequence_length = len(flattened_phonemes)
    input_data = np.random.rand(sequence_length, num_features)  # Placeholder
    
    # Add batch dimension
    input_data = np.expand_dims(input_data, axis=0)
    
    return input_data

# Convert model output to an audio file
def convert_to_audio(model_output, sample_rate=22050):
    """
    Convert the model output into a .wav file.
    """
    # Check if model_output is empty
    if model_output is None or len(model_output) == 0:
        raise ValueError("Model output is empty.")

    # Normalize the audio output
    normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1))
    
    # Return normalized output for further processing
    return normalized_output

# Generate sound effect with specified duration
def generate_sfx(text, duration=30):
    """
    Takes input text, preprocesses it, runs it through the model,
    and generates a downloadable audio file for a specified duration.
    """
    input_data = preprocess_text(text)
    
    # Initialize an empty list to hold audio segments
    audio_segments = []
    total_samples = duration * 22050  # Calculate total samples for 30 seconds
    generated_samples = 0

    while generated_samples < total_samples:
        # Generate prediction
        prediction = model.predict(input_data)

        # Ensure prediction shape is correct
        if prediction.ndim == 2 and prediction.shape[1] > 1:
            prediction = prediction.flatten()  # Flatten if necessary
        
        # Convert the prediction to audio data
        audio_segment = convert_to_audio(prediction)

        # Append the generated segment to the list
        audio_segments.append(audio_segment)

        # Increment the total samples generated
        generated_samples += len(audio_segment)

    # Concatenate all segments to form the final audio output
    final_audio = np.concatenate(audio_segments)[:total_samples]  # Ensure we cut to the correct length

    # Write the audio data to a file
    output_filename = "output.wav"
    write(output_filename, 22050, final_audio)

    return output_filename

# Define the Gradio interface
interface = gr.Interface(
    fn=generate_sfx,
    inputs=[
        gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
        gr.Slider(label="Duration (seconds)", minimum=1, maximum=60, value=30)  # Added duration slider
    ],
    outputs=gr.Audio(label="Generated SFX", type="filepath"),
    live=False,
    title="SFX Generator from Text",
    description="Enter a word or sentence, and the model will generate an SFX sound for the specified duration.",
)

# Run the interface
if __name__ == "__main__":
    interface.launch()