import gradio as gr import tensorflow as tf import numpy as np import nltk from nltk.corpus import cmudict from scipy.io.wavfile import write # Download required NLTK data nltk.download('averaged_perceptron_tagger') nltk.download('cmudict') # Load your model model = tf.keras.models.load_model('audio_model.h5') # Preprocess input text def preprocess_text(text): """ Process the input text to prepare it for the model. """ d = cmudict.dict() words = text.lower().split() phonemes = [] for word in words: if word in d: phonemes.append(d[word][0]) else: phonemes.append(['UNKNOWN']) # Flatten the list of phonemes flattened_phonemes = [p for sublist in phonemes for p in sublist] # Create dummy feature vectors (this should be replaced with actual feature extraction) num_features = 13 sequence_length = len(flattened_phonemes) input_data = np.random.rand(sequence_length, num_features) # Placeholder # Add batch dimension input_data = np.expand_dims(input_data, axis=0) return input_data # Convert model output to an audio file def convert_to_audio(model_output, sample_rate=22050): """ Convert the model output into a .wav file. """ # Check if model_output is empty if model_output is None or len(model_output) == 0: raise ValueError("Model output is empty.") # Normalize the audio output normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1)) # Return normalized output for further processing return normalized_output # Generate sound effect with specified duration def generate_sfx(text, duration=30): """ Takes input text, preprocesses it, runs it through the model, and generates a downloadable audio file for a specified duration. """ input_data = preprocess_text(text) # Initialize an empty list to hold audio segments audio_segments = [] total_samples = duration * 22050 # Calculate total samples for 30 seconds generated_samples = 0 while generated_samples < total_samples: # Generate prediction prediction = model.predict(input_data) # Ensure prediction shape is correct if prediction.ndim == 2 and prediction.shape[1] > 1: prediction = prediction.flatten() # Flatten if necessary # Convert the prediction to audio data audio_segment = convert_to_audio(prediction) # Append the generated segment to the list audio_segments.append(audio_segment) # Increment the total samples generated generated_samples += len(audio_segment) # Concatenate all segments to form the final audio output final_audio = np.concatenate(audio_segments)[:total_samples] # Ensure we cut to the correct length # Write the audio data to a file output_filename = "output.wav" write(output_filename, 22050, final_audio) return output_filename # Define the Gradio interface interface = gr.Interface( fn=generate_sfx, inputs=[ gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"), gr.Slider(label="Duration (seconds)", minimum=1, maximum=60, value=30) # Added duration slider ], outputs=gr.Audio(label="Generated SFX", type="filepath"), live=False, title="SFX Generator from Text", description="Enter a word or sentence, and the model will generate an SFX sound for the specified duration.", ) # Run the interface if __name__ == "__main__": interface.launch()