Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import tensorflow as tf | |
| import numpy as np | |
| import nltk | |
| from nltk.corpus import cmudict | |
| from scipy.io.wavfile import write | |
| # Download required NLTK data | |
| nltk.download('averaged_perceptron_tagger') | |
| nltk.download('cmudict') | |
| # Load your model | |
| model = tf.keras.models.load_model('audio_model.h5') | |
| # Preprocess input text | |
| def preprocess_text(text): | |
| """ | |
| Process the input text to prepare it for the model. | |
| """ | |
| d = cmudict.dict() | |
| words = text.lower().split() | |
| phonemes = [] | |
| for word in words: | |
| if word in d: | |
| phonemes.append(d[word][0]) | |
| else: | |
| phonemes.append(['UNKNOWN']) | |
| # Flatten the list of phonemes | |
| flattened_phonemes = [p for sublist in phonemes for p in sublist] | |
| # Create dummy feature vectors (this should be replaced with actual feature extraction) | |
| num_features = 13 | |
| sequence_length = len(flattened_phonemes) | |
| input_data = np.random.rand(sequence_length, num_features) # Placeholder | |
| # Add batch dimension | |
| input_data = np.expand_dims(input_data, axis=0) | |
| return input_data | |
| # Convert model output to an audio file | |
| def convert_to_audio(model_output, sample_rate=22050): | |
| """ | |
| Convert the model output into a .wav file. | |
| """ | |
| # Check if model_output is empty | |
| if model_output is None or len(model_output) == 0: | |
| raise ValueError("Model output is empty.") | |
| # Normalize the audio output | |
| normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1)) | |
| # Return normalized output for further processing | |
| return normalized_output | |
| # Generate sound effect with specified duration | |
| def generate_sfx(text, duration=30): | |
| """ | |
| Takes input text, preprocesses it, runs it through the model, | |
| and generates a downloadable audio file for a specified duration. | |
| """ | |
| input_data = preprocess_text(text) | |
| # Initialize an empty list to hold audio segments | |
| audio_segments = [] | |
| total_samples = duration * 22050 # Calculate total samples for 30 seconds | |
| generated_samples = 0 | |
| while generated_samples < total_samples: | |
| # Generate prediction | |
| prediction = model.predict(input_data) | |
| # Ensure prediction shape is correct | |
| if prediction.ndim == 2 and prediction.shape[1] > 1: | |
| prediction = prediction.flatten() # Flatten if necessary | |
| # Convert the prediction to audio data | |
| audio_segment = convert_to_audio(prediction) | |
| # Append the generated segment to the list | |
| audio_segments.append(audio_segment) | |
| # Increment the total samples generated | |
| generated_samples += len(audio_segment) | |
| # Concatenate all segments to form the final audio output | |
| final_audio = np.concatenate(audio_segments)[:total_samples] # Ensure we cut to the correct length | |
| # Write the audio data to a file | |
| output_filename = "output.wav" | |
| write(output_filename, 22050, final_audio) | |
| return output_filename | |
| # Define the Gradio interface | |
| interface = gr.Interface( | |
| fn=generate_sfx, | |
| inputs=[ | |
| gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"), | |
| gr.Slider(label="Duration (seconds)", minimum=1, maximum=60, value=30) # Added duration slider | |
| ], | |
| outputs=gr.Audio(label="Generated SFX", type="filepath"), | |
| live=False, | |
| title="SFX Generator from Text", | |
| description="Enter a word or sentence, and the model will generate an SFX sound for the specified duration.", | |
| ) | |
| # Run the interface | |
| if __name__ == "__main__": | |
| interface.launch() | |