Spaces:
Build error
Build error
| import gradio as gr | |
| import librosa | |
| import numpy as np | |
| from tensorflow.keras.models import Sequential | |
| from tensorflow.keras.layers import LSTM, Dense, Embedding | |
| from tensorflow.keras.optimizers import Adam | |
| from tensorflow.keras.losses import CategoricalCrossentropy | |
| from sklearn.preprocessing import LabelEncoder | |
| from tensorflow.keras.utils import to_categorical | |
| import os | |
| import json | |
| def extract_features(file_path): | |
| try: | |
| audio, sr = librosa.load(file_path, sr=None) # Load audio, keep the original sampling rate | |
| mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13) | |
| return mfccs.T # Transpose to have (time_steps, features) | |
| except Exception as e: | |
| print(f"Error processing {file_path}: {e}") | |
| return None | |
| def create_model(input_shape, vocab_size): | |
| model = Sequential() | |
| #Embedding to increase the vocabulary space | |
| model.add(Embedding(input_dim=vocab_size, output_dim=16, input_length=input_shape)) | |
| model.add(LSTM(64)) # Simple LSTM with 64 units. | |
| model.add(Dense(vocab_size, activation='softmax')) # Output layer. | |
| optimizer = Adam(learning_rate=0.001) #Adam optimizer | |
| loss_function = CategoricalCrossentropy() #categorical crossentropy loss | |
| model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy']) | |
| return model | |
| def prepare_data(mfccs_list): | |
| all_mfccs = np.concatenate(mfccs_list, axis=0) | |
| label_encoder = LabelEncoder() | |
| integer_encoded = label_encoder.fit_transform(all_mfccs.reshape(-1, all_mfccs.shape[-1]).astype(str)) | |
| integer_encoded = integer_encoded.reshape(all_mfccs.shape[0], all_mfccs.shape[1]) | |
| vocab_size = len(label_encoder.classes_) | |
| # Creating the sequences | |
| seq_length = 10 | |
| dataX, dataY = [], [] | |
| for i in range(0, len(integer_encoded) - seq_length, 1): | |
| seq_in = integer_encoded[i:i + seq_length] | |
| seq_out = integer_encoded[i + seq_length] | |
| dataX.append(seq_in) | |
| dataY.append(seq_out) | |
| n_patterns = len(dataX) | |
| # Reshape input to be [samples, time steps, features] | |
| dataX = np.array(dataX) | |
| dataX = np.reshape(dataX, (n_patterns, seq_length, all_mfccs.shape[-1])) | |
| dataY = np.array(dataY) | |
| dataY = to_categorical(dataY, num_classes=vocab_size) | |
| return dataX, dataY, vocab_size, label_encoder | |
| def train_model(model, dataX, dataY): | |
| model.fit(dataX, dataY, epochs=10, batch_size=64, verbose=0) | |
| def generate_rap(model, start_seq, label_encoder, seq_length, vocab_size, num_frames=50): | |
| generated_seq = start_seq.copy() | |
| for _ in range(num_frames): | |
| # Reshape the input to be [samples, time_steps, features] | |
| x_input = np.reshape(generated_seq, (1, len(generated_seq), generated_seq[0].shape[0])) | |
| # Predict the next token | |
| predicted_probabilities = model.predict(x_input, verbose=0)[0] | |
| predicted_token = np.argmax(predicted_probabilities) | |
| # Add the new mfcc | |
| generated_seq = np.concatenate((generated_seq, [label_encoder.classes_[predicted_token].split()]), axis=0) | |
| generated_seq = generated_seq.astype(float) | |
| return generated_seq | |
| # Function to train model and get results | |
| def train_and_generate(file_path): | |
| # Check File extensions | |
| if not file_path.lower().endswith(('.mp3', '.wav')): | |
| return "Invalid file type" | |
| # Extract features and prepare data | |
| features = extract_features(file_path) | |
| if features is None: | |
| return "Error extracting audio features, check input" | |
| dataX, dataY, vocab_size, label_encoder = prepare_data([features]) | |
| input_shape = dataX.shape[1] | |
| # Create and Train model | |
| model = create_model(input_shape, vocab_size) | |
| train_model(model, dataX, dataY) | |
| # Generation from a random seed | |
| rand_index = np.random.randint(0, len(dataX)-1) | |
| start_seq = dataX[rand_index] | |
| generated_mfcc_sequence = generate_rap(model, start_seq, label_encoder, input_shape, vocab_size) | |
| return generated_mfcc_sequence | |
| # Gradio Interface | |
| iface = gr.Interface( | |
| fn=train_and_generate, | |
| inputs=gr.Audio(source="upload", type="filepath", label="Upload MP3 or WAV File"), | |
| outputs=gr.Textbox(label="Generated Rap"), | |
| title="AI Rapper", | |
| description="Upload a Rap song to train the model and generate a new rap verse" | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |