| | import gradio as gr |
| | import spaces |
| | import librosa |
| | import soundfile as sf |
| | import wavio |
| | import os |
| | import subprocess |
| | import pickle |
| | import torch |
| | import torch.nn as nn |
| | from transformers import T5Tokenizer |
| | from transformer_model import Transformer |
| | from miditok import REMI, TokenizerConfig |
| | from pathlib import Path |
| | from huggingface_hub import hf_hub_download |
| |
|
| | repo_id = "amaai-lab/text2midi" |
| | |
| | model_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin") |
| | |
| | tokenizer_path = hf_hub_download(repo_id=repo_id, filename="vocab_remi.pkl") |
| | |
| | soundfont_path = hf_hub_download(repo_id=repo_id, filename="soundfont.sf2") |
| |
|
| |
|
| | def save_wav(filepath): |
| | |
| | directory = os.path.dirname(filepath) |
| | stem = os.path.splitext(os.path.basename(filepath))[0] |
| |
|
| | |
| | midi_filepath = os.path.join(directory, f"{stem}.mid") |
| | wav_filepath = os.path.join(directory, f"{stem}.wav") |
| |
|
| | |
| | |
| | process = subprocess.Popen( |
| | f"fluidsynth -r 16000 {soundfont_path} -g 1.0 --quiet --no-shell {midi_filepath} -T wav -F {wav_filepath} > /dev/null", |
| | shell=True |
| | ) |
| | process.wait() |
| |
|
| | return wav_filepath |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | def generate_midi(caption, temperature=0.9, max_len=500): |
| | device = 'cuda' if torch.cuda.is_available() else 'cpu' |
| | artifact_folder = 'artifacts' |
| |
|
| | |
| | |
| | with open(tokenizer_path, "rb") as f: |
| | r_tokenizer = pickle.load(f) |
| |
|
| | |
| | vocab_size = len(r_tokenizer) |
| | print("Vocab size: ", vocab_size) |
| | model = Transformer(vocab_size, 768, 8, 2048, 18, 1024, False, 8, device=device) |
| | |
| | model.load_state_dict(torch.load(model_path, map_location=device)) |
| | model.eval() |
| | tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base") |
| |
|
| | inputs = tokenizer(caption, return_tensors='pt', padding=True, truncation=True) |
| | input_ids = nn.utils.rnn.pad_sequence(inputs.input_ids, batch_first=True, padding_value=0) |
| | input_ids = input_ids.to(device) |
| | attention_mask =nn.utils.rnn.pad_sequence(inputs.attention_mask, batch_first=True, padding_value=0) |
| | attention_mask = attention_mask.to(device) |
| | output = model.generate(input_ids, attention_mask, max_len=max_len,temperature = temperature) |
| | output_list = output[0].tolist() |
| | generated_midi = r_tokenizer.decode(output_list) |
| | generated_midi.dump_midi("output.mid") |
| | |
| |
|
| |
|
| | @spaces.GPU(duration=120) |
| | def gradio_generate(prompt, temperature, max_length): |
| | |
| | generate_midi(prompt, temperature, max_length) |
| |
|
| | |
| | midi_filename = "output.mid" |
| | save_wav(midi_filename) |
| | wav_filename = midi_filename.replace(".mid", ".wav") |
| |
|
| | |
| | output_wave, samplerate = sf.read(wav_filename, dtype='float32') |
| | temp_wav_filename = "temp.wav" |
| | wavio.write(temp_wav_filename, output_wave, rate=16000, sampwidth=2) |
| | |
| | return temp_wav_filename, midi_filename |
| |
|
| |
|
| | title="Text2midi: Generating Symbolic Music from Captions" |
| | description_text = """ |
| | <p><a href="https://huggingface.co/spaces/amaai-lab/text2midi/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/> |
| | Generate midi music using Text2midi by providing a text prompt. |
| | <br/><br/> This is the demo for Text2midi. This model is the first end-to-end model for generating MIDI files from textual descriptions. By leveraging pretrained large language models and a powerful autoregressive transformer decoder, text2midi allows users to create symbolic music that aligns with detailed textual prompts, including musical attributes like chords, tempo, and style. </br> Read the <a href="https://www.arxiv.org/abs/2412.16526">full paper here.</a> </br> <a href="https://github.com/AMAAI-Lab/text2midi">View code on Github</a>. </br></br>***The model was optimized for 1500-2000 as Max token Length. Due to Huggingface GPU restraints we have set the default to 500 tokens, this will generate shorter midi files that may be sub-optimal. |
| | <p/> |
| | """ |
| | |
| | |
| | input_text = gr.Textbox(lines=2, label="Prompt") |
| | output_audio = gr.Audio(label="Generated Music", type="filepath") |
| | output_midi = gr.File(label="Download MIDI File") |
| | temperature = gr.Slider(minimum=0.8, maximum=1.1, value=0.9, step=0.1, label="Temperature", interactive=True) |
| | max_length = gr.Number(value=500, label="Max Length", minimum=500, maximum=2000, step=100) |
| |
|
| | |
| | css = ''' |
| | #duplicate-button { |
| | margin: auto; |
| | color: white; |
| | background: #1565c0; |
| | border-radius: 100vh; |
| | } |
| | .example { |
| | text-align: left; /* Centers the examples */ |
| | margin: auto; /* Ensures the examples are centered in their container */ |
| | } |
| | |
| | .example-caption { |
| | text-align: left; /* Centers the captions under each example */ |
| | |
| | td.svelte-1viwdyg{ |
| | text-align: left; |
| | } |
| | } |
| | ''' |
| |
|
| | |
| | gr_interface = gr.Interface( |
| | fn=gradio_generate, |
| | inputs=[input_text, temperature, max_length], |
| | outputs=[output_audio, output_midi], |
| | description=description_text, |
| | allow_flagging=False, |
| | examples=[ |
| | ["A haunting electronic ambient piece that evokes a sense of darkness and space, perfect for a film soundtrack. The string ensemble, trumpet, piano, timpani, and synth pad weave together to create a meditative atmosphere. Set in F minor with a 4/4 time signature, the song progresses at an Andante tempo, with the chords F, Fdim, and F/C recurring throughout.", 1, 1500], |
| | ["A slow and emotional classical piece, likely used in a film soundtrack, featuring a church organ as the sole instrument. Written in the key of Eb major with a 3/4 time signature, it evokes a sense of drama and romance. The chord progression of Bb7, Eb, and Ab contributes to the relaxing atmosphere throughout the song.", 1, 1500], |
| | ["An energetic and melodic electronic trance track with a space and retro vibe, featuring drums, distortion guitar, flute, synth bass, and slap bass. Set in A minor with a fast tempo of 138 BPM, the song maintains a 4/4 time signature throughout its duration.", 1, 1500], |
| | ["This short electronic song in C minor features a brass section, string ensemble, tenor saxophone, clean electric guitar, and slap bass, creating a melodic and slightly dark atmosphere. With a tempo of 124 BPM (Allegro) and a 4/4 time signature, the track incorporates a chord progression of C7/E, Eb6, and Bbm6, adding a touch of corporate and motivational vibes to the overall composition.", 1, 1500], |
| | ["An energetic and melodic electronic trance track with a space and retro vibe, featuring drums, distortion guitar, flute, synth bass, and slap bass. Set in A minor with a fast tempo of 138 BPM, the song maintains a 4/4 time signature throughout its duration.", 1, 1500], |
| | ["A short but energetic rock fragment in C minor, featuring overdriven guitars, electric bass, and drums, with a vivacious tempo of 155 BPM and a 4/4 time signature, evoking a blend of dark and melodic tones."], |
| | ], |
| | cache_examples="lazy", |
| | css="td.svelte-1viwdyg { text-align: left; }" |
| | ) |
| |
|
| | with gr.Blocks(css=css) as demo: |
| | title=gr.HTML(f"<h1><center>{title}</center></h1>") |
| | dupe = gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button") |
| | gr_interface.render() |
| | |
| |
|
| | |
| | demo.queue().launch() |
| |
|