TextToAudio

Runtime error

App Files Files Community

kmaes commited on Jan 18

Commit

b148e11

verified ·

1 Parent(s): 102df92

Upload 27 files

Browse files

Files changed (28) hide show

.gitattributes +2 -0
README.md +112 -8
app.py +277 -0
packages.txt +2 -0
requirements.txt +9 -0
text2midi_repo/.gitignore +162 -0
text2midi_repo/LICENSE +21 -0
text2midi_repo/README.md +162 -0
text2midi_repo/artifacts/vocab.pkl +3 -0
text2midi_repo/artifacts/vocab_remi.pkl +3 -0
text2midi_repo/captions/captions.json +3 -0
text2midi_repo/configs/config.yaml +62 -0
text2midi_repo/configs/ds_config.json +33 -0
text2midi_repo/model/__pycache__/transformer_model.cpython-314.pyc +0 -0
text2midi_repo/model/build_vocab.py +79 -0
text2midi_repo/model/build_vocab_remi.py +69 -0
text2midi_repo/model/data_loader.py +124 -0
text2midi_repo/model/data_loader_remi.py +126 -0
text2midi_repo/model/dict_output.txt +0 -0
text2midi_repo/model/train.py +185 -0
text2midi_repo/model/train_accelerate.py +224 -0
text2midi_repo/model/train_hf.py +283 -0
text2midi_repo/model/transformer_model.py +1509 -0
text2midi_repo/requirements-mac.txt +368 -0
text2midi_repo/requirements.txt +368 -0
text2midi_repo/text2midi_architecture.jpg +3 -0
text2midi_repo/utils/midi_to_wav.py +55 -0
text2midi_repo/utils/split_caption.py +27 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+text2midi_repo/captions/captions.json filter=lfs diff=lfs merge=lfs -text
+text2midi_repo/text2midi_architecture.jpg filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,14 +1,118 @@
 ---
-title: TextToAudio
-emoji: 📊
-colorFrom: pink
-colorTo: yellow
 sdk: gradio
-sdk_version: 6.3.0
 app_file: app.py
 pinned: false
-license: mit
-short_description: TextToAudio
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: VR Game Music Generator
+emoji: 🎵
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
+python_version: 3.11
 pinned: false
 ---
+# VR Game Music Generator
+Generate music from text descriptions using the text2midi AI model. Designed for integration with Unity and other game engines via the Gradio API.
+## Features
+- Text-to-music generation using AI
+- Real-time audio streaming (no file persistence)
+- RESTful API for game engine integration
+- Supports various music styles and instruments
+## API Usage
+### Endpoint
+```
+POST https://YOUR-SPACE.hf.space/api/generate
+```
+### Request
+```json
+{
+    "data": ["A cheerful pop song with piano and drums", 512, 0.9]
+}
+```
+Parameters:
+- `data[0]`: Music prompt (string)
+- `data[1]`: Max length in tokens (256-2048, default: 512)
+- `data[2]`: Temperature (0.1-1.5, default: 0.9)
+### Response
+```json
+{
+    "data": [
+        {"path": "/file=...", "url": "https://...", "orig_name": "audio.wav"},
+        "AI-generated audio for: 'A cheerful pop song...'"
+    ]
+}
+```
+## Unity Integration
+```csharp
+using UnityEngine;
+using UnityEngine.Networking;
+using System.Collections;
+public class MusicGenerator : MonoBehaviour
+{
+    private const string API_URL = "https://YOUR-SPACE.hf.space/api/generate";
+    public IEnumerator GenerateMusic(string prompt, System.Action<AudioClip> callback)
+    {
+        string json = $"{{\"data\": [\"{prompt}\", 512, 0.9]}}";
+        using (UnityWebRequest request = new UnityWebRequest(API_URL, "POST"))
+        {
+            byte[] bodyRaw = System.Text.Encoding.UTF8.GetBytes(json);
+            request.uploadHandler = new UploadHandlerRaw(bodyRaw);
+            request.downloadHandler = new DownloadHandlerBuffer();
+            request.SetRequestHeader("Content-Type", "application/json");
+            yield return request.SendWebRequest();
+            if (request.result == UnityWebRequest.Result.Success)
+            {
+                // Parse response and download audio from returned URL
+                var response = JsonUtility.FromJson<GradioResponse>(request.downloadHandler.text);
+                yield return DownloadAudio(response.data[0].url, callback);
+            }
+        }
+    }
+    private IEnumerator DownloadAudio(string url, System.Action<AudioClip> callback)
+    {
+        using (UnityWebRequest www = UnityWebRequestMultimedia.GetAudioClip(url, AudioType.WAV))
+        {
+            yield return www.SendWebRequest();
+            if (www.result == UnityWebRequest.Result.Success)
+            {
+                callback(DownloadHandlerAudioClip.GetContent(www));
+            }
+        }
+    }
+}
+```
+## Example Prompts
+- A cheerful and melodic pop Christmas song featuring piano, acoustic guitar, and drums
+- An energetic electronic trance track with synth bass and drums at 138 BPM
+- A slow and emotional classical piece featuring cello and violin in C minor
+- A cinematic electronic soundtrack with an epic and dark atmosphere
+- Happy medieval tavern music with lute and flute
+## Local Development
+```bash
+pip install -r requirements.txt
+python app.py
+```
+## Credits
+- Model: [amaai-lab/text2midi](https://huggingface.co/amaai-lab/text2midi)
+- Audio synthesis: FluidSynth with FluidR3 GM SoundFont

app.py ADDED Viewed

	@@ -0,0 +1,277 @@

+"""
+VR Music Generator - HuggingFace Spaces Version
+Generates music from text descriptions using the text2midi AI model.
+Exposes a Gradio API for Unity integration.
+Audio is streamed directly - no files are persisted.
+"""
+import gradio as gr
+import torch
+import torch.nn as nn
+import subprocess
+import os
+import sys
+import pickle
+import tempfile
+import io
+import numpy as np
+from scipy.io import wavfile
+from huggingface_hub import hf_hub_download
+# Add text2midi model to path
+sys.path.insert(0, "text2midi_repo")
+repo_id = "amaai-lab/text2midi"
+# Detect device
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(f"Using device: {device}")
+# Global model variables
+text2midi_model = None
+midi_tokenizer = None
+text_tokenizer = None
+def load_text2midi_model():
+    """Load the text2midi model and tokenizers."""
+    global text2midi_model, midi_tokenizer, text_tokenizer
+    try:
+        from model.transformer_model import Transformer
+        from transformers import T5Tokenizer
+        print("Loading text2midi model...")
+        # Download model files
+        model_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin")
+        tokenizer_path = hf_hub_download(repo_id=repo_id, filename="vocab_remi.pkl")
+        print(f"Model path: {model_path}")
+        print(f"Tokenizer path: {tokenizer_path}")
+        # Load MIDI tokenizer
+        with open(tokenizer_path, "rb") as f:
+            midi_tokenizer = pickle.load(f)
+        vocab_size = len(midi_tokenizer)
+        print(f"Vocab size: {vocab_size}")
+        # Initialize and load model
+        text2midi_model = Transformer(vocab_size, 768, 8, 2048, 18, 1024, False, 8, device=device)
+        text2midi_model.load_state_dict(torch.load(model_path, map_location=device, weights_only=True))
+        text2midi_model.to(device)
+        text2midi_model.eval()
+        # Load T5 tokenizer for text encoding
+        text_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
+        print("Text2midi model loaded successfully!")
+        return True
+    except Exception as e:
+        print(f"Warning: Could not load text2midi model: {e}")
+        import traceback
+        traceback.print_exc()
+        print("Falling back to simple MIDI generation...")
+        return False
+# Try to load the model
+MODEL_LOADED = load_text2midi_model()
+def find_soundfont():
+    """Find a SoundFont file on the system."""
+    common_paths = [
+        "/usr/share/sounds/sf2/FluidR3_GM.sf2",
+        "/usr/share/soundfonts/FluidR3_GM.sf2",
+        "/usr/share/sounds/sf2/default-GM.sf2",
+        "FluidR3_GM.sf2",
+    ]
+    for path in common_paths:
+        if os.path.exists(path):
+            return path
+    return None
+SOUNDFONT_PATH = find_soundfont()
+print(f"SoundFont: {SOUNDFONT_PATH or 'Not found'}")
+def generate_midi_with_model(prompt: str, output_path: str, max_len: int = 512, temperature: float = 0.9):
+    """Generate MIDI using the text2midi model."""
+    global text2midi_model, midi_tokenizer, text_tokenizer
+    # Tokenize input text
+    inputs = text_tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)
+    input_ids = inputs.input_ids.to(device)
+    attention_mask = inputs.attention_mask.to(device)
+    # Generate MIDI tokens
+    with torch.no_grad():
+        output = text2midi_model.generate(input_ids, attention_mask, max_len=max_len, temperature=temperature)
+    output_list = output[0].tolist()
+    # Decode to MIDI
+    generated_midi = midi_tokenizer.decode(output_list)
+    generated_midi.dump_midi(output_path)
+    return output_path
+def midi_to_audio_bytes(midi_path: str, sample_rate: int = 44100) -> tuple:
+    """
+    Convert MIDI to audio using FluidSynth, returning numpy array.
+    Uses stdout piping to avoid creating intermediate files.
+    """
+    if not SOUNDFONT_PATH:
+        return None
+    # Use FluidSynth to render MIDI to raw audio via stdout
+    # -T raw outputs raw audio, -F - outputs to stdout
+    result = subprocess.run([
+        "fluidsynth",
+        "-ni",                    # No interactive mode
+        "-T", "raw",              # Output raw audio format
+        "-F", "-",                # Output to stdout
+        "-r", str(sample_rate),   # Sample rate
+        SOUNDFONT_PATH,           # SoundFont file
+        midi_path,                # MIDI file
+    ], capture_output=True, timeout=120)
+    if result.returncode != 0:
+        print(f"FluidSynth error: {result.stderr.decode()}")
+        return None
+    # Convert raw audio bytes to numpy array (16-bit signed, stereo)
+    audio_data = np.frombuffer(result.stdout, dtype=np.int16)
+    # FluidSynth outputs stereo by default, reshape if needed
+    if len(audio_data) > 0:
+        # Convert to float32 normalized [-1, 1] for Gradio
+        audio_float = audio_data.astype(np.float32) / 32768.0
+        return (sample_rate, audio_float)
+    return None
+def generate_music(prompt: str, max_length: int = 512, temperature: float = 0.9):
+    """
+    Generate music from text prompt.
+    Returns audio data directly without saving files.
+    Args:
+        prompt: Text description of the music to generate
+        max_length: Maximum length in tokens (256-2048)
+        temperature: Generation temperature (0.1-1.5)
+    Returns:
+        Tuple of (audio_data, status_message)
+        audio_data is (sample_rate, numpy_array) for Gradio
+    """
+    if not prompt or not prompt.strip():
+        return None, "Please enter a music prompt"
+    try:
+        # Create temporary MIDI file (auto-deleted when closed)
+        with tempfile.NamedTemporaryFile(suffix='.mid', delete=False) as midi_file:
+            midi_path = midi_file.name
+        try:
+            # Generate MIDI using the model or fallback
+            if MODEL_LOADED:
+                status_prefix = "AI-generated"
+                generate_midi_with_model(prompt, midi_path, max_len=int(max_length), temperature=temperature)
+            else:
+                status_prefix = "Simple"
+                # Fallback: create simple MIDI
+                from midiutil import MIDIFile
+                midi = MIDIFile(1)
+                midi.addTempo(0, 0, 120)
+                notes = [60, 62, 64, 65, 67, 69, 71, 72]
+                for i, note in enumerate(notes[:min(len(prompt.split()), 8)]):
+                    midi.addNote(0, 0, note, i, 1, 100)
+                with open(midi_path, "wb") as f:
+                    midi.writeFile(f)
+            # Convert MIDI to audio
+            if SOUNDFONT_PATH:
+                audio_result = midi_to_audio_bytes(midi_path)
+                if audio_result:
+                    return audio_result, f"{status_prefix} audio for: '{prompt[:50]}...'" if len(prompt) > 50 else f"{status_prefix} audio for: '{prompt}'"
+                else:
+                    return None, f"Error: FluidSynth conversion failed"
+            else:
+                return None, f"Error: FluidSynth/SoundFont not available"
+        finally:
+            # Clean up temporary MIDI file
+            try:
+                os.unlink(midi_path)
+            except:
+                pass
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return None, f"Error: {str(e)}"
+# Create Gradio interface with API enabled
+with gr.Blocks(title="VR Music Generator") as demo:
+    gr.Markdown("# VR Game Music Generator")
+    gr.Markdown("Generate music from text descriptions using the text2midi AI model")
+    if not MODEL_LOADED:
+        gr.Markdown("**Warning:** AI model not loaded - using simple placeholder MIDI")
+    if not SOUNDFONT_PATH:
+        gr.Markdown("**Note:** FluidSynth not configured - audio generation disabled")
+    with gr.Row():
+        with gr.Column():
+            prompt_input = gr.Textbox(
+                label="Music Prompt",
+                placeholder="A cheerful pop song with piano and drums in C major at 120 BPM",
+                lines=3
+            )
+            with gr.Row():
+                max_length = gr.Slider(
+                    minimum=256,
+                    maximum=2048,
+                    value=512,
+                    step=256,
+                    label="Max Length (tokens)"
+                )
+                temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.5,
+                    value=0.9,
+                    step=0.1,
+                    label="Temperature"
+                )
+            generate_btn = gr.Button("Generate Music", variant="primary")
+        with gr.Column():
+            audio_output = gr.Audio(label="Generated Music", type="numpy")
+            status_output = gr.Textbox(label="Status", lines=2)
+    generate_btn.click(
+        fn=generate_music,
+        inputs=[prompt_input, max_length, temperature],
+        outputs=[audio_output, status_output],
+        api_name="generate"  # Exposes as /api/generate endpoint
+    )
+    gr.Markdown("---")
+    gr.Markdown("""
+**Example prompts:**
+- A cheerful and melodic pop Christmas song featuring piano, acoustic guitar, and drums
+- An energetic electronic trance track with synth bass and drums at 138 BPM
+- A slow and emotional classical piece featuring cello and violin in C minor
+- A cinematic electronic soundtrack with an epic and dark atmosphere
+**API Usage (for Unity):**
+```csharp
+// POST to: https://YOUR-SPACE.hf.space/api/generate
+// Body: {"data": ["your music prompt", 512, 0.9]}
+// Response: {"data": [{"path": "audio_url", ...}, "status"]}
+```
+    """)
+# For HuggingFace Spaces - launch() is called automatically
+# For local testing, uncomment below:
+# if __name__ == "__main__":
+#     demo.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ fluidsynth
2	+ fluid-soundfont-gm

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio>=4.0.0
+torch>=2.0.0
+transformers>=4.30.0
+huggingface-hub>=0.20.0
+midiutil>=1.2.1
+miditok>=3.0.0
+scipy>=1.10.0
+numpy>=1.24.0
+tqdm>=4.65.0

text2midi_repo/.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

text2midi_repo/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 AMAAI Lab
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

text2midi_repo/README.md ADDED Viewed

	@@ -0,0 +1,162 @@

+# Text2midi: Generating Symbolic Music from Captions
+[Demo](https://huggingface.co/spaces/amaai-lab/text2midi) | [Model](https://huggingface.co/amaai-lab/text2midi) | [Examples](https://amaai-lab.github.io/Text2midi/) | [Paper](https://arxiv.org/abs/2412.16526) | [Dataset](https://huggingface.co/datasets/amaai-lab/MidiCaps)
+[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/amaai-lab/text2midi)
+</div>
+**text2midi** is the first end-to-end model for generating MIDI files from textual descriptions. By leveraging pretrained large language models and a powerful autoregressive transformer decoder, **text2midi** allows users to create symbolic music that aligns with detailed textual prompts, including musical attributes like chords, tempo, and style. The details of the model are described in [this paper](https://arxiv.org/abs/2412.16526).
+🔥 Live demo available on [HuggingFace Spaces](https://huggingface.co/spaces/amaai-lab/text2midi).
+🔥 Update: Text2midi has been accepted at AAAI!
+<div align="center">
+  <img src="text2midi_architecture.jpg" width="500"/>
+</div>
+## Quickstart Guide
+Generate symbolic music from a text prompt:
+```python
+import pickle
+import torch
+import torch.nn as nn
+from transformers import T5Tokenizer
+from model.transformer_model import Transformer
+from huggingface_hub import hf_hub_download
+repo_id = "amaai-lab/text2midi"
+# Download the model.bin file
+model_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin")
+# Download the vocab_remi.pkl file
+tokenizer_path = hf_hub_download(repo_id=repo_id, filename="vocab_remi.pkl")
+if torch.cuda.is_available():
+    device = 'cuda'
+elif torch.backends.mps.is_available():
+    device = 'mps'
+else:
+    device = 'cpu'
+print(f"Using device: {device}")
+# Load the tokenizer dictionary
+with open(tokenizer_path, "rb") as f:
+    r_tokenizer = pickle.load(f)
+# Get the vocab size
+vocab_size = len(r_tokenizer)
+print("Vocab size: ", vocab_size)
+model = Transformer(vocab_size, 768, 8, 2048, 18, 1024, False, 8, device=device)
+model.load_state_dict(torch.load(model_path, map_location=device))
+model.eval()
+tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
+print('Model loaded.')
+# Enter the text prompt and tokenize it
+src = "A melodic electronic song with ambient elements, featuring piano, acoustic guitar, alto saxophone, string ensemble, and electric bass. Set in G minor with a 4/4 time signature, it moves at a lively Presto tempo. The composition evokes a blend of relaxation and darkness, with hints of happiness and a meditative quality."
+print('Generating for prompt: ' + src)
+inputs = tokenizer(src, return_tensors='pt', padding=True, truncation=True)
+input_ids = nn.utils.rnn.pad_sequence(inputs.input_ids, batch_first=True, padding_value=0)
+input_ids = input_ids.to(device)
+attention_mask =nn.utils.rnn.pad_sequence(inputs.attention_mask, batch_first=True, padding_value=0)
+attention_mask = attention_mask.to(device)
+# Generate the midi
+output = model.generate(input_ids, attention_mask, max_len=2000,temperature = 1.0)
+output_list = output[0].tolist()
+generated_midi = r_tokenizer.decode(output_list)
+generated_midi.dump_midi("output.mid")
+```
+## Installation
+If you have CUDA supported machine:
+```bash
+git clone https://github.com/AMAAI-Lab/text2midi
+cd text2midi
+pip install -r requirements.txt
+```
+Alternatively, if you have MPS supported machine:
+```bash
+git clone https://github.com/AMAAI-Lab/text2midi
+cd text2midi
+pip install -r requirements-mac.txt
+```
+## Datasets
+The model was trained using two datasets: [SymphonyNet](https://symphonynet.github.io/) for semi-supervised pretraining and MidiCaps for finetuning towards MIDI generation from captions.
+The [MidiCaps dataset](https://huggingface.co/datasets/amaai-lab/MidiCaps) is a large-scale dataset of 168k MIDI files paired with rich text captions. These captions contain musical attributes such as key, tempo, style, and mood, making it ideal for text-to-MIDI generation tasks as described in [this paper](https://arxiv.org/abs/2406.02255).
+## Citation
+If you use text2midi in your research, please cite:
+```
+@inproceedings{bhandari2025text2midi,
+    title={text2midi: Generating Symbolic Music from Captions},
+    author={Keshav Bhandari and Abhinaba Roy and Kyra Wang and Geeta Puri and Simon Colton and Dorien Herremans},
+    booktitle={Proceedings of the 39th AAAI Conference on Artificial Intelligence (AAAI 2025)},
+    year={2025}
+}
+```
+## Results of the Listening Study
+Each question is rated on a Likert scale from 1 (very bad) to 7 (very good). The table shows the average ratings per question for each group of participants.
+| Question            | MidiCaps | text2midi | MuseCoco |
+|---------------------|----------|-----------|----------|
+| Musical Quality     | 5.79     | 4.62      | 4.40     |
+| Overall Matching    | 5.42     | 4.67      | 4.07     |
+| Genre Matching      | 5.54     | 4.98      | 4.40     |
+| Mood Matching       | 5.70     | 5.00      | 4.32     |
+| Key Matching        | 4.61     | 3.64      | 3.36     |
+| Chord Matching      | 3.20     | 2.50      | 2.00     |
+| Tempo Matching      | 5.89     | 5.42      | 4.94     |
+## Objective Evaluations
+Results of objective evaluation for *all* of MidiCaps test set. Please not we have improved from all the numbers written in the paper (the numbers in paper are on a small subset of MidiCaps test set).
+| Metric              | text2midi | MidiCaps | MuseCoco |
+|---------------------|-----------|----------|----------|
+| CR ↑               | 2.31      | 3.43     | 2.12     |
+| CLAP ↑             | 0.22      | 0.26     | 0.21     |
+| TB (%) ↑           | 39.70     | -        | 21.71    |
+| TBT (%) ↑          | 65.80     | -        | 54.63    |
+| CK (%) ↑           | 33.60     | -        | 13.70    |
+| CKD (%) ↑          | 35.60     | -        | 14.59    |
+**Note**:
+CR = Compression ratio
+CLAP = CLAP score
+TB = Tempo Bin
+TBT = Tempo Bin with Tolerance
+CK = Correct Key
+CKD = Correct Key with Duplicates
+↑ = Higher score is better.
+## Training
+To train text2midi, we recommend using accelerate for multi-GPU support. First, configure accelerate by running:
+```bash
+accelerate config
+```
+Then, use the following command to start training:
+```bash
+accelerate launch --multi_gpu --num_processes=4 train_accelerate.py --config ../config.yaml
+```
+## Inference
+We support inference on CUDA, MPS and cpu. Please make sure you have pip installed the correct requirement file (requirments.txt for CUDA, requirements-mac.txt for MPS)
+```bash
+python model/transformer_model.py --caption <your intended descriptions>
+```

text2midi_repo/artifacts/vocab.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b15330f5ab9c2cd32d359bcc64a1de320a7dc1227180a7658fd0b8f2d35e12c
+size 239637

text2midi_repo/artifacts/vocab_remi.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:877d4511d6b9d5eea1c706199fe13a0de3d984c8f5d09c75d727ffe7f6f54ee6
+size 27256

text2midi_repo/captions/captions.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c25b36b6196618ff79f111e24c52b97d0bde9e1b47d2c596650944ebe6dcac5
+size 69068459

text2midi_repo/configs/config.yaml ADDED Viewed

	@@ -0,0 +1,62 @@

+model:
+  text2midi_model:
+    decoder_max_sequence_length: 2048
+    decoder_num_layers: 18
+    decoder_num_heads: 8
+    decoder_d_model: 768
+    decoder_intermediate_size: 1024
+    use_moe: False
+    num_experts: 4
+    use_deepspeed: False
+    use_accelerate: True
+training:
+  text2midi_model:
+    epochs: 140
+    batch_size: 1
+    learning_rate: 0.000001
+    weight_decay: 0.01
+    gradient_accumulation_steps: 4
+    with_tracking: True
+    checkpointing_steps: epoch
+    report_to: wandb
+    output_dir: /root/output_test_new
+    per_device_train_batch_size: 32
+    use_scheduler: True
+    lr_scheduler_type: cosine #choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]
+    num_warmup_steps: 100
+    save_every: 5
+    max_train_steps: None
+    scheduled_sampling: False
+    epsilon: 0
+    c: -0.0161
+    k: -0.312
+raw_data:
+  caption_dataset_path: /root/captions/train.json
+  raw_data_folders:
+    lmd:
+      folder_path: /import/c4dm-datasets-ext/lakhmidi
+      file_extension: midi
+    symphonynet:
+      folder_path: /root/text2midi/data/symphonynet/data/SymphonyNet_Dataset
+      file_extension: mid
+    maestro:
+      folder_path: /import/c4dm-datasets/maestro-v3.0.0
+      file_extension: midi
+    pop909:
+      folder_path: /import/c4dm-datasets-ext/POP909
+      file_extension: mid
+    pijama:
+      folder_path: /import/c4dm-datasets/PiJAMA/data/midi
+      file_extension: midi
+    midicaps:
+      folder_path: /root/data
+      file_extension: mid
+deepspeed_config:
+  deepspeed_config_path: /root/test/text2midi/configs/ds_config.json
+artifact_folder: ../artifacts

text2midi_repo/configs/ds_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "train_micro_batch_size_per_gpu": 1,
+    "gradient_accumulation_steps": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 1e-4
+        }
+    },
+    "bf16": {
+        "enabled": true
+    },
+    "zero_optimization": {
+        "stage": 1,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+            },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+            },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9
+        },
+"activation_checkpointing": {
+"partition_activations": true,
+"number_checkpoints": null,
+"contiguous_memory_optimization":true,
+"cpu_checkpointing": true
+}
+}

text2midi_repo/model/__pycache__/transformer_model.cpython-314.pyc ADDED Viewed

Binary file (77.4 kB). View file

text2midi_repo/model/build_vocab.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import yaml
+import os
+import argparse
+import pickle
+import glob
+import numpy as np
+import json
+from tqdm import tqdm
+import random
+from copy import deepcopy
+import sys
+import pickle
+# Parse command line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument("--config", type=str, default=os.path.normpath("configs/config.yaml"),
+                    help="Path to the config file")
+args = parser.parse_args()
+# Load config file
+with open(args.config, 'r') as f:
+    configs = yaml.safe_load(f)
+artifact_folder = configs["artifact_folder"]
+raw_data_folders = configs["raw_data"]["raw_data_folders"]
+# Build the vocabulary
+vocab = {}
+instruments = ['piano', 'chromatic', 'organ', 'guitar', 'bass', 'strings', 'ensemble', 'brass', 'reed', 'pipe', 'synth_lead', 'synth_pad', 'synth_effect', 'ethnic', 'percussive', 'sfx', 'drum']
+# Special tokens
+for i in instruments:
+    vocab[('prefix', 'instrument', i)] = len(vocab) + 1
+# MIDI velocity range from 0 to 127
+velocity = [0, 15, 30, 45, 60, 75, 90, 105, 120, 127]
+# MIDI pitch range from 0 to 127
+midi_pitch = list(range(0, 128))
+# Onsets are quantized in 10 milliseconds up to 5 seconds
+onset = list(range(0, 5001, 10))
+duration = list(range(0, 5001, 10))
+# Add the instrument tokens to the vocabulary
+for v in velocity:
+    for i in instruments:
+        for p in midi_pitch:
+            if i == "drum":
+                continue
+            else:
+                vocab[(i, p, v)] = len(vocab) + 1
+for p in midi_pitch:
+    vocab[("drum", p)] = len(vocab) + 1
+for o in onset:
+    vocab[("onset", o)] = len(vocab) + 1
+for d in duration:
+    vocab[("dur", d)] = len(vocab) + 1
+vocab["<T>"] = len(vocab) + 1
+vocab["<D>"] = len(vocab) + 1
+vocab["<U>"] = len(vocab) + 1
+vocab["<SS>"] = len(vocab) + 1
+print('vocab[<ss>]', vocab['<SS>'])
+vocab["<S>"] = len(vocab) + 1
+vocab["<E>"] = len(vocab) + 1
+vocab["SEP"] = len(vocab) + 1
+# Print the vocabulary length
+print(f"Vocabulary length: {len(vocab)}")
+# Save the vocabulary
+vocab_path = os.path.join(artifact_folder, "vocab.pkl")
+with open(vocab_path, 'wb') as f:
+    pickle.dump(vocab, f)
+print(f"Vocabulary saved to {vocab_path}")

text2midi_repo/model/build_vocab_remi.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import yaml
+import os
+import argparse
+import pickle
+import glob
+import numpy as np
+import json
+from tqdm import tqdm
+import random
+from copy import deepcopy
+import sys
+import pickle
+from miditok import REMI, TokenizerConfig  # here we choose to use REMI
+import jsonlines
+# Parse command line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument("--config", type=str, default=os.path.normpath("configs/config.yaml"),
+                    help="Path to the config file")
+args = parser.parse_args()
+# Load config file
+with open(args.config, 'r') as f:
+    configs = yaml.safe_load(f)
+artifact_folder = configs["artifact_folder"]
+raw_data_folders = configs["raw_data"]["raw_data_folders"]
+caption_dataset_path = configs["raw_data"]["caption_dataset_path"]
+dataset_path = configs["raw_data"]["raw_data_folders"]["lmd"]["folder_path"]
+# Our parameters
+BEAT_RES = {(0, 1): 12, (1, 2): 4, (2, 4): 2, (4, 8): 1}
+TOKENIZER_PARAMS = {
+    "pitch_range": (21, 109),
+    "beat_res": BEAT_RES,
+    "num_velocities": 32,
+    "special_tokens": ["PAD", "BOS", "EOS", "MASK"],
+    "use_chords": False,
+    "use_rests": False,
+    "use_tempos": True,
+    "use_time_signatures": True,
+    "use_programs": True,
+    "num_tempos": 32,  # number of tempo bins
+    "tempo_range": (40, 250),  # (min, max)
+}
+config = TokenizerConfig(**TOKENIZER_PARAMS)
+# Creates the tokenizer
+tokenizer = REMI(config)
+# Load the caption dataset
+with jsonlines.open(caption_dataset_path) as reader:
+    captions = list(reader)
+midi_paths = [os.path.join(dataset_path, captions[i]['location']) for i in range(len(captions))][0:30000]
+# Builds the vocabulary with BPE
+# vocab_size = 30000
+# tokenizer.train(vocab_size=vocab_size, files_paths=midi_paths)
+# Print the vocabulary length
+print(f"Vocabulary length: {tokenizer.vocab_size}")
+# Save the vocabulary
+vocab_path = os.path.join(artifact_folder, "vocab_remi.pkl")
+with open(vocab_path, 'wb') as f:
+    pickle.dump(tokenizer, f)
+print(f"Vocabulary saved to {vocab_path}")

text2midi_repo/model/data_loader.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from aria.data.midi import MidiDict
+# from aria.tokenizer import AbsTokenizer
+# aria_tokenizer = AbsTokenizer()
+import yaml
+import jsonlines
+import glob
+import random
+import os
+import sys
+import pickle
+import json
+import argparse
+import numpy as np
+from copy import deepcopy
+from torch.utils.data import Dataset
+import torch
+from torch.nn import functional as F
+from transformers import T5Tokenizer
+from spacy.lang.en import English
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.dirname(SCRIPT_DIR))
+class Text2MusicDataset(Dataset):
+    def __init__(self, configs, captions, aria_tokenizer, mode="train", shuffle = False):
+        self.mode = mode
+        self.captions = captions
+        if shuffle:
+            random.shuffle(self.captions)
+        # Path to dataset
+        self.dataset_path = configs['raw_data']['raw_data_folders']['midicaps']['folder_path']
+        # Artifact folder
+        self.artifact_folder = configs['artifact_folder']
+        # Load encoder tokenizer json file dictionary
+        tokenizer_filepath = os.path.join(self.artifact_folder, "vocab.pkl")
+        self.aria_tokenizer = aria_tokenizer #AbsTokenizer()
+        # Load the pickled tokenizer dictionary
+        with open(tokenizer_filepath, 'rb') as f:
+            self.tokenizer = pickle.load(f)
+        # Load the sentencizer
+        self.nlp = English()
+        self.nlp.add_pipe('sentencizer')
+        # Load the FLAN-T5 tokenizer and encoder
+        self.t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
+        # Get the maximum sequence length
+        self.decoder_max_sequence_length = configs['model']['text2midi_model']['decoder_max_sequence_length']
+        # Print length of dataset
+        print("Length of dataset: ", len(self.captions))
+    def __len__(self):
+        return len(self.captions)
+    def __getitem__(self, idx):
+        caption = self.captions[idx]['caption']
+        midi_filepath = os.path.join(self.dataset_path, self.captions[idx]['location'])
+        # Read the MIDI file
+        midi = MidiDict.from_midi(midi_filepath)
+        if len(midi.note_msgs) == 0:
+            aria_tokenized_midi = ["<SS>", "<E>"]
+        else:
+            # Get the tokenized MIDI file
+            aria_tokenized_midi = self.aria_tokenizer.tokenize(midi)
+            # Add the start token
+            aria_tokenized_midi = ["<SS>"] + aria_tokenized_midi
+        # Drop a random number of sentences from the caption
+        do_drop = random.random() > 0.5
+        if do_drop:
+            sentences = list(self.nlp(caption).sents)
+            sent_length = len(sentences)
+            if sent_length<4:
+                how_many_to_drop = int(np.floor((20 + random.random()*30)/100*sent_length)) # between 20 and 50 percent of sentences
+            else:
+                how_many_to_drop = int(np.ceil((20 + random.random()*30)/100*sent_length)) # between 20 and 50 percent of sentences
+            which_to_drop = np.random.choice(sent_length, how_many_to_drop, replace=False)
+            new_sentences = [sentences[i] for i in range(sent_length) if i not in which_to_drop.tolist()]
+            new_sentences = " ".join([new_sentences[i].text for i in range(len(new_sentences))]) # combine sentences back with a space
+        else:
+            new_sentences = caption
+        # Tokenize the caption
+        inputs = self.t5_tokenizer(new_sentences, return_tensors='pt', padding=True, truncation=True)
+        input_ids = inputs['input_ids']
+        attention_mask = inputs['attention_mask']
+        # Tokenize the midi file
+        tokenized_midi = [self.tokenizer[token] for token in aria_tokenized_midi if token in self.tokenizer]
+        # Convert the tokenized MIDI file to a tensor and pad it to the maximum sequence length
+        if len(tokenized_midi) < self.decoder_max_sequence_length:
+            labels = F.pad(torch.tensor(tokenized_midi), (0, self.decoder_max_sequence_length - len(tokenized_midi))).to(torch.int64)
+        else:
+            labels = torch.tensor(tokenized_midi[-self.decoder_max_sequence_length:]).to(torch.int64)
+        return input_ids, attention_mask, labels
+if __name__ == "__main__":
+    # Parse command line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default=os.path.normpath("../configs/config.yaml"),
+                        help="Path to the config file")
+    args = parser.parse_args()
+    # Load config file
+    with open(args.config, 'r') as f:
+        configs = yaml.safe_load(f)
+    caption_dataset_path = configs['raw_data']['caption_dataset_path']
+    # Load the caption dataset
+    with jsonlines.open(caption_dataset_path) as reader:
+        captions = list(reader)
+    # Load the dataset
+    dataset = Text2MusicDataset(configs, captions, mode="train", shuffle = True)
+    a,b,c = dataset[0]
+    print(c.shape)

text2midi_repo/model/data_loader_remi.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import yaml
+import jsonlines
+import glob
+import random
+import os
+import sys
+import pickle
+import json
+import argparse
+import numpy as np
+from copy import deepcopy
+from torch.utils.data import Dataset
+import torch
+from torch.nn import functional as F
+from transformers import T5Tokenizer
+from spacy.lang.en import English
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.dirname(SCRIPT_DIR))
+class Text2MusicDataset(Dataset):
+    def __init__(self, configs, captions, remi_tokenizer, mode="train", shuffle = False):
+        self.mode = mode
+        self.captions = captions
+        if shuffle:
+            random.shuffle(self.captions)
+        # Path to dataset
+        self.dataset_path = configs['raw_data']['raw_data_folders']['midicaps']['folder_path']
+        # Artifact folder
+        self.artifact_folder = configs['artifact_folder']
+        # Load encoder tokenizer json file dictionary
+        # tokenizer_filepath = os.path.join(self.artifact_folder, "vocab.pkl")
+        # Load the pickled tokenizer dictionary
+        # with open(tokenizer_filepath, 'rb') as f:
+        #     self.tokenizer = pickle.load(f)
+        self.remi_tokenizer = remi_tokenizer
+        # Load the sentencizer
+        self.nlp = English()
+        self.nlp.add_pipe('sentencizer')
+        # Load the FLAN-T5 tokenizer and encoder
+        self.t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
+        # Get the maximum sequence length
+        self.decoder_max_sequence_length = configs['model']['text2midi_model']['decoder_max_sequence_length']
+        # Print length of dataset
+        print("Length of dataset: ", len(self.captions))
+    def __len__(self):
+        return len(self.captions)
+    def __getitem__(self, idx):
+        caption = self.captions[idx]['caption']
+        midi_filepath = os.path.join(self.dataset_path, self.captions[idx]['location'])
+        # print(f'midi filepath: {midi_filepath}')
+        # Read the MIDI file
+        tokens = self.remi_tokenizer(midi_filepath)
+        if len(tokens.ids) == 0:
+            tokenized_midi = [self.remi_tokenizer["BOS_None"], self.remi_tokenizer["EOS_None"]]
+        else:
+            tokenized_midi = [self.remi_tokenizer["BOS_None"]] + tokens.ids + [self.remi_tokenizer["EOS_None"]]
+        # Drop a random number of sentences from the caption
+        do_drop = random.random() > 0.5
+        if do_drop:
+            sentences = list(self.nlp(caption).sents)
+            sent_length = len(sentences)
+            if sent_length<4:
+                how_many_to_drop = int(np.floor((20 + random.random()*30)/100*sent_length)) # between 20 and 50 percent of sentences
+            else:
+                how_many_to_drop = int(np.ceil((20 + random.random()*30)/100*sent_length)) # between 20 and 50 percent of sentences
+            which_to_drop = np.random.choice(sent_length, how_many_to_drop, replace=False)
+            new_sentences = [sentences[i] for i in range(sent_length) if i not in which_to_drop.tolist()]
+            new_sentences = " ".join([new_sentences[i].text for i in range(len(new_sentences))]) # combine sentences back with a space
+        else:
+            new_sentences = caption
+        # Tokenize the caption
+        inputs = self.t5_tokenizer(new_sentences, return_tensors='pt', padding=True, truncation=True)
+        input_ids = inputs['input_ids']
+        attention_mask = inputs['attention_mask']
+        # Convert the tokenized MIDI file to a tensor and pad it to the maximum sequence length
+        if len(tokenized_midi) < self.decoder_max_sequence_length:
+            labels = F.pad(torch.tensor(tokenized_midi), (0, self.decoder_max_sequence_length - len(tokenized_midi))).to(torch.int64)
+        else:
+            labels = torch.tensor(tokenized_midi[0:self.decoder_max_sequence_length]).to(torch.int64)
+        return input_ids, attention_mask, labels
+if __name__ == "__main__":
+    # Parse command line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default=os.path.normpath("../configs/config.yaml"),
+                        help="Path to the config file")
+    args = parser.parse_args()
+    tokenizer_filepath = "../artifacts/vocab_remi.pkl"
+    # Load the tokenizer dictionary
+    with open(tokenizer_filepath, "rb") as f:
+        tokenizer = pickle.load(f)
+    bos_token_number = tokenizer["PAD_None"]
+    print(f"bos_token_number: {bos_token_number}")
+    # Load config file
+    with open(args.config, 'r') as f:
+        configs = yaml.safe_load(f)
+    caption_dataset_path = configs['raw_data']['caption_dataset_path']
+    # Load the caption dataset
+    with jsonlines.open(caption_dataset_path) as reader:
+        captions = list(reader)
+    # Load the dataset
+    dataset = Text2MusicDataset(configs, captions, remi_tokenizer=tokenizer, mode="train", shuffle = True)
+    a,b,c = dataset[0]
+    print(type(a))
+    generated_midi = tokenizer.decode(c)
+    print(type(generated_midi))
+    generated_midi.dump_midi("decoded_midi.mid")

text2midi_repo/model/dict_output.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

text2midi_repo/model/train.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import os
+# print("CUDA_VISIBLE_DEVICES:", os.environ["CUDA_VISIBLE_DEVICES"])
+# import torch
+# print("CUDA device count:", torch.cuda.device_count())
+# print("CUDA current device:", torch.cuda.current_device())
+# print("CUDA device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
+# os.environ['CUDA_VISIBLE_DEVICES']="2,3"
+from torch.cuda import is_available as cuda_available, is_bf16_supported
+from torch.backends.mps import is_available as mps_available
+import torch.nn as nn
+import torch.optim as optim
+import yaml
+import json
+import pickle
+import os
+import random
+import deepspeed
+from tqdm import tqdm
+import torch
+from torch import Tensor, argmax
+from evaluate import load as load_metric
+import sys
+import argparse
+import jsonlines
+from data_loader import Text2MusicDataset
+from transformer_model import Transformer
+from torch.utils.data import DataLoader
+# Parse command line arguments
+# parser = argparse.ArgumentParser()
+# parser.add_argument("--config", type=str, default=os.path.normpath("configs/config.yaml"),
+#                     help="Path to the config file")
+# parser = deepspeed.add_config_arguments(parser)
+# args = parser.parse_args()
+config_file = "../configs/config.yaml"
+# Load config file
+with open(config_file, 'r') as f: ##args.config
+    configs = yaml.safe_load(f)
+batch_size = configs['training']['text2midi_model']['batch_size']
+learning_rate = configs['training']['text2midi_model']['learning_rate']
+epochs = configs['training']['text2midi_model']['epochs']
+# Artifact folder
+artifact_folder = configs['artifact_folder']
+# Load encoder tokenizer json file dictionary
+tokenizer_filepath = os.path.join(artifact_folder, "vocab.pkl")
+# Load the tokenizer dictionary
+with open(tokenizer_filepath, "rb") as f:
+    tokenizer = pickle.load(f)
+# Get the vocab size
+vocab_size = len(tokenizer)+1
+print("Vocab size: ", vocab_size)
+caption_dataset_path = configs['raw_data']['caption_dataset_path']
+# Load the caption dataset
+with jsonlines.open(caption_dataset_path) as reader:
+    captions = list(reader)
+def collate_fn(batch):
+    """
+    Collate function for the DataLoader
+    :param batch: The batch
+    :return: The collated batch
+    """
+    input_ids = [item[0].squeeze(0) for item in batch]
+    # Pad or trim batch to the same length
+    input_ids = nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
+    attention_mask = [item[1].squeeze(0) for item in batch]
+    # Pad or trim batch to the same length
+    attention_mask = nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
+    labels = [item[2].squeeze(0) for item in batch]
+    # Pad or trim batch to the same length
+    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=0)
+    return input_ids, attention_mask, labels
+# Load the dataset
+dataset = Text2MusicDataset(configs, captions, mode="train", shuffle = True)
+data_length = len(dataset)
+dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=collate_fn)
+# Create the encoder-decoder model
+# Initialize the model
+d_model = configs['model']['text2midi_model']['decoder_d_model']  # Model dimension (same as FLAN-T5 encoder output dimension)
+nhead = configs['model']['text2midi_model']['decoder_num_heads']     # Number of heads in the multiheadattention models
+num_layers = configs['model']['text2midi_model']['decoder_num_layers']  # Number of decoder layers
+max_len = configs['model']['text2midi_model']['decoder_max_sequence_length']  # Maximum length of the input sequence
+use_moe = configs['model']['text2midi_model']['use_moe'] # Use mixture of experts
+num_experts = configs['model']['text2midi_model']['num_experts'] # Number of experts in the mixture of experts
+dim_feedforward = configs['model']['text2midi_model']['decoder_intermediate_size'] # Dimension of the feedforward network model
+use_deepspeed = configs['model']['text2midi_model']['use_deepspeed'] # Use deepspeed
+if use_deepspeed:
+    ds_config = configs['deepspeed_config']['deepspeed_config_path']
+    import deepspeed
+    from deepspeed.accelerator import get_accelerator
+    local_rank = int(os.environ['LOCAL_RANK'])
+    device = (torch.device(get_accelerator().device_name(), local_rank) if (local_rank > -1)
+              and get_accelerator().is_available() else torch.device("cpu"))
+    deepspeed.init_distributed(dist_backend='nccl')
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_flash_sdp(False)
+else:
+    device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
+print(f"Device: {device}")
+print_every = 10
+model = Transformer(vocab_size, d_model, nhead, max_len, num_layers, dim_feedforward, use_moe, num_experts, device=device)
+# Print number of parameters
+num_params = sum(p.numel() for p in model.parameters())
+print(f"Number of parameters: {num_params}")
+# Print number of trainable parameters
+num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+print(f"Number of trainable parameters: {num_trainable_params}")
+if not use_deepspeed:
+    optimizer = optim.Adam(model.parameters(), lr=1e-4)
+criterion = nn.CrossEntropyLoss()
+torch.cuda.empty_cache()
+def train_model(model, dataloader, criterion, num_epochs, optimizer=None, data_length=1000):
+    if use_deepspeed:
+        parameters = filter(lambda p: p.requires_grad, model.parameters())
+        model, optimizer, _, _ = deepspeed.initialize(model=model,
+                                                      optimizer=optimizer,
+                                                      model_parameters=model.parameters(),
+                                                      config=ds_config)
+    else:
+        model = model.to(device)
+    model.train()
+    for epoch in range(num_epochs):
+        total_loss = 0
+        with tqdm(total=int(data_length/batch_size), desc=f"Epoch {epoch + 1}/{num_epochs}") as pbar:
+            for step, batch in enumerate(dataloader):
+                if use_deepspeed:
+                    model.zero_grad()
+                else:
+                    optimizer.zero_grad()
+                # Get the batch
+                encoder_input, attention_mask, tgt = batch
+                # print(encoder_input.shape)
+                encoder_input = encoder_input.to(device)
+                attention_mask = attention_mask.to(device)
+                tgt = tgt.to(device)
+                tgt_input = tgt[:, :-1]
+                tgt_output = tgt[:, 1:]
+                if use_moe:
+                    outputs, aux_loss = model(encoder_input, attention_mask, tgt_input)
+                else:
+                    outputs = model(encoder_input, attention_mask, tgt_input)
+                    aux_loss = 0
+                loss = criterion(outputs.view(-1, outputs.size(-1)), tgt_output.reshape(-1))
+                loss += aux_loss
+                if use_deepspeed:
+                    model.backward(loss)
+                    model.step()
+                else:
+                    loss.backward()
+                    optimizer.step()
+                total_loss += loss.item()
+                if step % print_every == 0:
+                    pbar.set_postfix({"Loss": loss.item()})
+                    pbar.update(1)
+            pbar.set_postfix({"Loss": total_loss / len(dataloader)})
+            pbar.update(1)
+        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")
+# Train the model
+if use_deepspeed:
+    train_model(model, dataloader, criterion, num_epochs=epochs)
+else:
+    train_model(model, dataloader, criterion, num_epochs=epochs, optimizer=optimizer, data_length=data_length)
+# Save the trained model
+torch.save(model.state_dict(), "transformer_decoder_remi_plus.pth")
+print("Model saved as transformer_decoder_remi_plus.pth")

text2midi_repo/model/train_accelerate.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import os
+import torch.nn as nn
+import torch.optim as optim
+import yaml
+import math
+import time
+from transformers import get_scheduler
+import wandb
+import pickle
+import numpy as np
+import json
+import jsonlines
+from tqdm import tqdm
+import torch
+from accelerate import DistributedDataParallelKwargs, Accelerator
+from accelerate.logging import get_logger
+from data_loader_remi import Text2MusicDataset
+from transformer_model import Transformer
+from torch.utils.data import DataLoader
+import logging
+logger = get_logger(__name__)
+# Load config file
+config_file = "../configs/config.yaml"
+with open(config_file, 'r') as f:
+    configs = yaml.safe_load(f)
+batch_size = configs['training']['text2midi_model']['batch_size']
+learning_rate = configs['training']['text2midi_model']['learning_rate']
+epochs = configs['training']['text2midi_model']['epochs']
+artifact_folder = configs['artifact_folder']
+tokenizer_filepath = os.path.join(artifact_folder, "vocab_remi.pkl")
+with open(tokenizer_filepath, "rb") as f:
+    tokenizer = pickle.load(f)
+vocab_size = len(tokenizer)
+caption_dataset_path = configs['raw_data']['caption_dataset_path']
+# Load the caption dataset
+with jsonlines.open(caption_dataset_path) as reader:
+    captions = list(reader)
+    # captions = list(reader)
+def collate_fn(batch):
+    input_ids = [item[0].squeeze(0) for item in batch]
+    input_ids = nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
+    attention_mask = [item[1].squeeze(0) for item in batch]
+    attention_mask = nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
+    labels = [item[2].squeeze(0) for item in batch]
+    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=0)
+    return input_ids, attention_mask, labels
+d_model = configs['model']['text2midi_model']['decoder_d_model']
+nhead = configs['model']['text2midi_model']['decoder_num_heads']
+num_layers = configs['model']['text2midi_model']['decoder_num_layers']
+max_len = configs['model']['text2midi_model']['decoder_max_sequence_length']
+use_moe = configs['model']['text2midi_model']['use_moe']
+num_experts = configs['model']['text2midi_model']['num_experts']
+dim_feedforward = configs['model']['text2midi_model']['decoder_intermediate_size']
+gradient_accumulation_steps = configs['training']['text2midi_model']['gradient_accumulation_steps']
+use_scheduler = configs['training']['text2midi_model']['use_scheduler']
+checkpointing_steps = configs['training']['text2midi_model']['checkpointing_steps']
+lr_scheduler_type = configs['training']['text2midi_model']['lr_scheduler_type']
+num_warmup_steps = configs['training']['text2midi_model']['num_warmup_steps']
+max_train_steps = configs['training']['text2midi_model']['max_train_steps']
+with_tracking = configs['training']['text2midi_model']['with_tracking']
+report_to = configs['training']['text2midi_model']['report_to']
+output_dir = configs['training']['text2midi_model']['output_dir']
+per_device_train_batch_size = configs['training']['text2midi_model']['per_device_train_batch_size']
+save_every = configs['training']['text2midi_model']['save_every']
+accelerator_log_kwargs = {}
+if with_tracking:
+    accelerator_log_kwargs["log_with"] = report_to
+    # Remove the logging_dir argument in case of error
+    accelerator_log_kwargs["logging_dir"] = output_dir
+accelerator = Accelerator(gradient_accumulation_steps=gradient_accumulation_steps, mixed_precision='fp16', kwargs_handlers=[DistributedDataParallelKwargs(find_unused_parameters=True)], **accelerator_log_kwargs)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+logger.info(accelerator.state, main_process_only=False)
+if accelerator.is_main_process:
+    if output_dir is None or output_dir == "":
+        output_dir = "saved/" + str(int(time.time()))
+        if not os.path.exists("saved"):
+            os.makedirs("saved")
+        os.makedirs(output_dir, exist_ok=True)
+    elif output_dir is not None:
+        os.makedirs(output_dir, exist_ok=True)
+    os.makedirs("{}/{}".format(output_dir, "outputs"), exist_ok=True)
+    accelerator.project_configuration.automatic_checkpoint_naming = False
+    wandb.login()
+    wandb.init(project="Text-2-Midi", settings=wandb.Settings(init_timeout=120))
+accelerator.wait_for_everyone()
+device = accelerator.device
+with accelerator.main_process_first():
+    dataset = Text2MusicDataset(configs, captions, remi_tokenizer=tokenizer, mode="train", shuffle=True)
+    dataloader = DataLoader(dataset, batch_size=per_device_train_batch_size, shuffle=True, num_workers=4, collate_fn=collate_fn, drop_last=True)
+model = Transformer(vocab_size, d_model, nhead, max_len, num_layers, dim_feedforward, use_moe, num_experts, device=device)
+model.load_state_dict(torch.load('/root/output_test_new/epoch_68/pytorch_model.bin', map_location=device))
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+total_params = count_parameters(model)
+print(f"Total number of trainable parameters: {total_params}")
+optimizer = optim.Adam(model.parameters(), lr=1e-4)
+overrode_max_train_steps = False
+num_update_steps_per_epoch = math.ceil(len(dataloader) / gradient_accumulation_steps)
+print("num_update_steps_per_epoch", num_update_steps_per_epoch)
+print("max_train_steps", max_train_steps)
+if max_train_steps == 'None':
+    max_train_steps = epochs * num_update_steps_per_epoch
+    print("max_train_steps", max_train_steps)
+    overrode_max_train_steps = True
+    num_warmup_steps = 20000
+elif isinstance(max_train_steps, str):
+    max_train_steps = int(max_train_steps)
+lr_scheduler = get_scheduler(
+    name=lr_scheduler_type,
+    optimizer=optimizer,
+    num_warmup_steps=num_warmup_steps,
+    num_training_steps=max_train_steps,
+)
+model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
+dataloader = accelerator.prepare(dataloader)
+if overrode_max_train_steps:
+    max_train_steps = epochs * num_update_steps_per_epoch
+epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
+# checkpointing_steps = checkpointing_steps if checkpointing_steps.isdigit() else None
+total_batch_size = per_device_train_batch_size * accelerator.num_processes * gradient_accumulation_steps
+logger.info("***** Running training *****")
+logger.info(f"  Num examples = {len(dataset)}")
+logger.info(f"  Num Epochs = {epochs}")
+logger.info(f"  Instantaneous batch size per device = {per_device_train_batch_size}")
+logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+logger.info(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
+logger.info(f"  Total optimization steps = {max_train_steps}")
+criterion = nn.CrossEntropyLoss()
+def train_model_accelerate(model, dataloader, criterion, num_epochs, max_train_steps, optimizer=None, out_dir=None, checkpointing_steps='epoch', with_tracking=False, save_every=5, device='cpu'):
+    progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
+    completed_steps = 0
+    starting_epoch = 68
+    model = model.to(device)
+    model.train()
+    best_loss = np.inf
+    for epoch in range(starting_epoch, num_epochs):
+        total_loss = 0
+        for step, batch in enumerate(dataloader):
+            with accelerator.accumulate(model):
+                encoder_input, attention_mask, tgt = batch
+                encoder_input = encoder_input.to(device)
+                attention_mask = attention_mask.to(device)
+                tgt = tgt.to(device)
+                tgt_input = tgt[:, :-1]
+                tgt_output = tgt[:, 1:]
+                if use_moe:
+                    outputs, aux_loss = model(encoder_input, attention_mask, tgt_input)
+                else:
+                    outputs = model(encoder_input, attention_mask, tgt_input)
+                    aux_loss = 0
+                loss = criterion(outputs.view(-1, outputs.size(-1)), tgt_output.reshape(-1))
+                loss += aux_loss
+                total_loss += loss.detach().float()
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            if accelerator.sync_gradients:
+                progress_bar.set_postfix({"Loss": loss.item()})
+                progress_bar.update(1)
+                completed_steps += 1
+                if accelerator.is_main_process:
+                    result = {}
+                    result["epoch"] = epoch+1
+                    result["step"] = completed_steps
+                    result["train_loss"] = round(total_loss.item()/(gradient_accumulation_steps*completed_steps),4)
+                    wandb.log(result)
+            if isinstance(checkpointing_steps, int):
+                if completed_steps % checkpointing_steps == 0:
+                    output_dir = f"step_{completed_steps }"
+                    if out_dir is not None:
+                        output_dir = os.path.join(out_dir, output_dir)
+                    accelerator.save_state(output_dir)
+            if completed_steps >= max_train_steps:
+                break
+        if accelerator.is_main_process:
+            result = {}
+            result["epoch"] = epoch+1
+            result["step"] = completed_steps
+            result["train_loss"] = round(total_loss.item()/len(dataloader), 4)
+            result_string = "Epoch: {}, Loss Train: {}\n".format(epoch, result["train_loss"])
+            accelerator.print(result_string)
+            with open("{}/summary.jsonl".format(out_dir), "a") as f:
+                f.write(json.dumps(result) + "\n\n")
+            logger.info(result)
+        if accelerator.is_main_process:
+            if total_loss < best_loss:
+                best_loss = total_loss
+                save_checkpoint = True
+            else:
+                save_checkpoint = False
+        accelerator.wait_for_everyone()
+        if accelerator.is_main_process and checkpointing_steps == "best":
+            if save_checkpoint:
+                accelerator.save_state("{}/{}".format(out_dir, "best"))
+            if (epoch + 1) % save_every == 0:
+                logger.info("Saving checkpoint at epoch {}".format(epoch+1))
+                accelerator.save_state("{}/{}".format(out_dir, "epoch_" + str(epoch+1)))
+        if accelerator.is_main_process and checkpointing_steps == "epoch":
+            accelerator.save_state("{}/{}".format(out_dir, "epoch_" + str(epoch+1)))
+train_model_accelerate(model, dataloader, criterion, num_epochs=epochs, max_train_steps=max_train_steps,
+                       optimizer=optimizer, out_dir=output_dir, checkpointing_steps=checkpointing_steps,
+                       with_tracking=with_tracking, save_every=save_every, device=device)
+# torch.save(model.state_dict(), "transformer_decoder_remi_plus.pth")
+# print("Model saved as transformer_decoder_remi_plus.pth")

text2midi_repo/model/train_hf.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import os
+from torch.cuda import is_available as cuda_available, is_bf16_supported
+from torch.backends.mps import is_available as mps_available
+import torch.nn as nn
+import torch.optim as optim
+import yaml
+import json
+import pickle
+import os
+import random
+from tqdm import tqdm
+from transformers import T5EncoderModel, BertModel, BertConfig, Trainer, TrainingArguments, PreTrainedModel, T5Config, T5EncoderModel, BertLMHeadModel
+import torch
+from torch import Tensor, argmax
+from evaluate import load as load_metric
+import sys
+import argparse
+import jsonlines
+from data_loader_remi import Text2MusicDataset
+from transformer_model import Transformer
+from torch.utils.data import DataLoader
+parser = argparse.ArgumentParser()
+parser.add_argument("--config", type=str, default=os.path.normpath("configs/config.yaml"),
+                    help="Path to the config file")
+args = parser.parse_args()
+# Load config file
+with open(args.config, 'r') as f: ##args.config
+    configs = yaml.safe_load(f)
+batch_size = configs['training']['text2midi_model']['batch_size']
+learning_rate = configs['training']['text2midi_model']['learning_rate']
+epochs = configs['training']['text2midi_model']['epochs']
+# Artifact folder
+artifact_folder = configs['artifact_folder']
+# Load remi tokenizer
+tokenizer_filepath = os.path.join(artifact_folder, "vocab_remi.pkl")
+# Load the tokenizer dictionary
+with open(tokenizer_filepath, "rb") as f:
+    tokenizer = pickle.load(f)
+# Get the vocab size
+vocab_size = tokenizer.vocab_size + 1
+print("Vocab size: ", vocab_size)
+caption_dataset_path = configs['raw_data']['caption_dataset_path']
+# Load the caption dataset
+with jsonlines.open(caption_dataset_path) as reader:
+    captions = list(reader)
+def collate_fn(batch):
+    """
+    Collate function for the DataLoader
+    :param batch: The batch
+    :return: The collated batch
+    """
+    input_ids = [item[0].squeeze(0) for item in batch]
+    # Pad or trim batch to the same length
+    input_ids = nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
+    attention_mask = [item[1].squeeze(0) for item in batch]
+    # Pad or trim batch to the same length
+    attention_mask = nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
+    labels = [item[2].squeeze(0) for item in batch]
+    # Pad or trim batch to the same length
+    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=0)
+    decoder_input_ids = labels[:, :-1].contiguous()
+    labels = labels[:, 1:].contiguous()
+    # return input_ids, attention_mask, labels
+    return {
+        'input_ids': input_ids,
+        'attention_mask': attention_mask,
+        'decoder_input_ids': decoder_input_ids,
+        'labels': labels
+    }
+# Train test split captions
+random.seed(444)
+random.shuffle(captions)
+train_size = int(0.8 * len(captions))
+train_captions = captions[:train_size]
+test_captions = captions[train_size:]
+# Load the dataset
+train_dataset = Text2MusicDataset(configs, train_captions, tokenizer, mode="train", shuffle = True)
+print(f"Train Data length: {len(train_dataset)}")
+test_dataset = Text2MusicDataset(configs, test_captions, tokenizer, mode="eval", shuffle = False)
+print(f"Test Data length: {len(test_dataset)}")
+# Dataloader
+# train_dataset = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=5)
+# test_dataset = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=5)
+# Create the encoder-decoder model
+class CustomEncoderDecoderModel(PreTrainedModel):
+    def __init__(self, encoder, decoder, encoder_config, decoder_config):
+        super().__init__(encoder_config)
+        self.encoder = encoder
+        self.decoder = decoder
+        self.encoder_config = encoder_config
+        self.decoder_config = decoder_config
+    def forward(self, input_ids, decoder_input_ids, attention_mask=None, decoder_attention_mask=None, labels=None, **kwargs):
+        encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+        encoder_hidden_states = encoder_outputs.last_hidden_state
+        # Assume the decoder can take encoder hidden states as inputs
+        output = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+            labels=labels
+        )
+        logits = output.logits
+        loss = output.loss
+        return {'loss': loss, 'logits': logits}
+# Load the pre-trained FLAN T5 encoder and freeze its parameters
+flan_t5_encoder = T5EncoderModel.from_pretrained('google/flan-t5-small')
+for param in flan_t5_encoder.parameters():
+    param.requires_grad = False
+# Load the configurations
+encoder_config = T5Config.from_pretrained('google/flan-t5-small')
+# Define a configuration for the BERT decoder
+config_decoder = BertConfig()
+config_decoder.vocab_size = vocab_size
+config_decoder.max_position_embeddings = configs['model']['text2midi_model']['decoder_max_sequence_length']
+config_decoder.max_length = configs['model']['text2midi_model']['decoder_max_sequence_length']
+config_decoder.bos_token_id = tokenizer["BOS_None"]
+config_decoder.eos_token_id = tokenizer["EOS_None"]
+config_decoder.pad_token_id = 0
+config_decoder.num_hidden_layers = configs['model']['text2midi_model']['decoder_num_layers']
+config_decoder.num_attention_heads = configs['model']['text2midi_model']['decoder_num_heads']
+config_decoder.hidden_size = configs['model']['text2midi_model']['decoder_d_model']
+config_decoder.intermediate_size = configs['model']['text2midi_model']['decoder_intermediate_size']
+# set decoder config to causal lm
+config_decoder.is_decoder = True
+config_decoder.add_cross_attention = True
+config_decoder.tie_encoder_decoder = False
+config_decoder.tie_word_embeddings = False
+# Create a BERT model based on the configuration
+custom_decoder = BertLMHeadModel(config_decoder)
+# Initialize the custom model
+model = CustomEncoderDecoderModel(
+    encoder=flan_t5_encoder,
+    decoder=custom_decoder,
+    encoder_config=encoder_config,
+    decoder_config=config_decoder
+)
+# Print the number of parameters in the model
+num_params = sum(p.numel() for p in model.parameters())
+print(f"Number of parameters in the model: {num_params}")
+# Create config for the Trainer
+USE_CUDA = cuda_available()
+print(f"USE_CUDA: {USE_CUDA}")
+if not cuda_available():
+    FP16 = FP16_EVAL = BF16 = BF16_EVAL = False
+elif is_bf16_supported():
+    BF16 = BF16_EVAL = True
+    FP16 = FP16_EVAL = False
+else:
+    BF16 = BF16_EVAL = False
+    FP16 = FP16_EVAL = True
+USE_MPS = not USE_CUDA and mps_available()
+metrics = {metric: load_metric(metric) for metric in ["accuracy"]}
+def compute_metrics(eval_pred):
+    """
+    Compute metrics for pretraining.
+    Must use preprocess_logits function that converts logits to predictions (argmax or sampling).
+    :param eval_pred: EvalPrediction containing predictions and labels
+    :return: metrics
+    """
+    predictions, labels = eval_pred
+    not_pad_mask = labels != 0
+    labels, predictions = labels[not_pad_mask], predictions[not_pad_mask]
+    return metrics["accuracy"].compute(predictions=predictions.flatten(), references=labels.flatten())
+def preprocess_logits(logits: Tensor, _: Tensor) -> Tensor:
+    """
+    Preprocess the logits before accumulating them during evaluation.
+    This allows to significantly reduce the memory usage and make the training tractable.
+    """
+    pred_ids = argmax(logits, dim=-1)  # long dtype
+    return pred_ids
+run_name = configs['training']['text2midi_model']['run_name']
+model_dir = os.path.join(artifact_folder, run_name)
+log_dir = os.path.join(model_dir, "logs")
+# Clear the logs directory before training
+os.system(f"rm -rf {log_dir}")
+# Define the training arguments
+training_args = TrainingArguments(
+    output_dir=model_dir,
+    per_device_train_batch_size=batch_size,
+    per_device_eval_batch_size=batch_size,
+    save_strategy="epoch",  # "steps" or "epoch"
+    save_total_limit=1,
+    learning_rate=learning_rate,
+    lr_scheduler_type="cosine_with_restarts",
+    warmup_ratio=0.3,
+    max_grad_norm=3.0,
+    weight_decay= configs['training']['text2midi_model']['weight_decay'],
+    num_train_epochs=epochs,
+    evaluation_strategy="epoch",
+    gradient_accumulation_steps=configs['training']['text2midi_model']['gradient_accumulation_steps'],
+    # gradient_checkpointing=True,
+    optim="adafactor",
+    seed=444,
+    logging_strategy="steps",
+    logging_steps=10,
+    logging_dir=log_dir,
+    no_cuda=not USE_CUDA,
+    fp16=FP16,
+    fp16_full_eval=FP16_EVAL,
+    bf16=BF16,
+    bf16_full_eval=BF16_EVAL,
+    load_best_model_at_end=True,
+    # metric_for_best_model="loss",
+    greater_is_better=False,
+    report_to="tensorboard",
+    run_name=run_name,
+    push_to_hub=False,
+    dataloader_num_workers=5
+)
+# # Define the Trainer
+# trainer = Trainer(
+#     model=model,
+#     args=training_args,
+#     train_dataset=train_dataset,
+#     eval_dataset=test_dataset,
+#     compute_metrics=compute_metrics,
+#     preprocess_logits_for_metrics=preprocess_logits,
+#     # callbacks=[EarlyStoppingCallback(early_stopping_patience=30)]
+# )
+class CustomTrainer(Trainer):
+    def get_train_dataloader(self):
+        return DataLoader(self.train_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=5)
+    def get_eval_dataloader(self, eval_dataset):
+        return DataLoader(eval_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=5)
+    def get_test_dataloader(self, test_dataset):
+        return DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=5)
+# Define the Trainer
+trainer = CustomTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=test_dataset,
+    compute_metrics=compute_metrics,
+    preprocess_logits_for_metrics=preprocess_logits,
+    # callbacks=[EarlyStoppingCallback(early_stopping_patience=30)]
+)
+# Train and save the model
+train_result = trainer.train()
+trainer.save_model()
+trainer.log_metrics("train", train_result.metrics)
+trainer.save_metrics("train", train_result.metrics)
+trainer.save_state()

text2midi_repo/model/transformer_model.py ADDED Viewed

	@@ -0,0 +1,1509 @@

+# from aria.tokenizer import AbsTokenizer
+# aria_tokenizer = AbsTokenizer()
+import copy
+import json
+from typing import Optional, Any, Union, Callable
+import torch.multiprocessing as mp
+from torch.nn import DataParallel
+import jsonlines
+import math
+import time
+import torch
+import os
+import warnings
+from tqdm import tqdm
+from torch import Tensor
+# from aria.tokenizer import AbsTokenizer
+import pickle
+from torch.nn import Module, LayerNorm, Dropout, Linear
+from torch.nn.modules.container import ModuleList
+from torch.nn.modules.activation import MultiheadAttention
+from torch.multiprocessing import Process, set_start_method
+from torch.nn.init import xavier_uniform_
+import torch.nn.functional as F
+import torch.nn as nn
+from st_moe_pytorch import MoE
+from st_moe_pytorch import SparseMoEBlock
+from einops import rearrange
+from transformers import T5Tokenizer, T5EncoderModel
+import sys
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DataLoader, Dataset
+import torch.profiler
+from accelerate import Accelerator
+import argparse  # Add this import
+class CaptionDataset(Dataset):
+    def __init__(self, captions):
+        self.captions = captions
+    def __len__(self):
+        return len(self.captions)
+    def __getitem__(self, idx):
+        return self.captions[idx]
+def custom_collate_fn(batch):
+    captions = [item['caption'] for item in batch]
+    locations = [item['location'] for item in batch]
+    return captions, locations
+def ensure_log_dir_exists(log_dir):
+    if not os.path.exists(log_dir):
+        os.makedirs(log_dir)
+__all__ = ['Transformer', 'TransformerEncoder', 'TransformerDecoder', 'TransformerEncoderLayer', 'TransformerDecoderLayer']
+def _generate_square_subsequent_mask(
+        sz: int,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+) -> Tensor:
+    r"""Generate a square causal mask for the sequence.
+    The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0).
+    """
+    if device is None:
+        device = torch.device('cpu')
+    if dtype is None:
+        dtype = torch.float32
+    return torch.triu(
+        torch.full((sz, sz), float('-inf'), dtype=dtype, device=device),
+        diagonal=1,
+    )
+def _get_seq_len(
+        src: Tensor,
+        batch_first: bool
+) -> Optional[int]:
+    if src.is_nested:
+        return None
+    else:
+        src_size = src.size()
+        if len(src_size) == 2:
+            # unbatched: S, E
+            return src_size[0]
+        else:
+            # batched: B, S, E if batch_first else S, B, E
+            seq_len_pos = 1 if batch_first else 0
+            return src_size[seq_len_pos]
+class PositionalEncoding(nn.Module):
+    r"""Inject some information about the relative or absolute position of the tokens in the sequence.
+        The positional encodings have the same dimension as the embeddings, so that the two can be summed.
+        Here, we use sine and cosine functions of different frequencies.
+    .. math:
+        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
+        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
+        \text{where pos is the word position and i is the embed idx)
+    Args:
+        d_model: the embed dim (required).
+        dropout: the dropout value (default=0.1).
+        max_len: the max. length of the incoming sequence (default=5000).
+    Examples:
+        >>> pos_encoder = PositionalEncoding(d_model)
+    """
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        # self.register_buffer('pe', pe)
+        self.register_parameter('pe', nn.Parameter(pe, requires_grad=False))
+    def forward(self, x):
+        r"""Inputs of forward function
+        Args:
+            x: the sequence fed to the positional encoder model (required).
+        Shape:
+            x: [sequence length, batch size, embed dim]
+            output: [sequence length, batch size, embed dim]
+        Examples:
+            >>> output = pos_encoder(x)
+        """
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
+def precompute_freqs_cis(
+    seq_len: int,
+    n_elem: int,
+    base: int = 10000,
+    dtype: torch.dtype = torch.bfloat16,
+):
+    freqs = 1.0 / (
+        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)
+    )
+    t = torch.arange(seq_len, device=freqs.device)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
+    return cache.to(dtype=dtype)
+@torch.jit.script
+def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+    """
+    In-place RoPE. Credits to Katherine Crowson:
+    x shape (b_sz, n_head, s_len, d_head).
+    cos, sin shape (s_len, d_head // 2).
+    """
+    x = x.permute(0, 2, 1, 3)
+    d = x.shape[-1] // 2
+    cos = freqs_cis[..., 0][None, :, None]
+    sin = freqs_cis[..., 1][None, :, None]
+    x1, x2 = x[..., :d], x[..., d : d * 2]
+    tmp = x1.clone()
+    # x1.mul_(cos).addcmul_(x2, sin, value=-1)
+    # x2.mul_(cos).addcmul_(tmp, sin, value=1) ##was throwing some error: RuntimeError: Output 0 of SliceBackward0 is a view and is being modified inplace. This view is the output of a function that returns multiple views. Such functions do not allow the output views to be modified inplace. You should replace the inplace operation by an out-of-place one.
+    x1_new = x1.mul(cos) - x2.mul(sin)
+    x2_new = x2.mul(cos) + tmp.mul(sin)
+    x = torch.cat((x1_new, x2_new), dim=-1)
+    x = x.permute(0, 2, 1, 3)
+    return x
+class MultiHeadSelfAttention(nn.Module):
+    r"""Multi-head self-attention module.
+    Args:
+        embed_dim (int): The input embedding dimension.
+        num_heads (int, optional): The number of attention heads (default: 4).
+        dropout (float, optional): The dropout probability (default: 0.1).
+        device (torch.device, optional): The device to use (default: None).
+        dtype (torch.dtype, optional): The data type to use (default: None).
+    Attributes:
+        dim_head (int): The dimension of each attention head.
+        scale (float): The scaling factor for attention scores.
+        heads (int): The number of attention heads.
+        to_qkv (nn.Linear): Linear layer for projecting input to query, key, and value.
+        to_out (nn.Linear): Linear layer for projecting attention output to the original embedding dimension.
+        dropout (nn.Dropout): Dropout layer.
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int = 4,
+        dropout: float = 0.1,
+        batch_first: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.batch_first = batch_first
+        self.dim_head = embed_dim // num_heads
+        self.scale = self.dim_head ** -0.5
+        self.heads = num_heads
+        hidden_dim = self.dim_head * num_heads
+        self.to_qkv = nn.Linear(embed_dim, hidden_dim * 3, bias=False, **factory_kwargs)
+        self.to_out = nn.Linear(hidden_dim, embed_dim, bias=False, **factory_kwargs)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor, is_causal: bool = True) -> torch.Tensor:
+        r"""Forward pass of the multi-head self-attention module.
+        Args:
+            x (torch.Tensor): The input tensor of shape (batch_size, sequence_length, embed_dim).
+        Returns:
+            torch.Tensor: The output tensor of shape (batch_size, sequence_length, embed_dim).
+        """
+        if not self.batch_first:
+            x = x.transpose(0, 1)
+        b, n, _ = x.size()
+        q, k, v = torch.chunk(self.to_qkv(x), chunks=3, dim=-1)
+        q, k, v = map(lambda t: t.contiguous().view(b, self.heads, n, -1), (q, k, v))
+        self.freqs_cis = precompute_freqs_cis(
+                seq_len=n,
+                n_elem=self.embed_dim // self.heads,
+                base=10000,
+                dtype=x.dtype,
+            ).to(x.device)
+        freqs_cis = self.freqs_cis[: x.shape[1]]
+        # q = apply_rotary_emb(q, freqs_cis)
+        # k = apply_rotary_emb(k, freqs_cis)
+        out = torch.nn.functional.scaled_dot_product_attention(q, k, v, is_causal=is_causal)
+        out = out.contiguous().view(b, n, -1)
+        out = self.dropout(out)
+        return self.to_out(out)
+class Transformer(Module):
+    r"""A transformer model.
+    User is able to modify the attributes as needed. The architecture
+    is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer,
+    Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and
+    Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information
+    Processing Systems, pages 6000-6010.
+    Args:
+        d_model: the number of expected features in the encoder/decoder inputs (default=512).
+        nhead: the number of heads in the multiheadattention models (default=8).
+        num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6).
+        num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        use_moe: if True, use MoE instead of linear layer for feedforward network (default=False).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of encoder/decoder intermediate layer, can be a string
+            ("relu" or "gelu") or a unary callable. Default: relu
+        custom_encoder: custom encoder (default=None).
+        custom_decoder: custom decoder (default=None).
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+        norm_first: if ``True``, encoder and decoder layers will perform LayerNorms before
+            other attention and feedforward operations, otherwise after. Default: ``False`` (after).
+        bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive
+            bias. Default: ``True``.
+    Examples::
+        >>> transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12)
+        >>> src = torch.rand((32, 512))
+        >>> tgt = torch.rand((32, 512, 30000))
+        >>> out = transformer_model(src, tgt)
+    Note: A full example to apply nn.Transformer module for the word language model is available in
+    https://github.com/pytorch/examples/tree/master/word_language_model
+    """
+    def __init__(self, n_vocab: int = 30000, d_model: int = 512, nhead: int = 8, max_len: int = 5000,
+                 num_decoder_layers: int = 6, dim_feedforward: int = 2048, use_moe: bool = False,
+                 num_experts: int = 16, dropout: float = 0.1,
+                 activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+                 layer_norm_eps: float = 1e-5, batch_first: bool = True, norm_first: bool = False,
+                 bias: bool = True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
+        self.use_moe = use_moe
+        self.input_emb = nn.Embedding(n_vocab, d_model, **factory_kwargs)
+        self.pos_encoder = PositionalEncoding(d_model, dropout, max_len).to(device)
+        # Load the FLAN-T5 encoder
+        self.encoder = T5EncoderModel.from_pretrained("google/flan-t5-base").to(device)
+        # Freeze the encoder
+        for param in self.encoder.parameters():
+            param.requires_grad = False
+        decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, use_moe, num_experts, dropout,
+                                                activation, layer_norm_eps, batch_first, norm_first,
+                                                bias, **factory_kwargs)
+        decoder_norm = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, use_moe, decoder_norm)
+        self.projection = nn.Linear(d_model, n_vocab).to(device)
+        self._reset_parameters()
+        self.d_model = d_model
+        self.nhead = nhead
+        self.batch_first = batch_first
+    def forward(self, src: Tensor, src_mask: Tensor, tgt: Tensor, memory_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None, tgt_is_causal: bool = True,
+                memory_is_causal: bool = False) -> Tensor:
+        r"""Take in and process masked source/target sequences.
+        .. note::
+            If a boolean tensor is provided for any of the [src/tgt/memory]_mask arguments, positions with a ``True`` value are
+            not allowed to participate in the attention,
+            which is the opposite of the definition for :attr:`attn_mask`
+            in :func:`torch.nn.functional.scaled_dot_product_attention`.
+        Args:
+            src: the sequence to the encoder (required).
+            src_attn_mask: the attention mask for the src sequence (required).
+            tgt: the sequence to the decoder (required).
+            tgt_mask: the additive mask for the tgt sequence (optional).
+            memory_mask: the additive mask for the encoder output (optional).
+            tgt_key_padding_mask: the Tensor mask for tgt keys per batch (optional).
+            memory_key_padding_mask: the Tensor mask for memory keys per batch (optional).
+            tgt_is_causal: If specified, applies a causal mask as ``tgt_mask``.
+                Default: ``None``; try to detect a causal mask.
+                Warning:
+                ``tgt_is_causal`` provides a hint that ``tgt_mask`` is
+                the causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+            memory_is_causal: If specified, applies a causal mask as
+                ``memory_mask``.
+                Default: ``False``.
+                Warning:
+                ``memory_is_causal`` provides a hint that
+                ``memory_mask`` is the causal mask. Providing incorrect
+                hints can result in incorrect execution, including
+                forward and backward compatibility.
+        Shape:
+            - src: :math:`(S, S)` for unbatched input, :math:`(S, N)` if `batch_first=False` or
+              `(N, S)` if `batch_first=True`.
+            - src_mask: :math:`(S, S)` or :math:`(N\cdot\text{num\_heads}, S, S)`.
+            - tgt: :math:`(T, E)` for unbatched input, :math:`(T, N, E)` if `batch_first=False` or
+              `(N, T, E)` if `batch_first=True`.
+            - tgt_mask: :math:`(T, T)` or :math:`(N\cdot\text{num\_heads}, T, T)`.
+            - memory_mask: :math:`(T, S)`.
+            - src_key_padding_mask: :math:`(S)` for unbatched input otherwise :math:`(N, S)`.
+            - tgt_key_padding_mask: :math:`(T)` for unbatched input otherwise :math:`(N, T)`.
+            - memory_key_padding_mask: :math:`(S)` for unbatched input otherwise :math:`(N, S)`.
+            Note: [src/tgt/memory]_mask ensures that position :math:`i` is allowed to attend the unmasked
+            positions. If a BoolTensor is provided, positions with ``True``
+            are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+            is provided, it will be added to the attention weight.
+            [src/tgt/memory]_key_padding_mask provides specified elements in the key to be ignored by
+            the attention. If a BoolTensor is provided, the positions with the
+            value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+            - output: :math:`(T, E)` for unbatched input, :math:`(T, N, E)` if `batch_first=False` or
+              `(N, T, E)` if `batch_first=True`.
+            Note: Due to the multi-head attention architecture in the transformer model,
+            the output sequence length of a transformer is same as the input sequence
+            (i.e. target) length of the decoder.
+            where :math:`S` is the source sequence length, :math:`T` is the target sequence length, :math:`N` is the
+            batch size, :math:`E` is the feature number
+        Examples:
+            >>> # xdoctest: +SKIP
+            >>> output = transformer_model(src, tgt, src_mask=src_mask)
+        """
+        if src.dim() != tgt.dim():
+            raise RuntimeError("the number of dimensions in src and tgt must be equal")
+        memory = self.encoder(src, attention_mask=src_mask).last_hidden_state
+        tgt = self.input_emb(tgt) * math.sqrt(self.d_model)
+        tgt = self.pos_encoder(tgt)
+        # tgt = tgt + tgt_pos
+        if self.use_moe:
+            with torch.cuda.amp.autocast(enabled =False):
+                output, sum_total_aux_loss = self.decoder(tgt, memory, memory_mask=memory_mask,
+                                    memory_key_padding_mask=memory_key_padding_mask,
+                                    tgt_is_causal=tgt_is_causal, memory_is_causal=memory_is_causal)
+        else:
+            output = self.decoder(tgt, memory, memory_mask=memory_mask,
+                                memory_key_padding_mask=memory_key_padding_mask,
+                                tgt_is_causal=tgt_is_causal, memory_is_causal=memory_is_causal)
+        output = self.projection(output)
+        # output = F.log_softmax(output, dim=-1)
+        if self.use_moe:
+            return output, sum_total_aux_loss
+        else:
+            return output
+    def generate(self, src: Tensor, src_mask: Tensor, max_len: int = 100, temperature: float = 1.0):
+        ## ADD A START OF SEQUENCE TOKEN  <SS> token to the src tensor
+        r"""Generate a sequence of tokens from the given inputs.
+        Args:
+            src: the sequence to the encoder (required).
+            src_mask: the attention mask for the src sequence (required).
+            max_len: the maximum length of the sequence to generate (default=100).
+            temperature: the temperature for the softmax (default=1.0).
+        Returns:
+            torch.Tensor: The generated sequence of tokens.
+        """
+        if src.dim() != 2:
+            raise RuntimeError("The src tensor should be 2-dimensional")
+        tgt_fin = torch.full((src.size(0), 1), 1, dtype=torch.long, device=src.device)
+        # values = [21631, 8, 10, 9, 6, 7, 17, 21632, 11474, 20626, 21151, 9426, 20627, 21143, 11476, 20640, 21143, 11477, 20655, 21145, 11476, 20669, 21145, 11477, 20683, 21145, 13527, 20697, 21146, 13529, 20712, 21145, 7013, 20769, 21143, 7006, 20769, 21143, 7006, 20769, 21141, 7009, 20769, 21143, 9426, 20797, 21144, 11474, 20797, 21173, 11476, 20812, 21144, 11477, 20826, 21145, 11476, 20840, 21145, 11477, 20855, 21145, 13527, 20869, 21144, 13529, 20883, 21143, 7006, 20940, 21139, 7013, 20940, 21140, 7006, 20940, 21147, 7009, 20940, 21147, 11474, 20969, 21144, 11474, 20969, 21170, 11476, 20983, 21144, 11477, 20997, 21145, 11476, 21012, 21144, 11477, 21026, 21144, 11479, 21040]
+        # values_tensor = torch.tensor(values, dtype=torch.long, device=src.device)
+        # tgt_fin = values_tensor.unsqueeze(0).repeat(src.size(0), 1)
+        for i in tqdm(range(max_len)):
+            max_index = tgt_fin.max()
+            # assert max_index < 21634, "tgt_fin contains index out of range. Adjust n_vocab or fix tgt_fin indices."
+            tgt = tgt_fin
+            if self.use_moe:
+                output, _ = self.froward(src, src_mask, tgt, memory_mask=None,
+                                memory_key_padding_mask=None,
+                                tgt_is_causal=True, memory_is_causal=False)
+            else:
+                output = self.forward(src, src_mask, tgt, memory_mask=None,
+                                      memory_key_padding_mask=None,
+                                      tgt_is_causal=True, memory_is_causal=False)
+            # logits = self.projection(output)
+            logits = output
+            output = F.log_softmax(logits/temperature, dim=-1)
+            output = output.view(-1, output.size(-1))
+            next_tokens = torch.multinomial(torch.exp(output), 1)[-1] # taking the last logit and adding to the sequence
+            tgt_fin = torch.cat((tgt_fin, next_tokens.unsqueeze(-1)), dim=1)
+        return tgt_fin[:, 1:]
+    @staticmethod
+    def generate_square_subsequent_mask(
+            sz: int,
+            device: Optional[torch.device] = None,
+            dtype: Optional[torch.dtype] = None,
+    ) -> Tensor:
+        r"""Generate a square causal mask for the sequence.
+        The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0).
+        """
+        return _generate_square_subsequent_mask(sz, dtype=dtype, device=device)
+    def _reset_parameters(self):
+        r"""Initiate parameters in the transformer model."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+class TransformerEncoder(Module):
+    r"""TransformerEncoder is a stack of N encoder layers.
+    Users can build the BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters.
+    Args:
+        encoder_layer: an instance of the TransformerEncoderLayer() class (required).
+        num_layers: the number of sub-encoder-layers in the encoder (required).
+        norm: the layer normalization component (optional).
+        enable_nested_tensor: if True, input will automatically convert to nested tensor
+            (and convert back on output). This will improve the overall performance of
+            TransformerEncoder when padding rate is high. Default: ``True`` (enabled).
+    Examples::
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+        >>> transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
+        >>> src = torch.rand(10, 32, 512)
+        >>> out = transformer_encoder(src)
+    """
+    __constants__ = ['norm']
+    def __init__(
+        self,
+        encoder_layer: "TransformerEncoderLayer",
+        num_layers: int,
+        norm: Optional[Module] = None,
+        enable_nested_tensor: bool = True,
+        mask_check: bool = True
+    ) -> None:
+        super().__init__()
+        torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        # this attribute saves the value providedat object construction
+        self.enable_nested_tensor = enable_nested_tensor
+        # this attribute controls whether nested tensors are used
+        self.use_nested_tensor = enable_nested_tensor
+        self.mask_check = mask_check
+        enc_layer = "encoder_layer"
+        why_not_sparsity_fast_path = ''
+        if not isinstance(encoder_layer, torch.nn.TransformerEncoderLayer):
+            why_not_sparsity_fast_path = f"{enc_layer} was not TransformerEncoderLayer"
+        elif encoder_layer.norm_first :
+            why_not_sparsity_fast_path = f"{enc_layer}.norm_first was True"
+        elif not encoder_layer.self_attn.batch_first:
+            why_not_sparsity_fast_path = (f"{enc_layer}.self_attn.batch_first was not True" +
+                                          "(use batch_first for better inference performance)")
+        elif not encoder_layer.self_attn._qkv_same_embed_dim:
+            why_not_sparsity_fast_path = f"{enc_layer}.self_attn._qkv_same_embed_dim was not True"
+        elif encoder_layer.self_attn.in_proj_bias is None:
+            why_not_sparsity_fast_path = f"{enc_layer}.self_attn was passed bias=False"
+        elif not encoder_layer.activation_relu_or_gelu:
+            why_not_sparsity_fast_path = f"{enc_layer}.activation_relu_or_gelu was not True"
+        elif not (encoder_layer.norm1.eps == encoder_layer.norm2.eps) :
+            why_not_sparsity_fast_path = f"{enc_layer}.norm1.eps was not equal to {enc_layer}.norm2.eps"
+        elif encoder_layer.self_attn.num_heads % 2 == 1:
+            why_not_sparsity_fast_path = f"{enc_layer}.self_attn.num_heads is odd"
+        if enable_nested_tensor and why_not_sparsity_fast_path:
+            warnings.warn(f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}")
+            self.use_nested_tensor = False
+    def forward(
+            self,
+            src: Tensor,
+            mask: Optional[Tensor] = None,
+            src_key_padding_mask: Optional[Tensor] = None,
+            is_causal: Optional[bool] = None) -> Tensor:
+        r"""Pass the input through the encoder layers in turn.
+        Args:
+            src: the sequence to the encoder (required).
+            mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+            is_causal: If specified, applies a causal mask as ``mask``.
+                Default: ``None``; try to detect a causal mask.
+                Warning:
+                ``is_causal`` provides a hint that ``mask`` is the
+                causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        src_key_padding_mask = F._canonical_mask(
+            mask=src_key_padding_mask,
+            mask_name="src_key_padding_mask",
+            other_type=F._none_or_dtype(mask),
+            other_name="mask",
+            target_type=src.dtype
+        )
+        mask = F._canonical_mask(
+            mask=mask,
+            mask_name="mask",
+            other_type=None,
+            other_name="",
+            target_type=src.dtype,
+            check_other=False,
+        )
+        output = src
+        convert_to_nested = False
+        first_layer = self.layers[0]
+        src_key_padding_mask_for_layers = src_key_padding_mask
+        why_not_sparsity_fast_path = ''
+        str_first_layer = "self.layers[0]"
+        batch_first = first_layer.self_attn.batch_first
+        # is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled()
+        # if not is_fastpath_enabled:
+        #     why_not_sparsity_fast_path = "torch.backends.mha.get_fastpath_enabled() was not True"
+        if not hasattr(self, "use_nested_tensor"):
+            why_not_sparsity_fast_path = "use_nested_tensor attribute not present"
+        elif not self.use_nested_tensor:
+            why_not_sparsity_fast_path = "self.use_nested_tensor (set in init) was not True"
+        elif first_layer.training:
+            why_not_sparsity_fast_path = f"{str_first_layer} was in training mode"
+        elif not src.dim() == 3:
+            why_not_sparsity_fast_path = f"input not batched; expected src.dim() of 3 but got {src.dim()}"
+        elif src_key_padding_mask is None:
+            why_not_sparsity_fast_path = "src_key_padding_mask was None"
+        elif (((not hasattr(self, "mask_check")) or self.mask_check)
+                and not torch._nested_tensor_from_mask_left_aligned(src, src_key_padding_mask.logical_not())):
+            why_not_sparsity_fast_path = "mask_check enabled, and src and src_key_padding_mask was not left aligned"
+        elif output.is_nested:
+            why_not_sparsity_fast_path = "NestedTensor input is not supported"
+        elif mask is not None:
+            why_not_sparsity_fast_path = "src_key_padding_mask and mask were both supplied"
+        elif torch.is_autocast_enabled():
+            why_not_sparsity_fast_path = "autocast is enabled"
+        if not why_not_sparsity_fast_path:
+            tensor_args = (
+                src,
+                first_layer.self_attn.in_proj_weight,
+                first_layer.self_attn.in_proj_bias,
+                first_layer.self_attn.out_proj.weight,
+                first_layer.self_attn.out_proj.bias,
+                first_layer.norm1.weight,
+                first_layer.norm1.bias,
+                first_layer.norm2.weight,
+                first_layer.norm2.bias,
+                first_layer.linear1.weight,
+                first_layer.linear1.bias,
+                first_layer.linear2.weight,
+                first_layer.linear2.bias,
+            )
+            _supported_device_type = ["cpu", "cuda", torch.utils.backend_registration._privateuse1_backend_name]
+            if torch.overrides.has_torch_function(tensor_args):
+                why_not_sparsity_fast_path = "some Tensor argument has_torch_function"
+            elif src.device.type not in _supported_device_type:
+                why_not_sparsity_fast_path = f"src device is neither one of {_supported_device_type}"
+            elif torch.is_grad_enabled() and any(x.requires_grad for x in tensor_args):
+                why_not_sparsity_fast_path = ("grad is enabled and at least one of query or the "
+                                              "input/output projection weights or biases requires_grad")
+            if (not why_not_sparsity_fast_path) and (src_key_padding_mask is not None):
+                convert_to_nested = True
+                output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)
+                src_key_padding_mask_for_layers = None
+        seq_len = _get_seq_len(src, batch_first)
+        is_causal = _detect_is_causal_mask(mask, is_causal, seq_len)
+        for mod in self.layers:
+            output = mod(output, src_mask=mask, is_causal=is_causal, src_key_padding_mask=src_key_padding_mask_for_layers)
+        if convert_to_nested:
+            output = output.to_padded_tensor(0., src.size())
+        if self.norm is not None:
+            output = self.norm(output)
+        return output
+class TransformerDecoder(Module):
+    r"""TransformerDecoder is a stack of N decoder layers.
+    Args:
+        decoder_layer: an instance of the TransformerDecoderLayer() class (required).
+        num_layers: the number of sub-decoder-layers in the decoder (required).
+        norm: the layer normalization component (optional).
+    Examples::
+        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
+        >>> transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
+        >>> memory = torch.rand(10, 32, 512)
+        >>> tgt = torch.rand(20, 32, 512)
+        >>> out = transformer_decoder(tgt, memory)
+    """
+    __constants__ = ['norm']
+    def __init__(
+        self,
+        decoder_layer: "TransformerDecoderLayer",
+        num_layers: int,
+        use_moe: bool = False,
+        norm: Optional[Module] = None
+    ) -> None:
+        super().__init__()
+        torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.use_moe = use_moe
+        self.norm = norm
+    def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None, tgt_is_causal: Optional[bool] = None,
+                memory_is_causal: bool = False) -> Tensor:
+        r"""Pass the inputs (and mask) through the decoder layer in turn.
+        Args:
+            tgt: the sequence to the decoder (required).
+            memory: the sequence from the last layer of the encoder (required).
+            tgt_mask: the mask for the tgt sequence (optional).
+            memory_mask: the mask for the memory sequence (optional).
+            memory_key_padding_mask: the mask for the memory keys per batch (optional).
+            tgt_is_causal: If specified, applies a causal mask as ``tgt mask``.
+                Default: ``None``; try to detect a causal mask.
+                Warning:
+                ``tgt_is_causal`` provides a hint that ``tgt_mask`` is
+                the causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+            memory_is_causal: If specified, applies a causal mask as
+                ``memory mask``.
+                Default: ``False``.
+                Warning:
+                ``memory_is_causal`` provides a hint that
+                ``memory_mask`` is the causal mask. Providing incorrect
+                hints can result in incorrect execution, including
+                forward and backward compatibility.
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        output = tgt
+        seq_len = _get_seq_len(tgt, self.layers[0].self_attn.batch_first)
+        tgt_is_causal = _detect_is_causal_mask(tgt_mask, tgt_is_causal, seq_len)
+        # print(f'target is causal: {tgt_is_causal}')
+        if self.use_moe:
+            sum_total_aux_loss = 0
+            for mod in self.layers:
+                output, total_aux_loss, balance_loss, router_z_loss = mod(output, memory,
+                             memory_mask=memory_mask,
+                             memory_key_padding_mask=memory_key_padding_mask,
+                             tgt_is_causal=tgt_is_causal,
+                             memory_is_causal=memory_is_causal)
+                sum_total_aux_loss += total_aux_loss
+        else:
+            for mod in self.layers:
+                output = mod(output, memory,
+                            memory_mask=memory_mask,
+                            memory_key_padding_mask=memory_key_padding_mask,
+                            tgt_is_causal=tgt_is_causal,
+                            memory_is_causal=memory_is_causal)
+        if self.norm is not None:
+            output = self.norm(output)
+        if self.use_moe:
+            return output, sum_total_aux_loss
+        else:
+            return output
+class TransformerEncoderLayer(Module):
+    r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
+    This standard encoder layer is based on the paper "Attention Is All You Need".
+    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
+    Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
+    Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
+    in a different way during application.
+    TransformerEncoderLayer can handle either traditional torch.tensor inputs,
+    or Nested Tensor inputs.  Derived classes are expected to similarly accept
+    both input formats.  (Not all combinations of inputs are currently
+    supported by TransformerEncoderLayer while Nested Tensor is in prototype
+    state.)
+    If you are implementing a custom layer, you may derive it either from
+    the Module or TransformerEncoderLayer class.  If your custom layer
+    supports both torch.Tensors and Nested Tensors inputs, make its
+    implementation a derived class of TransformerEncoderLayer. If your custom
+    Layer supports only torch.Tensor inputs, derive its implementation from
+    Module.
+    Args:
+        d_model: the number of expected features in the input (required).
+        nhead: the number of heads in the multiheadattention models (required).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of the intermediate layer, can be a string
+            ("relu" or "gelu") or a unary callable. Default: relu
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+        norm_first: if ``True``, layer norm is done prior to attention and feedforward
+            operations, respectively. Otherwise it's done after. Default: ``False`` (after).
+        bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive
+            bias. Default: ``True``.
+    Examples::
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+        >>> src = torch.rand(10, 32, 512)
+        >>> out = encoder_layer(src)
+    Alternatively, when ``batch_first`` is ``True``:
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
+        >>> src = torch.rand(32, 10, 512)
+        >>> out = encoder_layer(src)
+    Fast path:
+        forward() will use a special optimized implementation described in
+        `FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`_ if all of the following
+        conditions are met:
+        - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor
+          argument ``requires_grad``
+        - training is disabled (using ``.eval()``)
+        - batch_first is ``True`` and the input is batched (i.e., ``src.dim() == 3``)
+        - activation is one of: ``"relu"``, ``"gelu"``, ``torch.functional.relu``, or ``torch.functional.gelu``
+        - at most one of ``src_mask`` and ``src_key_padding_mask`` is passed
+        - if src is a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_, neither ``src_mask``
+          nor ``src_key_padding_mask`` is passed
+        - the two ``LayerNorm`` instances have a consistent ``eps`` value (this will naturally be the case
+          unless the caller has manually modified one without modifying the other)
+        If the optimized implementation is in use, a
+        `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be
+        passed for ``src`` to represent padding more efficiently than using a padding
+        mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ will be
+        returned, and an additional speedup proportional to the fraction of the input that
+        is padding can be expected.
+        .. _`FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`:
+         https://arxiv.org/abs/2205.14135
+    """
+    __constants__ = ['norm_first']
+    def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1,
+                 activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+                 layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False,
+                 bias: bool = True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout,
+                                            bias=bias, batch_first=batch_first,
+                                            **factory_kwargs)
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward, bias=bias, **factory_kwargs)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs)
+        self.norm_first = norm_first
+        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+        # Legacy string support for activation function.
+        if isinstance(activation, str):
+            activation = _get_activation_fn(activation)
+        # We can't test self.activation in forward() in TorchScript,
+        # so stash some information about it instead.
+        if activation is F.relu or isinstance(activation, torch.nn.ReLU):
+            self.activation_relu_or_gelu = 1
+        elif activation is F.gelu or isinstance(activation, torch.nn.GELU):
+            self.activation_relu_or_gelu = 2
+        else:
+            self.activation_relu_or_gelu = 0
+        self.activation = activation
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, 'activation'):
+            self.activation = F.relu
+    def forward(
+            self,
+            src: Tensor,
+            src_mask: Optional[Tensor] = None,
+            src_key_padding_mask: Optional[Tensor] = None,
+            is_causal: bool = False) -> Tensor:
+        r"""Pass the input through the encoder layer.
+        Args:
+            src: the sequence to the encoder layer (required).
+            src_mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+            is_causal: If specified, applies a causal mask as ``src mask``.
+                Default: ``False``.
+                Warning:
+                ``is_causal`` provides a hint that ``src_mask`` is the
+                causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        src_key_padding_mask = F._canonical_mask(
+            mask=src_key_padding_mask,
+            mask_name="src_key_padding_mask",
+            other_type=F._none_or_dtype(src_mask),
+            other_name="src_mask",
+            target_type=src.dtype
+        )
+        src_mask = F._canonical_mask(
+            mask=src_mask,
+            mask_name="src_mask",
+            other_type=None,
+            other_name="",
+            target_type=src.dtype,
+            check_other=False,
+        )
+        # is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled()
+        # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
+        why_not_sparsity_fast_path = ''
+        # if not is_fastpath_enabled:
+        #     why_not_sparsity_fast_path = "torch.backends.mha.get_fastpath_enabled() was not True"
+        if not src.dim() == 3:
+            why_not_sparsity_fast_path = f"input not batched; expected src.dim() of 3 but got {src.dim()}"
+        elif self.training:
+            why_not_sparsity_fast_path = "training is enabled"
+        elif not self.self_attn.batch_first:
+            why_not_sparsity_fast_path = "self_attn.batch_first was not True"
+        elif self.self_attn.in_proj_bias is None:
+            why_not_sparsity_fast_path = "self_attn was passed bias=False"
+        elif not self.self_attn._qkv_same_embed_dim:
+            why_not_sparsity_fast_path = "self_attn._qkv_same_embed_dim was not True"
+        elif not self.activation_relu_or_gelu:
+            why_not_sparsity_fast_path = "activation_relu_or_gelu was not True"
+        elif not (self.norm1.eps == self.norm2.eps):
+            why_not_sparsity_fast_path = "norm1.eps is not equal to norm2.eps"
+        elif src.is_nested and (src_key_padding_mask is not None or src_mask is not None):
+            why_not_sparsity_fast_path = "neither src_key_padding_mask nor src_mask are not supported with NestedTensor input"
+        elif self.self_attn.num_heads % 2 == 1:
+            why_not_sparsity_fast_path = "num_head is odd"
+        elif torch.is_autocast_enabled():
+            why_not_sparsity_fast_path = "autocast is enabled"
+        if not why_not_sparsity_fast_path:
+            tensor_args = (
+                src,
+                self.self_attn.in_proj_weight,
+                self.self_attn.in_proj_bias,
+                self.self_attn.out_proj.weight,
+                self.self_attn.out_proj.bias,
+                self.norm1.weight,
+                self.norm1.bias,
+                self.norm2.weight,
+                self.norm2.bias,
+                self.linear1.weight,
+                self.linear1.bias,
+                self.linear2.weight,
+                self.linear2.bias,
+            )
+            # We have to use list comprehensions below because TorchScript does not support
+            # generator expressions.
+            _supported_device_type = ["cpu", "cuda", torch.utils.backend_registration._privateuse1_backend_name]
+            if torch.overrides.has_torch_function(tensor_args):
+                why_not_sparsity_fast_path = "some Tensor argument has_torch_function"
+            elif not all((x.device.type in _supported_device_type) for x in tensor_args):
+                why_not_sparsity_fast_path = ("some Tensor argument's device is neither one of "
+                                              f"{_supported_device_type}")
+            elif torch.is_grad_enabled() and any(x.requires_grad for x in tensor_args):
+                why_not_sparsity_fast_path = ("grad is enabled and at least one of query or the "
+                                              "input/output projection weights or biases requires_grad")
+            if not why_not_sparsity_fast_path:
+                merged_mask, mask_type = self.self_attn.merge_masks(src_mask, src_key_padding_mask, src)
+                return torch._transformer_encoder_layer_fwd(
+                    src,
+                    self.self_attn.embed_dim,
+                    self.self_attn.num_heads,
+                    self.self_attn.in_proj_weight,
+                    self.self_attn.in_proj_bias,
+                    self.self_attn.out_proj.weight,
+                    self.self_attn.out_proj.bias,
+                    self.activation_relu_or_gelu == 2,
+                    self.norm_first,
+                    self.norm1.eps,
+                    self.norm1.weight,
+                    self.norm1.bias,
+                    self.norm2.weight,
+                    self.norm2.bias,
+                    self.linear1.weight,
+                    self.linear1.bias,
+                    self.linear2.weight,
+                    self.linear2.bias,
+                    merged_mask,
+                    mask_type,
+                )
+        x = src
+        if self.norm_first:
+            x = x + self._sa_block(self.norm1(x), src_mask, src_key_padding_mask, is_causal=is_causal)
+            x = x + self._ff_block(self.norm2(x))
+        else:
+            x = self.norm1(x + self._sa_block(x, src_mask, src_key_padding_mask, is_causal=is_causal))
+            x = self.norm2(x + self._ff_block(x))
+        return x
+    # self-attention block
+    def _sa_block(self, x: Tensor,
+                  attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], is_causal: bool = False) -> Tensor:
+        x = self.self_attn(x, x, x,
+                           attn_mask=attn_mask,
+                           key_padding_mask=key_padding_mask,
+                           need_weights=False, is_causal=is_causal)[0]
+        return self.dropout1(x)
+    # feed forward block
+    def _ff_block(self, x: Tensor) -> Tensor:
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout2(x)
+class TransformerDecoderLayer(Module):
+    r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
+    This standard decoder layer is based on the paper "Attention Is All You Need".
+    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
+    Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
+    Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
+    in a different way during application.
+    Args:
+        d_model: the number of expected features in the input (required).
+        nhead: the number of heads in the multiheadattention models (required).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of the intermediate layer, can be a string
+            ("relu" or "gelu") or a unary callable. Default: relu
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+        norm_first: if ``True``, layer norm is done prior to self attention, multihead
+            attention and feedforward operations, respectively. Otherwise it's done after.
+            Default: ``False`` (after).
+        bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive
+            bias. Default: ``True``.
+    Examples::
+        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
+        >>> memory = torch.rand(10, 32, 512)
+        >>> tgt = torch.rand(20, 32, 512)
+        >>> out = decoder_layer(tgt, memory)
+    Alternatively, when ``batch_first`` is ``True``:
+        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=True)
+        >>> memory = torch.rand(32, 10, 512)
+        >>> tgt = torch.rand(32, 20, 512)
+        >>> out = decoder_layer(tgt, memory)
+    """
+    __constants__ = ['norm_first']
+    def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, use_moe: bool = False, num_experts: int = 16,
+                 dropout: float = 0.1, activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+                 layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False,
+                 bias: bool = True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.self_attn = MultiHeadSelfAttention(d_model, nhead, dropout=dropout, batch_first=batch_first, **factory_kwargs)
+        self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
+                                                 bias=bias, **factory_kwargs)
+        self.use_moe = use_moe
+        if use_moe:
+            self.moe = MoE(
+                dim = d_model,
+                num_experts = num_experts,      # increase the experts (# parameters) of your model without increasing computation
+                gating_top_n = 2,               # default to top 2 gating, but can also be more (3 was tested in the paper with a lower threshold)
+                threshold_train = 0.2,          # at what threshold to accept a token to be routed to second expert and beyond - 0.2 was optimal for 2 expert routing, and apparently should be lower for 3
+                threshold_eval = 0.2,
+                capacity_factor_train = 1.25,   # experts have fixed capacity per batch. we need some extra capacity in case gating is not perfectly balanced.
+                capacity_factor_eval = 2.,      # capacity_factor_* should be set to a value >=1
+                balance_loss_coef = 1e-2,       # multiplier on the auxiliary expert balancing auxiliary loss
+                router_z_loss_coef = 1e-3,      # loss weight for router z-loss
+            ).to(device)
+            self.moe_block = SparseMoEBlock(
+                self.moe,
+                add_ff_before = True,
+                add_ff_after = True
+            ).to(device)
+        else:
+            # Implementation of Feedforward model
+            self.linear1 = Linear(d_model, dim_feedforward, bias=bias, **factory_kwargs)
+            self.dropout = Dropout(dropout)
+            self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs)
+        self.norm_first = norm_first
+        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.norm3 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+        self.dropout3 = Dropout(dropout)
+        # Legacy string support for activation function.
+        if isinstance(activation, str):
+            self.activation = _get_activation_fn(activation)
+        else:
+            self.activation = activation
+    def __setstate__(self, state):
+        if 'activation' not in state:
+            state['activation'] = F.relu
+        super().__setstate__(state)
+    def forward(
+        self,
+        tgt: Tensor,
+        memory: Tensor,
+        memory_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        tgt_is_causal: bool = False,
+        memory_is_causal: bool = False,
+    ) -> Tensor:
+        r"""Pass the inputs (and mask) through the decoder layer.
+        Args:
+            tgt: the sequence to the decoder layer (required).
+            memory: the sequence from the last layer of the encoder (required).
+            memory_mask: the mask for the memory sequence (optional).
+            memory_key_padding_mask: the mask for the memory keys per batch (optional).
+            tgt_is_causal: If specified, applies a causal mask as ``tgt mask``.
+                Default: ``False``.
+                Warning:
+                ``tgt_is_causal`` provides a hint that ``tgt_mask`` is
+                the causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+            memory_is_causal: If specified, applies a causal mask as
+                ``memory mask``.
+                Default: ``False``.
+                Warning:
+                ``memory_is_causal`` provides a hint that
+                ``memory_mask`` is the causal mask. Providing incorrect
+                hints can result in incorrect execution, including
+                forward and backward compatibility.
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
+        x = tgt
+        # print(f'target is causal: {tgt_is_causal}')
+        if self.norm_first:
+            x = x + self._sa_block(self.norm1(x), tgt_is_causal)
+            x = x + self._mha_block(self.norm2(x), memory, memory_mask, memory_key_padding_mask, memory_is_causal)
+            if self.use_moe:
+                m, total_aux_loss, balance_loss, router_z_loss = self.moe_block(x)
+                x = x + m
+            else:
+                x = x + self._ff_block(self.norm3(x))
+        else:
+            x = self.norm1(x + self._sa_block(x, tgt_is_causal))
+            x = self.norm2(x + self._mha_block(x, memory, memory_mask, memory_key_padding_mask, memory_is_causal))
+            if self.use_moe:
+                m, total_aux_loss, balance_loss, router_z_loss = self.moe_block(x)
+                x = x + m
+            else:
+                x = self.norm3(x + self._ff_block(x))
+        if self.use_moe:
+            return x, total_aux_loss, balance_loss, router_z_loss
+        else:
+            return x
+    # self-attention block
+    def _sa_block(self, x: Tensor,
+                  is_causal: bool = False) -> Tensor:
+        x = self.self_attn(x, is_causal=is_causal)
+        return self.dropout1(x)
+    # multihead attention block
+    def _mha_block(self, x: Tensor, mem: Tensor,
+                   attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], is_causal: bool = False) -> Tensor:
+        x = self.multihead_attn(x, mem, mem,
+                                attn_mask=attn_mask,
+                                key_padding_mask=key_padding_mask,
+                                is_causal=is_causal,
+                                need_weights=False)[0]
+        return self.dropout2(x)
+    # feed forward block
+    def _ff_block(self, x: Tensor) -> Tensor:
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout3(x)
+def _get_clones(module, N):
+    # FIXME: copy.deepcopy() is not defined on nn.module
+    return ModuleList([copy.deepcopy(module) for i in range(N)])
+def _get_activation_fn(activation: str) -> Callable[[Tensor], Tensor]:
+    if activation == "relu":
+        return F.relu
+    elif activation == "gelu":
+        return F.gelu
+    raise RuntimeError(f"activation should be relu/gelu, not {activation}")
+def _detect_is_causal_mask(
+        mask: Optional[Tensor],
+        is_causal: Optional[bool] = None,
+        size: Optional[int] = None,
+) -> bool:
+    """Return whether the given attention mask is causal.
+    Warning:
+    If ``is_causal`` is not ``None``, its value will be returned as is.  If a
+    user supplies an incorrect ``is_causal`` hint,
+    ``is_causal=False`` when the mask is in fact a causal attention.mask
+       may lead to reduced performance relative to what would be achievable
+       with ``is_causal=True``;
+    ``is_causal=True`` when the mask is in fact not a causal attention.mask
+       may lead to incorrect and unpredictable execution - in some scenarios,
+       a causal mask may be applied based on the hint, in other execution
+       scenarios the specified mask may be used.  The choice may not appear
+       to be deterministic, in that a number of factors like alignment,
+       hardware SKU, etc influence the decision whether to use a mask or
+       rely on the hint.
+    ``size`` if not None, check whether the mask is a causal mask of the provided size
+       Otherwise, checks for any causal mask.
+    """
+    # Prevent type refinement
+    make_causal = (is_causal is True)
+    if is_causal is None and mask is not None:
+        sz = size if size is not None else mask.size(-2)
+        causal_comparison = _generate_square_subsequent_mask(
+            sz, device=mask.device, dtype=mask.dtype)
+        # Do not use `torch.equal` so we handle batched masks by
+        # broadcasting the comparison.
+        if mask.size() == causal_comparison.size():
+            make_causal = bool((mask == causal_comparison).all())
+        else:
+            make_causal = False
+    return make_causal
+def check_instruments(genereated_seq):
+    ins_present = []
+    ins_count = 0
+    instrument_list = ["piano", "chromatic", "organ", "guitar", "bass", "strings", "ensemble", "brass", "reed", "drum", "pipe", "synth_lead", "synth_pad", "synth_effect", "ethnic", "percussive", "sfx"]
+    for token in genereated_seq:
+        try:
+            ins, pitch, vel = token
+            # print(str(ins))
+        except ValueError:
+            try:
+                ins, pitch = token
+            except ValueError:
+                ins = token
+        if str(ins) in instrument_list:
+            # print('coming here')
+            if ('prefix', 'instrument', str(ins)) not in ins_present and ins_count < 15:
+                ins_count += 1
+                print(f'adding instrument {ins}')
+                ins_present.append(('prefix', 'instrument', str(ins)))
+    if ins_present != []:
+        genereated_seq = ins_present + ['<S>']+ genereated_seq +['<E>']
+    else:
+        genereated_seq = genereated_seq +['<E>']
+    print(genereated_seq)
+    return genereated_seq
+def process_caption(gpu_id, captions, model, tokenizer, r_tokenizer):
+    # Detect device: CUDA, MPS, or CPU
+    if torch.cuda.is_available():
+        device = torch.device(f"cuda:{gpu_id}")
+        torch.cuda.set_device(gpu_id)
+        print(f"Using CUDA on GPU {gpu_id}")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+        print("Using MPS on macOS")
+    else:
+        device = torch.device("cpu")
+        print("Using CPU")
+    # Move the model to the selected device
+    model.to(device)
+    model.eval()
+    for caption in captions:
+        src = caption['caption']
+        location = caption['location']
+        # Tokenize input
+        inputs = tokenizer(src, return_tensors='pt', padding=True, truncation=True)
+        input_ids = nn.utils.rnn.pad_sequence(inputs.input_ids, batch_first=True, padding_value=0)
+        input_ids = input_ids.to(device)
+        attention_mask = nn.utils.rnn.pad_sequence(inputs.attention_mask, batch_first=True, padding_value=0)
+        attention_mask = attention_mask.to(device)
+        # Generate output
+        output = model.generate(input_ids, attention_mask, max_len=5000, temperature=0.9)
+        output_list = output[0].tolist()
+        # Decode MIDI and save it
+        generated_midi = r_tokenizer.decode(output_list)
+        generated_midi.dump_midi(f"../res/{location}")
+# def process_caption(gpu_id, captions, model, tokenizer, r_tokenizer):
+#     device = gpu_id
+#     torch.cuda.set_device(gpu_id)
+#     model.to(gpu_id)
+#     model.eval()
+#     for caption in captions:
+#         src = caption['caption']
+#         location = caption['location']
+#         #src = "A cinematic electronic soundtrack that evokes an epic and dark atmosphere, featuring cello, contrabass, and drums. The song is set in A minor with a moderate tempo and a 4/4 time signature, creating an emotional and action-packed ambiance suitable for film."
+#         '''
+#         example 1: "A cheerful and melodic pop Christmas song featuring piano, acoustic guitar, vibraphone, bass, and drums, set in the key of Eb minor with a fast tempo of 123 bpm and a 4/4 time signature, creating a joyful and relaxing atmosphere."lmd_full/1/1b9f5f325c2080d345d877f590aa3dbe.mid
+#         example 2: "A melodic electronic song with ambient elements, featuring piano, acoustic guitar, alto saxophone, string ensemble, and electric bass. Set in G minor with a 4/4 time signature, it moves at a lively Presto tempo. The composition evokes a blend of relaxation and darkness, with hints of happiness and a meditative quality."lmd_full/1/152891ac63017b234c33e75e4a4a28c5.mid
+#         example 3: "This motivational electronic and pop song features a clean electric guitar, rock organ, synth voice, acoustic guitar, and vibraphone, creating a melodic and uplifting atmosphere. Set in the key of G# minor with a 4/4 time signature, the track moves at an energetic Allegro tempo of 120 beats per minute. The chord progression of Bbm7 and F# adds to the song's inspiring and corporate feel." lmd_full/1/14347e50e9e8149a9da09f49b188180b.mid
+#         example 4: "This short electronic song in C minor features a brass section, string ensemble, tenor saxophone, clean electric guitar, and slap bass, creating a melodic and slightly dark atmosphere. With a tempo of 124 BPM (Allegro) and a 4/4 time signature, the track incorporates a chord progression of C7/E, Eb6, and Bbm6, adding a touch of corporate and motivational vibes to the overall composition." lmd_full/1/1dc4cd50a5509d8042d27d80bc7e668e.mid
+#         example 5: "An energetic and melodic electronic trance track with a space and retro vibe, featuring drums, distortion guitar, flute, synth bass, and slap bass. Set in A minor with a fast tempo of 138 BPM, the song maintains a 4/4 time signature throughout its duration." lmd_full/3/3328b854ebe7a2fc9a746ede74c410ae.mid
+#         example 6: "A short but energetic rock fragment in C minor, featuring overdriven guitars, electric bass, and drums, with a vivacious tempo of 155 BPM and a 4/4 time signature, evoking a blend of dark and melodic tones." lmd_full/4/4c2232688c5f869b8470a408d197f5e3.mid
+#         example 7: "A classical piece with a cinematic flair, this composition is characterized by its fast tempo and 4/4 time signature. The soprano saxophone and flute take turns leading the melody, supported by the lush tones of the string ensemble, acoustic bass, and pan flute. Set in the key of F minor, the harmonic landscape is painted with the chords Gm7b5, Cm7b5, Fm7, Eaug, and Ab/Eb. The overall mood evokes images of film, with hints of Christmas, drama, documentary, and adventure." lmd_full/9/95bce1b489a11829b4fef39200291f60.mid
+#         exmaple 8: "A slow, dark, and emotional classical piece featuring cello, violin, and viola, likely to be used in a dramatic film soundtrack. The composition is in the key of C minor with a 4/4 time signature, and the main chord progression consists of Cm, G, Cm, and Fm." lmd_full/a/a22aad98ecfe4b3d8a353c2a72132834.mid
+#         example 9: "A slow and emotional classical piece, likely used in a film soundtrack, featuring a church organ as the sole instrument. Written in the key of Eb major with a 3/4 time signature, it evokes a sense of drama and romance. The chord progression of Bb7, Eb, and Ab contributes to the relaxing atmosphere throughout the song." lmd_full/a/af4302a036c9df71e0435df9b08f8c4b.mid
+#         example 10: "A cinematic electronic soundtrack that evokes an epic and dark atmosphere, featuring cello, contrabass, and drums. The song is set in A minor with a moderate tempo and a 4/4 time signature, creating an emotional and action-packed ambiance suitable for film." lmd_full/d/d920b6f451d7a72ae06f154e7c06c4c1.mid
+#         '''
+#         inputs = tokenizer(src, return_tensors='pt', padding=True, truncation=True)
+#         input_ids = nn.utils.rnn.pad_sequence(inputs.input_ids, batch_first=True, padding_value=0)
+#         input_ids = input_ids.to(device)
+#         attention_mask =nn.utils.rnn.pad_sequence(inputs.attention_mask, batch_first=True, padding_value=0)
+#         attention_mask = attention_mask.to(device)
+#         output = model.generate(input_ids, attention_mask,max_len=5000,temperature = 0.9)
+#         output_list = output[0].tolist()
+#         print(type(output_list))
+#         # generated_sequences = [dict_tokenizer[token] for token in output_list[0]]
+#         # generated_sequences = check_instruments(generated_sequences)
+#         # # generated_sequences = [('prefix', 'instrument', 'bass'), ('prefix', 'instrument', 'guitar'), ('prefix', 'instrument', 'piano'), ('prefix', 'instrument', 'guitar'), '<S>' ]+ generated_sequences +['<E>']
+#         # generated_sequences = [token for token in generated_sequences]# if token not in ["<SS>", "<S>", "<E>", "<SEP>"]]
+#         # # print("Generated sequences:", generated_sequences)
+#         # with open('../../generated_seq.pkl', 'wb') as f:
+#         #     pickle.dump(generated_sequences, f)
+#         # mid_dict = aria_tokenizer.detokenize(generated_sequences)
+#         # mid = mid_dict.to_midi()
+#         generated_midi = r_tokenizer.decode(output_list)
+#         # print(type(generated_midi))
+#         generated_midi.dump_midi(f"../res/{location}")
+def test_generate(caption):
+    # Detect device: CUDA, MPS, or CPU
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        print("Using CUDA on NVIDIA GPU")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+        print("Using MPS on macOS")
+    else:
+        device = torch.device("cpu")
+        print("Using CPU")
+    artifact_folder = '../artifacts'
+    tokenizer_filepath = os.path.join(artifact_folder, "vocab_remi.pkl")
+    caption_dataset_path = '/root/text2midi/captions/train.json'
+    print(f'caption_dataset_path: {caption_dataset_path}')
+    # Load the tokenizer dictionary
+    with open(tokenizer_filepath, "rb") as f:
+        r_tokenizer = pickle.load(f)
+    vocab_size = len(r_tokenizer)  # +1
+    print("Vocab size: ", vocab_size)
+    # Initialize model
+    model = Transformer(vocab_size, 768, 8, 2048, 18, 1024, False, 8, device=device)
+    model.load_state_dict(torch.load('/root/test/text2midi/output_new/epoch_30/pytorch_model.bin', map_location=device))
+    model.to(device)  # Move model to detected device
+    model.eval()
+    # Prepare input
+    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
+    '''
+    # num_gpus = torch.cuda.device_count()
+    # captions_per_gpu = len(captions) // num_gpus
+    # processes = []
+    # for i in range(num_gpus):
+    #     start_idx = i * captions_per_gpu
+    #     end_idx = (i + 1) * captions_per_gpu if i != num_gpus - 1 else len(captions)
+    #     p = mp.Process(target=process_caption, args=(i, captions[start_idx:end_idx], model, tokenizer, r_tokenizer))
+    #     p.start()
+    #     processes.append(p)
+    # for p in processes:
+    #     p.join()
+    '''
+    # src = "A pop song with nostalgic feeling."
+    # src = "A happy christmas song suitable for festive mood."
+    # src = "A melodic electronic song with ambient elements, featuring piano, acoustic guitar, alto saxophone, string ensemble, and electric bass. Set in G minor with a 4/4 time signature, it moves at a lively Presto tempo. The composition evokes a blend of relaxation and darkness, with hints of happiness and a meditative quality."
+    # src="An energetic and melodic electronic trance track with a space and retro vibe, featuring drums, distortion guitar, flute, synth bass, and slap bass. Set in A minor with a fast tempo of 138 BPM, the song maintains a 4/4 time signature throughout its duration."
+    # src="A cheerful and melodic pop Christmas song featuring piano, acoustic guitar, vibraphone, bass, and drums, set in the key of Eb minor with a fast tempo of 123 bpm and a 4/4 time signature, creating a joyful and relaxing atmosphere."
+    # src = "This short electronic song in C minor features a brass section, string ensemble, tenor saxophone, clean electric guitar, and slap bass, creating a melodic and slightly dark atmosphere. With a tempo of 124 BPM (Allegro) and a 4/4 time signature, the track incorporates a chord progression of C7/E, Eb6, and Bbm6, adding a touch of corporate and motivational vibes to the overall composition."
+    # src="This motivational electronic and pop song features a clean electric guitar, rock organ, synth voice, acoustic guitar, and vibraphone, creating a melodic and uplifting atmosphere. Set in the key of G# minor with a 4/4 time signature, the track moves at an energetic Allegro tempo of 120 beats per minute. The chord progression of Bbm7 and F# adds to the song's inspiring and corporate feel."
+    # src = "Played at 149 beats per minute in 2/4 time signature and the key of G major, classical piece with instruments: bassoon, clarinet, flute, horn, oboe, and trumpet."
+    # src= 'Played at 114 beats per minute in 1/4 time signature and the key of g# minor, classical piece with the following instruments: clarinet, english horn, flute, horn, piccolo, trombone, and trumpet.'
+    inputs = tokenizer(caption, return_tensors='pt', padding=True, truncation=True)
+    input_ids = nn.utils.rnn.pad_sequence(inputs.input_ids, batch_first=True, padding_value=0)
+    input_ids = input_ids.to(device)
+    attention_mask = nn.utils.rnn.pad_sequence(inputs.attention_mask, batch_first=True, padding_value=0)
+    attention_mask = attention_mask.to(device)
+    output = model.generate(input_ids, attention_mask, max_len=2000, temperature=0.9)
+    output_list = output[0].tolist()
+    # Decode and save MIDI
+    generated_midi = r_tokenizer.decode(output_list)
+    generated_midi.dump_midi(f"../../output_christmas_2.mid")
+def load_model_and_tokenizer(accelerator, model_path, vocab_size, tokenizer_filepath):
+    device = accelerator.device
+    with open(tokenizer_filepath, "rb") as f:
+        r_tokenizer = pickle.load(f)
+    model = Transformer(vocab_size, 768, 8, 2048, 18, 1024, False, 8, device=device)
+    model.load_state_dict(torch.load(model_path, map_location=device))
+    model.to(device)
+    model.eval()
+    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
+    return model, tokenizer, r_tokenizer
+def process_example(accelerator, model, tokenizer, r_tokenizer, example, location, output_path):
+    device = accelerator.device
+    inputs = tokenizer(example, return_tensors='pt', padding=True, truncation=True).to(device)
+    input_ids = inputs['input_ids']
+    attention_mask = inputs['attention_mask']
+    with torch.no_grad():
+        output = model.module.generate(input_ids, attention_mask, max_len=2000, temperature=0.9)
+    output_list = output[0].tolist()
+    generated_midi = r_tokenizer.decode(output_list)
+    generated_midi.dump_midi(output_path)
+def run_accelerate_generation():
+    accelerator = Accelerator()
+    artifact_folder = '../artifacts'
+    tokenizer_filepath = os.path.join(artifact_folder, "vocab_remi.pkl")
+    model_path = '/root/output_test_new/epoch_30/pytorch_model.bin'
+    captions_path = '/root/captions/train.json'
+    with jsonlines.open(captions_path) as reader:
+        selected_captions = [line for line in reader if line.get('test_set') is True]
+    with open(tokenizer_filepath, "rb") as f:
+        r_tokenizer = pickle.load(f)
+    model, tokenizer, r_tokenizer = load_model_and_tokenizer(accelerator, model_path, len(r_tokenizer), tokenizer_filepath)
+    model = accelerator.prepare(model)
+    dataset = CaptionDataset(selected_captions)
+    dataloader = DataLoader(dataset, batch_size=8, num_workers=4, shuffle=False, collate_fn=custom_collate_fn)
+    dataloader = accelerator.prepare(dataloader)
+    for captions, locations in dataloader:
+        for example, location in zip(captions, locations):
+            output_path = os.path.join(f'/root/Text2midi/res_acc', location)
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            process_example(accelerator, model, tokenizer, r_tokenizer, example, location, output_path)
+# run_accelerate_generation() #uncomment this and comment __main__ to run accelerate generation
+def main():
+    parser = argparse.ArgumentParser(description="Generate MIDI from caption")
+    parser.add_argument('--caption', type=str, required=True, help='Caption to generate MIDI from')
+    args = parser.parse_args()
+    test_generate(args.caption)
+'''
+comment out the next section function and uncomment the run_accelerate_generation() function to run the accelerate generation
+'''
+if __name__ == "__main__":
+    main()
+    print("Done")

text2midi_repo/requirements-mac.txt ADDED Viewed

	@@ -0,0 +1,368 @@

+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile requirements.in
+#
+accelerate==0.18.0
+    # via -r requirements.in
+aiohappyeyeballs==2.4.4
+    # via aiohttp
+aiohttp==3.11.10
+    # via
+    #   datasets
+    #   fsspec
+aiosignal==1.3.1
+    # via aiohttp
+annotated-types==0.7.0
+    # via pydantic
+async-timeout==5.0.1
+    # via aiohttp
+attrs==24.2.0
+    # via
+    #   aiohttp
+    #   jsonlines
+beartype==0.19.0
+    # via st-moe-pytorch
+blis==1.0.1
+    # via thinc
+catalogue==2.0.10
+    # via
+    #   spacy
+    #   srsly
+    #   thinc
+certifi==2024.8.30
+    # via
+    #   requests
+    #   sentry-sdk
+charset-normalizer==3.4.0
+    # via requests
+click==8.1.7
+    # via
+    #   typer
+    #   wandb
+cloudpathlib==0.20.0
+    # via weasel
+colt5-attention==0.11.1
+    # via st-moe-pytorch
+confection==0.1.5
+    # via
+    #   thinc
+    #   weasel
+cymem==2.0.10
+    # via
+    #   preshed
+    #   spacy
+    #   thinc
+datasets==3.1.0
+    # via evaluate
+dill==0.3.8
+    # via
+    #   datasets
+    #   evaluate
+    #   multiprocess
+docker-pycreds==0.4.0
+    # via wandb
+einops==0.8.0
+    # via
+    #   -r requirements.in
+    #   colt5-attention
+    #   local-attention
+    #   st-moe-pytorch
+evaluate==0.4.3
+    # via -r requirements.in
+filelock==3.16.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+    #   triton
+frozenlist==1.5.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.9.0
+    # via
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   torch
+gitdb==4.0.11
+    # via gitpython
+gitpython==3.1.43
+    # via wandb
+huggingface-hub==0.26.3
+    # via
+    #   accelerate
+    #   datasets
+    #   evaluate
+    #   miditok
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   requests
+    #   yarl
+jinja2==3.1.4
+    # via
+    #   spacy
+    #   torch
+jsonlines==4.0.0
+    # via -r requirements.in
+langcodes==3.5.0
+    # via spacy
+language-data==1.3.0
+    # via langcodes
+local-attention==1.9.15
+    # via colt5-attention
+marisa-trie==1.2.1
+    # via language-data
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+miditok==3.0.3
+    # via -r requirements.in
+mpmath==1.3.0
+    # via sympy
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   datasets
+    #   evaluate
+murmurhash==1.0.11
+    # via
+    #   preshed
+    #   spacy
+    #   thinc
+networkx==3.4.2
+    # via torch
+numpy==2.0.2
+    # via
+    #   -r requirements.in
+    #   accelerate
+    #   blis
+    #   datasets
+    #   evaluate
+    #   miditok
+    #   pandas
+    #   spacy
+    #   symusic
+    #   thinc
+    #   transformers
+#nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+#nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+#nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+#nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+#nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+#nvidia-cufft-cu12==11.2.1.3
+    # via torch
+#nvidia-curand-cu12==10.3.5.147
+    # via torch
+#nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+#nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+#nvidia-nccl-cu12==2.21.5
+    # via torch
+#nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+#nvidia-nvtx-cu12==12.4.127
+    # via torch
+packaging==24.2
+    # via
+    #   accelerate
+    #   colt5-attention
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   spacy
+    #   thinc
+    #   transformers
+    #   weasel
+pandas==2.2.3
+    # via
+    #   datasets
+    #   evaluate
+platformdirs==4.3.6
+    # via
+    #   symusic
+    #   wandb
+preshed==3.0.9
+    # via
+    #   spacy
+    #   thinc
+propcache==0.2.1
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==5.29.1
+    # via wandb
+psutil==6.1.0
+    # via
+    #   accelerate
+    #   wandb
+pyarrow==18.1.0
+    # via datasets
+pydantic==2.10.3
+    # via
+    #   confection
+    #   spacy
+    #   thinc
+    #   wandb
+    #   weasel
+pydantic-core==2.27.1
+    # via pydantic
+pygments==2.18.0
+    # via rich
+pysmartdl==1.3.4
+    # via symusic
+python-dateutil==2.9.0.post0
+    # via pandas
+pytz==2024.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   -r requirements.in
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+    #   wandb
+regex==2024.11.6
+    # via transformers
+requests==2.32.3
+    # via
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   spacy
+    #   transformers
+    #   wandb
+    #   weasel
+rich==13.9.4
+    # via typer
+safetensors==0.4.5
+    # via
+    #   accelerate
+    #   transformers
+sentry-sdk==2.19.2
+    # via wandb
+sentencepiece==0.2.0
+setproctitle==1.3.4
+    # via wandb
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via
+    #   docker-pycreds
+    #   python-dateutil
+smart-open==7.0.5
+    # via weasel
+smmap==5.0.1
+    # via gitdb
+spacy==3.8.2
+    # via -r requirements.in
+spacy-legacy==3.0.12
+    # via spacy
+spacy-loggers==1.0.5
+    # via spacy
+srsly==2.4.8
+    # via
+    #   confection
+    #   spacy
+    #   thinc
+    #   weasel
+st-moe-pytorch==0.1.8
+    # via -r requirements.in
+sympy==1.13.1
+    # via torch
+symusic==0.5.5
+    # via miditok
+thinc==8.3.2
+    # via spacy
+tokenizers==0.21.0
+    # via
+    #   miditok
+    #   transformers
+torch==2.5.1
+    # via
+    #   -r requirements.in
+    #   accelerate
+    #   colt5-attention
+    #   local-attention
+    #   st-moe-pytorch
+tqdm==4.67.1
+    # via
+    #   -r requirements.in
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   miditok
+    #   spacy
+    #   transformers
+transformers==4.47.0
+    # via -r requirements.in
+#triton==3.1.0
+    # via torch
+typer==0.15.1
+    # via
+    #   spacy
+    #   weasel
+typing-extensions==4.12.2
+    # via
+    #   cloudpathlib
+    #   huggingface-hub
+    #   multidict
+    #   pydantic
+    #   pydantic-core
+    #   rich
+    #   torch
+    #   typer
+    #   wandb
+tzdata==2024.2
+    # via pandas
+urllib3==2.2.3
+    # via
+    #   requests
+    #   sentry-sdk
+wandb==0.19.0
+    # via -r requirements.in
+wasabi==1.1.3
+    # via
+    #   spacy
+    #   thinc
+    #   weasel
+weasel==0.4.1
+    # via spacy
+wrapt==1.17.0
+    # via smart-open
+xxhash==3.5.0
+    # via
+    #   datasets
+    #   evaluate
+yarl==1.18.3
+    # via aiohttp
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools

text2midi_repo/requirements.txt ADDED Viewed

	@@ -0,0 +1,368 @@

+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile requirements.in
+#
+accelerate==0.18.0
+    # via -r requirements.in
+aiohappyeyeballs==2.4.4
+    # via aiohttp
+aiohttp==3.11.10
+    # via
+    #   datasets
+    #   fsspec
+aiosignal==1.3.1
+    # via aiohttp
+annotated-types==0.7.0
+    # via pydantic
+async-timeout==5.0.1
+    # via aiohttp
+attrs==24.2.0
+    # via
+    #   aiohttp
+    #   jsonlines
+beartype==0.19.0
+    # via st-moe-pytorch
+blis==1.0.1
+    # via thinc
+catalogue==2.0.10
+    # via
+    #   spacy
+    #   srsly
+    #   thinc
+certifi==2024.8.30
+    # via
+    #   requests
+    #   sentry-sdk
+charset-normalizer==3.4.0
+    # via requests
+click==8.1.7
+    # via
+    #   typer
+    #   wandb
+cloudpathlib==0.20.0
+    # via weasel
+colt5-attention==0.11.1
+    # via st-moe-pytorch
+confection==0.1.5
+    # via
+    #   thinc
+    #   weasel
+cymem==2.0.10
+    # via
+    #   preshed
+    #   spacy
+    #   thinc
+datasets==3.1.0
+    # via evaluate
+dill==0.3.8
+    # via
+    #   datasets
+    #   evaluate
+    #   multiprocess
+docker-pycreds==0.4.0
+    # via wandb
+einops==0.8.0
+    # via
+    #   -r requirements.in
+    #   colt5-attention
+    #   local-attention
+    #   st-moe-pytorch
+evaluate==0.4.3
+    # via -r requirements.in
+filelock==3.16.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+    #   triton
+frozenlist==1.5.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.9.0
+    # via
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   torch
+gitdb==4.0.11
+    # via gitpython
+gitpython==3.1.43
+    # via wandb
+huggingface-hub==0.26.3
+    # via
+    #   accelerate
+    #   datasets
+    #   evaluate
+    #   miditok
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   requests
+    #   yarl
+jinja2==3.1.4
+    # via
+    #   spacy
+    #   torch
+jsonlines==4.0.0
+    # via -r requirements.in
+langcodes==3.5.0
+    # via spacy
+language-data==1.3.0
+    # via langcodes
+local-attention==1.9.15
+    # via colt5-attention
+marisa-trie==1.2.1
+    # via language-data
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+miditok==3.0.3
+    # via -r requirements.in
+mpmath==1.3.0
+    # via sympy
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   datasets
+    #   evaluate
+murmurhash==1.0.11
+    # via
+    #   preshed
+    #   spacy
+    #   thinc
+networkx==3.4.2
+    # via torch
+numpy==2.0.2
+    # via
+    #   -r requirements.in
+    #   accelerate
+    #   blis
+    #   datasets
+    #   evaluate
+    #   miditok
+    #   pandas
+    #   spacy
+    #   symusic
+    #   thinc
+    #   transformers
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+packaging==24.2
+    # via
+    #   accelerate
+    #   colt5-attention
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   spacy
+    #   thinc
+    #   transformers
+    #   weasel
+pandas==2.2.3
+    # via
+    #   datasets
+    #   evaluate
+platformdirs==4.3.6
+    # via
+    #   symusic
+    #   wandb
+preshed==3.0.9
+    # via
+    #   spacy
+    #   thinc
+propcache==0.2.1
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==5.29.1
+    # via wandb
+psutil==6.1.0
+    # via
+    #   accelerate
+    #   wandb
+pyarrow==18.1.0
+    # via datasets
+pydantic==2.10.3
+    # via
+    #   confection
+    #   spacy
+    #   thinc
+    #   wandb
+    #   weasel
+pydantic-core==2.27.1
+    # via pydantic
+pygments==2.18.0
+    # via rich
+pysmartdl==1.3.4
+    # via symusic
+python-dateutil==2.9.0.post0
+    # via pandas
+pytz==2024.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   -r requirements.in
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+    #   wandb
+regex==2024.11.6
+    # via transformers
+requests==2.32.3
+    # via
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   spacy
+    #   transformers
+    #   wandb
+    #   weasel
+rich==13.9.4
+    # via typer
+safetensors==0.4.5
+    # via
+    #   accelerate
+    #   transformers
+sentry-sdk==2.19.2
+    # via wandb
+sentencepiece==0.2.0
+setproctitle==1.3.4
+    # via wandb
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via
+    #   docker-pycreds
+    #   python-dateutil
+smart-open==7.0.5
+    # via weasel
+smmap==5.0.1
+    # via gitdb
+spacy==3.8.2
+    # via -r requirements.in
+spacy-legacy==3.0.12
+    # via spacy
+spacy-loggers==1.0.5
+    # via spacy
+srsly==2.4.8
+    # via
+    #   confection
+    #   spacy
+    #   thinc
+    #   weasel
+st-moe-pytorch==0.1.8
+    # via -r requirements.in
+sympy==1.13.1
+    # via torch
+symusic==0.5.5
+    # via miditok
+thinc==8.3.2
+    # via spacy
+tokenizers==0.21.0
+    # via
+    #   miditok
+    #   transformers
+torch==2.5.1
+    # via
+    #   -r requirements.in
+    #   accelerate
+    #   colt5-attention
+    #   local-attention
+    #   st-moe-pytorch
+tqdm==4.67.1
+    # via
+    #   -r requirements.in
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   miditok
+    #   spacy
+    #   transformers
+transformers==4.47.0
+    # via -r requirements.in
+triton==3.1.0
+    # via torch
+typer==0.15.1
+    # via
+    #   spacy
+    #   weasel
+typing-extensions==4.12.2
+    # via
+    #   cloudpathlib
+    #   huggingface-hub
+    #   multidict
+    #   pydantic
+    #   pydantic-core
+    #   rich
+    #   torch
+    #   typer
+    #   wandb
+tzdata==2024.2
+    # via pandas
+urllib3==2.2.3
+    # via
+    #   requests
+    #   sentry-sdk
+wandb==0.19.0
+    # via -r requirements.in
+wasabi==1.1.3
+    # via
+    #   spacy
+    #   thinc
+    #   weasel
+weasel==0.4.1
+    # via spacy
+wrapt==1.17.0
+    # via smart-open
+xxhash==3.5.0
+    # via
+    #   datasets
+    #   evaluate
+yarl==1.18.3
+    # via aiohttp
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools

text2midi_repo/text2midi_architecture.jpg ADDED Viewed

Git LFS Details

SHA256: 732af27208de46a6f0ab508605597b1dc1f569daa0e256ce695773fd2466eb6f
Pointer size: 131 Bytes
Size of remote file: 179 kB

text2midi_repo/utils/midi_to_wav.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+import subprocess
+from multiprocessing import Pool, cpu_count
+from tqdm import tqdm
+soundfont_filepath = "/root/soundfont/soundfont.sf"
+def save_wav(midi_filepath, wav_filepath):
+    # Check if the .wav file already exists
+    if os.path.isfile(wav_filepath):
+        print(f"{wav_filepath} already exists, skipping")
+        return wav_filepath
+    else:
+        print(f"Creating {wav_filepath} from {midi_filepath}")
+        # Run the fluidsynth command to convert MIDI to WAV
+        command = f"fluidsynth -r 48000 {soundfont_filepath} -g 1.0 --quiet --no-shell {midi_filepath} -T wav -F {wav_filepath}"
+        print(f"Running command: {command}")
+        process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        stdout, stderr = process.communicate()
+        if process.returncode != 0:
+            print(f"Error converting {midi_filepath} to {wav_filepath}: {stderr.decode('utf-8')}")
+        else:
+            print(f"Successfully created {wav_filepath}")
+        return wav_filepath
+def process_midi_file(midi_filepath):
+    # Determine the corresponding wav file path
+    relative_path = os.path.relpath(midi_filepath, "/root/Text2midi/res_acc")
+    wav_filepath = os.path.join("/root/wav", relative_path.replace('.mid', '.wav'))
+    wav_directory = os.path.dirname(wav_filepath)
+    # Ensure the directory exists
+    os.makedirs(wav_directory, exist_ok=True)
+    # Convert the MIDI file to WAV
+    save_wav(midi_filepath, wav_filepath)
+def main():
+    # Find all .mid files in /root/Text2midi/res_acc
+    midi_files = []
+    for root, _, files in os.walk("/root/Text2midi/res_acc"):
+        for file in files:
+            if file.endswith(".mid"):
+                midi_files.append(os.path.join(root, file))
+    # Use half of the available CPU cores for multiprocessing
+    num_cores = cpu_count() // 2
+    with Pool(num_cores) as pool:
+        list(tqdm(pool.imap(process_midi_file, midi_files), total=len(midi_files), desc="Processing MIDI files"))
+if __name__ == "__main__":
+    main()

text2midi_repo/utils/split_caption.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import json
+import os
+import random
+import jsonlines
+def select_and_split_captions(input_path, output_dir, num_splits=6):
+    with jsonlines.open(input_path) as reader:
+        captions = [line for line in reader if line.get('test_set') is True]
+    selected_captions = captions #random.sample(captions, 500)
+    # Split the selected captions into num_splits groups
+    split_size = len(selected_captions) // num_splits
+    for i in range(num_splits):
+        start_idx = i * split_size
+        end_idx = (i + 1) * split_size if i != num_splits - 1 else len(selected_captions)
+        split_captions = selected_captions[start_idx:end_idx]
+        output_path = os.path.join(output_dir, f'selected_captions_{i}.json')
+        with open(output_path, 'w') as f:
+            json.dump(split_captions, f, indent=4)
+        print(f'Saved {len(split_captions)} captions to {output_path}')
+if __name__ == "__main__":
+    input_path = '/root/captions/train.json'
+    output_dir = '/root/captions/'
+    select_and_split_captions(input_path, output_dir)