TundraandTabor commited on May 25, 2025

Commit

12b2634

verified ·

1 Parent(s): ac50976

Upload 38 files

Browse files

Files changed (39) hide show

.gitattributes +3 -0
1_batch_xml2abc.py +54 -0
2_data_preprocess.py +181 -0
3_batch_abc2xml.py +56 -0
LICENSE.txt +21 -0
README (1).md +31 -0
README (2).md +293 -0
README.md +44 -0
abc2xml (1).py +0 -0
abc2xml (2).py +0 -0
abc2xml.py +0 -0
config (1).py +67 -0
config (2).py +38 -0
config (3).py +15 -0
config (4).py +18 -0
config (5).py +39 -0
config.py +35 -0
data.py +136 -0
demo.ipynb +821 -0
demo.py +236 -0
extract_clamp2.py +194 -0
illustration.png +3 -0
illustration_online.png +3 -0
inference (1).py +271 -0
inference.py +318 -0
notagen.png +3 -0
prompts.txt +112 -0
requirements (6).txt +7 -0
statistics.py +68 -0
train-gen (1).py +325 -0
train-gen.py +374 -0
train.py +186 -0
utils (1).py +483 -0
utils (2).py +423 -0
utils (3).py +423 -0
utils (4).py +423 -0
utils (5).py +421 -0
utils.py +406 -0
xml2abc.py +1609 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+illustration_online.png filter=lfs diff=lfs merge=lfs -text
+illustration.png filter=lfs diff=lfs merge=lfs -text
+notagen.png filter=lfs diff=lfs merge=lfs -text

1_batch_xml2abc.py ADDED Viewed

	@@ -0,0 +1,54 @@

+ORI_FOLDER = ""  # Replace with the path to your folder containing XML (.xml, .mxl, .musicxml) files
+DES_FOLDER = ""   # The script will convert the musicxml files and output standard abc notation files to this folder
+import os
+import math
+import random
+import subprocess
+from tqdm import tqdm
+from multiprocessing import Pool
+def convert_xml2abc(file_list):
+    cmd = 'python xml2abc.py -d 8 -c 6 -x '
+    for file in tqdm(file_list):
+        filename = os.path.basename(file)
+        os.makedirs(DES_FOLDER, exist_ok=True)
+        try:
+            p = subprocess.Popen(cmd + '"' + file + '"', stdout=subprocess.PIPE, shell=True)
+            result = p.communicate()
+            output = result[0].decode('utf-8')
+            if output == '':
+                with open("logs/xml2abc_error_log.txt", "a", encoding="utf-8") as f:
+                    f.write(file + '\n')
+                continue
+            else:
+                with open(os.path.join(DES_FOLDER, filename.rsplit('.', 1)[0] + '.abc'), 'w', encoding='utf-8') as f:
+                    f.write(output)
+        except Exception as e:
+            with open("logs/xml2abc_error_log.txt", "a", encoding="utf-8") as f:
+                f.write(file + ' ' + str(e) + '\n')
+if __name__ == '__main__':
+    file_list = []
+    os.makedirs("logs", exist_ok=True)
+    # Traverse the specified folder for XML/MXL files
+    for root, dirs, files in os.walk(os.path.abspath(ORI_FOLDER)):
+        for file in files:
+            if file.endswith((".mxl", ".xml", ".musicxml")):
+                filename = os.path.join(root, file).replace("\\", "/")
+                file_list.append(filename)
+    # Shuffle and prepare for multiprocessing
+    random.shuffle(file_list)
+    num_files = len(file_list)
+    num_processes = os.cpu_count()
+    file_lists = [file_list[i::num_processes] for i in range(num_processes)]
+    # Create a pool for processing
+    with Pool(processes=num_processes) as pool:
+        pool.map(convert_xml2abc, file_lists)

2_data_preprocess.py ADDED Viewed

	@@ -0,0 +1,181 @@

+ORI_FOLDER = ''  # Replace with the path to your folder containing standard ABC notation files
+INTERLEAVED_FOLDER = ''   # Output interleaved ABC notation files to this folder
+AUGMENTED_FOLDER = ''   # Output key-augmented and rest-omitted ABC notation files to this folder
+EVAL_SPLIT = 0.1    # The ratio of eval data
+import os
+import re
+import json
+import shutil
+import random
+from tqdm import tqdm
+from abctoolkit.utils import (
+    remove_information_field,
+    remove_bar_no_annotations,
+    Quote_re,
+    Barlines,
+    extract_metadata_and_parts,
+    extract_global_and_local_metadata,
+    extract_barline_and_bartext_dict)
+from abctoolkit.convert import unidecode_abc_lines
+from abctoolkit.rotate import rotate_abc
+from abctoolkit.check import check_alignment_unrotated
+from abctoolkit.transpose import Key2index, transpose_an_abc_text
+os.makedirs(INTERLEAVED_FOLDER, exist_ok=True)
+os.makedirs(AUGMENTED_FOLDER, exist_ok=True)
+for key in Key2index.keys():
+    key_folder = os.path.join(AUGMENTED_FOLDER, key)
+    os.makedirs(key_folder, exist_ok=True)
+def abc_preprocess_pipeline(abc_path):
+    with open(abc_path, 'r', encoding='utf-8') as f:
+        abc_lines = f.readlines()
+    # delete blank lines
+    abc_lines = [line for line in abc_lines if line.strip() != '']
+    # unidecode
+    abc_lines = unidecode_abc_lines(abc_lines)
+    # clean information field
+    abc_lines = remove_information_field(abc_lines=abc_lines, info_fields=['X:', 'T:', 'C:', 'W:', 'w:', 'Z:', '%%MIDI'])
+    # delete bar number annotations
+    abc_lines = remove_bar_no_annotations(abc_lines)
+    # delete \"
+    for i, line in enumerate(abc_lines):
+        if re.search(r'^[A-Za-z]:', line) or line.startswith('%'):
+            continue
+        else:
+            if r'\"' in line:
+                abc_lines[i] = abc_lines[i].replace(r'\"', '')
+    # delete text annotations with quotes
+    for i, line in enumerate(abc_lines):
+        quote_contents = re.findall(Quote_re, line)
+        for quote_content in quote_contents:
+            for barline in Barlines:
+                if barline in quote_content:
+                    line = line.replace(quote_content, '')
+                    abc_lines[i] = line
+    # check bar alignment
+    try:
+        _, bar_no_equal_flag, _ = check_alignment_unrotated(abc_lines)
+        if not bar_no_equal_flag:
+            print(abc_path, 'Unequal bar number')
+            raise Exception
+    except:
+        raise Exception
+    # deal with text annotations: remove too long text annotations; remove consecutive non-alphabet/number characters
+    for i, line in enumerate(abc_lines):
+        quote_matches = re.findall(r'"[^"]*"', line)
+        for match in quote_matches:
+            if match == '""':
+                line = line.replace(match, '')
+            if match[1] in ['^', '_']:
+                sub_string = match
+                pattern = r'([^a-zA-Z0-9])\1+'
+                sub_string = re.sub(pattern, r'\1', sub_string)
+                if len(sub_string) <= 40:
+                    line = line.replace(match, sub_string)
+                else:
+                    line = line.replace(match, '')
+        abc_lines[i] = line
+    abc_name = os.path.splitext(os.path.split(abc_path)[-1])[0]
+    # transpose
+    metadata_lines, part_text_dict = extract_metadata_and_parts(abc_lines)
+    global_metadata_dict, local_metadata_dict = extract_global_and_local_metadata(metadata_lines)
+    if global_metadata_dict['K'][0] == 'none':
+        global_metadata_dict['K'][0] = 'C'
+    ori_key = global_metadata_dict['K'][0]
+    interleaved_abc = rotate_abc(abc_lines)
+    interleaved_path = os.path.join(INTERLEAVED_FOLDER, abc_name + '.abc')
+    with open(interleaved_path, 'w') as w:
+        w.writelines(interleaved_abc)
+    for key in Key2index.keys():
+        transposed_abc_text = transpose_an_abc_text(abc_lines, key)
+        transposed_abc_lines = transposed_abc_text.split('\n')
+        transposed_abc_lines = list(filter(None, transposed_abc_lines))
+        transposed_abc_lines = [line + '\n' for line in transposed_abc_lines]
+        # rest reduction
+        metadata_lines, prefix_dict, left_barline_dict, bar_text_dict, right_barline_dict = \
+            extract_barline_and_bartext_dict(transposed_abc_lines)
+        reduced_abc_lines = metadata_lines
+        for i in range(len(bar_text_dict['V:1'])):
+            line = ''
+            for symbol in prefix_dict.keys():
+                valid_flag = False
+                for char in bar_text_dict[symbol][i]:
+                    if char.isalpha() and not char in ['Z', 'z', 'X', 'x']:
+                        valid_flag = True
+                        break
+                if valid_flag:
+                    if i == 0:
+                        part_patch = '[' + symbol + ']' + prefix_dict[symbol] + left_barline_dict[symbol][0] + bar_text_dict[symbol][0] + right_barline_dict[symbol][0]
+                    else:
+                        part_patch = '[' + symbol + ']' + bar_text_dict[symbol][i] + right_barline_dict[symbol][i]
+                    line += part_patch
+            line += '\n'
+            reduced_abc_lines.append(line)
+            reduced_abc_name = abc_name + '_' + key
+            reduced_abc_path = os.path.join(AUGMENTED_FOLDER, key, reduced_abc_name + '.abc')
+            with open(reduced_abc_path, 'w', encoding='utf-8') as w:
+                w.writelines(reduced_abc_lines)
+    return abc_name, ori_key
+if __name__ == '__main__':
+    data = []
+    file_list = os.listdir(ORI_FOLDER)
+    for file in tqdm(file_list):
+        ori_abc_path = os.path.join(ORI_FOLDER, file)
+        try:
+            abc_name, ori_key = abc_preprocess_pipeline(ori_abc_path)
+        except:
+            print(ori_abc_path, 'failed to pre-process.')
+            continue
+        data.append({
+            'path': os.path.join(AUGMENTED_FOLDER, abc_name),
+            'key': ori_key
+        })
+    random.shuffle(data)
+    eval_data = data[ : int(EVAL_SPLIT * len(data))]
+    train_data = data[int(EVAL_SPLIT * len(data)) : ]
+    data_index_path = AUGMENTED_FOLDER + '.jsonl'
+    eval_index_path = AUGMENTED_FOLDER + '_eval.jsonl'
+    train_index_path = AUGMENTED_FOLDER + '_train.jsonl'
+    with open(data_index_path, 'w', encoding='utf-8') as w:
+        for d in data:
+            w.write(json.dumps(d) + '\n')
+    with open(eval_index_path, 'w', encoding='utf-8') as w:
+        for d in eval_data:
+            w.write(json.dumps(d) + '\n')
+    with open(train_index_path, 'w', encoding='utf-8') as w:
+        for d in train_data:
+            w.write(json.dumps(d) + '\n')

3_batch_abc2xml.py ADDED Viewed

	@@ -0,0 +1,56 @@

+ORI_FOLDER = ""  # Replace with the path to your folder containing standard/interleaved abc files
+DES_FOLDER = ""   # The script will convert the abc files and output musicxml files to this folder
+import os
+import math
+import random
+import subprocess
+from tqdm import tqdm
+from multiprocessing import Pool
+def convert_abc2xml(file_list):
+    cmd = 'python abc2xml.py '
+    for file in tqdm(file_list):
+        filename = file.split('/')[-1]  # Extract file name
+        os.makedirs(DES_FOLDER, exist_ok=True)
+        try:
+            p = subprocess.Popen(cmd + '"' + file + '"', stdout=subprocess.PIPE, shell=True)
+            result = p.communicate()
+            output = result[0].decode('utf-8')
+            if output == '':
+                with open("logs/abc2xml_error_log.txt", "a", encoding="utf-8") as f:
+                    f.write(file + '\n')
+                continue
+            else:
+                output_path = f"{DES_FOLDER}/" + ".".join(filename.split(".")[:-1]) + ".xml"
+                with open(output_path, 'w', encoding='utf-8') as f:
+                    f.write(output)
+        except Exception as e:
+            with open("logs/abc2xml_error_log.txt", "a", encoding="utf-8") as f:
+                f.write(file + ' ' + str(e) + '\n')
+            pass
+if __name__ == '__main__':
+    file_list = []
+    os.makedirs("logs", exist_ok=True)
+    # Traverse the specified folder for ABC files
+    for root, dirs, files in os.walk(ORI_FOLDER):
+        for file in files:
+            if not file.endswith(".abc"):
+                continue
+            filename = os.path.join(root, file).replace("\\", "/")
+            file_list.append(filename)
+    # Prepare for multiprocessing
+    file_lists = []
+    random.shuffle(file_list)
+    for i in range(os.cpu_count()):
+        start_idx = int(math.floor(i * len(file_list) / os.cpu_count()))
+        end_idx = int(math.floor((i + 1) * len(file_list) / os.cpu_count()))
+        file_lists.append(file_list[start_idx:end_idx])
+    pool = Pool(processes=os.cpu_count())
+    pool.map(convert_abc2xml, file_lists)

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Yashan Wang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README (1).md ADDED Viewed

	@@ -0,0 +1,31 @@

+## Local Gradio Demo
+1. Set up the environment:
+  ```
+  conda create --name notagen python=3.10
+  conda activate notagen
+  conda install pytorch==2.3.0 pytorch-cuda=11.8 -c pytorch -c nvidia
+  pip install accelerate
+  pip install optimum
+  pip install -r requirements.txt
+  ```
+2. Download [NotaGen-X](https://huggingface.co/ElectricAlexis/NotaGen/blob/main/weights_notagenx_p_size_16_p_length_1024_p_layers_20_h_size_1280.pth) and put it under ```gradio/```.
+3. run ```demo.py```:
+  ```
+  cd gradio/
+  python demo.py
+  ```
+4. Then you can view the demo page at 0.0.0.0:7861.
+  <p align="center">
+  <img src="illustration.png" alt="NotaGen Gradio Demo">
+  </p>
+  You can choose period, composer, and instrumentation as a prompt combination for NotaGen's conditional generation. After generation completes, you can save the ABC notation and MusicXML files locally.
+  It is with some regret that the current combination of prompts is limited to 112, which is constrained by the number of pieces of music under each prompt in the fine-tuning dataset. We hope to expand the combinations and forms of prompts in the future.

README (2).md ADDED Viewed

	@@ -0,0 +1,293 @@

+# 🎵 NotaGen: Advancing Musicality in Symbolic Music Generation with Large Language Model Training Paradigms
+<p align="center">
+  <!-- ArXiv -->
+  <a href="https://arxiv.org/abs/2502.18008">
+    <img src="https://img.shields.io/badge/NotaGen_Paper-ArXiv-%23B31B1B?logo=arxiv&logoColor=white" alt="Paper">
+  </a>
+  &nbsp;&nbsp;
+  <!-- HuggingFace -->
+  <a href="https://huggingface.co/ElectricAlexis/NotaGen">
+    <img src="https://img.shields.io/badge/NotaGen_Weights-HuggingFace-%23FFD21F?logo=huggingface&logoColor=white" alt="Weights">
+  </a>
+  &nbsp;&nbsp;
+  <!-- HuggingFace Space -->
+  <a href="https://huggingface.co/spaces/ElectricAlexis/NotaGen">
+    <img src="https://img.shields.io/badge/NotaGen_Space-Huggingface-✨️?logo=huggingface&logoColor=white" alt="Space">
+  </a>
+  &nbsp;&nbsp;
+  <!-- Web Demo -->
+  <a href="https://electricalexis.github.io/notagen-demo/">
+    <img src="https://img.shields.io/badge/NotaGen_Demo-Web-%23007ACC?logo=google-chrome&logoColor=white" alt="Demo">
+  </a>
+</p>
+<p align="center">
+  <img src="notagen.png" alt="NotaGen" width="50%">
+</p>
+## 📖 Overview
+**NotaGen** is a symbolic music generation model that explores the potential of producing **high-quality classical sheet music**. Inspired by the success of Large Language Models (LLMs), NotaGen adopts a three-stage training paradigm:
+- 🧠 **Pre-training** on 1.6M musical pieces
+- 🎯 **Fine-tuning** on ~9K classical compositions with `period-composer-instrumentation` prompts
+- 🚀 **Reinforcement Learning** using our novel **CLaMP-DPO** method (no human annotations or pre-defined rewards required.)
+Check our [demo page](https://electricalexis.github.io/notagen-demo/) and enjoy music composed by NotaGen!
+## ⚙️ Environment Setup
+```bash
+conda create --name notagen python=3.10
+conda activate notagen
+conda install pytorch==2.3.0 pytorch-cuda=11.8 -c pytorch -c nvidia
+pip install accelerate
+pip install optimum
+pip install -r requirements.txt
+```
+## 🏋️ NotaGen Model Weights
+### Pre-training
+We provide pre-trained weights of different scales:
+|  Models         |  Parameters  |  Patch-level Decoder Layers  |  Character-level Decoder Layers  |  Hidden Size  |  Patch Length (Context Length)  |
+|  ----           |  ----  |  ---- |  ----  |  ----  |  ----  |
+|  [NotaGen-small](https://huggingface.co/ElectricAlexis/NotaGen/blob/main/weights_notagen_pretrain_p_size_16_p_length_2048_p_layers_12_c_layers_3_h_size_768_lr_0.0002_batch_8.pth)  | 110M   |  12   |  3     |  768   |  2048  |
+|  [NotaGen-medium](https://huggingface.co/ElectricAlexis/NotaGen/blob/main/weights_notagen_pretrain_p_size_16_p_length_2048_p_layers_16_c_layers_3_h_size_1024_lr_0.0001_batch_4.pth) | 244M   |  16   |  3     |  1024  |  2048  |
+|  [NotaGen-large](https://huggingface.co/ElectricAlexis/NotaGen/blob/main/weights_notagen_pretrain_p_size_16_p_length_1024_p_layers_20_c_layers_6_h_size_1280_lr_0.0001_batch_4.pth)  | 516M   |  20   |  6     |  1280  |  1024  |
+**Notice**: The pre-trained weights cannot be used for conditional generation based on 'period-composer-instrumentation'.
+### Fine-tuning
+We fine-tuned NotaGen-large on a corpus of approximately 9k classical pieces. You can download the weights [here](https://huggingface.co/ElectricAlexis/NotaGen/blob/main/weights_notagen_pretrain-finetune_p_size_16_p_length_1024_p_layers_c_layers_6_20_h_size_1280_lr_1e-05_batch_1.pth).
+### Reinforcement-Learning
+After pre-training and fine-tuning, we optimized NotaGen-large with 3 iterations of CLaMP-DPO. You can download the weights [here](https://huggingface.co/ElectricAlexis/NotaGen/blob/main/weights_notagen_pretrain-finetune-RL3_beta_0.1_lambda_10_p_size_16_p_length_1024_p_layers_20_c_layers_6_h_size_1280_lr_1e-06_batch_1.pth).
+### 🌟 NotaGen-X
+Inspired by Deepseek-R1, we further optimized the training procedures of NotaGen and released a better version --- [NotaGen-X](https://huggingface.co/ElectricAlexis/NotaGen/blob/main/weights_notagenx_p_size_16_p_length_1024_p_layers_20_h_size_1280.pth). Compared to the version in the paper, NotaGen-X incorporates the following improvements:
+- We introduced a post-training stage between pre-training and fine-tuning, refining the model with a classical-style subset of the pre-training dataset.
+- We removed the key augmentation in the Fine-tune stage, making the instrument range of the generated compositions more reasonable.
+- After RL, we utilized the resulting checkpoint to gather a new set of post-training data. Starting from the pre-trained checkpoint, we conducted another round of post-training, fine-tuning, and reinforcement learning.
+If you want to add a new composer style to NotaGen-X, please refer to issue [#18](https://github.com/ElectricAlexis/NotaGen/issues/18) for more instructions :D
+## 🎹 Demo
+### Online Gradio Demo
+We developed an [online gradio demo](https://huggingface.co/spaces/ElectricAlexis/NotaGen) on Huggingface Space for NotaGen-X. You can input **"Period-Composer-Instrumentation"** as the prompt to have NotaGen generate music, preview the audio / pdf scores, and download them :D
+<p align="center">
+  <img src="gradio/illustration_online.png" alt="NotaGen Gradio Demo">
+</p>
+### Local Gradio Demo
+We developed a local Gradio demo for NotaGen-X. You can input **"Period-Composer-Instrumentation"** as the prompt to have NotaGen generate music！
+<p align="center">
+  <img src="gradio/illustration.png" alt="NotaGen Gradio Demo">
+</p>
+Deploying NotaGen-X inference locally may require 8GB of GPU memory. For implementation details, please view [gradio/README.md](https://github.com/ElectricAlexis/NotaGen/blob/main/gradio/README.md). We are also working on developing an online demo.
+### Online Colab Notebook
+Thanks for [@deeplearn-art](https://github.com/deeplearn-art/NotaGen)'s contribution of a [Google Colab notebook for NotaGen](https://colab.research.google.com/drive/1yJA1wG0fiwNeehdQxAUw56i4bTXzoVVv?usp=sharing)! You can run it and access to a Gradio public link to play with this demo. 🤩
+### ComfyUI
+Thanks for [@billwuhao](https://github.com/billwuhao/ComfyUI_NotaGen)'s contribution of [a ComfyUI node for NotaGen](https://github.com/billwuhao/ComfyUI_NotaGen)! It can automatically convert generated .abc to .xml, .mp3, and .png formats. You can listen to the generated music and see the sheet music too! Please visit the [repository page](https://github.com/billwuhao/ComfyUI_NotaGen) for more information. 🤩
+<p align="center">
+  <img src="https://github.com/billwuhao/ComfyUI_NotaGen/blob/master/images/2025-03-10_06-24-03.png" alt="NotaGen ComfyUI">
+</p>
+## 🛠️ Data Pre-processing & Post-processing
+For converting **ABC notation** files from / to **MusicXML** files, please view [data/README.md](https://github.com/ElectricAlexis/NotaGen/blob/main/data/README.md) for instructions.
+To illustrate the specific data format, we provide a small dataset of **Schubert's lieder** compositions from the [OpenScore Lieder](https://github.com/OpenScore/Lieder), which includes:
+- 🗂️ Interleaved ABC folders
+- 🗂️ Augmented ABC folders
+- 📄 Data index files for training and evaluation
+You can download it [here](https://drive.google.com/drive/folders/1iVLkcywzXGcHFodce9nDQyEmK4UDmBtY?usp=sharing) and put it under ```data/```.
+In the instructions of **Fine-tuning** and **Reinforcement Learning** below, we will use this dataset as an example of our implementation. **It won't include the "period-composer-instrumentation" conditioning**, just for showing how to adapt the pretrained NotaGen to a specific music style.
+## 🧠 Pre-train
+If you want to use your own data to pre-train a blank **NotaGen** model, please:
+1. Preprocess the data and generate the data index files following the instructions in [data/README.md](https://github.com/ElectricAlexis/NotaGen/blob/main/data/README.md)
+2. Modify the parameters in ```pretrain/config.py```
+Use this command for pre-training:
+```bash
+cd pretrain/
+accelerate launch --multi_gpu --mixed_precision fp16 train-gen.py
+```
+## 🎯 Fine-tune
+Here we give an example on fine-tuning **NotaGen-large** with the **Schubert's lieder** data mentioned above.
+**Notice:** The use of **NotaGen-large** requires at least **24GB of GPU memory** for training and inference. Alternatively, you may use **NotaGen-small** or **NotaGen-medium** and change the configuration of models in ```finetune/config.py```.
+### Configuration
+- In ```finetune/config.py```:
+  - Modify the ```DATA_TRAIN_INDEX_PATH``` and ```DATA_EVAL_INDEX_PATH```:
+    ```python
+    # Configuration for the data
+    DATA_TRAIN_INDEX_PATH = "../data/schubert_augmented_train.jsonl"
+    DATA_EVAL_INDEX_PATH  = "../data/schubert_augmented_eval.jsonl"
+    ```
+  - Download pre-trained NotaGen weights, and modify the ```PRETRAINED_PATH```:
+    ```python
+    PRETRAINED_PATH = "../pretrain/weights_notagen_pretrain_p_size_16_p_length_1024_p_layers_20_c_layers_6_h_size_1280_lr_0.0001_batch_4.pth"  # Use NotaGen-large
+    ```
+  - ```EXP_TAG``` is for differentiating the models. It will be integrated into the ckpt's name. Here we set it to ```schubert```.
+  - You can also modify other parameters like the learning rate.
+### Execution
+Use this command for fine-tuning:
+```bash
+cd finetune/
+CUDA_VISIBLE_DEVICES=0 python train-gen.py
+```
+## 🚀 Reinforcement Learning (CLaMP-DPO)
+Here we give an example on how to use **CLaMP-DPO** to enhance the model fine-tuned with **Schubert's lieder** data.
+### ⚙️ [CLaMP 2](https://github.com/sanderwood/clamp2) Setup
+Download model weights and put them under the ```clamp2/```folder:
+- [CLaMP 2 Model Weights](https://huggingface.co/sander-wood/clamp2/blob/main/weights_clamp2_h_size_768_lr_5e-05_batch_128_scale_1_t_length_128_t_model_FacebookAI_xlm-roberta-base_t_dropout_True_m3_True.pth)
+- [M3 Model Weights](https://huggingface.co/sander-wood/clamp2/blob/main/weights_m3_p_size_64_p_length_512_t_layers_3_p_layers_12_h_size_768_lr_0.0001_batch_16_mask_0.45.pth)
+### 🔍 Extract Ground Truth Features
+Modify ```input_dir``` and ```output_dir``` in ```clamp2/extract_clamp2.py```:
+```python
+input_dir = '../data/schubert_interleaved'  # interleaved abc folder
+output_dir = 'feature/schubert_interleaved'  # feature folder
+```
+Extract the features:
+```
+cd clamp2/
+python extract_clamp2.py
+```
+### 🔄 CLaMP-DPO
+Here we give an example of an iteration of **CLaMP-DPO** from the initial model fine-tuned on **Schubert's lieder** data.
+#### 1. Inference
+- Modify the ```INFERENCE_WEIGHTS_PATH``` to path of the fine-tuned weights and ```NUM_SAMPLES``` to generate in ```inference/config.py```:
+  ```python
+    INFERENCE_WEIGHTS_PATH = '../finetune/weights_notagen_schubert_p_size_16_p_length_1024_p_layers_20_c_layers_6_h_size_1280_lr_1e-05_batch_1.pth'
+    NUM_SAMPLES = 1000
+  ```
+- Inference:
+  ```
+  cd inference/
+  python inference.py
+  ```
+  This will generate an ```output/```folder with two subfolders: ```original``` and ```interleaved```. The ```original/``` subdirectory stores the raw inference outputs from the model, while the ```interleaved/``` subdirectory contains data post-processed with rest measure completion, compatible with CLaMP 2. Each of these subdirectories will contain a model-specific folder, named as a combination of the model's name and its sampling parameters.
+#### 2. Extract Generated Data Features
+Modify ```input_dir``` and ```output_dir``` in ```clamp2/extract_clamp2.py```:
+```python
+input_dir = '../output/interleaved/weights_notagen_schubert_p_size_16_p_length_1024_p_layers_20_c_layers_6_h_size_1280_lr_1e-05_batch_1_k_9_p_0.9_temp_1.2'  # interleaved abc folder
+output_dir = 'feature/weights_notagen_schubert_p_size_16_p_length_1024_p_layers_20_c_layers_6_h_size_1280_lr_1e-05_batch_1_k_9_p_0.9_temp_1.2'  # feature folder
+```
+Extract the features:
+```
+cd clamp2/
+python extract_clamp2.py
+```
+#### 3. Statistics on Averge CLaMP 2 Score (Optional)
+If you're interested in the **Average CLaMP 2 Score** of the current model, modify the parameters in ```clamp2/statistics.py```:
+```python
+gt_feature_folder = 'feature/schubert_interleaved'
+output_feature_folder = 'feature/weights_notagen_schubert_p_size_16_p_length_1024_p_layers_20_c_layers_6_h_size_1280_lr_1e-05_batch_1_k_9_p_0.9_temp_1.2'
+```
+Then run this script:
+```
+cd clamp2/
+python statistics.py
+```
+#### 4. Construct Preference Data
+Modify the parameters in ```RL/data.py```:
+```python
+gt_feature_folder = '../clamp2/feature/schubert_interleaved'
+output_feature_folder = '../clamp2/feature/weights_notagen_schubert_p_size_16_p_length_1024_p_layers_20_c_layers_6_h_size_1280_lr_1e-05_batch_1_k_9_p_0.9_temp_1.2'
+output_original_abc_folder = '../output/original/weights_notagen_schubert_p_size_16_p_length_1024_p_layers_20_c_layers_6_h_size_1280_lr_1e-05_batch_1_k_9_p_0.9_temp_1.2'
+output_interleaved_abc_folder = '../output/interleaved/weights_notagen_schubert_p_size_16_p_length_1024_p_layers_20_c_layers_6_h_size_1280_lr_1e-05_batch_1_k_9_p_0.9_temp_1.2'
+data_index_path = 'schubert_RL1.json'  # Data for the first iteration of RL
+data_select_portion = 0.1
+```
+In this script, the **CLaMP 2 Score** of each generated piece will be calculated and sorted. The portion of data in the chosen and rejected sets is determined by ```data_select_portion```. Additionally, there are also three rules to exclude problematic sheets from the chosen set:
+- Sheets with duration alignment problems are excluded;
+- Sheets that may plagiarize from ground truth data (ld_sim>0.95) are excluded;
+- Sheets where staves for the same instrument are not grouped together are excluded.
+The prefence data file will be names as ```data_index_path```, which records the file paths in chosen and rejected sets.
+Run this script:
+```
+cd RL/
+python data.py
+```
+#### 5. DPO Training
+Modify the parameters in ```RL/config.py```:
+```python
+DATA_INDEX_PATH = 'schubert_RL1.json'  # Preference data path
+PRETRAINED_PATH = '../finetune/weights_notagen_schubert_p_size_16_p_length_1024_p_layers_20_c_layers_6_h_size_1280_lr_1e-05_batch_1.pth'  # The model to go through DPO optimization
+EXP_TAG = 'schubert-RL1'              # Model tag for differentiation
+```
+You can also modify other parameters like ```OPTIMATION_STEPS``` and DPO hyper-parameters.
+Run this script:
+```
+cd RL/
+CUDA_VISIBLE_DEVICES=0 python train.py
+```
+After training, a model named ```weights_notagen_schubert-RL1_beta_0.1_lambda_10_p_size_16_p_length_1024_p_layers_20_c_layers_6_h_size_1280_lr_1e-06.pth``` will be saved under ```RL/```. For the second round of CLaMP-DPO, please go back to the first inference stage, and let the new model to generate pieces.
+For this small experiment on **Schubert's lieder** data, we post our **Average CLaMP 2 Score** here for the fine-tuned model and models after each iteration of CLaMP-DPO, as a reference:
+|  CLaMP-DPO Iteration (K) |  Average CLaMP 2 Score  |
+|  ----           |  ----  |
+|  0 (fine-tuned) | 0.324  |
+|  1              | 0.579  |
+|  2              | 0.778  |
+If you are interested in this method, have a try on your own style-specific dataset :D
+## 📚 Citation
+If you find **NotaGen** or **CLaMP-DPO** useful in your work, please cite our paper.
+```bibtex
+@misc{wang2025notagenadvancingmusicalitysymbolic,
+      title={NotaGen: Advancing Musicality in Symbolic Music Generation with Large Language Model Training Paradigms},
+      author={Yashan Wang and Shangda Wu and Jianhuai Hu and Xingjian Du and Yueqi Peng and Yongxin Huang and Shuai Fan and Xiaobing Li and Feng Yu and Maosong Sun},
+      year={2025},
+      eprint={2502.18008},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2502.18008},
+}
+```

README.md ADDED Viewed

	@@ -0,0 +1,44 @@

+## Data Pre-processing
+### Convert from MusicXML
+- Navigate to the data folder ```cd data/```
+- Modify the ```ORI_FOLDER``` and ```DES_FOLDER``` in ```1_batch_xml2abc.py```, then run this script:
+  ```
+  python 1_batch_xml2abc.py
+  ```
+  This will conver the MusicXML files into standard ABC notation files.
+- Modify the ```ORI_FOLDER```, ```INTERLEAVED_FOLDER```, ```AUGMENTED_FOLDER```, and ```EVAL_SPLIT``` in ```2_data_preprocess.py```:
+  ```python
+  ORI_FOLDER = ''  # Folder containing standard ABC notation files
+  INTERLEAVED_FOLDER = ''   # Output interleaved ABC notation files that are compatible with CLaMP 2 to this folder
+  AUGMENTED_FOLDER = ''   # On the basis of interleaved ABC, output key-augmented and rest-omitted files that are compatible with NotaGen to this folder
+  EVAL_SPLIT = 0.1    # Evaluation data ratio
+  ```
+  then run this script:
+  ```
+  python 2_data_preprocess.py
+  ```
+  - The script will convert the standard ABC to interleaved ABC, which is compatible with CLaMP 2. The files will be under ```INTERLEAVED_FOLDER```.
+  - This script will make 15 key signature folders under the ```AUGMENTED_FOLDER```, and output interleaved ABC notation files with rest bars omitted. This is the data representation that NotaGen adopts.
+  - This script will also generate data index files for training NotaGen. It will randomly split train and eval sets according to the proportion ```EVAL_SPLIT``` defines. The index files will be named as ```{AUGMENTED_FOLDER}_train.jsonl``` and ```{AUGMENTED_FOLDER}_eval.jsonl```.
+## Data Post-processing
+### Preview Sheets in ABC Notation
+We recommend [EasyABC](https://sourceforge.net/projects/easyabc/), a nice software for ABC Notation previewing, composing and editing.
+It's needed to add a line "X:1" before each piece to present the score image in EasyABC :D
+### Convert to MusicXML
+- Go to the data folder ```cd data/```
+- Modify the ```ORI_FOLDER``` and ```DES_FOLDER``` in ```3_batch_abc2xml.py```, then run this script:
+  ```
+  python 3_batch_abc2xml.py
+  ```
+  This will conver the standard/interleaved ABC notation files into MusicXML files.

abc2xml (1).py ADDED Viewed

The diff for this file is too large to render. See raw diff

abc2xml (2).py ADDED Viewed

The diff for this file is too large to render. See raw diff

abc2xml.py ADDED Viewed

The diff for this file is too large to render. See raw diff

config (1).py ADDED Viewed

	@@ -0,0 +1,67 @@

+EVAL_SPLIT = 0.01  # Fraction of training data used for evaluation
+WANDB_KEY = "<your_wandb_key>"  # Set M3/CLaMP2_WANDB_LOG=False if no API key for Weights and Biases logging
+# -------------------- Configuration for M3 Training --------------------
+TRAIN_FOLDERS = [
+    "<path_to_training_data>"  # Directory containing training data
+]
+EVAL_FOLDERS = [
+    ""  # (Optional) Directory containing evaluation data
+]
+PATCH_SIZE = 64  # Size of each patch
+PATCH_LENGTH = 512  # Length of the patches
+PATCH_NUM_LAYERS = 12  # Number of layers in the encoder
+TOKEN_NUM_LAYERS = 3  # Number of layers in the decoder
+M3_HIDDEN_SIZE = 768  # Size of the hidden layer
+M3_NUM_EPOCH = 100  # Maximum number of epochs for training
+M3_LEARNING_RATE = 1e-4  # Learning rate for the optimizer
+M3_BATCH_SIZE = 16  # Batch size per GPU (single card) during training
+M3_MASK_RATIO = 0.45  # Ratio of masked elements during training
+M3_DETERMINISTIC = True  # Ensures deterministic results with random seeds
+M3_WANDB_LOG = True  # Enable logging to Weights and Biases
+M3_LOAD_CKPT = True  # Load model weights from a checkpoint if available
+M3_WEIGHTS_PATH = (
+    "weights_m3_p_size_" + str(PATCH_SIZE) +
+    "_p_length_" + str(PATCH_LENGTH) +
+    "_t_layers_" + str(TOKEN_NUM_LAYERS) +
+    "_p_layers_" + str(PATCH_NUM_LAYERS) +
+    "_h_size_" + str(M3_HIDDEN_SIZE) +
+    "_lr_" + str(M3_LEARNING_RATE) +
+    "_batch_" + str(M3_BATCH_SIZE) +
+    "_mask_" + str(M3_MASK_RATIO) + ".pth"
+)  # Path to store the model weights
+M3_LOGS_PATH = M3_WEIGHTS_PATH.replace("weights", "logs").replace("pth", "txt")  # Path to save training logs
+# -------------------- Configuration for CLaMP2 Training ----------------
+TRAIN_JSONL = "<path_to_training_jsonl>"  # Path to the JSONL file with training data
+EVAL_JSONL = ""  # (Optional) Path to the JSONL file with evaluation data
+CLAMP2_HIDDEN_SIZE = 768  # Size of the hidden layer
+TEXT_MODEL_NAME = "FacebookAI/xlm-roberta-base"  # Name of the pre-trained text model
+CLAMP2_NUM_EPOCH = 100  # Maximum number of epochs for training
+CLAMP2_LEARNING_RATE = 5e-5  # Learning rate for the optimizer
+CLAMP2_BATCH_SIZE = 128  # Batch size per GPU (single card) during training
+LOGIT_SCALE = 1  # Scaling factor for contrastive loss
+MAX_TEXT_LENGTH = 128  # Maximum allowed length for text input
+TEXT_DROPOUT = True  # Whether to apply dropout during text processing
+CLAMP2_DETERMINISTIC = True  # Ensures deterministic results with random seeds
+CLAMP2_LOAD_M3 = True  # Load weights from the M3 model
+CLAMP2_WANDB_LOG = True  # Enable logging to Weights and Biases
+CLAMP2_LOAD_CKPT = True  # Load weights from a checkpoint if available
+CLAMP2_WEIGHTS_PATH = (
+    "weights_clamp2_h_size_" + str(CLAMP2_HIDDEN_SIZE) +
+    "_lr_" + str(CLAMP2_LEARNING_RATE) +
+    "_batch_" + str(CLAMP2_BATCH_SIZE) +
+    "_scale_" + str(LOGIT_SCALE) +
+    "_t_length_" + str(MAX_TEXT_LENGTH) +
+    "_t_model_" + TEXT_MODEL_NAME.replace("/", "_") +
+    "_t_dropout_" + str(TEXT_DROPOUT) +
+    "_m3_" + str(CLAMP2_LOAD_M3) + ".pth"
+)  # Path to store CLaMP2 model weights
+CLAMP2_LOGS_PATH = CLAMP2_WEIGHTS_PATH.replace("weights", "logs").replace("pth", "txt")  # Path to save training logs

config (2).py ADDED Viewed

	@@ -0,0 +1,38 @@

+import os
+# Configuration for the data
+DATA_TRAIN_INDEX_PATH = ""
+DATA_EVAL_INDEX_PATH  = ""
+# Configuration for the model
+PATCH_STREAM = True                                             # Stream training / inference
+PATCH_SIZE = 16                                                # Patch Size
+PATCH_LENGTH = 1024                                             # Patch Length
+CHAR_NUM_LAYERS = 6                                             # Number of layers in the decoder
+PATCH_NUM_LAYERS = 20                                           # Number of layers in the encoder
+HIDDEN_SIZE = 1280                                               # Hidden Size
+# Configuration for the training
+BATCH_SIZE = 1
+LEARNING_RATE = 1e-5
+NUM_EPOCHS = 64                                                 # Number of epochs to train for (if early stopping doesn't intervene)
+ACCUMULATION_STEPS = 1                                          # Accumulation steps to simulate large batch size
+PATCH_SAMPLING_BATCH_SIZE = 0                                   # Batch size for patch during training, 0 for full conaudio
+LOAD_FROM_CHECKPOINT = False                                    # Whether to load weights from a checkpoint
+WANDB_LOGGING = False                                           # Whether to log to wandb
+WANDB_KEY = '<your_wandb_key>'
+PRETRAINED_PATH = ""                # Path of pretrained weights
+EXP_TAG = ''                                            # Experiment tag for name differentiation
+NAME =  EXP_TAG + \
+        "_p_size_" + str(PATCH_SIZE) + \
+        "_p_length_" + str(PATCH_LENGTH) + \
+        "_p_layers_" + str(PATCH_NUM_LAYERS) + \
+        "_c_layers_" + str(CHAR_NUM_LAYERS) + \
+        "_h_size_" + str(HIDDEN_SIZE) + \
+        "_lr_" + str(LEARNING_RATE) + \
+        "_batch_" + str(BATCH_SIZE)
+WEIGHTS_PATH = "weights_notagen_" + NAME + ".pth"                  # Path to save weights
+LOGS_PATH    = "logs_notagen_"    + NAME + ".txt"                     # Path to save logs
+WANDB_NAME = NAME

config (3).py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+# Configurations for inference
+INFERENCE_WEIGHTS_PATH = 'weights_notagenx_p_size_16_p_length_1024_p_layers_20_h_size_1280.pth'               # Path to weights for inference# Folder to save output files
+TOP_K = 9                                                       # Top k for sampling
+TOP_P = 0.9                                                      # Top p for sampling
+TEMPERATURE = 1.2                                                 # Temperature for sampling
+# Configurations for model
+PATCH_STREAM = True                                             # Stream training / inference
+PATCH_SIZE = 16                                                # Patch Size
+PATCH_LENGTH = 1024                                             # Patch Length
+CHAR_NUM_LAYERS = 6                                             # Number of layers in the decoder
+PATCH_NUM_LAYERS = 20                                           # Number of layers in the encoder
+HIDDEN_SIZE = 1280                                               # Hidden Size

config (4).py ADDED Viewed

	@@ -0,0 +1,18 @@

+import os
+# Configurations for inference
+INFERENCE_WEIGHTS_PATH = ''               # Path to weights for inference# Folder to save output files
+NUM_SAMPLES = 1000                                               # Number of samples to generate (only for generate mode)
+TOP_K = 9                                                       # Top k for sampling
+TOP_P = 0.9                                                      # Top p for sampling
+TEMPERATURE = 1.2                                                 # Temperature for sampling
+ORIGINAL_OUTPUT_FOLDER = os.path.join('../output/original', os.path.splitext(os.path.split(INFERENCE_WEIGHTS_PATH)[-1])[0] + '_k_' + str(TOP_K) + '_p_' + str(TOP_P) + '_temp_' + str(TEMPERATURE))
+INTERLEAVED_OUTPUT_FOLDER = os.path.join('../output/interleaved', os.path.splitext(os.path.split(INFERENCE_WEIGHTS_PATH)[-1])[0] + '_k_' + str(TOP_K) + '_p_' + str(TOP_P) + '_temp_' + str(TEMPERATURE))
+# Configurations for model
+PATCH_STREAM = True                                             # Stream training / inference
+PATCH_SIZE = 16                                                # Patch Size
+PATCH_LENGTH = 1024                                             # Patch Length
+CHAR_NUM_LAYERS = 6                                             # Number of layers in the decoder
+PATCH_NUM_LAYERS = 20                                           # Number of layers in the encoder
+HIDDEN_SIZE = 1280                                               # Hidden Size

config (5).py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+# Configuration for the data
+DATA_TRAIN_INDEX_PATH = ""
+DATA_EVAL_INDEX_PATH  = ""
+# Configuration for the model
+PATCH_STREAM = True
+PATCH_SIZE = 16                                                # Patch Size
+PATCH_LENGTH = 2048                                             # Patch Length
+CHAR_NUM_LAYERS = 3                                             # Number of layers in the decoder
+PATCH_NUM_LAYERS = 12                                           # Number of layers in the encoder
+HIDDEN_SIZE = 768                                               # Hidden Size
+# Configuration for the training
+BATCH_SIZE = 4
+LEARNING_RATE = 1e-4
+NUM_EPOCHS = 128                                                 # Number of epochs to train for (if early stopping doesn't intervene)
+ACCUMULATION_STEPS = 1                                          # Accumulation steps to simulate large batch size
+PATCH_SAMPLING_BATCH_SIZE = 0                                   # Batch size for patch during training, 0 for full conaudio
+LOAD_FROM_CHECKPOINT = False                                    # Whether to load weights from a checkpoint
+WANDB_LOGGING = False                                           # Whether to log to wandb
+WANDB_KEY = '<your_wandb_key>'
+EXP_TAG = 'pretrain'                                            # Experiment tag for differentiation
+NAME =  EXP_TAG + \
+        "_p_size_" + str(PATCH_SIZE) + \
+        "_p_length_" + str(PATCH_LENGTH) + \
+        "_p_layers_" + str(PATCH_NUM_LAYERS) + \
+        "_c_layers_" + str(CHAR_NUM_LAYERS) + \
+        "_h_size_" + str(HIDDEN_SIZE) + \
+        "_lr_" + str(LEARNING_RATE) + \
+        "_batch_" + str(BATCH_SIZE)
+WEIGHTS_PATH = "weights_notagen_" + NAME + ".pth"                  # Path to save weights
+LOGS_PATH    = "logs_notagen_"    + NAME + ".txt"                     # Path to save logs
+WANDB_NAME = NAME

config.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import os
+# Configuration for the data
+DATA_INDEX_PATH = ''
+# Configuration for the model
+PATCH_STREAM = True
+PATCH_SIZE = 16                                                # Patch Size
+PATCH_LENGTH = 1024                                             # Patch Length
+CHAR_NUM_LAYERS = 6                                             # Number of layers in the decoder
+PATCH_NUM_LAYERS = 20                                           # Number of layers in the encoder
+HIDDEN_SIZE = 1280                                               # Hidden Size
+# Configuration for the training
+BETA = 0.1                                                      # beta in DPO's objective function
+LAMBDA = 10                                                     # lambda in DPOP's objective function
+LEARNING_RATE = 1e-6
+OPTIMIZATION_STEPS = 10000                                      # Optimization steps for DPO
+WANDB_LOGGING = False                                           # Whether to log to wandb
+WANDB_KEY = '<your_wandb_key>'
+PRETRAINED_PATH = ''
+EXP_TAG = ''
+NAME =  EXP_TAG + \
+        "_beta_" + str(BETA) + \
+        "_lambda_" + str(LAMBDA) + \
+        "_p_size_" + str(PATCH_SIZE) + \
+        "_p_length_" + str(PATCH_LENGTH) + \
+        "_p_layers_" + str(PATCH_NUM_LAYERS) + \
+        "_c_layers_" + str(CHAR_NUM_LAYERS) + \
+        "_h_size_" + str(HIDDEN_SIZE) + \
+        "_lr_" + str(LEARNING_RATE)
+WEIGHTS_PATH = "weights_notagen_" + NAME + ".pth"                  # Path to save weights
+WANDB_NAME = NAME

data.py ADDED Viewed

	@@ -0,0 +1,136 @@

+gt_feature_folder = '../clamp2/feature/schubert_interleaved'
+output_feature_folder = '../clamp2/feature/weights_notagen_schubert-RL2_beta_0.1_lambda_10_p_size_16_p_length_1024_p_layers_20_h_size_1280_lr_1e-06_k_9_p_0.9_temp_1.2'
+output_original_abc_folder = '../output/original/weights_notagen_schubert-RL2_beta_0.1_lambda_10_p_size_16_p_length_1024_p_layers_20_h_size_1280_lr_1e-06_k_9_p_0.9_temp_1.2'
+output_interleaved_abc_folder = '../output/interleaved/weights_notagen_schubert-RL2_beta_0.1_lambda_10_p_size_16_p_length_1024_p_layers_20_h_size_1280_lr_1e-06_k_9_p_0.9_temp_1.2'
+data_index_path = 'schubert_RL3.json'
+data_select_portion = 0.1
+import os
+import re
+import json
+import random
+import numpy as np
+from config import *
+from abctoolkit.check import check_alignment_rotated, check_alignment_unrotated
+from abctoolkit.rotate import unrotate_abc
+def load_npy_files(folder_path_list):
+    """
+    Load all .npy files from a specified folder and return a list of numpy arrays.
+    """
+    npy_list = []
+    for file_path in folder_path_list:
+        if file_path.endswith('.npy'):
+            # file_path = os.path.join(folder_path, file_name)
+            np_array = np.load(file_path)[0]
+            npy_list.append(np_array)
+    return npy_list
+def average_npy(npy_list):
+    """
+    Compute the average of a list of numpy arrays.
+    """
+    return np.mean(npy_list, axis=0)
+def cosine_similarity(vec1, vec2):
+    """
+    Compute cosine similarity between two numpy arrays.
+    """
+    dot_product = np.dot(vec1, vec2)
+    norm_vec1 = np.linalg.norm(vec1)
+    norm_vec2 = np.linalg.norm(vec2)
+    cosine_sim = dot_product / (norm_vec1 * norm_vec2)
+    return cosine_sim
+def generate_preference_dict():
+    gt_feature_paths = []
+    for gt_feature_file in os.listdir(gt_feature_folder):
+        gt_feature_paths.append(os.path.join(gt_feature_folder, gt_feature_file))
+    gt_features = load_npy_files(gt_feature_paths)
+    gt_avg_feature = average_npy(gt_features)
+    output_feature_sim_dict = {}
+    for file in os.listdir(output_feature_folder):
+        output_feature_path = os.path.join(output_feature_folder, file)
+        output_feature = np.load(output_feature_path)[0]
+        sim = cosine_similarity(gt_avg_feature, output_feature)
+        output_feature_sim_dict[file[:-4]] = sim
+    threshold = int(len(output_feature_sim_dict) * data_select_portion)
+    sorted_output_files = sorted(output_feature_sim_dict.keys(), key=lambda item: output_feature_sim_dict[item], reverse=True)
+    chosen_index = 0
+    i = 0
+    chosen_abc_paths = []
+    while chosen_index < threshold and i < len(sorted_output_files):
+        chosen_flag = True
+        file = sorted_output_files[i]
+        output_interleaved_abc_path = os.path.join(output_interleaved_abc_folder, file + '.abc')
+        with open(output_interleaved_abc_path, 'r') as f:
+            abc_lines = f.readlines()
+        # check aligment
+        try:
+            abc_lines_unrotated = unrotate_abc(abc_lines)
+            barline_equal_flag, bar_no_equal_flag, bar_dur_equal_flag = check_alignment_unrotated(abc_lines_unrotated)
+            if not (barline_equal_flag and bar_no_equal_flag and bar_dur_equal_flag):
+                raise Exception
+        except:
+            chosen_flag = False
+        # check header: sheets where staves for the same instrument are not grouped together are excluded from the chosen set.
+        appeared_inst = set()
+        last_inst = ''
+        for line in abc_lines:
+            if line.startswith('V:') and 'nm=' in line:
+                match = re.search(r'nm="([^"]+)"', line)
+                if match:
+                    inst = match.group(1)
+                    if inst != last_inst and inst in appeared_inst:
+                        chosen_flag = False
+                        break
+                    else:
+                        last_inst = inst
+                        appeared_inst.add(inst)
+        # check plagiarism: sheets with sim > 0.95 are excluded
+        output_feature_path = os.path.join(output_feature_folder, file + '.npy')
+        output_feature = np.load(output_feature_path)[0]
+        for gt_feature_file in os.listdir(gt_feature_folder):
+            gt_feature_path = os.path.join(gt_feature_folder, gt_feature_file)
+            gt_feature = np.load(gt_feature_path)[0]
+            sim = cosine_similarity(output_feature, gt_feature)
+            if sim > 0.95:
+                chosen_flag = False
+                break
+        if chosen_flag:
+            original_abc_path = os.path.join(output_original_abc_folder, file + '.abc')
+            chosen_abc_paths.append(original_abc_path)
+            chosen_index += 1
+        else:
+            print(file, 'skipped')
+        i += 1
+    rejected_abc_paths = [os.path.join(output_original_abc_folder, file + '.abc') for file in sorted_output_files[-threshold:]]
+    preference_dict = {'chosen': chosen_abc_paths, 'rejected': rejected_abc_paths}
+    with open(data_index_path, 'w') as w:
+        json.dump(preference_dict, w, indent=4)
+if __name__ == '__main__':
+    generate_preference_dict()

demo.ipynb ADDED Viewed

	@@ -0,0 +1,821 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6e5cf1e7-c275-4929-9c44-ec48e26a2d4d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import re\n",
+    "import time\n",
+    "import torch\n",
+    "import torch\n",
+    "import random\n",
+    "import bisect\n",
+    "import json\n",
+    "from pathlib import Path\n",
+    "from tokenizers import Tokenizer\n",
+    "from transformers import GPT2Model, GPT2LMHeadModel, GPT2Config, LlamaModel, LlamaForCausalLM, PreTrainedModel \n",
+    "from samplings import top_p_sampling, top_k_sampling, temperature_sampling\n",
+    "from abctoolkit.utils import Exclaim_re, Quote_re, SquareBracket_re, Barline_regexPattern\n",
+    "from abctoolkit.transpose import Note_list, Pitch_sign_list\n",
+    "from abctoolkit.duration import calculate_bartext_duration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00fd2ebb-6e53-4038-af85-f9c5f02fde0e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Configurations for inference\n",
+    "INFERENCE_WEIGHTS_PATH = '../weights/weights_notagenx_p_size_16_p_length_1024_p_layers_20_h_size_1280.pth'               # Path to weights for inference# Folder to save output files\n",
+    "TOP_K = 9                                                       # Top k for sampling\n",
+    "TOP_P = 0.9                                                      # Top p for sampling\n",
+    "TEMPERATURE = 1.2                                                 # Temperature for sampling\n",
+    "\n",
+    "# Configurations for model\n",
+    "PATCH_STREAM = True                                             # Stream training / inference\n",
+    "PATCH_SIZE = 16                                                # Patch Size\n",
+    "PATCH_LENGTH = 1024                                             # Patch Length\n",
+    "CHAR_NUM_LAYERS = 6                                             # Number of layers in the decoder\n",
+    "PATCH_NUM_LAYERS = 20                                           # Number of layers in the encoder\n",
+    "HIDDEN_SIZE = 1280                                               # Hidden Size\n",
+    "\n",
+    "device = torch.device(\"cuda\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb70eb19-8b9c-4864-b711-7a0395b42c49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Patchilizer:\n",
+    "    def __init__(self, stream=PATCH_STREAM):\n",
+    "        self.stream = stream\n",
+    "        self.delimiters = [\"|:\", \"::\", \":|\", \"[|\", \"||\", \"|]\", \"|\"]\n",
+    "        self.regexPattern = '(' + '|'.join(map(re.escape, self.delimiters)) + ')'\n",
+    "        self.bos_token_id = 1\n",
+    "        self.eos_token_id = 2\n",
+    "        self.special_token_id = 0\n",
+    "\n",
+    "    def split_bars(self, body_lines):\n",
+    "        \"\"\"\n",
+    "        Split a body of music into individual bars.\n",
+    "        \"\"\"\n",
+    "        new_bars = []\n",
+    "        try:\n",
+    "            for line in body_lines:\n",
+    "                line_bars = re.split(self.regexPattern, line)\n",
+    "                line_bars = list(filter(None, line_bars))\n",
+    "                new_line_bars = []\n",
+    "\n",
+    "                if len(line_bars) == 1:\n",
+    "                    new_line_bars = line_bars\n",
+    "                else:\n",
+    "                    if line_bars[0] in self.delimiters:\n",
+    "                        new_line_bars = [line_bars[i] + line_bars[i + 1] for i in range(0, len(line_bars), 2)]\n",
+    "                    else:\n",
+    "                        new_line_bars = [line_bars[0]] + [line_bars[i] + line_bars[i + 1] for i in range(1, len(line_bars), 2)]\n",
+    "                    if 'V' not in new_line_bars[-1]:\n",
+    "                        new_line_bars[-2] += new_line_bars[-1]  # 吸收最后一个 小节线+\\n 的组合\n",
+    "                        new_line_bars = new_line_bars[:-1]\n",
+    "                new_bars += new_line_bars\n",
+    "        except:\n",
+    "            pass\n",
+    "\n",
+    "        return new_bars\n",
+    "\n",
+    "    def split_patches(self, abc_text, patch_size=PATCH_SIZE, generate_last=False):\n",
+    "        if not generate_last and len(abc_text) % patch_size != 0:\n",
+    "            abc_text += chr(self.eos_token_id)\n",
+    "        patches = [abc_text[i : i + patch_size] for i in range(0, len(abc_text), patch_size)]\n",
+    "        return patches\n",
+    "\n",
+    "    def patch2chars(self, patch):\n",
+    "        \"\"\"\n",
+    "        Convert a patch into a bar.\n",
+    "        \"\"\"\n",
+    "        bytes = ''\n",
+    "        for idx in patch:\n",
+    "            if idx == self.eos_token_id:\n",
+    "                break\n",
+    "            if idx < self.eos_token_id:\n",
+    "                pass\n",
+    "            bytes += chr(idx)\n",
+    "        return bytes\n",
+    "        \n",
+    "\n",
+    "    def patchilize_metadata(self, metadata_lines):\n",
+    "\n",
+    "        metadata_patches = []\n",
+    "        for line in metadata_lines:\n",
+    "            metadata_patches += self.split_patches(line)\n",
+    "\n",
+    "        return metadata_patches\n",
+    "    \n",
+    "    def patchilize_tunebody(self, tunebody_lines, encode_mode='train'):\n",
+    "\n",
+    "        tunebody_patches = []\n",
+    "        bars = self.split_bars(tunebody_lines)\n",
+    "        if encode_mode == 'train':\n",
+    "            for bar in bars:\n",
+    "                tunebody_patches += self.split_patches(bar)\n",
+    "        elif encode_mode == 'generate':\n",
+    "            for bar in bars[:-1]:\n",
+    "                tunebody_patches += self.split_patches(bar)\n",
+    "            tunebody_patches += self.split_patches(bars[-1], generate_last=True)\n",
+    "       \n",
+    "        return tunebody_patches\n",
+    "\n",
+    "    def encode_train(self, abc_text, patch_length=PATCH_LENGTH, patch_size=PATCH_SIZE, add_special_patches=True, cut=True):\n",
+    "\n",
+    "        lines = abc_text.split('\\n')\n",
+    "        lines = list(filter(None, lines))\n",
+    "        lines = [line + '\\n' for line in lines]\n",
+    "\n",
+    "        tunebody_index = -1\n",
+    "        for i, line in enumerate(lines):\n",
+    "            if '[V:' in line:\n",
+    "                tunebody_index = i\n",
+    "                break\n",
+    "\n",
+    "        metadata_lines = lines[ : tunebody_index]\n",
+    "        tunebody_lines = lines[tunebody_index : ]\n",
+    "\n",
+    "        if self.stream:\n",
+    "            tunebody_lines = ['[r:' + str(line_index) + '/' + str(len(tunebody_lines) - line_index - 1) + ']' + line for line_index, line in\n",
+    "                                enumerate(tunebody_lines)]    \n",
+    "\n",
+    "        metadata_patches = self.patchilize_metadata(metadata_lines)\n",
+    "        tunebody_patches = self.patchilize_tunebody(tunebody_lines, encode_mode='train')\n",
+    "\n",
+    "        if add_special_patches:\n",
+    "            bos_patch = chr(self.bos_token_id) * (patch_size - 1) + chr(self.eos_token_id)\n",
+    "            eos_patch = chr(self.bos_token_id) + chr(self.eos_token_id) * (patch_size - 1)\n",
+    "\n",
+    "            metadata_patches = [bos_patch] + metadata_patches\n",
+    "            tunebody_patches = tunebody_patches + [eos_patch]\n",
+    "\n",
+    "        if self.stream:\n",
+    "            if len(metadata_patches) + len(tunebody_patches) > patch_length:\n",
+    "                available_cut_indexes = [0] + [index + 1 for index, patch in enumerate(tunebody_patches) if '\\n' in patch]\n",
+    "                line_index_for_cut_index = list(range(len(available_cut_indexes)))  \n",
+    "                end_index = len(metadata_patches) + len(tunebody_patches) - patch_length\n",
+    "                biggest_index = bisect.bisect_left(available_cut_indexes, end_index) \n",
+    "                available_cut_indexes = available_cut_indexes[:biggest_index + 1]\n",
+    "\n",
+    "                if len(available_cut_indexes) == 1:\n",
+    "                    choices = ['head']\n",
+    "                elif len(available_cut_indexes) == 2:\n",
+    "                    choices = ['head', 'tail']\n",
+    "                else:\n",
+    "                    choices = ['head', 'tail', 'middle']\n",
+    "                choice = random.choice(choices)\n",
+    "                if choice == 'head':\n",
+    "                    patches = metadata_patches + tunebody_patches[0:]\n",
+    "                else:\n",
+    "                    if choice == 'tail':\n",
+    "                        cut_index = len(available_cut_indexes) - 1\n",
+    "                    else:\n",
+    "                        cut_index = random.choice(range(1, len(available_cut_indexes) - 1))\n",
+    "\n",
+    "                    line_index = line_index_for_cut_index[cut_index] \n",
+    "                    stream_tunebody_lines = tunebody_lines[line_index : ]\n",
+    "                    \n",
+    "                    stream_tunebody_patches = self.patchilize_tunebody(stream_tunebody_lines, encode_mode='train')\n",
+    "                    if add_special_patches:\n",
+    "                        stream_tunebody_patches = stream_tunebody_patches + [eos_patch]\n",
+    "                    patches = metadata_patches + stream_tunebody_patches\n",
+    "            else:\n",
+    "                patches = metadata_patches + tunebody_patches\n",
+    "        else:\n",
+    "            patches = metadata_patches + tunebody_patches\n",
+    "\n",
+    "        if cut: \n",
+    "            patches = patches[ : patch_length]\n",
+    "        else:   \n",
+    "            pass\n",
+    "\n",
+    "        # encode to ids\n",
+    "        id_patches = []\n",
+    "        for patch in patches:\n",
+    "            id_patch = [ord(c) for c in patch] + [self.special_token_id] * (patch_size - len(patch))\n",
+    "            id_patches.append(id_patch)\n",
+    "\n",
+    "        return id_patches\n",
+    "\n",
+    "    def encode_generate(self, abc_code, patch_length=PATCH_LENGTH, patch_size=PATCH_SIZE, add_special_patches=True):\n",
+    "\n",
+    "        lines = abc_code.split('\\n')\n",
+    "        lines = list(filter(None, lines))\n",
+    "    \n",
+    "        tunebody_index = None\n",
+    "        for i, line in enumerate(lines):\n",
+    "            if line.startswith('[V:') or line.startswith('[r:'):\n",
+    "                tunebody_index = i\n",
+    "                break\n",
+    "    \n",
+    "        metadata_lines = lines[ : tunebody_index]\n",
+    "        tunebody_lines = lines[tunebody_index : ]   \n",
+    "    \n",
+    "        metadata_lines = [line + '\\n' for line in metadata_lines]\n",
+    "        if self.stream:\n",
+    "            if not abc_code.endswith('\\n'):\n",
+    "                tunebody_lines = [tunebody_lines[i] + '\\n' for i in range(len(tunebody_lines) - 1)] + [tunebody_lines[-1]]\n",
+    "            else:\n",
+    "                tunebody_lines = [tunebody_lines[i] + '\\n' for i in range(len(tunebody_lines))]\n",
+    "        else:\n",
+    "            tunebody_lines = [line + '\\n' for line in tunebody_lines]\n",
+    "    \n",
+    "        metadata_patches = self.patchilize_metadata(metadata_lines)\n",
+    "        tunebody_patches = self.patchilize_tunebody(tunebody_lines, encode_mode='generate')\n",
+    "    \n",
+    "        if add_special_patches:\n",
+    "            bos_patch = chr(self.bos_token_id) * (patch_size - 1) + chr(self.eos_token_id)\n",
+    "\n",
+    "            metadata_patches = [bos_patch] + metadata_patches\n",
+    "    \n",
+    "        patches = metadata_patches + tunebody_patches\n",
+    "        patches = patches[ : patch_length]\n",
+    "\n",
+    "        # encode to ids\n",
+    "        id_patches = []\n",
+    "        for patch in patches:\n",
+    "            if len(patch) < PATCH_SIZE and patch[-1] != chr(self.eos_token_id):\n",
+    "                id_patch = [ord(c) for c in patch]\n",
+    "            else:\n",
+    "                id_patch = [ord(c) for c in patch] + [self.special_token_id] * (patch_size - len(patch))\n",
+    "            id_patches.append(id_patch)\n",
+    "        \n",
+    "        return id_patches\n",
+    "\n",
+    "    def decode(self, patches):\n",
+    "        \"\"\"\n",
+    "        Decode patches into music.\n",
+    "        \"\"\"\n",
+    "        return ''.join(self.patch2chars(patch) for patch in patches)\n",
+    "\n",
+    "\n",
+    "class PatchLevelDecoder(PreTrainedModel):\n",
+    "    \"\"\"\n",
+    "    A Patch-level Decoder model for generating patch features in an auto-regressive manner. \n",
+    "    It inherits PreTrainedModel from transformers.\n",
+    "    \"\"\"\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        self.patch_embedding = torch.nn.Linear(PATCH_SIZE * 128, config.n_embd)\n",
+    "        torch.nn.init.normal_(self.patch_embedding.weight, std=0.02)\n",
+    "        self.base = GPT2Model(config)\n",
+    "\n",
+    "    def forward(self,\n",
+    "                patches: torch.Tensor,\n",
+    "                masks=None) -> torch.Tensor:\n",
+    "        \"\"\"\n",
+    "        The forward pass of the patch-level decoder model.\n",
+    "        :param patches: the patches to be encoded\n",
+    "        :param masks: the masks for the patches\n",
+    "        :return: the encoded patches\n",
+    "        \"\"\"\n",
+    "        patches = torch.nn.functional.one_hot(patches, num_classes=128).to(self.dtype)\n",
+    "        patches = patches.reshape(len(patches), -1, PATCH_SIZE * (128))\n",
+    "        patches = self.patch_embedding(patches.to(self.device))\n",
+    "\n",
+    "        if masks==None:\n",
+    "            return self.base(inputs_embeds=patches)\n",
+    "        else:\n",
+    "            return self.base(inputs_embeds=patches,\n",
+    "                             attention_mask=masks)\n",
+    "\n",
+    "\n",
+    "class CharLevelDecoder(PreTrainedModel):\n",
+    "    \"\"\"\n",
+    "    A Char-level Decoder model for generating the chars within each patch in an auto-regressive manner\n",
+    "    based on the encoded patch features. It inherits PreTrainedModel from transformers.\n",
+    "    \"\"\"\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        self.special_token_id = 0\n",
+    "        self.bos_token_id = 1\n",
+    "\n",
+    "        self.base = GPT2LMHeadModel(config)\n",
+    "\n",
+    "    def forward(self,\n",
+    "                encoded_patches: torch.Tensor,\n",
+    "                target_patches: torch.Tensor):\n",
+    "        \"\"\"\n",
+    "        The forward pass of the char-level decoder model.\n",
+    "        :param encoded_patches: the encoded patches\n",
+    "        :param target_patches: the target patches\n",
+    "        :return: the output of the model\n",
+    "        \"\"\"\n",
+    "        # preparing the labels for model training\n",
+    "        target_patches = torch.cat((torch.ones_like(target_patches[:,0:1])*self.bos_token_id, target_patches), dim=1)\n",
+    "        # print('target_patches shape:', target_patches.shape)\n",
+    "\n",
+    "        target_masks = target_patches == self.special_token_id\n",
+    "        labels = target_patches.clone().masked_fill_(target_masks, -100)\n",
+    "\n",
+    "        # masking the labels for model training\n",
+    "        target_masks = torch.ones_like(labels)\n",
+    "        target_masks = target_masks.masked_fill_(labels == -100, 0)\n",
+    "\n",
+    "        # select patches\n",
+    "        if PATCH_SAMPLING_BATCH_SIZE!=0 and PATCH_SAMPLING_BATCH_SIZE<target_patches.shape[0]:\n",
+    "            indices = list(range(len(target_patches)))\n",
+    "            random.shuffle(indices)\n",
+    "            selected_indices = sorted(indices[:PATCH_SAMPLING_BATCH_SIZE])\n",
+    "\n",
+    "            target_patches = target_patches[selected_indices,:]\n",
+    "            target_masks = target_masks[selected_indices,:]\n",
+    "            encoded_patches = encoded_patches[selected_indices,:]\n",
+    "\n",
+    "        # get input embeddings\n",
+    "        inputs_embeds = torch.nn.functional.embedding(target_patches, self.base.transformer.wte.weight)\n",
+    "\n",
+    "        # concatenate the encoded patches with the input embeddings\n",
+    "        inputs_embeds = torch.cat((encoded_patches.unsqueeze(1), inputs_embeds[:,1:,:]), dim=1)\n",
+    "\n",
+    "        output = self.base(inputs_embeds=inputs_embeds, \n",
+    "                         attention_mask=target_masks,\n",
+    "                         labels=labels)\n",
+    "                         # output_hidden_states=True=True)\n",
+    "\n",
+    "        return output\n",
+    "\n",
+    "    def generate(self,\n",
+    "                 encoded_patch: torch.Tensor,   # [hidden_size]\n",
+    "                 tokens: torch.Tensor): # [1]\n",
+    "        \"\"\"\n",
+    "        The generate function for generating a patch based on the encoded patch and already generated tokens.\n",
+    "        :param encoded_patch: the encoded patch\n",
+    "        :param tokens: already generated tokens in the patch\n",
+    "        :return: the probability distribution of next token\n",
+    "        \"\"\"\n",
+    "        encoded_patch = encoded_patch.reshape(1, 1, -1) # [1, 1, hidden_size]\n",
+    "        tokens = tokens.reshape(1, -1)\n",
+    "\n",
+    "        # Get input embeddings\n",
+    "        tokens = torch.nn.functional.embedding(tokens, self.base.transformer.wte.weight)\n",
+    "\n",
+    "        # Concatenate the encoded patch with the input embeddings\n",
+    "        tokens = torch.cat((encoded_patch, tokens[:,1:,:]), dim=1)\n",
+    "        \n",
+    "        # Get output from model\n",
+    "        outputs = self.base(inputs_embeds=tokens)\n",
+    "        \n",
+    "        # Get probabilities of next token\n",
+    "        probs = torch.nn.functional.softmax(outputs.logits.squeeze(0)[-1], dim=-1)\n",
+    "\n",
+    "        return probs\n",
+    "\n",
+    "class NotaGenLMHeadModel(PreTrainedModel):\n",
+    "    \"\"\"\n",
+    "    NotaGen is a language model with a hierarchical structure.\n",
+    "    It includes a patch-level decoder and a char-level decoder.\n",
+    "    The patch-level decoder is used to generate patch features in an auto-regressive manner.\n",
+    "    The char-level decoder is used to generate the chars within each patch in an auto-regressive manner.\n",
+    "    It inherits PreTrainedModel from transformers.\n",
+    "    \"\"\"\n",
+    "    def __init__(self, encoder_config, decoder_config):\n",
+    "        super().__init__(encoder_config)\n",
+    "        self.special_token_id = 0\n",
+    "        self.bos_token_id = 1\n",
+    "        self.eos_token_id = 2\n",
+    "        self.patch_level_decoder = PatchLevelDecoder(encoder_config)\n",
+    "        self.char_level_decoder = CharLevelDecoder(decoder_config)\n",
+    "\n",
+    "    def forward(self,\n",
+    "                patches: torch.Tensor,\n",
+    "                masks: torch.Tensor):\n",
+    "        \"\"\"\n",
+    "        The forward pass of the bGPT model.\n",
+    "        :param patches: the patches to be encoded\n",
+    "        :param masks: the masks for the patches\n",
+    "        :return: the decoded patches\n",
+    "        \"\"\"\n",
+    "        patches = patches.reshape(len(patches), -1, PATCH_SIZE)\n",
+    "        encoded_patches = self.patch_level_decoder(patches, masks)[\"last_hidden_state\"]\n",
+    "        \n",
+    "        left_shift_masks = masks * (masks.flip(1).cumsum(1).flip(1) > 1)\n",
+    "        masks[:, 0] = 0\n",
+    "        \n",
+    "        encoded_patches = encoded_patches[left_shift_masks == 1]\n",
+    "        patches = patches[masks == 1]        \n",
+    "\n",
+    "        return self.char_level_decoder(encoded_patches, patches)\n",
+    "        \n",
+    "    def generate(self,\n",
+    "                 patches: torch.Tensor,\n",
+    "                 top_k=0,\n",
+    "                 top_p=1,\n",
+    "                 temperature=1.0):\n",
+    "        \"\"\"\n",
+    "        The generate function for generating patches based on patches.\n",
+    "        :param patches: the patches to be encoded\n",
+    "        :param top_k: the top k for sampling\n",
+    "        :param top_p: the top p for sampling\n",
+    "        :param temperature: the temperature for sampling\n",
+    "        :return: the generated patches\n",
+    "        \"\"\"\n",
+    "        if patches.shape[-1] % PATCH_SIZE != 0:\n",
+    "            tokens = patches[:,:,-(patches.shape[-1]%PATCH_SIZE):].squeeze(0, 1)\n",
+    "            tokens = torch.cat((torch.tensor([self.bos_token_id], device=self.device), tokens), dim=-1)\n",
+    "            patches = patches[:,:,:-(patches.shape[-1]%PATCH_SIZE)]\n",
+    "        else:\n",
+    "            tokens =  torch.tensor([self.bos_token_id], device=self.device)\n",
+    "\n",
+    "        patches = patches.reshape(len(patches), -1, PATCH_SIZE) # [bs, seq, patch_size]\n",
+    "        encoded_patches = self.patch_level_decoder(patches)[\"last_hidden_state\"]    # [bs, seq, hidden_size]\n",
+    "        generated_patch = []            \n",
+    "\n",
+    "        while True:\n",
+    "            prob = self.char_level_decoder.generate(encoded_patches[0][-1], tokens).cpu().detach().numpy()  # [128]\n",
+    "            prob = top_k_sampling(prob, top_k=top_k, return_probs=True) # [128]\n",
+    "            prob = top_p_sampling(prob, top_p=top_p, return_probs=True) # [128]\n",
+    "            token = temperature_sampling(prob, temperature=temperature) # int\n",
+    "            char = chr(token)\n",
+    "            generated_patch.append(token)\n",
+    "\n",
+    "            if len(tokens) >= PATCH_SIZE:# or token == self.eos_token_id:\n",
+    "                break\n",
+    "            else:\n",
+    "                tokens = torch.cat((tokens, torch.tensor([token], device=self.device)), dim=0)\n",
+    "        \n",
+    "        return generated_patch\n",
+    "\n",
+    "def clean_to_abc(raw_text, unreduce=True, output_path='output.abc'):\n",
+    "    # Remove [r:x/y] tags\n",
+    "    cleaned = re.sub(r'\\[r:\\d+/\\d+\\]', '', raw_text)\n",
+    "\n",
+    "    # Add required ABC headers\n",
+    "    lines = cleaned.strip().splitlines()\n",
+    "    header_inserted = False\n",
+    "    abc_lines = []\n",
+    "    for line in lines:\n",
+    "        if not header_inserted and line.startswith('%%score'):\n",
+    "            abc_lines.insert(0, 'T:Generated\\n')\n",
+    "            abc_lines.insert(0, 'X:1\\n')\n",
+    "            header_inserted = True\n",
+    "        abc_lines.append(line if line.endswith('\\n') else line + '\\n')\n",
+    "\n",
+    "    # Optional: fill missing rests\n",
+    "    if unreduce:\n",
+    "        try:\n",
+    "            abc_lines = rest_unreduce(abc_lines)\n",
+    "        except Exception as e:\n",
+    "            print(\"Unreduce failed:\", e)\n",
+    "\n",
+    "    # Save to .abc file\n",
+    "    Path(output_path).write_text(''.join(abc_lines), encoding='utf-8')\n",
+    "    print(f\"Saved cleaned ABC to {output_path}\")\n",
+    "    return output_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d126533-a9a1-48a5-9b1b-be6da37a55ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Note_list = Note_list + ['z', 'x']\n",
+    "\n",
+    "patchilizer = Patchilizer()\n",
+    "\n",
+    "patch_config = GPT2Config(num_hidden_layers=PATCH_NUM_LAYERS,\n",
+    "                          max_length=PATCH_LENGTH,\n",
+    "                          max_position_embeddings=PATCH_LENGTH,\n",
+    "                          n_embd=HIDDEN_SIZE,\n",
+    "                          num_attention_heads=HIDDEN_SIZE // 64,\n",
+    "                          vocab_size=1)\n",
+    "byte_config = GPT2Config(num_hidden_layers=CHAR_NUM_LAYERS,\n",
+    "                         max_length=PATCH_SIZE + 1,\n",
+    "                         max_position_embeddings=PATCH_SIZE + 1,\n",
+    "                         hidden_size=HIDDEN_SIZE,\n",
+    "                         num_attention_heads=HIDDEN_SIZE // 64,\n",
+    "                         vocab_size=128)\n",
+    "\n",
+    "model = NotaGenLMHeadModel(encoder_config=patch_config, decoder_config=byte_config).to(device)\n",
+    "\n",
+    "def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True):\n",
+    "    \"\"\"\n",
+    "    Prepare model for k-bit training.\n",
+    "    Features include:\n",
+    "    1. Convert model to mixed precision (FP16).\n",
+    "    2. Disable unnecessary gradient computations.\n",
+    "    3. Enable gradient checkpointing (optional).\n",
+    "    \"\"\"\n",
+    "    # Convert model to mixed precision\n",
+    "    model = model.to(dtype=torch.float16)\n",
+    "\n",
+    "    # Disable gradients for embedding layers\n",
+    "    for param in model.parameters():\n",
+    "        if param.dtype == torch.float32:\n",
+    "            param.requires_grad = False\n",
+    "\n",
+    "    # Enable gradient checkpointing\n",
+    "    if use_gradient_checkpointing:\n",
+    "        model.gradient_checkpointing_enable()\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "model = prepare_model_for_kbit_training(\n",
+    "    model,\n",
+    "    use_gradient_checkpointing=False  \n",
+    ")\n",
+    "\n",
+    "print(\"Parameter Number: \" + str(sum(p.numel() for p in model.parameters() if p.requires_grad)))\n",
+    "\n",
+    "checkpoint = torch.load(INFERENCE_WEIGHTS_PATH, map_location=torch.device(device))\n",
+    "model.load_state_dict(checkpoint['model'])\n",
+    "model = model.to(device)\n",
+    "model.eval()\n",
+    "\n",
+    "def complete_brackets(s):\n",
+    "    stack = []\n",
+    "    bracket_map = {'{': '}', '[': ']', '(': ')'}\n",
+    "    \n",
+    "    # Iterate through each character, handle bracket matching\n",
+    "    for char in s:\n",
+    "        if char in bracket_map:\n",
+    "            stack.append(char)\n",
+    "        elif char in bracket_map.values():\n",
+    "            # Find the corresponding left bracket\n",
+    "            for key, value in bracket_map.items():\n",
+    "                if value == char:\n",
+    "                    if stack and stack[-1] == key:\n",
+    "                        stack.pop()\n",
+    "                    break  # Found matching right bracket, process next character\n",
+    "    \n",
+    "    # Complete missing right brackets (in reverse order of remaining left brackets in stack)\n",
+    "    completion = ''.join(bracket_map[c] for c in reversed(stack))\n",
+    "    return s + completion\n",
+    "\n",
+    "\n",
+    "def rest_unreduce(abc_lines):\n",
+    "\n",
+    "    tunebody_index = None\n",
+    "    for i in range(len(abc_lines)):\n",
+    "        if abc_lines[i].startswith('%%score'):\n",
+    "            abc_lines[i] = complete_brackets(abc_lines[i])\n",
+    "        if '[V:' in abc_lines[i]:\n",
+    "            tunebody_index = i\n",
+    "            break\n",
+    "\n",
+    "    metadata_lines = abc_lines[: tunebody_index]\n",
+    "    tunebody_lines = abc_lines[tunebody_index:]\n",
+    "\n",
+    "    part_symbol_list = []\n",
+    "    voice_group_list = []\n",
+    "    for line in metadata_lines:\n",
+    "        if line.startswith('%%score'):\n",
+    "            for round_bracket_match in re.findall(r'\\((.*?)\\)', line):\n",
+    "                voice_group_list.append(round_bracket_match.split())\n",
+    "            existed_voices = [item for sublist in voice_group_list for item in sublist]\n",
+    "        if line.startswith('V:'):\n",
+    "            symbol = line.split()[0]\n",
+    "            part_symbol_list.append(symbol)\n",
+    "            if symbol[2:] not in existed_voices:\n",
+    "                voice_group_list.append([symbol[2:]])\n",
+    "    z_symbol_list = []  # voices that use z as rest\n",
+    "    x_symbol_list = []  # voices that use x as rest\n",
+    "    for voice_group in voice_group_list:\n",
+    "        z_symbol_list.append('V:' + voice_group[0])\n",
+    "        for j in range(1, len(voice_group)):\n",
+    "            x_symbol_list.append('V:' + voice_group[j])\n",
+    "\n",
+    "    part_symbol_list.sort(key=lambda x: int(x[2:]))\n",
+    "\n",
+    "    unreduced_tunebody_lines = []\n",
+    "\n",
+    "    for i, line in enumerate(tunebody_lines):\n",
+    "        unreduced_line = ''\n",
+    "\n",
+    "        line = re.sub(r'^\\[r:[^\\]]*\\]', '', line)\n",
+    "\n",
+    "        pattern = r'\\[V:(\\d+)\\](.*?)(?=\\[V:|$)'\n",
+    "        matches = re.findall(pattern, line)\n",
+    "\n",
+    "        line_bar_dict = {}\n",
+    "        for match in matches:\n",
+    "            key = f'V:{match[0]}'\n",
+    "            value = match[1]\n",
+    "            line_bar_dict[key] = value\n",
+    "\n",
+    "        # calculate duration and collect barline\n",
+    "        dur_dict = {}  \n",
+    "        for symbol, bartext in line_bar_dict.items():\n",
+    "            right_barline = ''.join(re.split(Barline_regexPattern, bartext)[-2:])\n",
+    "            bartext = bartext[:-len(right_barline)]\n",
+    "            try:\n",
+    "                bar_dur = calculate_bartext_duration(bartext)\n",
+    "            except:\n",
+    "                bar_dur = None\n",
+    "            if bar_dur is not None:\n",
+    "                if bar_dur not in dur_dict.keys():\n",
+    "                    dur_dict[bar_dur] = 1\n",
+    "                else:\n",
+    "                    dur_dict[bar_dur] += 1\n",
+    "\n",
+    "        try:\n",
+    "            ref_dur = max(dur_dict, key=dur_dict.get)\n",
+    "        except:\n",
+    "            pass    # use last ref_dur\n",
+    "\n",
+    "        if i == 0:\n",
+    "            prefix_left_barline = line.split('[V:')[0]\n",
+    "        else:\n",
+    "            prefix_left_barline = ''\n",
+    "\n",
+    "        for symbol in part_symbol_list:\n",
+    "            if symbol in line_bar_dict.keys():\n",
+    "                symbol_bartext = line_bar_dict[symbol]\n",
+    "            else:\n",
+    "                if symbol in z_symbol_list:\n",
+    "                    symbol_bartext = prefix_left_barline + 'z' + str(ref_dur) + right_barline\n",
+    "                elif symbol in x_symbol_list:\n",
+    "                    symbol_bartext = prefix_left_barline + 'x' + str(ref_dur) + right_barline\n",
+    "            unreduced_line += '[' + symbol + ']' + symbol_bartext\n",
+    "\n",
+    "        unreduced_tunebody_lines.append(unreduced_line + '\\n')\n",
+    "\n",
+    "    unreduced_lines = metadata_lines + unreduced_tunebody_lines\n",
+    "\n",
+    "    return unreduced_lines\n",
+    "\n",
+    "\n",
+    "def inference_patch(period, composer, instrumentation):\n",
+    "\n",
+    "    prompt_lines=[\n",
+    "    '%' + period + '\\n',\n",
+    "    '%' + composer + '\\n',\n",
+    "    '%' + instrumentation + '\\n']\n",
+    "\n",
+    "    while True:\n",
+    "\n",
+    "        failure_flag = False\n",
+    "\n",
+    "        bos_patch = [patchilizer.bos_token_id] * (PATCH_SIZE - 1) + [patchilizer.eos_token_id]\n",
+    "\n",
+    "        start_time = time.time()\n",
+    "\n",
+    "        prompt_patches = patchilizer.patchilize_metadata(prompt_lines)\n",
+    "        byte_list = list(''.join(prompt_lines))\n",
+    "        context_tunebody_byte_list = []\n",
+    "        metadata_byte_list = []\n",
+    "\n",
+    "        print(''.join(byte_list), end='')\n",
+    "\n",
+    "        prompt_patches = [[ord(c) for c in patch] + [patchilizer.special_token_id] * (PATCH_SIZE - len(patch)) for patch\n",
+    "                          in prompt_patches]\n",
+    "        prompt_patches.insert(0, bos_patch)\n",
+    "\n",
+    "        input_patches = torch.tensor(prompt_patches, device=device).reshape(1, -1)\n",
+    "\n",
+    "        end_flag = False\n",
+    "        cut_index = None\n",
+    "\n",
+    "        tunebody_flag = False\n",
+    "\n",
+    "        with torch.inference_mode():\n",
+    "            \n",
+    "            while True:\n",
+    "                with torch.autocast(device_type='cuda', dtype=torch.float16):\n",
+    "                    predicted_patch = model.generate(input_patches.unsqueeze(0),\n",
+    "                                                    top_k=TOP_K,\n",
+    "                                                    top_p=TOP_P,\n",
+    "                                                    temperature=TEMPERATURE)\n",
+    "                if not tunebody_flag and patchilizer.decode([predicted_patch]).startswith('[r:'):  # 初次进入tunebody，必须以[r:0/开头\n",
+    "                    tunebody_flag = True\n",
+    "                    r0_patch = torch.tensor([ord(c) for c in '[r:0/']).unsqueeze(0).to(device)\n",
+    "                    temp_input_patches = torch.concat([input_patches, r0_patch], axis=-1)\n",
+    "                    predicted_patch = model.generate(temp_input_patches.unsqueeze(0),\n",
+    "                                                    top_k=TOP_K,\n",
+    "                                                    top_p=TOP_P,\n",
+    "                                                    temperature=TEMPERATURE)\n",
+    "                    predicted_patch = [ord(c) for c in '[r:0/'] + predicted_patch\n",
+    "                if predicted_patch[0] == patchilizer.bos_token_id and predicted_patch[1] == patchilizer.eos_token_id:\n",
+    "                    end_flag = True\n",
+    "                    break\n",
+    "                next_patch = patchilizer.decode([predicted_patch])\n",
+    "\n",
+    "                for char in next_patch:\n",
+    "                    byte_list.append(char)\n",
+    "                    if tunebody_flag:\n",
+    "                        context_tunebody_byte_list.append(char)\n",
+    "                    else:\n",
+    "                        metadata_byte_list.append(char)\n",
+    "                    print(char, end='')\n",
+    "\n",
+    "                patch_end_flag = False\n",
+    "                for j in range(len(predicted_patch)):\n",
+    "                    if patch_end_flag:\n",
+    "                        predicted_patch[j] = patchilizer.special_token_id\n",
+    "                    if predicted_patch[j] == patchilizer.eos_token_id:\n",
+    "                        patch_end_flag = True\n",
+    "\n",
+    "                predicted_patch = torch.tensor([predicted_patch], device=device)  # (1, 16)\n",
+    "                input_patches = torch.cat([input_patches, predicted_patch], dim=1)  # (1, 16 * patch_len)\n",
+    "\n",
+    "                if len(byte_list) > 102400:\n",
+    "                    failure_flag = True\n",
+    "                    break\n",
+    "                if time.time() - start_time > 10 * 60: \n",
+    "                    failure_flag = True\n",
+    "                    break\n",
+    "\n",
+    "                if input_patches.shape[1] >= PATCH_LENGTH * PATCH_SIZE and not end_flag:\n",
+    "                    print('Stream generating...')\n",
+    "\n",
+    "                    metadata = ''.join(metadata_byte_list)\n",
+    "                    context_tunebody = ''.join(context_tunebody_byte_list)\n",
+    "\n",
+    "                    if '\\n' not in context_tunebody:\n",
+    "                        break   # Generated content is all metadata, abandon\n",
+    "\n",
+    "                    context_tunebody_liness = context_tunebody.split('\\n')\n",
+    "                    if not context_tunebody.endswith('\\n'):\n",
+    "                        context_tunebody_liness = [context_tunebody_liness[i] + '\\n' for i in range(len(context_tunebody_liness) - 1)] + [context_tunebody_liness[-1]]\n",
+    "                    else:\n",
+    "                        context_tunebody_liness = [context_tunebody_liness[i] + '\\n' for i in range(len(context_tunebody_liness))]\n",
+    "\n",
+    "                    cut_index = len(context_tunebody_liness) // 2\n",
+    "                    abc_code_slice = metadata + ''.join(context_tunebody_liness[-cut_index:])\n",
+    "\n",
+    "                    input_patches = patchilizer.encode_generate(abc_code_slice)\n",
+    "\n",
+    "                    input_patches = [item for sublist in input_patches for item in sublist]\n",
+    "                    input_patches = torch.tensor([input_patches], device=device)\n",
+    "                    input_patches = input_patches.reshape(1, -1)\n",
+    "\n",
+    "                    context_tunebody_byte_list = list(''.join(context_tunebody_lines[-cut_index:]))\n",
+    "\n",
+    "            if not failure_flag:\n",
+    "                abc_text = ''.join(byte_list)\n",
+    "\n",
+    "                # unreduce\n",
+    "                abc_lines = abc_text.split('\\n')\n",
+    "                abc_lines = list(filter(None, abc_lines))\n",
+    "                abc_lines = [line + '\\n' for line in abc_lines]\n",
+    "                try:\n",
+    "                    unreduced_abc_lines = rest_unreduce(abc_lines)\n",
+    "                except:\n",
+    "                    failure_flag = True\n",
+    "                    pass\n",
+    "                else:\n",
+    "                    unreduced_abc_lines = [line for line in unreduced_abc_lines if not(line.startswith('%') and not line.startswith('%%'))]\n",
+    "                    unreduced_abc_lines = ['X:1\\n'] + unreduced_abc_lines\n",
+    "                    unreduced_abc_text = ''.join(unreduced_abc_lines)\n",
+    "                    return unreduced_abc_text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "502c4420-533b-43cc-80ca-b2c94cd4be04",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = inference_patch('Classical', 'Beethoven, Ludwig van', 'Art Song')\n",
+    "\n",
+    "abc_lines = result.splitlines()\n",
+    "abc_lines = [line + '\\n' for line in abc_lines if line.strip()]  # Add newlines and remove empty lines\n",
+    "\n",
+    "abc_lines = rest_unreduce(abc_lines)\n",
+    "\n",
+    "with open(\"output.abc\", \"w\", encoding=\"utf-8\") as f:\n",
+    "    f.writelines(abc_lines)\n",
+    "\n",
+    "!python abc2xml.py -o . output.abc"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

demo.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import gradio as gr
+import sys
+import threading
+import queue
+from io import TextIOBase
+from inference import inference_patch
+import datetime
+import subprocess
+import os
+# Predefined valid combinations set
+with open('prompts.txt', 'r') as f:
+    prompts = f.readlines()
+valid_combinations = set()
+for prompt in prompts:
+    prompt = prompt.strip()
+    parts = prompt.split('_')
+    valid_combinations.add((parts[0], parts[1], parts[2]))
+# Generate available options
+periods = sorted({p for p, _, _ in valid_combinations})
+composers = sorted({c for _, c, _ in valid_combinations})
+instruments = sorted({i for _, _, i in valid_combinations})
+# Dynamic component updates
+def update_components(period, composer):
+    if not period:
+        return [
+            gr.Dropdown(choices=[], value=None, interactive=False),
+            gr.Dropdown(choices=[], value=None, interactive=False)
+        ]
+    valid_composers = sorted({c for p, c, _ in valid_combinations if p == period})
+    valid_instruments = sorted({i for p, c, i in valid_combinations if p == period and c == composer}) if composer else []
+    return [
+        gr.Dropdown(
+            choices=valid_composers,
+            value=composer if composer in valid_composers else None,
+            interactive=True
+        ),
+        gr.Dropdown(
+            choices=valid_instruments,
+            value=None,
+            interactive=bool(valid_instruments)
+        )
+    ]
+class RealtimeStream(TextIOBase):
+    def __init__(self, queue):
+        self.queue = queue
+    def write(self, text):
+        self.queue.put(text)
+        return len(text)
+def save_and_convert(abc_content, period, composer, instrumentation):
+    if not all([period, composer, instrumentation]):
+        raise gr.Error("Please complete a valid generation first before saving")
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    prompt_str = f"{period}_{composer}_{instrumentation}"
+    filename_base = f"{timestamp}_{prompt_str}"
+    abc_filename = f"{filename_base}.abc"
+    with open(abc_filename, "w", encoding="utf-8") as f:
+        f.write(abc_content)
+    xml_filename = f"{filename_base}.xml"
+    try:
+        subprocess.run(
+            ["python", "abc2xml.py", '-o', '.', abc_filename, ],
+            check=True,
+            capture_output=True,
+            text=True
+        )
+    except subprocess.CalledProcessError as e:
+        error_msg = f"Conversion failed: {e.stderr}" if e.stderr else "Unknown error"
+        raise gr.Error(f"ABC to XML conversion failed: {error_msg}. Please try to generate another composition.")
+    return f"Saved successfully: {abc_filename} -> {xml_filename}"
+def generate_music(period, composer, instrumentation):
+    if (period, composer, instrumentation) not in valid_combinations:
+        raise gr.Error("Invalid prompt combination! Please re-select from the period options")
+    output_queue = queue.Queue()
+    original_stdout = sys.stdout
+    sys.stdout = RealtimeStream(output_queue)
+    result_container = []
+    def run_inference():
+        try:
+            result_container.append(inference_patch(period, composer, instrumentation))
+        finally:
+            sys.stdout = original_stdout
+    thread = threading.Thread(target=run_inference)
+    thread.start()
+    process_output = ""
+    while thread.is_alive():
+        try:
+            text = output_queue.get(timeout=0.1)
+            process_output += text
+            yield process_output, None
+        except queue.Empty:
+            continue
+    while not output_queue.empty():
+        text = output_queue.get()
+        process_output += text
+        yield process_output, None
+    final_result = result_container[0] if result_container else ""
+    yield process_output, final_result
+with gr.Blocks() as demo:
+    gr.Markdown("## NotaGen")
+    with gr.Row():
+        # 左侧栏
+        with gr.Column():
+            period_dd = gr.Dropdown(
+                choices=periods,
+                value=None,
+                label="Period",
+                interactive=True
+            )
+            composer_dd = gr.Dropdown(
+                choices=[],
+                value=None,
+                label="Composer",
+                interactive=False
+            )
+            instrument_dd = gr.Dropdown(
+                choices=[],
+                value=None,
+                label="Instrumentation",
+                interactive=False
+            )
+            generate_btn = gr.Button("Generate!", variant="primary")
+            process_output = gr.Textbox(
+                label="Generation process",
+                interactive=False,
+                lines=15,
+                max_lines=15,
+                placeholder="Generation progress will be shown here...",
+                elem_classes="process-output"
+            )
+        # 右侧栏
+        with gr.Column():
+            final_output = gr.Textbox(
+                label="Post-processed ABC notation scores",
+                interactive=True,
+                lines=23,
+                placeholder="Post-processed ABC scores will be shown here...",
+                elem_classes="final-output"
+            )
+            with gr.Row():
+                save_btn = gr.Button("💾 Save as ABC & XML files", variant="secondary")
+            save_status = gr.Textbox(
+                label="Save Status",
+                interactive=False,
+                visible=True,
+                max_lines=2
+            )
+    period_dd.change(
+        update_components,
+        inputs=[period_dd, composer_dd],
+        outputs=[composer_dd, instrument_dd]
+    )
+    composer_dd.change(
+        update_components,
+        inputs=[period_dd, composer_dd],
+        outputs=[composer_dd, instrument_dd]
+    )
+    generate_btn.click(
+        generate_music,
+        inputs=[period_dd, composer_dd, instrument_dd],
+        outputs=[process_output, final_output]
+    )
+    save_btn.click(
+        save_and_convert,
+        inputs=[final_output, period_dd, composer_dd, instrument_dd],
+        outputs=[save_status]
+    )
+css = """
+.process-output {
+    background-color: #f0f0f0;
+    font-family: monospace;
+    padding: 10px;
+    border-radius: 5px;
+}
+.final-output {
+    background-color: #ffffff;
+    font-family: sans-serif;
+    padding: 10px;
+    border-radius: 5px;
+}
+.process-output textarea {
+    max-height: 500px !important;
+    overflow-y: auto !important;
+    white-space: pre-wrap;
+}
+"""
+css += """
+button#💾-save-convert:hover {
+    background-color: #ffe6e6;
+}
+"""
+demo.css = css
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7861
+    )

extract_clamp2.py ADDED Viewed

	@@ -0,0 +1,194 @@

+input_dir = ''  # interleaved abc folder
+output_dir = ''  # feature folder
+import os
+import json
+import random
+import torch
+import numpy as np
+from tqdm import tqdm
+from config import *
+from utils import *
+from samplings import *
+from accelerate import Accelerator
+from transformers import BertConfig, AutoTokenizer
+import argparse
+normalize = True
+os.makedirs("logs", exist_ok=True)
+for file in ["logs/files_extract_clamp2.json",
+             "logs/files_shuffle_extract_clamp2.json",
+             "logs/log_extract_clamp2.txt",
+             "logs/pass_extract_clamp2.txt",
+             "logs/skip_extract_clamp2.txt"]:
+    if os.path.exists(file):
+        os.remove(file)
+files = []
+for root, dirs, fs in os.walk(input_dir):
+    for f in fs:
+        if f.endswith(".txt") or f.endswith(".abc") or f.endswith(".mtf"):
+            files.append(os.path.join(root, f))
+print(f"Found {len(files)} files in total")
+with open("logs/files_extract_clamp2.json", "w", encoding="utf-8") as f:
+    json.dump(files, f)
+random.shuffle(files)
+with open("logs/files_shuffle_extract_clamp2.json", "w", encoding="utf-8") as f:
+    json.dump(files, f)
+accelerator = Accelerator()
+device = accelerator.device
+print("Using device:", device)
+with open("logs/log_extract_clamp.txt", "a", encoding="utf-8") as f:
+    f.write("Using device: " + str(device) + "\n")
+m3_config = BertConfig(vocab_size=1,
+                        hidden_size=M3_HIDDEN_SIZE,
+                        num_hidden_layers=PATCH_NUM_LAYERS,
+                        num_attention_heads=M3_HIDDEN_SIZE//64,
+                        intermediate_size=M3_HIDDEN_SIZE*4,
+                        max_position_embeddings=PATCH_LENGTH)
+model = CLaMP2Model(m3_config,
+                    text_model_name=TEXT_MODEL_NAME,
+                    hidden_size=CLAMP2_HIDDEN_SIZE,
+                    load_m3=CLAMP2_LOAD_M3)
+model = model.to(device)
+tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_NAME)
+patchilizer = M3Patchilizer()
+# print parameter number
+print("Parameter Number: "+str(sum(p.numel() for p in model.parameters() if p.requires_grad)))
+model.eval()
+checkpoint = torch.load(CLAMP2_WEIGHTS_PATH, map_location='cpu', weights_only=True)
+print(f"Successfully Loaded CLaMP 2 Checkpoint from Epoch {checkpoint['epoch']} with loss {checkpoint['min_eval_loss']}")
+model.load_state_dict(checkpoint['model'])
+def extract_feature(filename, get_normalized=normalize):
+    with open(filename, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+    filtered_lines = []
+    for line in lines:
+        if line.startswith('%') and not line.startswith('%%'):
+            pass
+        else:
+            filtered_lines.append(line)
+    item = ''.join(filtered_lines)
+    if filename.endswith(".txt"):
+        item = list(set(item.split("\n")))
+        item = "\n".join(item)
+        item = item.split("\n")
+        item = [c for c in item if len(c) > 0]
+        item = tokenizer.sep_token.join(item)
+        input_data = tokenizer(item, return_tensors="pt")
+        input_data = input_data['input_ids'].squeeze(0)
+        max_input_length = MAX_TEXT_LENGTH
+    else:
+        input_data = patchilizer.encode(item, add_special_patches=True)
+        input_data = torch.tensor(input_data)
+        max_input_length = PATCH_LENGTH
+    segment_list = []
+    for i in range(0, len(input_data), max_input_length):
+        segment_list.append(input_data[i:i+max_input_length])
+    segment_list[-1] = input_data[-max_input_length:]
+    last_hidden_states_list = []
+    for input_segment in segment_list:
+        input_masks = torch.tensor([1]*input_segment.size(0))
+        if filename.endswith(".txt"):
+            pad_indices = torch.ones(MAX_TEXT_LENGTH - input_segment.size(0)).long() * tokenizer.pad_token_id
+        else:
+            pad_indices = torch.ones((PATCH_LENGTH - input_segment.size(0), PATCH_SIZE)).long() * patchilizer.pad_token_id
+        input_masks = torch.cat((input_masks, torch.zeros(max_input_length - input_segment.size(0))), 0)
+        input_segment = torch.cat((input_segment, pad_indices), 0)
+        if filename.endswith(".txt"):
+            last_hidden_states = model.get_text_features(text_inputs=input_segment.unsqueeze(0).to(device),
+                                                         text_masks=input_masks.unsqueeze(0).to(device),
+                                                         get_normalized=get_normalized)
+        else:
+            last_hidden_states = model.get_music_features(music_inputs=input_segment.unsqueeze(0).to(device),
+                                                          music_masks=input_masks.unsqueeze(0).to(device),
+                                                          get_normalized=get_normalized)
+        if not get_normalized:
+            last_hidden_states = last_hidden_states[:, :input_masks.sum().long().item(), :]
+        last_hidden_states_list.append(last_hidden_states)
+    if not get_normalized:
+        last_hidden_states_list = [last_hidden_states[0] for last_hidden_states in last_hidden_states_list]
+        last_hidden_states_list[-1] = last_hidden_states_list[-1][-(len(input_data)%max_input_length):]
+        last_hidden_states_list = torch.concat(last_hidden_states_list, 0)
+    else:
+        full_chunk_cnt = len(input_data) // max_input_length
+        remain_chunk_len = len(input_data) % max_input_length
+        if remain_chunk_len == 0:
+            feature_weights = torch.tensor([max_input_length] * full_chunk_cnt, device=device).view(-1, 1)
+        else:
+            feature_weights = torch.tensor([max_input_length] * full_chunk_cnt + [remain_chunk_len], device=device).view(-1, 1)
+        last_hidden_states_list = torch.concat(last_hidden_states_list, 0)
+        last_hidden_states_list = last_hidden_states_list * feature_weights
+        last_hidden_states_list = last_hidden_states_list.sum(dim=0) / feature_weights.sum()
+    return last_hidden_states_list
+def process_directory(input_dir, output_dir, files):
+    print(f"Found {len(files)} files in total")
+    with open("logs/log_extract_clamp.txt", "a", encoding="utf-8") as f:
+        f.write("Found " + str(len(files)) + " files in total\n")
+    # calculate the number of files to process per GPU
+    num_files_per_gpu = len(files) // accelerator.num_processes
+    # calculate the start and end index for the current GPU
+    start_idx = accelerator.process_index * num_files_per_gpu
+    end_idx = start_idx + num_files_per_gpu
+    if accelerator.process_index == accelerator.num_processes - 1:
+        end_idx = len(files)
+    files_to_process = files[start_idx:end_idx]
+    # process the files
+    for file in tqdm(files_to_process):
+        output_subdir = output_dir + os.path.dirname(file)[len(input_dir):]
+        try:
+            os.makedirs(output_subdir, exist_ok=True)
+        except Exception as e:
+            print(output_subdir + " can not be created\n" + str(e))
+            with open("logs/log_extract_clamp.txt", "a") as f:
+                f.write(output_subdir + " can not be created\n" + str(e) + "\n")
+        output_file = os.path.join(output_subdir, os.path.splitext(os.path.basename(file))[0] + ".npy")
+        if os.path.exists(output_file):
+            print(f"Skipping {file}, output already exists")
+            with open("logs/skip_extract_clamp2.txt", "a", encoding="utf-8") as f:
+                f.write(file + "\n")
+            continue
+        try:
+            with torch.no_grad():
+                features = extract_feature(file).unsqueeze(0)
+            np.save(output_file, features.detach().cpu().numpy())
+            with open("logs/pass_extract_clamp2.txt", "a", encoding="utf-8") as f:
+                f.write(file + "\n")
+        except Exception as e:
+            print(f"Failed to process {file}: {e}")
+            with open("logs/log_extract_clamp.txt", "a", encoding="utf-8") as f:
+                f.write("Failed to process " + file + ": " + str(e) + "\n")
+with open("logs/files_shuffle_extract_clamp2.json", "r", encoding="utf-8") as f:
+    files = json.load(f)
+# process the files
+process_directory(input_dir, output_dir, files)
+with open("logs/log_extract_clamp.txt", "a", encoding="utf-8") as f:
+    f.write("GPU ID: " + str(device) + "\n")

illustration.png ADDED Viewed

Git LFS Details

SHA256: 10e0d5742ed50035210c40983bdf56d038d0288ebd89881b895e1e50afe609a3
Pointer size: 131 Bytes
Size of remote file: 384 kB

illustration_online.png ADDED Viewed

Git LFS Details

SHA256: b13315492555c202c6ba8d0891014f486fee8cac8ca3c908a26686bdc9e27347
Pointer size: 131 Bytes
Size of remote file: 253 kB

inference (1).py ADDED Viewed

	@@ -0,0 +1,271 @@

+import os
+import time
+import torch
+from utils import *
+from config import *
+from transformers import GPT2Config, LlamaConfig
+from abctoolkit.utils import Exclaim_re, Quote_re, SquareBracket_re, Barline_regexPattern
+from abctoolkit.transpose import Note_list, Pitch_sign_list
+from abctoolkit.duration import calculate_bartext_duration
+Note_list = Note_list + ['z', 'x']
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+else:
+    device = torch.device("cpu")
+os.makedirs(ORIGINAL_OUTPUT_FOLDER, exist_ok=True)
+os.makedirs(INTERLEAVED_OUTPUT_FOLDER, exist_ok=True)
+patchilizer = Patchilizer()
+patch_config = GPT2Config(num_hidden_layers=PATCH_NUM_LAYERS,
+                          max_length=PATCH_LENGTH,
+                          max_position_embeddings=PATCH_LENGTH,
+                          n_embd=HIDDEN_SIZE,
+                          num_attention_heads=HIDDEN_SIZE // 64,
+                          vocab_size=1)
+byte_config = GPT2Config(num_hidden_layers=CHAR_NUM_LAYERS,
+                         max_length=PATCH_SIZE + 1,
+                         max_position_embeddings=PATCH_SIZE + 1,
+                         hidden_size=HIDDEN_SIZE,
+                         num_attention_heads=HIDDEN_SIZE // 64,
+                         vocab_size=128)
+model = NotaGenLMHeadModel(encoder_config=patch_config, decoder_config=byte_config)
+print("Parameter Number: " + str(sum(p.numel() for p in model.parameters() if p.requires_grad)))
+checkpoint = torch.load(INFERENCE_WEIGHTS_PATH, map_location=torch.device(device))
+model.load_state_dict(checkpoint['model'])
+model = model.to(device)
+model.eval()
+def rest_unreduce(abc_lines):
+    tunebody_index = None
+    for i in range(len(abc_lines)):
+        if '[V:' in abc_lines[i]:
+            tunebody_index = i
+            break
+    metadata_lines = abc_lines[: tunebody_index]
+    tunebody_lines = abc_lines[tunebody_index:]
+    part_symbol_list = []
+    voice_group_list = []
+    for line in metadata_lines:
+        if line.startswith('%%score'):
+            for round_bracket_match in re.findall(r'\((.*?)\)', line):
+                voice_group_list.append(round_bracket_match.split())
+            existed_voices = [item for sublist in voice_group_list for item in sublist]
+        if line.startswith('V:'):
+            symbol = line.split()[0]
+            part_symbol_list.append(symbol)
+            if symbol[2:] not in existed_voices:
+                voice_group_list.append([symbol[2:]])
+    z_symbol_list = []  # voices that use z as rest
+    x_symbol_list = []  # voices that use x as rest
+    for voice_group in voice_group_list:
+        z_symbol_list.append('V:' + voice_group[0])
+        for j in range(1, len(voice_group)):
+            x_symbol_list.append('V:' + voice_group[j])
+    part_symbol_list.sort(key=lambda x: int(x[2:]))
+    unreduced_tunebody_lines = []
+    for i, line in enumerate(tunebody_lines):
+        unreduced_line = ''
+        line = re.sub(r'^\[r:[^\]]*\]', '', line)
+        pattern = r'\[V:(\d+)\](.*?)(?=\[V:|$)'
+        matches = re.findall(pattern, line)
+        line_bar_dict = {}
+        for match in matches:
+            key = f'V:{match[0]}'
+            value = match[1]
+            line_bar_dict[key] = value
+        # calculate duration and collect barline
+        dur_dict = {}
+        for symbol, bartext in line_bar_dict.items():
+            right_barline = ''.join(re.split(Barline_regexPattern, bartext)[-2:])
+            bartext = bartext[:-len(right_barline)]
+            try:
+                bar_dur = calculate_bartext_duration(bartext)
+            except:
+                bar_dur = None
+            if bar_dur is not None:
+                if bar_dur not in dur_dict.keys():
+                    dur_dict[bar_dur] = 1
+                else:
+                    dur_dict[bar_dur] += 1
+        try:
+            ref_dur = max(dur_dict, key=dur_dict.get)
+        except:
+            pass    # use last ref_dur
+        if i == 0:
+            prefix_left_barline = line.split('[V:')[0]
+        else:
+            prefix_left_barline = ''
+        for symbol in part_symbol_list:
+            if symbol in line_bar_dict.keys():
+                symbol_bartext = line_bar_dict[symbol]
+            else:
+                if symbol in z_symbol_list:
+                    symbol_bartext = prefix_left_barline + 'z' + str(ref_dur) + right_barline
+                elif symbol in x_symbol_list:
+                    symbol_bartext = prefix_left_barline + 'x' + str(ref_dur) + right_barline
+            unreduced_line += '[' + symbol + ']' + symbol_bartext
+        unreduced_tunebody_lines.append(unreduced_line + '\n')
+    unreduced_lines = metadata_lines + unreduced_tunebody_lines
+    return unreduced_lines
+def inference_patch(prompt_lines=[], pieces=NUM_SAMPLES):
+    file_no = 1
+    bos_patch = [patchilizer.bos_token_id] * (PATCH_SIZE - 1) + [patchilizer.eos_token_id]
+    while file_no <= pieces:
+        start_time = time.time()
+        start_time_format = time.strftime("%Y%m%d-%H%M%S")
+        prompt_patches = patchilizer.patchilize_metadata(prompt_lines)
+        byte_list = list(''.join(prompt_lines))
+        print(''.join(byte_list), end='')
+        prompt_patches = [[ord(c) for c in patch] + [patchilizer.special_token_id] * (PATCH_SIZE - len(patch)) for patch
+                          in prompt_patches]
+        prompt_patches.insert(0, bos_patch)
+        input_patches = torch.tensor(prompt_patches, device=device).reshape(1, -1)
+        failure_flag = False
+        end_flag = False
+        cut_index = None
+        tunebody_flag = False
+        while True:
+            predicted_patch = model.generate(input_patches.unsqueeze(0),
+                                             top_k=TOP_K,
+                                             top_p=TOP_P,
+                                             temperature=TEMPERATURE)
+            if not tunebody_flag and patchilizer.decode([predicted_patch]).startswith('[r:'):  # start with [r:0/
+                tunebody_flag = True
+                r0_patch = torch.tensor([ord(c) for c in '[r:0/']).unsqueeze(0).to(device)
+                temp_input_patches = torch.concat([input_patches, r0_patch], axis=-1)
+                predicted_patch = model.generate(temp_input_patches.unsqueeze(0),
+                                                 top_k=TOP_K,
+                                                 top_p=TOP_P,
+                                                 temperature=TEMPERATURE)
+                predicted_patch = [ord(c) for c in '[r:0/'] + predicted_patch
+            if predicted_patch[0] == patchilizer.bos_token_id and predicted_patch[1] == patchilizer.eos_token_id:
+                end_flag = True
+                break
+            next_patch = patchilizer.decode([predicted_patch])
+            for char in next_patch:
+                byte_list.append(char)
+                print(char, end='')
+            patch_end_flag = False
+            for j in range(len(predicted_patch)):
+                if patch_end_flag:
+                    predicted_patch[j] = patchilizer.special_token_id
+                if predicted_patch[j] == patchilizer.eos_token_id:
+                    patch_end_flag = True
+            predicted_patch = torch.tensor([predicted_patch], device=device)  # (1, 16)
+            input_patches = torch.cat([input_patches, predicted_patch], dim=1)  # (1, 16 * patch_len)
+            if len(byte_list) > 102400:
+                failure_flag = True
+                break
+            if time.time() - start_time > 20 * 60:
+                failure_flag = True
+                break
+            if input_patches.shape[1] >= PATCH_LENGTH * PATCH_SIZE and not end_flag:
+                print('Stream generating...')
+                abc_code = ''.join(byte_list)
+                abc_lines = abc_code.split('\n')
+                tunebody_index = None
+                for i, line in enumerate(abc_lines):
+                    if line.startswith('[r:') or line.startswith('[V:'):
+                        tunebody_index = i
+                        break
+                if tunebody_index is None or tunebody_index == len(abc_lines) - 1:
+                    break
+                metadata_lines = abc_lines[:tunebody_index]
+                tunebody_lines = abc_lines[tunebody_index:]
+                metadata_lines = [line + '\n' for line in metadata_lines]
+                if not abc_code.endswith('\n'):
+                    tunebody_lines = [tunebody_lines[i] + '\n' for i in range(len(tunebody_lines) - 1)] + [
+                        tunebody_lines[-1]]
+                else:
+                    tunebody_lines = [tunebody_lines[i] + '\n' for i in range(len(tunebody_lines))]
+                if cut_index is None:
+                    cut_index = len(tunebody_lines) // 2
+                abc_code_slice = ''.join(metadata_lines + tunebody_lines[-cut_index:])
+                input_patches = patchilizer.encode_generate(abc_code_slice)
+                input_patches = [item for sublist in input_patches for item in sublist]
+                input_patches = torch.tensor([input_patches], device=device)
+                input_patches = input_patches.reshape(1, -1)
+        if not failure_flag:
+            generation_time_cost = time.time() - start_time
+            abc_text = ''.join(byte_list)
+            filename = time.strftime("%Y%m%d-%H%M%S") + \
+                    "_" + format(generation_time_cost, '.2f') + '_' + str(file_no) + ".abc"
+            # unreduce
+            unreduced_output_path = os.path.join(INTERLEAVED_OUTPUT_FOLDER, filename)
+            abc_lines = abc_text.split('\n')
+            abc_lines = list(filter(None, abc_lines))
+            abc_lines = [line + '\n' for line in abc_lines]
+            try:
+                abc_lines = rest_unreduce(abc_lines)
+                with open(unreduced_output_path, 'w') as file:
+                    file.writelines(abc_lines)
+            except:
+                pass
+            else:
+                # original
+                original_output_path = os.path.join(ORIGINAL_OUTPUT_FOLDER, filename)
+                with open(original_output_path, 'w') as w:
+                    w.write(abc_text)
+                file_no += 1
+        else:
+            print('failed')
+if __name__ == '__main__':
+    inference_patch()

inference.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import os
+import time
+import torch
+from utils import *
+from config import *
+from transformers import GPT2Config
+from abctoolkit.utils import Exclaim_re, Quote_re, SquareBracket_re, Barline_regexPattern
+from abctoolkit.transpose import Note_list, Pitch_sign_list
+from abctoolkit.duration import calculate_bartext_duration
+Note_list = Note_list + ['z', 'x']
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+elif torch.backends.mps.is_available():
+    device = torch.device("mps")
+else:
+    device = torch.device("cpu")
+patchilizer = Patchilizer()
+patch_config = GPT2Config(num_hidden_layers=PATCH_NUM_LAYERS,
+                          max_length=PATCH_LENGTH,
+                          max_position_embeddings=PATCH_LENGTH,
+                          n_embd=HIDDEN_SIZE,
+                          num_attention_heads=HIDDEN_SIZE // 64,
+                          vocab_size=1)
+byte_config = GPT2Config(num_hidden_layers=CHAR_NUM_LAYERS,
+                         max_length=PATCH_SIZE + 1,
+                         max_position_embeddings=PATCH_SIZE + 1,
+                         hidden_size=HIDDEN_SIZE,
+                         num_attention_heads=HIDDEN_SIZE // 64,
+                         vocab_size=128)
+model = NotaGenLMHeadModel(encoder_config=patch_config, decoder_config=byte_config).to(device)
+def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True):
+    """
+    Prepare model for k-bit training.
+    Features include:
+    1. Convert model to mixed precision (FP16).
+    2. Disable unnecessary gradient computations.
+    3. Enable gradient checkpointing (optional).
+    """
+    # Convert model to mixed precision
+    model = model.to(dtype=torch.float16)
+    # Disable gradients for embedding layers
+    for param in model.parameters():
+        if param.dtype == torch.float32:
+            param.requires_grad = False
+    # Enable gradient checkpointing
+    if use_gradient_checkpointing:
+        model.gradient_checkpointing_enable()
+    return model
+model = prepare_model_for_kbit_training(
+    model,
+    use_gradient_checkpointing=False
+)
+print("Parameter Number: " + str(sum(p.numel() for p in model.parameters() if p.requires_grad)))
+checkpoint = torch.load(INFERENCE_WEIGHTS_PATH, map_location=torch.device(device))
+model.load_state_dict(checkpoint['model'])
+model = model.to(device)
+model.eval()
+def complete_brackets(s):
+    stack = []
+    bracket_map = {'{': '}', '[': ']', '(': ')'}
+    # Iterate through each character, handle bracket matching
+    for char in s:
+        if char in bracket_map:
+            stack.append(char)
+        elif char in bracket_map.values():
+            # Find the corresponding left bracket
+            for key, value in bracket_map.items():
+                if value == char:
+                    if stack and stack[-1] == key:
+                        stack.pop()
+                    break  # Found matching right bracket, process next character
+    # Complete missing right brackets (in reverse order of remaining left brackets in stack)
+    completion = ''.join(bracket_map[c] for c in reversed(stack))
+    return s + completion
+def rest_unreduce(abc_lines):
+    tunebody_index = None
+    for i in range(len(abc_lines)):
+        if abc_lines[i].startswith('%%score'):
+            abc_lines[i] = complete_brackets(abc_lines[i])
+        if '[V:' in abc_lines[i]:
+            tunebody_index = i
+            break
+    metadata_lines = abc_lines[: tunebody_index]
+    tunebody_lines = abc_lines[tunebody_index:]
+    part_symbol_list = []
+    voice_group_list = []
+    for line in metadata_lines:
+        if line.startswith('%%score'):
+            for round_bracket_match in re.findall(r'\((.*?)\)', line):
+                voice_group_list.append(round_bracket_match.split())
+            existed_voices = [item for sublist in voice_group_list for item in sublist]
+        if line.startswith('V:'):
+            symbol = line.split()[0]
+            part_symbol_list.append(symbol)
+            if symbol[2:] not in existed_voices:
+                voice_group_list.append([symbol[2:]])
+    z_symbol_list = []  # voices that use z as rest
+    x_symbol_list = []  # voices that use x as rest
+    for voice_group in voice_group_list:
+        z_symbol_list.append('V:' + voice_group[0])
+        for j in range(1, len(voice_group)):
+            x_symbol_list.append('V:' + voice_group[j])
+    part_symbol_list.sort(key=lambda x: int(x[2:]))
+    unreduced_tunebody_lines = []
+    for i, line in enumerate(tunebody_lines):
+        unreduced_line = ''
+        line = re.sub(r'^\[r:[^\]]*\]', '', line)
+        pattern = r'\[V:(\d+)\](.*?)(?=\[V:|$)'
+        matches = re.findall(pattern, line)
+        line_bar_dict = {}
+        for match in matches:
+            key = f'V:{match[0]}'
+            value = match[1]
+            line_bar_dict[key] = value
+        # calculate duration and collect barline
+        dur_dict = {}
+        for symbol, bartext in line_bar_dict.items():
+            right_barline = ''.join(re.split(Barline_regexPattern, bartext)[-2:])
+            bartext = bartext[:-len(right_barline)]
+            try:
+                bar_dur = calculate_bartext_duration(bartext)
+            except:
+                bar_dur = None
+            if bar_dur is not None:
+                if bar_dur not in dur_dict.keys():
+                    dur_dict[bar_dur] = 1
+                else:
+                    dur_dict[bar_dur] += 1
+        try:
+            ref_dur = max(dur_dict, key=dur_dict.get)
+        except:
+            pass    # use last ref_dur
+        if i == 0:
+            prefix_left_barline = line.split('[V:')[0]
+        else:
+            prefix_left_barline = ''
+        for symbol in part_symbol_list:
+            if symbol in line_bar_dict.keys():
+                symbol_bartext = line_bar_dict[symbol]
+            else:
+                if symbol in z_symbol_list:
+                    symbol_bartext = prefix_left_barline + 'z' + str(ref_dur) + right_barline
+                elif symbol in x_symbol_list:
+                    symbol_bartext = prefix_left_barline + 'x' + str(ref_dur) + right_barline
+            unreduced_line += '[' + symbol + ']' + symbol_bartext
+        unreduced_tunebody_lines.append(unreduced_line + '\n')
+    unreduced_lines = metadata_lines + unreduced_tunebody_lines
+    return unreduced_lines
+def inference_patch(period, composer, instrumentation):
+    prompt_lines=[
+    '%' + period + '\n',
+    '%' + composer + '\n',
+    '%' + instrumentation + '\n']
+    while True:
+        failure_flag = False
+        bos_patch = [patchilizer.bos_token_id] * (PATCH_SIZE - 1) + [patchilizer.eos_token_id]
+        start_time = time.time()
+        prompt_patches = patchilizer.patchilize_metadata(prompt_lines)
+        byte_list = list(''.join(prompt_lines))
+        context_tunebody_byte_list = []
+        metadata_byte_list = []
+        print(''.join(byte_list), end='')
+        prompt_patches = [[ord(c) for c in patch] + [patchilizer.special_token_id] * (PATCH_SIZE - len(patch)) for patch
+                          in prompt_patches]
+        prompt_patches.insert(0, bos_patch)
+        input_patches = torch.tensor(prompt_patches, device=device).reshape(1, -1)
+        end_flag = False
+        cut_index = None
+        tunebody_flag = False
+        with torch.inference_mode():
+            while True:
+                with torch.autocast(device_type='cuda', dtype=torch.float16):
+                    predicted_patch = model.generate(input_patches.unsqueeze(0),
+                                                    top_k=TOP_K,
+                                                    top_p=TOP_P,
+                                                    temperature=TEMPERATURE)
+                if not tunebody_flag and patchilizer.decode([predicted_patch]).startswith('[r:'):  # 初次进入tunebody，必须以[r:0/开头
+                    tunebody_flag = True
+                    r0_patch = torch.tensor([ord(c) for c in '[r:0/']).unsqueeze(0).to(device)
+                    temp_input_patches = torch.concat([input_patches, r0_patch], axis=-1)
+                    predicted_patch = model.generate(temp_input_patches.unsqueeze(0),
+                                                    top_k=TOP_K,
+                                                    top_p=TOP_P,
+                                                    temperature=TEMPERATURE)
+                    predicted_patch = [ord(c) for c in '[r:0/'] + predicted_patch
+                if predicted_patch[0] == patchilizer.bos_token_id and predicted_patch[1] == patchilizer.eos_token_id:
+                    end_flag = True
+                    break
+                next_patch = patchilizer.decode([predicted_patch])
+                for char in next_patch:
+                    byte_list.append(char)
+                    if tunebody_flag:
+                        context_tunebody_byte_list.append(char)
+                    else:
+                        metadata_byte_list.append(char)
+                    print(char, end='')
+                patch_end_flag = False
+                for j in range(len(predicted_patch)):
+                    if patch_end_flag:
+                        predicted_patch[j] = patchilizer.special_token_id
+                    if predicted_patch[j] == patchilizer.eos_token_id:
+                        patch_end_flag = True
+                predicted_patch = torch.tensor([predicted_patch], device=device)  # (1, 16)
+                input_patches = torch.cat([input_patches, predicted_patch], dim=1)  # (1, 16 * patch_len)
+                if len(byte_list) > 102400:
+                    failure_flag = True
+                    break
+                if time.time() - start_time > 10 * 60:
+                    failure_flag = True
+                    break
+                if input_patches.shape[1] >= PATCH_LENGTH * PATCH_SIZE and not end_flag:
+                    print('Stream generating...')
+                    metadata = ''.join(metadata_byte_list)
+                    context_tunebody = ''.join(context_tunebody_byte_list)
+                    if '\n' not in context_tunebody:
+                        break   # Generated content is all metadata, abandon
+                    context_tunebody_liness = context_tunebody.split('\n')
+                    if not context_tunebody.endswith('\n'):
+                        context_tunebody_liness = [context_tunebody_liness[i] + '\n' for i in range(len(context_tunebody_liness) - 1)] + [context_tunebody_liness[-1]]
+                    else:
+                        context_tunebody_liness = [context_tunebody_liness[i] + '\n' for i in range(len(context_tunebody_liness))]
+                    cut_index = len(context_tunebody_liness) // 2
+                    abc_code_slice = metadata + ''.join(context_tunebody_liness[-cut_index:])
+                    input_patches = patchilizer.encode_generate(abc_code_slice)
+                    input_patches = [item for sublist in input_patches for item in sublist]
+                    input_patches = torch.tensor([input_patches], device=device)
+                    input_patches = input_patches.reshape(1, -1)
+                    context_tunebody_byte_list = list(''.join(context_tunebody_lines[-cut_index:]))
+            if not failure_flag:
+                abc_text = ''.join(byte_list)
+                # unreduce
+                abc_lines = abc_text.split('\n')
+                abc_lines = list(filter(None, abc_lines))
+                abc_lines = [line + '\n' for line in abc_lines]
+                try:
+                    unreduced_abc_lines = rest_unreduce(abc_lines)
+                except:
+                    failure_flag = True
+                    pass
+                else:
+                    unreduced_abc_lines = [line for line in unreduced_abc_lines if not(line.startswith('%') and not line.startswith('%%'))]
+                    unreduced_abc_lines = ['X:1\n'] + unreduced_abc_lines
+                    unreduced_abc_text = ''.join(unreduced_abc_lines)
+                    return unreduced_abc_text
+if __name__ == '__main__':
+    inference_patch('Classical', 'Beethoven, Ludwig van', 'Keyboard')

notagen.png ADDED Viewed

Git LFS Details

SHA256: 782948b2cd663b846ebbc03cb14112efdc65dd487a74a3e10fb484199f33b658
Pointer size: 131 Bytes
Size of remote file: 613 kB

prompts.txt ADDED Viewed

	@@ -0,0 +1,112 @@

+Baroque_Bach, Johann Sebastian_Chamber
+Baroque_Bach, Johann Sebastian_Choral
+Baroque_Bach, Johann Sebastian_Keyboard
+Baroque_Bach, Johann Sebastian_Orchestral
+Baroque_Bach, Johann Sebastian_Vocal-Orchestral
+Baroque_Corelli, Arcangelo_Chamber
+Baroque_Corelli, Arcangelo_Orchestral
+Baroque_Handel, George Frideric_Chamber
+Baroque_Handel, George Frideric_Keyboard
+Baroque_Handel, George Frideric_Orchestral
+Baroque_Handel, George Frideric_Vocal-Orchestral
+Baroque_Scarlatti, Domenico_Keyboard
+Baroque_Vivaldi, Antonio_Chamber
+Baroque_Vivaldi, Antonio_Orchestral
+Baroque_Vivaldi, Antonio_Vocal-Orchestral
+Classical_Beethoven, Ludwig van_Art Song
+Classical_Beethoven, Ludwig van_Chamber
+Classical_Beethoven, Ludwig van_Keyboard
+Classical_Beethoven, Ludwig van_Orchestral
+Classical_Haydn, Joseph_Chamber
+Classical_Haydn, Joseph_Keyboard
+Classical_Haydn, Joseph_Orchestral
+Classical_Haydn, Joseph_Vocal-Orchestral
+Classical_Mozart, Wolfgang Amadeus_Chamber
+Classical_Mozart, Wolfgang Amadeus_Choral
+Classical_Mozart, Wolfgang Amadeus_Keyboard
+Classical_Mozart, Wolfgang Amadeus_Orchestral
+Classical_Mozart, Wolfgang Amadeus_Vocal-Orchestral
+Classical_Paradis, Maria Theresia von_Art Song
+Classical_Reichardt, Louise_Art Song
+Classical_Saint-Georges, Joseph Bologne_Chamber
+Classical_Schroter, Corona_Art Song
+Romantic_Bartok, Bela_Keyboard
+Romantic_Berlioz, Hector_Choral
+Romantic_Bizet, Georges_Art Song
+Romantic_Boulanger, Lili_Art Song
+Romantic_Boulton, Harold_Art Song
+Romantic_Brahms, Johannes_Art Song
+Romantic_Brahms, Johannes_Chamber
+Romantic_Brahms, Johannes_Choral
+Romantic_Brahms, Johannes_Keyboard
+Romantic_Brahms, Johannes_Orchestral
+Romantic_Burgmuller, Friedrich_Keyboard
+Romantic_Butterworth, George_Art Song
+Romantic_Chaminade, Cecile_Art Song
+Romantic_Chausson, Ernest_Art Song
+Romantic_Chopin, Frederic_Art Song
+Romantic_Chopin, Frederic_Keyboard
+Romantic_Cornelius, Peter_Art Song
+Romantic_Debussy, Claude_Art Song
+Romantic_Debussy, Claude_Keyboard
+Romantic_Dvorak, Antonin_Chamber
+Romantic_Dvorak, Antonin_Choral
+Romantic_Dvorak, Antonin_Keyboard
+Romantic_Dvorak, Antonin_Orchestral
+Romantic_Faisst, Clara_Art Song
+Romantic_Faure, Gabriel_Art Song
+Romantic_Faure, Gabriel_Chamber
+Romantic_Faure, Gabriel_Keyboard
+Romantic_Franz, Robert_Art Song
+Romantic_Gonzaga, Chiquinha_Art Song
+Romantic_Grandval, Clemence de_Art Song
+Romantic_Grieg, Edvard_Keyboard
+Romantic_Grieg, Edvard_Orchestral
+Romantic_Hensel, Fanny_Art Song
+Romantic_Holmes, Augusta Mary Anne_Art Song
+Romantic_Jaell, Marie_Art Song
+Romantic_Kinkel, Johanna_Art Song
+Romantic_Kralik, Mathilde_Art Song
+Romantic_Lang, Josephine_Art Song
+Romantic_Lehmann, Liza_Art Song
+Romantic_Liszt, Franz_Keyboard
+Romantic_Mayer, Emilie_Chamber
+Romantic_Medtner, Nikolay_Keyboard
+Romantic_Mendelssohn, Felix_Art Song
+Romantic_Mendelssohn, Felix_Chamber
+Romantic_Mendelssohn, Felix_Choral
+Romantic_Mendelssohn, Felix_Keyboard
+Romantic_Mendelssohn, Felix_Orchestral
+Romantic_Munktell, Helena_Art Song
+Romantic_Parratt, Walter_Choral
+Romantic_Prokofiev, Sergey_Keyboard
+Romantic_Rachmaninoff, Sergei_Choral
+Romantic_Rachmaninoff, Sergei_Keyboard
+Romantic_Ravel, Maurice_Art Song
+Romantic_Ravel, Maurice_Chamber
+Romantic_Ravel, Maurice_Keyboard
+Romantic_Saint-Saens, Camille_Chamber
+Romantic_Saint-Saens, Camille_Keyboard
+Romantic_Saint-Saens, Camille_Orchestral
+Romantic_Satie, Erik_Art Song
+Romantic_Satie, Erik_Keyboard
+Romantic_Schubert, Franz_Art Song
+Romantic_Schubert, Franz_Chamber
+Romantic_Schubert, Franz_Choral
+Romantic_Schubert, Franz_Keyboard
+Romantic_Schumann, Clara_Art Song
+Romantic_Schumann, Robert_Art Song
+Romantic_Schumann, Robert_Chamber
+Romantic_Schumann, Robert_Choral
+Romantic_Schumann, Robert_Keyboard
+Romantic_Scriabin, Aleksandr_Keyboard
+Romantic_Shostakovich, Dmitry_Chamber
+Romantic_Shostakovich, Dmitry_Keyboard
+Romantic_Sibelius, Jean_Keyboard
+Romantic_Smetana, Bedrich_Keyboard
+Romantic_Tchaikovsky, Pyotr_Keyboard
+Romantic_Tchaikovsky, Pyotr_Orchestral
+Romantic_Viardot, Pauline_Art Song
+Romantic_Warlock, Peter_Art Song
+Romantic_Wolf, Hugo_Art Song
+Romantic_Zumsteeg, Emilie_Art Song

requirements (6).txt ADDED Viewed

	@@ -0,0 +1,7 @@

+transformers==4.40.0
+numpy==1.26.4
+wandb==0.17.2
+abctoolkit==0.0.6
+samplings==0.1.7
+pyparsing==3.2.1
+gradio==5.17.1

statistics.py ADDED Viewed

	@@ -0,0 +1,68 @@

+gt_feature_folder = ''
+output_feature_folder = ''
+import os
+import json
+import random
+import re
+import numpy as np
+from config import *
+def load_npy_files(folder_path_list):
+    """
+    Load all .npy files from a specified folder and return a list of numpy arrays.
+    """
+    npy_list = []
+    for file_path in folder_path_list:
+        if file_path.endswith('.npy'):
+            # file_path = os.path.join(folder_path, file_name)
+            np_array = np.load(file_path)[0]
+            npy_list.append(np_array)
+    return npy_list
+def average_npy(npy_list):
+    """
+    Compute the average of a list of numpy arrays.
+    """
+    return np.mean(npy_list, axis=0)
+def cosine_similarity(vec1, vec2):
+    """
+    Compute cosine similarity between two numpy arrays.
+    """
+    dot_product = np.dot(vec1, vec2)
+    norm_vec1 = np.linalg.norm(vec1)
+    norm_vec2 = np.linalg.norm(vec2)
+    cosine_sim = dot_product / (norm_vec1 * norm_vec2)
+    return cosine_sim
+def test_generated_results_similarity():
+    gt_feature_paths = []
+    for gt_feature_file in os.listdir(gt_feature_folder):
+        gt_feature_paths.append(os.path.join(gt_feature_folder, gt_feature_file))
+    gt_features = load_npy_files(gt_feature_paths)
+    gt_avg_feature = average_npy(gt_features)
+    clamp2score_list = []
+    for output_feature_file in os.listdir(output_feature_folder):
+        output_feature_path = os.path.join(output_feature_folder, output_feature_file)
+        output_feature = np.load(output_feature_path)[0]
+        clamp2score = cosine_similarity(gt_avg_feature, output_feature)
+        clamp2score_list.append(clamp2score)
+    avg_clampscore = sum(clamp2score_list) / len(clamp2score_list)
+    print('average clamp 2 score:', avg_clampscore)
+if __name__ == '__main__':
+    test_generated_results_similarity()

train-gen (1).py ADDED Viewed

	@@ -0,0 +1,325 @@

+import os
+import gc
+import time
+import math
+import json
+import wandb
+import torch
+import random
+import numpy as np
+from utils import *
+from config import *
+from tqdm import tqdm
+from copy import deepcopy
+from torch.cuda.amp import autocast, GradScaler
+from torch.utils.data import Dataset, DataLoader
+from transformers import GPT2Config, LlamaConfig, get_scheduler, get_constant_schedule_with_warmup
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data.distributed import DistributedSampler
+# Set up distributed training
+world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
+global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0
+local_rank = int(os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else 0
+if world_size > 1:
+    torch.cuda.set_device(local_rank)
+    device = torch.device("cuda", local_rank)
+    dist.init_process_group(backend='nccl') if world_size > 1 else None
+else:
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+# Set random seed
+seed = 0 + global_rank
+random.seed(seed)
+np.random.seed(seed)
+torch.manual_seed(seed)
+torch.cuda.manual_seed_all(seed)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+batch_size = BATCH_SIZE
+patchilizer = Patchilizer()
+patch_config = GPT2Config(num_hidden_layers=PATCH_NUM_LAYERS,
+                    max_length=PATCH_LENGTH,
+                    max_position_embeddings=PATCH_LENGTH,
+                    n_embd=HIDDEN_SIZE,
+                    num_attention_heads=HIDDEN_SIZE//64,
+                    vocab_size=1)
+char_config = GPT2Config(num_hidden_layers=CHAR_NUM_LAYERS,
+                            max_length=PATCH_SIZE+1,
+                            max_position_embeddings=PATCH_SIZE+1,
+                            hidden_size=HIDDEN_SIZE,
+                            num_attention_heads=HIDDEN_SIZE//64,
+                            vocab_size=128)
+model = NotaGenLMHeadModel(encoder_config=patch_config, decoder_config=char_config)
+model = model.to(device)
+# print parameter number
+print("Parameter Number: "+str(sum(p.numel() for p in model.parameters() if p.requires_grad)))
+if world_size > 1:
+    model = DDP(model, device_ids=[local_rank], output_device=local_rank,  find_unused_parameters=True)
+scaler = GradScaler()
+is_autocast = True
+optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
+def clear_unused_tensors():
+    gc.disable()  # Temporarily disable garbage collection
+    try:
+        # Get the set of tensor ids used by the model
+        if hasattr(model, "module"):
+            model_tensors = {id(p) for p in model.module.parameters()}
+        else:
+            model_tensors = {id(p) for p in model.parameters()}
+        # Get the set of tensor ids used by the optimizer
+        optimizer_tensors = {
+            id(state)
+            for state_dict in optimizer.state.values()
+            for state in state_dict.values()
+            if isinstance(state, torch.Tensor)  # Ensure only tensors are considered
+        }
+        # List of all CUDA tensors currently in memory
+        tensors = [obj for obj in gc.get_objects() if isinstance(obj, torch.Tensor) and obj.is_cuda]
+        # Create weak references to avoid interfering with garbage collection
+        tensor_refs = [weakref.ref(tensor) for tensor in tensors]
+        for tensor_ref in tensor_refs:
+            tensor = tensor_ref()  # Dereference the weak reference
+            if tensor is not None and id(tensor) not in model_tensors and id(tensor) not in optimizer_tensors:
+                # Mark the tensor for deletion
+                tensor.detach_()  # Detach from computation graph
+                del tensor  # Delete the tensor reference
+    except:
+        pass
+    finally:
+        gc.enable()  # Re-enable garbage collection
+        gc.collect()  # Force a garbage collection
+        torch.cuda.empty_cache()  # Clear the CUDA cache
+def collate_batch(input_batches):
+    input_patches, input_masks = zip(*input_batches)
+    input_patches = torch.nn.utils.rnn.pad_sequence(input_patches, batch_first=True, padding_value=0)
+    input_masks = torch.nn.utils.rnn.pad_sequence(input_masks, batch_first=True, padding_value=0)
+    return input_patches.to(device), input_masks.to(device)
+def split_into_minibatches(input_patches, input_masks, minibatch_size):
+    minibatches = []
+    for start_idx in range(0, len(input_patches), minibatch_size):
+        end_idx = start_idx + minibatch_size
+        minibatch_patches = input_patches[start_idx:end_idx]
+        minibatch_masks = input_masks[start_idx:end_idx]
+        minibatches.append((minibatch_patches, minibatch_masks))
+    return minibatches
+class NotaGenDataset(Dataset):
+    def __init__(self, filenames):
+        self.filenames = filenames
+    def __len__(self):
+        return len(self.filenames)
+    def __getitem__(self, idx):
+        filepath = self.filenames[idx]['path']
+        key = random.choice(['C#', 'F#', 'B', 'E', 'A', 'D', 'G', 'C', 'F', 'Bb', 'Eb', 'Ab', 'Db', 'Gb', 'Cb'])
+        folder = os.path.dirname(filepath)
+        name = os.path.split(filepath)[-1]
+        des_filepath = os.path.join(folder, key, name + '_' + key + '.abc')
+        with open(des_filepath, 'r', encoding='utf-8') as f:
+            abc_text = f.read()
+        file_bytes = patchilizer.encode_train(abc_text)
+        file_masks = [1] * len(file_bytes)
+        file_bytes = torch.tensor(file_bytes, dtype=torch.long)
+        file_masks = torch.tensor(file_masks, dtype=torch.long)
+        return file_bytes, file_masks
+def process_one_batch(batch):
+    input_patches, input_masks = batch
+    loss = model(input_patches, input_masks).loss
+    # Reduce the loss on GPU 0
+    if world_size > 1:
+        loss = loss.unsqueeze(0)
+        dist.reduce(loss, dst=0)
+        loss = loss / world_size
+        dist.broadcast(loss, src=0)
+    return loss
+# do one epoch for training
+def train_epoch(epoch):
+    tqdm_train_set = tqdm(train_set)
+    total_train_loss = 0
+    iter_idx = 1
+    model.train()
+    train_steps = (epoch-1)*len(train_set)
+    for batch in tqdm_train_set:
+        minibatches = split_into_minibatches(batch[0], batch[1], BATCH_SIZE//ACCUMULATION_STEPS)
+        for minibatch in minibatches:
+            with autocast():
+                loss = process_one_batch(minibatch) / ACCUMULATION_STEPS
+            scaler.scale(loss).backward()
+            total_train_loss += loss.item()
+        scaler.step(optimizer)
+        scaler.update()
+        lr_scheduler.step()
+        model.zero_grad(set_to_none=True)
+        tqdm_train_set.set_postfix({str(global_rank)+'_train_loss': total_train_loss / iter_idx})
+        train_steps += 1
+        # Log the training loss to wandb
+        if global_rank==0 and WANDB_LOGGING:
+            wandb.log({"train_loss": total_train_loss / iter_idx}, step=train_steps)
+        iter_idx += 1
+        if iter_idx % 1000 == 0:
+            clear_unused_tensors()
+    return total_train_loss / (iter_idx-1)
+# do one epoch for eval
+def eval_epoch():
+    tqdm_eval_set = tqdm(eval_set)
+    total_eval_loss = 0
+    total_eval_bpb = 0
+    iter_idx = 1
+    model.eval()
+    # Evaluate data for one epoch
+    for batch in tqdm_eval_set:
+        minibatches = split_into_minibatches(batch[0], batch[1], BATCH_SIZE//ACCUMULATION_STEPS)
+        for minibatch in minibatches:
+            with torch.no_grad():
+                loss = process_one_batch(minibatch) / ACCUMULATION_STEPS
+            total_eval_loss += loss.item()
+        tqdm_eval_set.set_postfix({str(global_rank)+'_eval_loss': total_eval_loss / iter_idx})
+        iter_idx += 1
+    return total_eval_loss / (iter_idx-1)
+# train and eval
+if __name__ == "__main__":
+    # Initialize wandb
+    if WANDB_LOGGING and global_rank==0:
+        wandb.login(key=WANDB_KEY)
+        wandb.init(project="notagen",
+                   name=WANDB_NAME)
+    # load data
+    with open(DATA_TRAIN_INDEX_PATH, "r", encoding="utf-8") as f:
+        print("Loading Data...")
+        train_files = []
+        for line in f:
+            train_files.append(json.loads(line))
+    with open(DATA_EVAL_INDEX_PATH, "r", encoding="utf-8") as f:
+        print("Loading Data...")
+        eval_files = []
+        for line in f:
+            eval_files.append(json.loads(line))
+    train_batch_nums = int(len(train_files) / batch_size)
+    eval_batch_nums = int(len(eval_files) / batch_size)
+    random.shuffle(train_files)
+    random.shuffle(eval_files)
+    train_files = train_files[:train_batch_nums*batch_size]
+    eval_files = eval_files[:eval_batch_nums*batch_size]
+    train_set = NotaGenDataset(train_files)
+    eval_set = NotaGenDataset(eval_files)
+    train_sampler = DistributedSampler(train_set, num_replicas=world_size, rank=local_rank)
+    eval_sampler = DistributedSampler(eval_set, num_replicas=world_size, rank=local_rank)
+    train_set = DataLoader(train_set, batch_size=batch_size, collate_fn=collate_batch, sampler=train_sampler, shuffle = (train_sampler is None))
+    eval_set = DataLoader(eval_set, batch_size=batch_size, collate_fn=collate_batch, sampler=eval_sampler, shuffle = (train_sampler is None))
+    lr_scheduler = get_constant_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=1000)
+    model = model.to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
+    if LOAD_FROM_CHECKPOINT and os.path.exists(WEIGHTS_PATH):
+        # Load checkpoint to CPU
+        checkpoint = torch.load(WEIGHTS_PATH, map_location='cpu')
+        # Here, model is assumed to be on GPU
+        # Load state dict to CPU model first, then move the model to GPU
+        if torch.cuda.device_count() > 1:
+            # If you have a DataParallel model, you need to load to model.module instead
+            cpu_model = deepcopy(model.module)
+            cpu_model.load_state_dict(checkpoint['model'])
+            model.module.load_state_dict(cpu_model.state_dict())
+        else:
+            # Load to a CPU clone of the model, then load back
+            cpu_model = deepcopy(model)
+            cpu_model.load_state_dict(checkpoint['model'])
+            model.load_state_dict(cpu_model.state_dict())
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_sched'])
+        pre_epoch = checkpoint['epoch']
+        best_epoch = checkpoint['best_epoch']
+        min_eval_loss = checkpoint['min_eval_loss']
+        print("Successfully Loaded Checkpoint from Epoch %d" % pre_epoch)
+        checkpoint = None
+    else:
+        pre_epoch = 0
+        best_epoch = 0
+        min_eval_loss = 100
+    for epoch in range(1+pre_epoch, NUM_EPOCHS+1):
+        train_sampler.set_epoch(epoch)
+        eval_sampler.set_epoch(epoch)
+        print('-' * 21 + "Epoch " + str(epoch) + '-' * 21)
+        train_loss = train_epoch(epoch)
+        eval_loss = eval_epoch()
+        if global_rank==0:
+            with open(LOGS_PATH,'a') as f:
+                f.write("Epoch " + str(epoch) + "\ntrain_loss: " + str(train_loss) + "\neval_loss: " +str(eval_loss) + "\ntime: " + time.asctime(time.localtime(time.time())) + "\n\n")
+            if eval_loss < min_eval_loss:
+                best_epoch = epoch
+                min_eval_loss = eval_loss
+                checkpoint = {
+                                'model': model.module.state_dict() if hasattr(model, "module") else model.state_dict(),
+                                'optimizer': optimizer.state_dict(),
+                                'lr_sched': lr_scheduler.state_dict(),
+                                'epoch': epoch,
+                                'best_epoch': best_epoch,
+                                'min_eval_loss': min_eval_loss
+                                }
+                torch.save(checkpoint, WEIGHTS_PATH)
+        if world_size > 1:
+            dist.barrier()
+    if global_rank==0:
+        print("Best Eval Epoch : "+str(best_epoch))
+        print("Min Eval Loss : "+str(min_eval_loss))

train-gen.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import os
+import gc
+import time
+import math
+import json
+import wandb
+import torch
+import random
+import numpy as np
+from abctoolkit.transpose import Key2index, Key2Mode
+from utils import *
+from config import *
+from tqdm import tqdm
+from copy import deepcopy
+from torch.cuda.amp import autocast, GradScaler
+from torch.utils.data import Dataset, DataLoader
+from transformers import GPT2Config, LlamaConfig, get_scheduler, get_constant_schedule_with_warmup
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data.distributed import DistributedSampler
+Index2Key = {index: key for key, index in Key2index.items() if index not in [1, 11]}
+Mode2Key = {mode: key for key, mode_list in Key2Mode.items() for mode in mode_list }
+# Set up distributed training
+world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
+global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else 0
+local_rank = int(os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else 0
+if world_size > 1:
+    torch.cuda.set_device(local_rank)
+    device = torch.device("cuda", local_rank)
+    dist.init_process_group(backend='nccl') if world_size > 1 else None
+else:
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+# Set random seed
+seed = 0 + global_rank
+random.seed(seed)
+np.random.seed(seed)
+torch.manual_seed(seed)
+torch.cuda.manual_seed_all(seed)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+batch_size = BATCH_SIZE
+patchilizer = Patchilizer()
+patch_config = GPT2Config(num_hidden_layers=PATCH_NUM_LAYERS,
+                    max_length=PATCH_LENGTH,
+                    max_position_embeddings=PATCH_LENGTH,
+                    n_embd=HIDDEN_SIZE,
+                    num_attention_heads=HIDDEN_SIZE//64,
+                    vocab_size=1)
+char_config = GPT2Config(num_hidden_layers=CHAR_NUM_LAYERS,
+                            max_length=PATCH_SIZE+1,
+                            max_position_embeddings=PATCH_SIZE+1,
+                            hidden_size=HIDDEN_SIZE,
+                            num_attention_heads=HIDDEN_SIZE//64,
+                            vocab_size=128)
+model = NotaGenLMHeadModel(encoder_config=patch_config, decoder_config=char_config)
+model = model.to(device)
+# print parameter number
+print("Parameter Number: "+str(sum(p.numel() for p in model.parameters() if p.requires_grad)))
+if world_size > 1:
+    model = DDP(model, device_ids=[local_rank], output_device=local_rank,  find_unused_parameters=True)
+scaler = GradScaler()
+is_autocast = True
+optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
+def clear_unused_tensors():
+    gc.disable()  # Temporarily disable garbage collection
+    try:
+        # Get the set of tensor ids used by the model
+        if hasattr(model, "module"):
+            model_tensors = {id(p) for p in model.module.parameters()}
+        else:
+            model_tensors = {id(p) for p in model.parameters()}
+        # Get the set of tensor ids used by the optimizer
+        optimizer_tensors = {
+            id(state)
+            for state_dict in optimizer.state.values()
+            for state in state_dict.values()
+            if isinstance(state, torch.Tensor)  # Ensure only tensors are considered
+        }
+        # List of all CUDA tensors currently in memory
+        tensors = [obj for obj in gc.get_objects() if isinstance(obj, torch.Tensor) and obj.is_cuda]
+        # Create weak references to avoid interfering with garbage collection
+        tensor_refs = [weakref.ref(tensor) for tensor in tensors]
+        for tensor_ref in tensor_refs:
+            tensor = tensor_ref()  # Dereference the weak reference
+            if tensor is not None and id(tensor) not in model_tensors and id(tensor) not in optimizer_tensors:
+                # Mark the tensor for deletion
+                tensor.detach_()  # Detach from computation graph
+                del tensor  # Delete the tensor reference
+    except:
+        pass
+    finally:
+        gc.enable()  # Re-enable garbage collection
+        gc.collect()  # Force a garbage collection
+        torch.cuda.empty_cache()  # Clear the CUDA cache
+def collate_batch(input_batches):
+    input_patches, input_masks = zip(*input_batches)
+    input_patches = torch.nn.utils.rnn.pad_sequence(input_patches, batch_first=True, padding_value=0)
+    input_masks = torch.nn.utils.rnn.pad_sequence(input_masks, batch_first=True, padding_value=0)
+    return input_patches.to(device), input_masks.to(device)
+def split_into_minibatches(input_patches, input_masks, minibatch_size):
+    minibatches = []
+    for start_idx in range(0, len(input_patches), minibatch_size):
+        end_idx = start_idx + minibatch_size
+        minibatch_patches = input_patches[start_idx:end_idx]
+        minibatch_masks = input_masks[start_idx:end_idx]
+        minibatches.append((minibatch_patches, minibatch_masks))
+    return minibatches
+class NotaGenDataset(Dataset):
+    def __init__(self, filenames):
+        self.filenames = filenames
+    def __len__(self):
+        return len(self.filenames)
+    def __getitem__(self, idx):
+        filepath = self.filenames[idx]['path']
+        ori_key = Mode2Key[self.filenames[idx]['key']]
+        # choose a key to transpose, according to a probility distribution
+        ori_key_index = Key2index[ori_key]
+        available_index = [(ori_key_index + offset) % 12 for offset in range(-3, 4)]
+        index_prob = [1/16, 2/16, 3/16, 4/16, 3/16, 2/16, 1/16]
+        index_prob_range = [0] + [sum(index_prob[0 : i + 1]) for i in range(len(index_prob))]
+        random_number = random.random()
+        for i in range(len(index_prob_range) - 1):
+            if index_prob_range[i] <= random_number < index_prob_range[i + 1]:
+                des_key_index = available_index[i]
+        if des_key_index == 1:
+            des_key = 'Db' if random.random() < 0.8 else 'C#'
+        elif des_key_index == 11:
+            des_key = 'B' if random.random() < 0.8 else 'Cb'
+        elif des_key_index == 6:
+            des_key = 'F#' if random.random() < 0.5 else 'Gb'
+        else:
+            des_key = Index2Key[des_key_index]
+        folder = os.path.dirname(filepath)
+        name = os.path.split(filepath)[-1]
+        des_filepath = os.path.join(folder, des_key, name + '_' + des_key + '.abc')
+        with open(des_filepath, 'r', encoding='utf-8') as f:
+            abc_text = f.read()
+        file_bytes = patchilizer.encode_train(abc_text)
+        file_masks = [1] * len(file_bytes)
+        file_bytes = torch.tensor(file_bytes, dtype=torch.long)
+        file_masks = torch.tensor(file_masks, dtype=torch.long)
+        return file_bytes, file_masks
+def process_one_batch(batch):
+    input_patches, input_masks = batch
+    loss = model(input_patches, input_masks).loss
+    # Reduce the loss on GPU 0
+    if world_size > 1:
+        loss = loss.unsqueeze(0)
+        dist.reduce(loss, dst=0)
+        loss = loss / world_size
+        dist.broadcast(loss, src=0)
+    return loss
+# do one epoch for training
+def train_epoch(epoch):
+    tqdm_train_set = tqdm(train_set)
+    total_train_loss = 0
+    iter_idx = 1
+    model.train()
+    train_steps = (epoch-1)*len(train_set)
+    for batch in tqdm_train_set:
+        minibatches = split_into_minibatches(batch[0], batch[1], BATCH_SIZE//ACCUMULATION_STEPS)
+        for minibatch in minibatches:
+            with autocast():
+                loss = process_one_batch(minibatch) / ACCUMULATION_STEPS
+            scaler.scale(loss).backward()
+            total_train_loss += loss.item()
+        scaler.step(optimizer)
+        scaler.update()
+        lr_scheduler.step()
+        model.zero_grad(set_to_none=True)
+        tqdm_train_set.set_postfix({str(global_rank)+'_train_loss': total_train_loss / iter_idx})
+        train_steps += 1
+        # Log the training loss to wandb
+        if global_rank==0 and WANDB_LOGGING:
+            wandb.log({"train_loss": total_train_loss / iter_idx}, step=train_steps)
+        iter_idx += 1
+        if iter_idx % 1000 == 0:
+            clear_unused_tensors()
+    return total_train_loss / (iter_idx-1)
+# do one epoch for eval
+def eval_epoch():
+    tqdm_eval_set = tqdm(eval_set)
+    total_eval_loss = 0
+    total_eval_bpb = 0
+    iter_idx = 1
+    model.eval()
+    # Evaluate data for one epoch
+    for batch in tqdm_eval_set:
+        minibatches = split_into_minibatches(batch[0], batch[1], BATCH_SIZE//ACCUMULATION_STEPS)
+        for minibatch in minibatches:
+            with torch.no_grad():
+                loss = process_one_batch(minibatch) / ACCUMULATION_STEPS
+            total_eval_loss += loss.item()
+        tqdm_eval_set.set_postfix({str(global_rank)+'_eval_loss': total_eval_loss / iter_idx})
+        iter_idx += 1
+    return total_eval_loss / (iter_idx-1)
+# train and eval
+if __name__ == "__main__":
+    # Initialize wandb
+    if WANDB_LOGGING and global_rank==0:
+        wandb.login(key=WANDB_KEY)
+        wandb.init(project="notagen",
+                   name=WANDB_NAME)
+    # load data
+    with open(DATA_TRAIN_INDEX_PATH, "r", encoding="utf-8") as f:
+        print("Loading Data...")
+        train_files = []
+        for line in f:
+            train_files.append(json.loads(line))
+    with open(DATA_EVAL_INDEX_PATH, "r", encoding="utf-8") as f:
+        print("Loading Data...")
+        eval_files = []
+        for line in f:
+            eval_files.append(json.loads(line))
+    if len(eval_files) == 0:
+        train_files, eval_files = split_data(train_files)
+    train_batch_nums = int(len(train_files) / batch_size)
+    eval_batch_nums = int(len(eval_files) / batch_size)
+    random.shuffle(train_files)
+    random.shuffle(eval_files)
+    train_files = train_files[:train_batch_nums*batch_size]
+    eval_files = eval_files[:eval_batch_nums*batch_size]
+    train_set = NotaGenDataset(train_files)
+    eval_set = NotaGenDataset(eval_files)
+    train_sampler = DistributedSampler(train_set, num_replicas=world_size, rank=local_rank)
+    eval_sampler = DistributedSampler(eval_set, num_replicas=world_size, rank=local_rank)
+    train_set = DataLoader(train_set, batch_size=batch_size, collate_fn=collate_batch, sampler=train_sampler, shuffle = (train_sampler is None))
+    eval_set = DataLoader(eval_set, batch_size=batch_size, collate_fn=collate_batch, sampler=eval_sampler, shuffle = (train_sampler is None))
+    lr_scheduler = get_constant_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=1000)
+    model = model.to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
+    if not LOAD_FROM_CHECKPOINT:
+        if os.path.exists(PRETRAINED_PATH):
+            # Load pre-trained checkpoint to CPU
+            checkpoint = torch.load(PRETRAINED_PATH, map_location='cpu')
+            # Here, model is assumed to be on GPU
+            # Load state dict to CPU model first, then move the model to GPU
+            if torch.cuda.device_count() > 1:
+                # If you have a DataParallel model, you need to load to model.module instead
+                cpu_model = deepcopy(model.module)
+                cpu_model.load_state_dict(checkpoint['model'])
+                model.module.load_state_dict(cpu_model.state_dict())
+            else:
+                # Load to a CPU clone of the model, then load back
+                cpu_model = deepcopy(model)
+                cpu_model.load_state_dict(checkpoint['model'])
+                model.load_state_dict(cpu_model.state_dict())
+            print(f"Successfully Loaded Pretrained Checkpoint at Epoch {checkpoint['epoch']} with Loss {checkpoint['min_eval_loss']}")
+            pre_epoch = 0
+            best_epoch = 0
+            min_eval_loss = 100
+        else:
+            raise Exception('Pre-trained Checkpoint not found. Please check your pre-trained ckpt path.')
+    else:
+        if os.path.exists(WEIGHTS_PATH):
+            # Load checkpoint to CPU
+            checkpoint = torch.load(WEIGHTS_PATH, map_location='cpu')
+            # Here, model is assumed to be on GPU
+            # Load state dict to CPU model first, then move the model to GPU
+            if torch.cuda.device_count() > 1:
+                # If you have a DataParallel model, you need to load to model.module instead
+                cpu_model = deepcopy(model.module)
+                cpu_model.load_state_dict(checkpoint['model'])
+                model.module.load_state_dict(cpu_model.state_dict())
+            else:
+                # Load to a CPU clone of the model, then load back
+                cpu_model = deepcopy(model)
+                cpu_model.load_state_dict(checkpoint['model'])
+                model.load_state_dict(cpu_model.state_dict())
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            lr_scheduler.load_state_dict(checkpoint['lr_sched'])
+            pre_epoch = checkpoint['epoch']
+            best_epoch = checkpoint['best_epoch']
+            min_eval_loss = checkpoint['min_eval_loss']
+            print("Successfully Loaded Checkpoint from Epoch %d" % pre_epoch)
+            checkpoint = None
+        else:
+            raise Exception('Checkpoint not found to continue training. Please check your parameter settings.')
+    for epoch in range(1+pre_epoch, NUM_EPOCHS+1):
+        train_sampler.set_epoch(epoch)
+        eval_sampler.set_epoch(epoch)
+        print('-' * 21 + "Epoch " + str(epoch) + '-' * 21)
+        train_loss = train_epoch(epoch)
+        eval_loss = eval_epoch()
+        if global_rank==0:
+            with open(LOGS_PATH,'a') as f:
+                f.write("Epoch " + str(epoch) + "\ntrain_loss: " + str(train_loss) + "\neval_loss: " +str(eval_loss) + "\ntime: " + time.asctime(time.localtime(time.time())) + "\n\n")
+            if eval_loss < min_eval_loss:
+                best_epoch = epoch
+                min_eval_loss = eval_loss
+                checkpoint = {
+                                'model': model.module.state_dict() if hasattr(model, "module") else model.state_dict(),
+                                'optimizer': optimizer.state_dict(),
+                                'lr_sched': lr_scheduler.state_dict(),
+                                'epoch': epoch,
+                                'best_epoch': best_epoch,
+                                'min_eval_loss': min_eval_loss
+                                }
+                torch.save(checkpoint, WEIGHTS_PATH)
+        if world_size > 1:
+            dist.barrier()
+    if global_rank==0:
+        print("Best Eval Epoch : "+str(best_epoch))
+        print("Min Eval Loss : "+str(min_eval_loss))

train.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import os
+import gc
+import time
+import math
+import json
+import wandb
+import torch
+import random
+import numpy as np
+from abctoolkit.transpose import Key2index, Key2Mode
+from utils import *
+from config import *
+from data import generate_preference_dict
+from tqdm import tqdm
+from copy import deepcopy
+from torch.utils.data import Dataset, DataLoader
+from transformers import GPT2Config, get_scheduler, get_constant_schedule_with_warmup
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+# Set random seed
+seed = 0
+random.seed(seed)
+np.random.seed(seed)
+torch.manual_seed(seed)
+torch.cuda.manual_seed_all(seed)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+patchilizer = Patchilizer()
+patch_config = GPT2Config(num_hidden_layers=PATCH_NUM_LAYERS,
+                    max_length=PATCH_LENGTH,
+                    max_position_embeddings=PATCH_LENGTH,
+                    n_embd=HIDDEN_SIZE,
+                    num_attention_heads=HIDDEN_SIZE//64,
+                    vocab_size=1)
+char_config = GPT2Config(num_hidden_layers=CHAR_NUM_LAYERS,
+                            max_length=PATCH_SIZE+1,
+                            max_position_embeddings=PATCH_SIZE+1,
+                            hidden_size=HIDDEN_SIZE,
+                            num_attention_heads=HIDDEN_SIZE//64,
+                            vocab_size=128)
+model_ref = NotaGenLMHeadModel(encoder_config=patch_config, decoder_config=char_config)
+model = NotaGenLMHeadModel(encoder_config=patch_config, decoder_config=char_config)
+model_ref = model_ref.to(device)
+model = model.to(device)
+# print parameter number
+print("Parameter Number: "+str(sum(p.numel() for p in model.parameters() if p.requires_grad)))
+optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
+def collate_batch(input_batches):
+    pos_input_patches, pos_input_masks, neg_input_patches, neg_input_masks = input_batches
+    pos_input_patches = pos_input_patches.unsqueeze(0)
+    pos_input_masks = pos_input_masks.unsqueeze(0)
+    neg_input_patches = neg_input_patches.unsqueeze(0)
+    neg_input_masks = neg_input_masks.unsqueeze(0)
+    pos_input_patches = torch.nn.utils.rnn.pad_sequence(pos_input_patches, batch_first=True, padding_value=0)
+    pos_input_masks = torch.nn.utils.rnn.pad_sequence(pos_input_masks, batch_first=True, padding_value=0)
+    neg_input_patches = torch.nn.utils.rnn.pad_sequence(neg_input_patches, batch_first=True, padding_value=0)
+    neg_input_masks = torch.nn.utils.rnn.pad_sequence(neg_input_masks, batch_first=True, padding_value=0)
+    return (pos_input_patches.to(device), pos_input_masks.to(device),
+            neg_input_patches.to(device), neg_input_masks.to(device))
+class NotaGenDataset(Dataset):
+    def __init__(self, preference_dict):
+        self.preference_dict = preference_dict
+        self.pair_list = []
+        for pos_filepath in self.preference_dict['chosen']:
+            for neg_filepath in self.preference_dict['rejected']:
+                self.pair_list.append({'chosen': pos_filepath, 'rejected': neg_filepath})
+    def __len__(self):
+        return len(self.pair_list)
+    def __getitem__(self, idx):
+        try:
+            pair = self.pair_list[idx]
+            pos_filepath = pair['chosen']
+            neg_filepath = pair['rejected']
+            with open(pos_filepath, 'r', encoding='utf-8') as f:
+                pos_abc_text = f.read()
+            with open(neg_filepath, 'r', encoding='utf-8') as f:
+                neg_abc_text = f.read()
+            pos_file_bytes = patchilizer.encode(pos_abc_text)
+            pos_file_masks = [1] * len(pos_file_bytes)
+            neg_file_bytes = patchilizer.encode(neg_abc_text)
+            neg_file_masks = [1] * len(neg_file_bytes)
+            pos_file_bytes = torch.tensor(pos_file_bytes, dtype=torch.long)
+            pos_file_masks = torch.tensor(pos_file_masks, dtype=torch.long)
+            neg_file_bytes = torch.tensor(neg_file_bytes, dtype=torch.long)
+            neg_file_masks = torch.tensor(neg_file_masks, dtype=torch.long)
+            return pos_file_bytes, pos_file_masks, neg_file_bytes, neg_file_masks
+        except Exception as e:
+            print(e)
+            return self.__getitem__((idx+1) % len(self.pair_list))
+def process_one_batch(batch):
+    pos_input_patches, pos_input_masks, neg_input_patches, neg_input_masks = batch
+    pos_input_patches_ref = pos_input_patches.clone()
+    pos_input_masks_ref = pos_input_masks.clone()
+    neg_input_patches_ref = neg_input_patches.clone()
+    neg_input_masks_ref = neg_input_masks.clone()
+    policy_pos_logps = model(pos_input_patches, pos_input_masks)
+    policy_neg_logps = model(neg_input_patches, neg_input_masks)
+    with torch.no_grad():
+        ref_pos_logps = model_ref(pos_input_patches_ref, pos_input_masks_ref).detach()
+        ref_neg_logps = model_ref(neg_input_patches_ref, neg_input_masks_ref).detach()
+    logits = (policy_pos_logps - policy_neg_logps) - (ref_pos_logps - ref_neg_logps)
+    loss = - torch.nn.functional.logsigmoid(BETA * (logits - LAMBDA * max(0, ref_pos_logps - policy_pos_logps)))
+    return loss
+# train
+if __name__ == "__main__":
+    # Initialize wandb
+    if WANDB_LOGGING:
+        wandb.login(key=WANDB_KEY)
+        wandb.init(project="notagen",
+                   name=WANDB_NAME)
+    # load data
+    with open(DATA_INDEX_PATH, 'r') as f:
+        preference_dict = json.loads(f.read())
+    train_set = NotaGenDataset(preference_dict)
+    # Load model actor/ref
+    if os.path.exists(PRETRAINED_PATH):
+        checkpoint = torch.load(PRETRAINED_PATH, map_location='cpu')
+        cpu_model = deepcopy(model)
+        cpu_model.load_state_dict(checkpoint['model'])
+        model.load_state_dict(cpu_model.state_dict())
+        cpu_model_ref = deepcopy(model_ref)
+        cpu_model_ref.load_state_dict(checkpoint['model'])
+        model_ref.load_state_dict(cpu_model_ref.state_dict())
+    else:
+        raise Exception('No pre-trained model loaded.')
+    model.train()
+    total_train_loss = 0
+    iter_idx = 1
+    tqdm_set = tqdm(range(OPTIMIZATION_STEPS))
+    for i in tqdm_set:
+        idx = random.randint(0, len(train_set)-1)
+        batch = train_set[idx]
+        batch = collate_batch(batch)
+        loss = process_one_batch(batch)
+        total_train_loss += loss.item()
+        loss.backward()
+        torch.nn.utils.clip_grad_norm(model.parameters(),max_norm=1.0 )
+        optimizer.step()
+        model.zero_grad(set_to_none=True)
+        tqdm_set.set_postfix({'train_loss': total_train_loss / (i + 1)})
+        # Log the training loss to wandb
+        if WANDB_LOGGING:
+            wandb.log({"train_loss": total_train_loss / (i + 1)}, step=i+1)
+    checkpoint = {'model': model.module.state_dict() if hasattr(model, "module") else model.state_dict()}
+    torch.save(checkpoint, WEIGHTS_PATH)

utils (1).py ADDED Viewed

	@@ -0,0 +1,483 @@

+import re
+import os
+import math
+import torch
+import random
+from config import *
+from unidecode import unidecode
+from torch.nn import functional as F
+from transformers import AutoModel, BertModel, GPT2LMHeadModel, PreTrainedModel, GPT2Config
+try:
+    import torch.distributed.nn
+    from torch import distributed as dist
+    has_distributed = True
+except ImportError:
+    has_distributed = False
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+class ClipLoss(torch.nn.Module):
+    def __init__(
+            self,
+            local_loss=False,
+            gather_with_grad=False,
+            cache_labels=False,
+            rank=0,
+            world_size=1,
+            use_horovod=False,
+    ):
+        super().__init__()
+        self.local_loss = local_loss
+        self.gather_with_grad = gather_with_grad
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        self.use_horovod = use_horovod
+        # cache state
+        self.prev_num_logits = 0
+        self.labels = {}
+    def gather_features(
+            self,
+            image_features,
+            text_features,
+            local_loss=False,
+            gather_with_grad=False,
+            rank=0,
+            world_size=1,
+            use_horovod=False
+    ):
+        assert has_distributed, 'torch.distributed did not import correctly, please use a PyTorch version with support.'
+        if use_horovod:
+            assert hvd is not None, 'Please install horovod'
+            if gather_with_grad:
+                all_image_features = hvd.allgather(image_features)
+                all_text_features = hvd.allgather(text_features)
+            else:
+                with torch.no_grad():
+                    all_image_features = hvd.allgather(image_features)
+                    all_text_features = hvd.allgather(text_features)
+                if not local_loss:
+                    # ensure grads for local rank when all_* features don't have a gradient
+                    gathered_image_features = list(all_image_features.chunk(world_size, dim=0))
+                    gathered_text_features = list(all_text_features.chunk(world_size, dim=0))
+                    gathered_image_features[rank] = image_features
+                    gathered_text_features[rank] = text_features
+                    all_image_features = torch.cat(gathered_image_features, dim=0)
+                    all_text_features = torch.cat(gathered_text_features, dim=0)
+        else:
+            # We gather tensors from all gpus
+            if gather_with_grad:
+                all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0)
+                all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
+            else:
+                gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)]
+                gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
+                dist.all_gather(gathered_image_features, image_features)
+                dist.all_gather(gathered_text_features, text_features)
+                if not local_loss:
+                    # ensure grads for local rank when all_* features don't have a gradient
+                    gathered_image_features[rank] = image_features
+                    gathered_text_features[rank] = text_features
+                all_image_features = torch.cat(gathered_image_features, dim=0)
+                all_text_features = torch.cat(gathered_text_features, dim=0)
+        return all_image_features, all_text_features
+    def get_ground_truth(self, device, num_logits) -> torch.Tensor:
+        # calculated ground-truth and cache if enabled
+        if self.prev_num_logits != num_logits or device not in self.labels:
+            labels = torch.arange(num_logits, device=device, dtype=torch.long)
+            if self.world_size > 1 and self.local_loss:
+                labels = labels + num_logits * self.rank
+            if self.cache_labels:
+                self.labels[device] = labels
+                self.prev_num_logits = num_logits
+        else:
+            labels = self.labels[device]
+        return labels
+    def get_logits(self, image_features, text_features, logit_scale):
+        if self.world_size > 1:
+            all_image_features, all_text_features = self.gather_features(
+                image_features, text_features,
+                self.local_loss, self.gather_with_grad, self.rank, self.world_size, self.use_horovod)
+            if self.local_loss:
+                logits_per_image = logit_scale * image_features @ all_text_features.T
+                logits_per_text = logit_scale * text_features @ all_image_features.T
+            else:
+                logits_per_image = logit_scale * all_image_features @ all_text_features.T
+                logits_per_text = logits_per_image.T
+        else:
+            logits_per_image = logit_scale * image_features @ text_features.T
+            logits_per_text = logit_scale * text_features @ image_features.T
+        return logits_per_image, logits_per_text
+    def forward(self, image_features, text_features, logit_scale, output_dict=False):
+        device = image_features.device
+        logits_per_image, logits_per_text = self.get_logits(image_features, text_features, logit_scale)
+        labels = self.get_ground_truth(device, logits_per_image.shape[0])
+        total_loss = (
+            F.cross_entropy(logits_per_image, labels) +
+            F.cross_entropy(logits_per_text, labels)
+        ) / 2
+        return {"contrastive_loss": total_loss} if output_dict else total_loss
+class M3Patchilizer:
+    def __init__(self):
+        self.delimiters = ["|:", "::", ":|", "[|", "||", "|]", "|"]
+        self.regexPattern = '(' + '|'.join(map(re.escape, self.delimiters)) + ')'
+        self.pad_token_id = 0
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.mask_token_id = 3
+    def split_bars(self, body):
+        bars = re.split(self.regexPattern, ''.join(body))
+        bars = list(filter(None, bars))  # remove empty strings
+        if bars[0] in self.delimiters:
+            bars[1] = bars[0] + bars[1]
+            bars = bars[1:]
+        bars = [bars[i * 2] + bars[i * 2 + 1] for i in range(len(bars) // 2)]
+        return bars
+    def bar2patch(self, bar, patch_size=PATCH_SIZE):
+        patch = [self.bos_token_id] + [ord(c) for c in bar] + [self.eos_token_id]
+        patch = patch[:patch_size]
+        patch += [self.pad_token_id] * (patch_size - len(patch))
+        return patch
+    def patch2bar(self, patch):
+        return ''.join(chr(idx) if idx > self.mask_token_id else '' for idx in patch)
+    def encode(self,
+               item,
+               patch_size=PATCH_SIZE,
+               add_special_patches=False,
+               truncate=False,
+               random_truncate=False):
+        item = unidecode(item)
+        lines = re.findall(r'.*?\n|.*$', item)
+        lines = list(filter(None, lines))  # remove empty lines
+        patches = []
+        if lines[0].split(" ")[0] == "ticks_per_beat":
+            patch = ""
+            for line in lines:
+                if patch.startswith(line.split(" ")[0]) and (len(patch) + len(" ".join(line.split(" ")[1:])) <= patch_size-2):
+                    patch = patch[:-1] + "\t" + " ".join(line.split(" ")[1:])
+                else:
+                    if patch:
+                        patches.append(patch)
+                    patch = line
+            if patch!="":
+                patches.append(patch)
+        else:
+            for line in lines:
+                if len(line) > 1 and ((line[0].isalpha() and line[1] == ':') or line.startswith('%%')):
+                    patches.append(line)
+                else:
+                    bars = self.split_bars(line)
+                    if bars:
+                        bars[-1] += '\n'
+                        patches.extend(bars)
+        if add_special_patches:
+            bos_patch = chr(self.bos_token_id) * patch_size
+            eos_patch = chr(self.eos_token_id) * patch_size
+            patches = [bos_patch] + patches + [eos_patch]
+        if len(patches) > PATCH_LENGTH and truncate:
+            choices = ["head", "tail", "middle"]
+            choice = random.choice(choices)
+            if choice=="head" or random_truncate==False:
+                patches = patches[:PATCH_LENGTH]
+            elif choice=="tail":
+                patches = patches[-PATCH_LENGTH:]
+            else:
+                start = random.randint(1, len(patches)-PATCH_LENGTH)
+                patches = patches[start:start+PATCH_LENGTH]
+        patches = [self.bar2patch(patch) for patch in patches]
+        return patches
+    def decode(self, patches):
+        return ''.join(self.patch2bar(patch) for patch in patches)
+class M3PatchEncoder(PreTrainedModel):
+    def __init__(self, config):
+        super(M3PatchEncoder, self).__init__(config)
+        self.patch_embedding = torch.nn.Linear(PATCH_SIZE*128, M3_HIDDEN_SIZE)
+        torch.nn.init.normal_(self.patch_embedding.weight, std=0.02)
+        self.base = BertModel(config=config)
+        self.pad_token_id = 0
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.mask_token_id = 3
+    def forward(self,
+                input_patches, # [batch_size, seq_length, hidden_size]
+                input_masks):  # [batch_size, seq_length]
+        # Transform input_patches into embeddings
+        input_patches = torch.nn.functional.one_hot(input_patches, num_classes=128)
+        input_patches = input_patches.reshape(len(input_patches), -1, PATCH_SIZE*128).type(torch.FloatTensor)
+        input_patches = self.patch_embedding(input_patches.to(self.device))
+        # Apply BERT model to input_patches and input_masks
+        return self.base(inputs_embeds=input_patches, attention_mask=input_masks)
+class M3TokenDecoder(PreTrainedModel):
+    def __init__(self, config):
+        super(M3TokenDecoder, self).__init__(config)
+        self.base = GPT2LMHeadModel(config=config)
+        self.pad_token_id = 0
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.mask_token_id = 3
+    def forward(self,
+                patch_features,  # [batch_size, hidden_size]
+                target_patches): # [batch_size, seq_length]
+        # get input embeddings
+        inputs_embeds = torch.nn.functional.embedding(target_patches, self.base.transformer.wte.weight)
+        # concatenate the encoded patches with the input embeddings
+        inputs_embeds = torch.cat((patch_features.unsqueeze(1), inputs_embeds[:,1:,:]), dim=1)
+        # preparing the labels for model training
+        target_masks = target_patches == self.pad_token_id
+        target_patches = target_patches.clone().masked_fill_(target_masks, -100)
+        # get the attention mask
+        target_masks = ~target_masks
+        target_masks = target_masks.type(torch.int)
+        return self.base(inputs_embeds=inputs_embeds,
+                         attention_mask=target_masks,
+                         labels=target_patches)
+    def generate(self,
+                 patch_feature,
+                 tokens):
+        # reshape the patch_feature and tokens
+        patch_feature = patch_feature.reshape(1, 1, -1)
+        tokens = tokens.reshape(1, -1)
+        # get input embeddings
+        tokens = torch.nn.functional.embedding(tokens, self.base.transformer.wte.weight)
+        # concatenate the encoded patches with the input embeddings
+        tokens = torch.cat((patch_feature, tokens[:,1:,:]), dim=1)
+        # get the outputs from the model
+        outputs = self.base(inputs_embeds=tokens)
+        # get the probabilities of the next token
+        probs = torch.nn.functional.softmax(outputs.logits.squeeze(0)[-1], dim=-1)
+        return probs.detach().cpu().numpy()
+class M3Model(PreTrainedModel):
+    def __init__(self, encoder_config, decoder_config):
+        super(M3Model, self).__init__(encoder_config)
+        self.encoder = M3PatchEncoder(encoder_config)
+        self.decoder = M3TokenDecoder(decoder_config)
+        self.pad_token_id = 0
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.mask_token_id = 3
+    def forward(self,
+                input_patches,      # [batch_size, seq_length, hidden_size]
+                input_masks,        # [batch_size, seq_length]
+                selected_indices,   # [batch_size, seq_length]
+                target_patches):    # [batch_size, seq_length, hidden_size]
+        input_patches = input_patches.reshape(len(input_patches), -1, PATCH_SIZE).to(self.device)
+        input_masks = input_masks.to(self.device)
+        selected_indices = selected_indices.to(self.device)
+        target_patches = target_patches.reshape(len(target_patches), -1, PATCH_SIZE).to(self.device)
+        # Pass the input_patches and input_masks through the encoder
+        outputs = self.encoder(input_patches, input_masks)["last_hidden_state"]
+        # Use selected_indices to form target_patches
+        target_patches = target_patches[selected_indices.bool()]
+        patch_features = outputs[selected_indices.bool()]
+        # Pass patch_features and target_patches through the decoder
+        return self.decoder(patch_features, target_patches)
+class CLaMP2Model(PreTrainedModel):
+    def __init__(self,
+                 music_config,
+                 global_rank=None,
+                 world_size=None,
+                 text_model_name=TEXT_MODEL_NAME,
+                 hidden_size=CLAMP2_HIDDEN_SIZE,
+                 load_m3=CLAMP2_LOAD_M3):
+        super(CLaMP2Model, self).__init__(music_config)
+        self.text_model = AutoModel.from_pretrained(text_model_name) # Load the text model
+        self.text_proj = torch.nn.Linear(self.text_model.config.hidden_size, hidden_size) # Linear layer for text projections
+        torch.nn.init.normal_(self.text_proj.weight, std=0.02) # Initialize weights with normal distribution
+        self.music_model = M3PatchEncoder(music_config) # Initialize the music model
+        self.music_proj = torch.nn.Linear(M3_HIDDEN_SIZE, hidden_size) # Linear layer for music projections
+        torch.nn.init.normal_(self.music_proj.weight, std=0.02) # Initialize weights with normal distribution
+        if global_rank==None or world_size==None:
+            global_rank = 0
+            world_size = 1
+        self.loss_fn = ClipLoss(local_loss=False,
+                                gather_with_grad=True,
+                                cache_labels=False,
+                                rank=global_rank,
+                                world_size=world_size,
+                                use_horovod=False)
+        if load_m3 and os.path.exists(M3_WEIGHTS_PATH):
+            checkpoint = torch.load(M3_WEIGHTS_PATH, map_location='cpu', weights_only=True)
+            decoder_config = GPT2Config(vocab_size=128,
+                            n_positions=PATCH_SIZE,
+                            n_embd=M3_HIDDEN_SIZE,
+                            n_layer=TOKEN_NUM_LAYERS,
+                            n_head=M3_HIDDEN_SIZE//64,
+                            n_inner=M3_HIDDEN_SIZE*4)
+            model = M3Model(music_config, decoder_config)
+            model.load_state_dict(checkpoint['model'])
+            self.music_model = model.encoder
+            model = None
+            print(f"Successfully Loaded M3 Checkpoint from Epoch {checkpoint['epoch']} with loss {checkpoint['min_eval_loss']}")
+    def avg_pooling(self, input_features, input_masks):
+        input_masks = input_masks.unsqueeze(-1).to(self.device) # add a dimension to match the feature dimension
+        input_features = input_features * input_masks # apply mask to input_features
+        avg_pool = input_features.sum(dim=1) / input_masks.sum(dim=1) # calculate average pooling
+        return avg_pool
+    def get_text_features(self,
+                          text_inputs,
+                          text_masks,
+                          get_normalized=False):
+        text_features = self.text_model(text_inputs.to(self.device),
+                                        attention_mask=text_masks.to(self.device))['last_hidden_state']
+        if get_normalized:
+            text_features = self.avg_pooling(text_features, text_masks)
+            text_features = self.text_proj(text_features)
+        return text_features
+    def get_music_features(self,
+                            music_inputs,
+                            music_masks,
+                            get_normalized=False):
+        music_features = self.music_model(music_inputs.to(self.device),
+                                          music_masks.to(self.device))['last_hidden_state']
+        if get_normalized:
+            music_features = self.avg_pooling(music_features, music_masks)
+            music_features = self.music_proj(music_features)
+        return music_features
+    def forward(self,
+                text_inputs,    # [batch_size, seq_length]
+                text_masks,     # [batch_size, seq_length]
+                music_inputs,   # [batch_size, seq_length, hidden_size]
+                music_masks):   # [batch_size, seq_length]
+        # Compute the text features
+        text_features = self.get_text_features(text_inputs, text_masks, get_normalized=True)
+        # Compute the music features
+        music_features = self.get_music_features(music_inputs, music_masks, get_normalized=True)
+        return self.loss_fn(text_features,
+                            music_features,
+                            LOGIT_SCALE,
+                            output_dict=False)
+def split_data(data, eval_ratio=EVAL_SPLIT):
+    random.shuffle(data)
+    split_idx = int(len(data)*eval_ratio)
+    eval_set = data[:split_idx]
+    train_set = data[split_idx:]
+    return train_set, eval_set
+def mask_patches(target_patches, patchilizer, mode):
+    indices = list(range(len(target_patches)))
+    random.shuffle(indices)
+    selected_indices = indices[:math.ceil(M3_MASK_RATIO*len(indices))]
+    sorted_indices = sorted(selected_indices)
+    input_patches = torch.tensor(target_patches)
+    if mode=="eval":
+        choice = "original"
+    else:
+        choice = random.choices(["mask", "shuffle", "original"], weights=[0.8, 0.1, 0.1])[0]
+    if choice=="mask":
+        input_patches[sorted_indices] = torch.tensor([patchilizer.mask_token_id]*PATCH_SIZE)
+    elif choice=="shuffle":
+        for idx in sorted_indices:
+            patch = input_patches[idx]
+            try:
+                index_eos = (patch == patchilizer.eos_token_id).nonzero().item()
+            except:
+                index_eos = len(patch)
+            indices = list(range(1, index_eos))
+            random.shuffle(indices)
+            indices = [0] + indices + list(range(index_eos, len(patch)))
+            input_patches[idx] = patch[indices]
+    selected_indices = torch.zeros(len(target_patches))
+    selected_indices[sorted_indices] = 1.
+    return input_patches, selected_indices
+def remove_instrument_info(item):
+    # remove instrument information from symbolic music
+    lines = re.findall(r'.*?\n|.*$', item)
+    lines = list(filter(None, lines))
+    if lines[0].split(" ")[0] == "ticks_per_beat":
+        type = "mtf"
+    else:
+        type = "abc"
+    cleaned_lines = []
+    for line in lines:
+        if type=="abc" and line.startswith("V:"):
+            # find the position of " nm=" or " snm="
+            nm_pos = line.find(" nm=")
+            snm_pos = line.find(" snm=")
+            # keep the part before " nm=" or " snm="
+            if nm_pos != -1:
+                line = line[:nm_pos]
+            elif snm_pos != -1:
+                line = line[:snm_pos]
+            if nm_pos != -1 or snm_pos != -1:
+                line += "\n"
+        elif type=="mtf" and line.startswith("program_change"):
+            line = " ".join(line.split(" ")[:-1]) + " 0\n"
+        cleaned_lines.append(line)
+    return ''.join(cleaned_lines)

utils (2).py ADDED Viewed

	@@ -0,0 +1,423 @@

+import torch
+import random
+import bisect
+import json
+import re
+import numpy as np
+from config import *
+from transformers import GPT2Model, GPT2LMHeadModel, LlamaModel, LlamaForCausalLM, PreTrainedModel
+from samplings import top_p_sampling, top_k_sampling, temperature_sampling
+from tokenizers import Tokenizer
+class Patchilizer:
+    def __init__(self, stream=PATCH_STREAM):
+        self.stream = stream
+        self.delimiters = ["|:", "::", ":|", "[|", "||", "|]", "|"]
+        self.regexPattern = '(' + '|'.join(map(re.escape, self.delimiters)) + ')'
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.special_token_id = 0
+    def split_bars(self, body_lines):
+        """
+        Split a body of music into individual bars.
+        """
+        new_bars = []
+        try:
+            for line in body_lines:
+                line_bars = re.split(self.regexPattern, line)
+                line_bars = list(filter(None, line_bars))
+                new_line_bars = []
+                if len(line_bars) == 1:
+                    new_line_bars = line_bars
+                else:
+                    if line_bars[0] in self.delimiters:
+                        new_line_bars = [line_bars[i] + line_bars[i + 1] for i in range(0, len(line_bars), 2)]
+                    else:
+                        new_line_bars = [line_bars[0]] + [line_bars[i] + line_bars[i + 1] for i in range(1, len(line_bars), 2)]
+                    if 'V' not in new_line_bars[-1]:
+                        new_line_bars[-2] += new_line_bars[-1]
+                        new_line_bars = new_line_bars[:-1]
+                new_bars += new_line_bars
+        except:
+            pass
+        return new_bars
+    def split_patches(self, abc_text, patch_size=PATCH_SIZE, generate_last=False):
+        if not generate_last and len(abc_text) % patch_size != 0:
+            abc_text += chr(self.eos_token_id)
+        patches = [abc_text[i : i + patch_size] for i in range(0, len(abc_text), patch_size)]
+        return patches
+    def patch2chars(self, patch):
+        """
+        Convert a patch into a bar.
+        """
+        bytes = ''
+        for idx in patch:
+            if idx == self.eos_token_id:
+                break
+            if idx < self.eos_token_id:
+                pass
+            bytes += chr(idx)
+        return bytes
+    def patchilize_metadata(self, metadata_lines):
+        metadata_patches = []
+        for line in metadata_lines:
+            metadata_patches += self.split_patches(line)
+        return metadata_patches
+    def patchilize_tunebody(self, tunebody_lines, encode_mode='train'):
+        tunebody_patches = []
+        bars = self.split_bars(tunebody_lines)
+        if encode_mode == 'train':
+            for bar in bars:
+                tunebody_patches += self.split_patches(bar)
+        elif encode_mode == 'generate':
+            for bar in bars[:-1]:
+                tunebody_patches += self.split_patches(bar)
+            tunebody_patches += self.split_patches(bars[-1], generate_last=True)
+        return tunebody_patches
+    def encode_train(self, abc_text, patch_length=PATCH_LENGTH, patch_size=PATCH_SIZE, add_special_patches=True, cut=True):
+        lines = abc_text.split('\n')
+        lines = list(filter(None, lines))
+        lines = [line + '\n' for line in lines]
+        tunebody_index = -1
+        for i, line in enumerate(lines):
+            if '[V:' in line:
+                tunebody_index = i
+                break
+        metadata_lines = lines[ : tunebody_index]
+        tunebody_lines = lines[tunebody_index : ]
+        if self.stream:
+            tunebody_lines = ['[r:' + str(line_index) + '/' + str(len(tunebody_lines) - line_index - 1) + ']' + line for line_index, line in
+                                enumerate(tunebody_lines)]
+        metadata_patches = self.patchilize_metadata(metadata_lines)
+        tunebody_patches = self.patchilize_tunebody(tunebody_lines, encode_mode='train')
+        if add_special_patches:
+            bos_patch = chr(self.bos_token_id) * (patch_size - 1) + chr(self.eos_token_id)
+            eos_patch = chr(self.bos_token_id) + chr(self.eos_token_id) * (patch_size - 1)
+            metadata_patches = [bos_patch] + metadata_patches
+            tunebody_patches = tunebody_patches + [eos_patch]
+        if self.stream:
+            if len(metadata_patches) + len(tunebody_patches) > patch_length:
+                available_cut_indexes = [0] + [index + 1 for index, patch in enumerate(tunebody_patches) if '\n' in patch]
+                line_index_for_cut_index = list(range(len(available_cut_indexes)))
+                end_index = len(metadata_patches) + len(tunebody_patches) - patch_length
+                biggest_index = bisect.bisect_left(available_cut_indexes, end_index)
+                available_cut_indexes = available_cut_indexes[:biggest_index + 1]
+                if len(available_cut_indexes) == 1:
+                    choices = ['head']
+                elif len(available_cut_indexes) == 2:
+                    choices = ['head', 'tail']
+                else:
+                    choices = ['head', 'tail', 'middle']
+                choice = random.choice(choices)
+                if choice == 'head':
+                    patches = metadata_patches + tunebody_patches[0:]
+                else:
+                    if choice == 'tail':
+                        cut_index = len(available_cut_indexes) - 1
+                    else:
+                        cut_index = random.choice(range(1, len(available_cut_indexes) - 1))
+                    line_index = line_index_for_cut_index[cut_index]
+                    stream_tunebody_lines = tunebody_lines[line_index : ]
+                    stream_tunebody_patches = self.patchilize_tunebody(stream_tunebody_lines, encode_mode='train')
+                    if add_special_patches:
+                        stream_tunebody_patches = stream_tunebody_patches + [eos_patch]
+                    patches = metadata_patches + stream_tunebody_patches
+            else:
+                patches = metadata_patches + tunebody_patches
+        else:
+            patches = metadata_patches + tunebody_patches
+        if cut:
+            patches = patches[ : patch_length]
+        else:
+            pass
+        # encode to ids
+        id_patches = []
+        for patch in patches:
+            id_patch = [ord(c) for c in patch] + [self.special_token_id] * (patch_size - len(patch))
+            id_patches.append(id_patch)
+        return id_patches
+    def encode_generate(self, abc_code, patch_length=PATCH_LENGTH, patch_size=PATCH_SIZE, add_special_patches=True):
+        lines = abc_code.split('\n')
+        lines = list(filter(None, lines))
+        tunebody_index = None
+        for i, line in enumerate(lines):
+            if line.startswith('[V:') or line.startswith('[r:'):
+                tunebody_index = i
+                break
+        metadata_lines = lines[ : tunebody_index]
+        tunebody_lines = lines[tunebody_index : ]
+        metadata_lines = [line + '\n' for line in metadata_lines]
+        if self.stream:
+            if not abc_code.endswith('\n'):
+                tunebody_lines = [tunebody_lines[i] + '\n' for i in range(len(tunebody_lines) - 1)] + [tunebody_lines[-1]]
+            else:
+                tunebody_lines = [tunebody_lines[i] + '\n' for i in range(len(tunebody_lines))]
+        else:
+            tunebody_lines = [line + '\n' for line in tunebody_lines]
+        metadata_patches = self.patchilize_metadata(metadata_lines)
+        tunebody_patches = self.patchilize_tunebody(tunebody_lines, encode_mode='generate')
+        if add_special_patches:
+            bos_patch = chr(self.bos_token_id) * (patch_size - 1) + chr(self.eos_token_id)
+            metadata_patches = [bos_patch] + metadata_patches
+        patches = metadata_patches + tunebody_patches
+        patches = patches[ : patch_length]
+        # encode to ids
+        id_patches = []
+        for patch in patches:
+            if len(patch) < PATCH_SIZE and patch[-1] != chr(self.eos_token_id):
+                id_patch = [ord(c) for c in patch]
+            else:
+                id_patch = [ord(c) for c in patch] + [self.special_token_id] * (patch_size - len(patch))
+            id_patches.append(id_patch)
+        return id_patches
+    def decode(self, patches):
+        """
+        Decode patches into music.
+        """
+        return ''.join(self.patch2chars(patch) for patch in patches)
+class PatchLevelDecoder(PreTrainedModel):
+    """
+    A Patch-level Decoder model for generating patch features in an auto-regressive manner.
+    It inherits PreTrainedModel from transformers.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.patch_embedding = torch.nn.Linear(PATCH_SIZE * 128, config.n_embd)
+        torch.nn.init.normal_(self.patch_embedding.weight, std=0.02)
+        self.base = GPT2Model(config)
+    def forward(self,
+                patches: torch.Tensor,
+                masks=None) -> torch.Tensor:
+        """
+        The forward pass of the patch-level decoder model.
+        :param patches: the patches to be encoded
+        :param masks: the masks for the patches
+        :return: the encoded patches
+        """
+        patches = torch.nn.functional.one_hot(patches, num_classes=128).to(self.dtype)
+        patches = patches.reshape(len(patches), -1, PATCH_SIZE * (128))
+        patches = self.patch_embedding(patches.to(self.device))
+        if masks==None:
+            return self.base(inputs_embeds=patches)
+        else:
+            return self.base(inputs_embeds=patches,
+                             attention_mask=masks)
+class CharLevelDecoder(PreTrainedModel):
+    """
+    A Char-level Decoder model for generating the chars within each patch in an auto-regressive manner
+    based on the encoded patch features. It inherits PreTrainedModel from transformers.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.special_token_id = 0
+        self.bos_token_id = 1
+        self.base = GPT2LMHeadModel(config)
+    def forward(self,
+                encoded_patches: torch.Tensor,
+                target_patches: torch.Tensor):
+        """
+        The forward pass of the char-level decoder model.
+        :param encoded_patches: the encoded patches
+        :param target_patches: the target patches
+        :return: the output of the model
+        """
+        # preparing the labels for model training
+        target_patches = torch.cat((torch.ones_like(target_patches[:,0:1])*self.bos_token_id, target_patches), dim=1)
+        # print('target_patches shape:', target_patches.shape)
+        target_masks = target_patches == self.special_token_id
+        labels = target_patches.clone().masked_fill_(target_masks, -100)
+        # masking the labels for model training
+        target_masks = torch.ones_like(labels)
+        target_masks = target_masks.masked_fill_(labels == -100, 0)
+        # select patches
+        if PATCH_SAMPLING_BATCH_SIZE!=0 and PATCH_SAMPLING_BATCH_SIZE<target_patches.shape[0]:
+            indices = list(range(len(target_patches)))
+            random.shuffle(indices)
+            selected_indices = sorted(indices[:PATCH_SAMPLING_BATCH_SIZE])
+            target_patches = target_patches[selected_indices,:]
+            target_masks = target_masks[selected_indices,:]
+            encoded_patches = encoded_patches[selected_indices,:]
+        # get input embeddings
+        inputs_embeds = torch.nn.functional.embedding(target_patches, self.base.transformer.wte.weight)
+        # concatenate the encoded patches with the input embeddings
+        inputs_embeds = torch.cat((encoded_patches.unsqueeze(1), inputs_embeds[:,1:,:]), dim=1)
+        output = self.base(inputs_embeds=inputs_embeds,
+                         attention_mask=target_masks,
+                         labels=labels)
+                         # output_hidden_states=True=True)
+        return output
+    def generate(self,
+                 encoded_patch: torch.Tensor,   # [hidden_size]
+                 tokens: torch.Tensor): # [1]
+        """
+        The generate function for generating a patch based on the encoded patch and already generated tokens.
+        :param encoded_patch: the encoded patch
+        :param tokens: already generated tokens in the patch
+        :return: the probability distribution of next token
+        """
+        encoded_patch = encoded_patch.reshape(1, 1, -1) # [1, 1, hidden_size]
+        tokens = tokens.reshape(1, -1)
+        # Get input embeddings
+        tokens = torch.nn.functional.embedding(tokens, self.base.transformer.wte.weight)
+        # Concatenate the encoded patch with the input embeddings
+        tokens = torch.cat((encoded_patch, tokens[:,1:,:]), dim=1)
+        # Get output from model
+        outputs = self.base(inputs_embeds=tokens)
+        # Get probabilities of next token
+        probs = torch.nn.functional.softmax(outputs.logits.squeeze(0)[-1], dim=-1)
+        return probs
+def safe_normalize_probs(probs):
+    epsilon = 1e-12
+    probs = np.array(probs, dtype=np.float64)
+    probs = np.where(np.isnan(probs) | (probs < 0), 0, probs)
+    probs = probs + epsilon
+    s = probs.sum()
+    if s > 0:
+        probs = probs / s
+    else:
+        probs = np.zeros_like(probs)
+        probs[0] = 1.0
+    return probs
+class NotaGenLMHeadModel(PreTrainedModel):
+    """
+    NotaGen is a language model with a hierarchical structure.
+    It includes a patch-level decoder and a char-level decoder.
+    The patch-level decoder is used to generate patch features in an auto-regressive manner.
+    The char-level decoder is used to generate the chars within each patch in an auto-regressive manner.
+    It inherits PreTrainedModel from transformers.
+    """
+    def __init__(self, encoder_config, decoder_config):
+        super().__init__(encoder_config)
+        self.special_token_id = 0
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.patch_level_decoder = PatchLevelDecoder(encoder_config)
+        self.char_level_decoder = CharLevelDecoder(decoder_config)
+    def forward(self,
+                patches: torch.Tensor,
+                masks: torch.Tensor):
+        """
+        The forward pass of the bGPT model.
+        :param patches: the patches to be encoded
+        :param masks: the masks for the patches
+        :return: the decoded patches
+        """
+        patches = patches.reshape(len(patches), -1, PATCH_SIZE)
+        encoded_patches = self.patch_level_decoder(patches, masks)["last_hidden_state"]
+        left_shift_masks = masks * (masks.flip(1).cumsum(1).flip(1) > 1)
+        masks[:, 0] = 0
+        encoded_patches = encoded_patches[left_shift_masks == 1]
+        patches = patches[masks == 1]
+        return self.char_level_decoder(encoded_patches, patches)
+    def generate(self,
+                 patches: torch.Tensor,
+                 top_k=0,
+                 top_p=1,
+                 temperature=1.0):
+        """
+        The generate function for generating patches based on patches.
+        :param patches: the patches to be encoded
+        :param top_k: the top k for sampling
+        :param top_p: the top p for sampling
+        :param temperature: the temperature for sampling
+        :return: the generated patches
+        """
+        if patches.shape[-1] % PATCH_SIZE != 0:
+            tokens = patches[:,:,-(patches.shape[-1]%PATCH_SIZE):].squeeze(0, 1)
+            tokens = torch.cat((torch.tensor([self.bos_token_id], device=self.device), tokens), dim=-1)
+            patches = patches[:,:,:-(patches.shape[-1]%PATCH_SIZE)]
+        else:
+            tokens =  torch.tensor([self.bos_token_id], device=self.device)
+        patches = patches.reshape(len(patches), -1, PATCH_SIZE) # [bs, seq, patch_size]
+        encoded_patches = self.patch_level_decoder(patches)["last_hidden_state"]    # [bs, seq, hidden_size]
+        generated_patch = []
+        while True:
+            prob = self.char_level_decoder.generate(encoded_patches[0][-1], tokens).cpu().detach().numpy()  # [128]
+            prob = safe_normalize_probs(prob)
+            prob = top_k_sampling(prob, top_k=top_k, return_probs=True) # [128]
+            prob = safe_normalize_probs(prob)
+            prob = top_p_sampling(prob, top_p=top_p, return_probs=True) # [128]
+            prob = safe_normalize_probs(prob)
+            token = temperature_sampling(prob, temperature=temperature) # int
+            char = chr(token)
+            generated_patch.append(token)
+            if len(tokens) >= PATCH_SIZE:# or token == self.eos_token_id:
+                break
+            else:
+                tokens = torch.cat((tokens, torch.tensor([token], device=self.device)), dim=0)
+        return generated_patch

utils (3).py ADDED Viewed

	@@ -0,0 +1,423 @@

+import torch
+import random
+import bisect
+import json
+import re
+import numpy as np
+from config import *
+from transformers import GPT2Model, GPT2LMHeadModel, LlamaModel, LlamaForCausalLM, PreTrainedModel
+from samplings import top_p_sampling, top_k_sampling, temperature_sampling
+from tokenizers import Tokenizer
+class Patchilizer:
+    def __init__(self, stream=PATCH_STREAM):
+        self.stream = stream
+        self.delimiters = ["|:", "::", ":|", "[|", "||", "|]", "|"]
+        self.regexPattern = '(' + '|'.join(map(re.escape, self.delimiters)) + ')'
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.special_token_id = 0
+    def split_bars(self, body_lines):
+        """
+        Split a body of music into individual bars.
+        """
+        new_bars = []
+        try:
+            for line in body_lines:
+                line_bars = re.split(self.regexPattern, line)
+                line_bars = list(filter(None, line_bars))
+                new_line_bars = []
+                if len(line_bars) == 1:
+                    new_line_bars = line_bars
+                else:
+                    if line_bars[0] in self.delimiters:
+                        new_line_bars = [line_bars[i] + line_bars[i + 1] for i in range(0, len(line_bars), 2)]
+                    else:
+                        new_line_bars = [line_bars[0]] + [line_bars[i] + line_bars[i + 1] for i in range(1, len(line_bars), 2)]
+                    if 'V' not in new_line_bars[-1]:
+                        new_line_bars[-2] += new_line_bars[-1]  # 吸收最后一个 小节线+\n 的组合
+                        new_line_bars = new_line_bars[:-1]
+                new_bars += new_line_bars
+        except:
+            pass
+        return new_bars
+    def split_patches(self, abc_text, patch_size=PATCH_SIZE, generate_last=False):
+        if not generate_last and len(abc_text) % patch_size != 0:
+            abc_text += chr(self.eos_token_id)
+        patches = [abc_text[i : i + patch_size] for i in range(0, len(abc_text), patch_size)]
+        return patches
+    def patch2chars(self, patch):
+        """
+        Convert a patch into a bar.
+        """
+        bytes = ''
+        for idx in patch:
+            if idx == self.eos_token_id:
+                break
+            if idx < self.eos_token_id:
+                pass
+            bytes += chr(idx)
+        return bytes
+    def patchilize_metadata(self, metadata_lines):
+        metadata_patches = []
+        for line in metadata_lines:
+            metadata_patches += self.split_patches(line)
+        return metadata_patches
+    def patchilize_tunebody(self, tunebody_lines, encode_mode='train'):
+        tunebody_patches = []
+        bars = self.split_bars(tunebody_lines)
+        if encode_mode == 'train':
+            for bar in bars:
+                tunebody_patches += self.split_patches(bar)
+        elif encode_mode == 'generate':
+            for bar in bars[:-1]:
+                tunebody_patches += self.split_patches(bar)
+            tunebody_patches += self.split_patches(bars[-1], generate_last=True)
+        return tunebody_patches
+    def encode_train(self, abc_text, patch_length=PATCH_LENGTH, patch_size=PATCH_SIZE, add_special_patches=True, cut=True):
+        lines = abc_text.split('\n')
+        lines = list(filter(None, lines))
+        lines = [line + '\n' for line in lines]
+        tunebody_index = -1
+        for i, line in enumerate(lines):
+            if '[V:' in line:
+                tunebody_index = i
+                break
+        metadata_lines = lines[ : tunebody_index]
+        tunebody_lines = lines[tunebody_index : ]
+        if self.stream:
+            tunebody_lines = ['[r:' + str(line_index) + '/' + str(len(tunebody_lines) - line_index - 1) + ']' + line for line_index, line in
+                                enumerate(tunebody_lines)]
+        metadata_patches = self.patchilize_metadata(metadata_lines)
+        tunebody_patches = self.patchilize_tunebody(tunebody_lines, encode_mode='train')
+        if add_special_patches:
+            bos_patch = chr(self.bos_token_id) * (patch_size - 1) + chr(self.eos_token_id)
+            eos_patch = chr(self.bos_token_id) + chr(self.eos_token_id) * (patch_size - 1)
+            metadata_patches = [bos_patch] + metadata_patches
+            tunebody_patches = tunebody_patches + [eos_patch]
+        if self.stream:
+            if len(metadata_patches) + len(tunebody_patches) > patch_length:
+                available_cut_indexes = [0] + [index + 1 for index, patch in enumerate(tunebody_patches) if '\n' in patch]
+                line_index_for_cut_index = list(range(len(available_cut_indexes)))
+                end_index = len(metadata_patches) + len(tunebody_patches) - patch_length
+                biggest_index = bisect.bisect_left(available_cut_indexes, end_index)
+                available_cut_indexes = available_cut_indexes[:biggest_index + 1]
+                if len(available_cut_indexes) == 1:
+                    choices = ['head']
+                elif len(available_cut_indexes) == 2:
+                    choices = ['head', 'tail']
+                else:
+                    choices = ['head', 'tail', 'middle']
+                choice = random.choice(choices)
+                if choice == 'head':
+                    patches = metadata_patches + tunebody_patches[0:]
+                else:
+                    if choice == 'tail':
+                        cut_index = len(available_cut_indexes) - 1
+                    else:
+                        cut_index = random.choice(range(1, len(available_cut_indexes) - 1))
+                    line_index = line_index_for_cut_index[cut_index]
+                    stream_tunebody_lines = tunebody_lines[line_index : ]
+                    stream_tunebody_patches = self.patchilize_tunebody(stream_tunebody_lines, encode_mode='train')
+                    if add_special_patches:
+                        stream_tunebody_patches = stream_tunebody_patches + [eos_patch]
+                    patches = metadata_patches + stream_tunebody_patches
+            else:
+                patches = metadata_patches + tunebody_patches
+        else:
+            patches = metadata_patches + tunebody_patches
+        if cut:
+            patches = patches[ : patch_length]
+        else:
+            pass
+        # encode to ids
+        id_patches = []
+        for patch in patches:
+            id_patch = [ord(c) for c in patch] + [self.special_token_id] * (patch_size - len(patch))
+            id_patches.append(id_patch)
+        return id_patches
+    def encode_generate(self, abc_code, patch_length=PATCH_LENGTH, patch_size=PATCH_SIZE, add_special_patches=True):
+        lines = abc_code.split('\n')
+        lines = list(filter(None, lines))
+        tunebody_index = None
+        for i, line in enumerate(lines):
+            if line.startswith('[V:') or line.startswith('[r:'):
+                tunebody_index = i
+                break
+        metadata_lines = lines[ : tunebody_index]
+        tunebody_lines = lines[tunebody_index : ]
+        metadata_lines = [line + '\n' for line in metadata_lines]
+        if self.stream:
+            if not abc_code.endswith('\n'):
+                tunebody_lines = [tunebody_lines[i] + '\n' for i in range(len(tunebody_lines) - 1)] + [tunebody_lines[-1]]
+            else:
+                tunebody_lines = [tunebody_lines[i] + '\n' for i in range(len(tunebody_lines))]
+        else:
+            tunebody_lines = [line + '\n' for line in tunebody_lines]
+        metadata_patches = self.patchilize_metadata(metadata_lines)
+        tunebody_patches = self.patchilize_tunebody(tunebody_lines, encode_mode='generate')
+        if add_special_patches:
+            bos_patch = chr(self.bos_token_id) * (patch_size - 1) + chr(self.eos_token_id)
+            metadata_patches = [bos_patch] + metadata_patches
+        patches = metadata_patches + tunebody_patches
+        patches = patches[ : patch_length]
+        # encode to ids
+        id_patches = []
+        for patch in patches:
+            if len(patch) < PATCH_SIZE and patch[-1] != chr(self.eos_token_id):
+                id_patch = [ord(c) for c in patch]
+            else:
+                id_patch = [ord(c) for c in patch] + [self.special_token_id] * (patch_size - len(patch))
+            id_patches.append(id_patch)
+        return id_patches
+    def decode(self, patches):
+        """
+        Decode patches into music.
+        """
+        return ''.join(self.patch2chars(patch) for patch in patches)
+class PatchLevelDecoder(PreTrainedModel):
+    """
+    A Patch-level Decoder model for generating patch features in an auto-regressive manner.
+    It inherits PreTrainedModel from transformers.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.patch_embedding = torch.nn.Linear(PATCH_SIZE * 128, config.n_embd)
+        torch.nn.init.normal_(self.patch_embedding.weight, std=0.02)
+        self.base = GPT2Model(config)
+    def forward(self,
+                patches: torch.Tensor,
+                masks=None) -> torch.Tensor:
+        """
+        The forward pass of the patch-level decoder model.
+        :param patches: the patches to be encoded
+        :param masks: the masks for the patches
+        :return: the encoded patches
+        """
+        patches = torch.nn.functional.one_hot(patches, num_classes=128).to(self.dtype)
+        patches = patches.reshape(len(patches), -1, PATCH_SIZE * (128))
+        patches = self.patch_embedding(patches.to(self.device))
+        if masks==None:
+            return self.base(inputs_embeds=patches)
+        else:
+            return self.base(inputs_embeds=patches,
+                             attention_mask=masks)
+class CharLevelDecoder(PreTrainedModel):
+    """
+    A Char-level Decoder model for generating the chars within each patch in an auto-regressive manner
+    based on the encoded patch features. It inherits PreTrainedModel from transformers.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.special_token_id = 0
+        self.bos_token_id = 1
+        self.base = GPT2LMHeadModel(config)
+    def forward(self,
+                encoded_patches: torch.Tensor,
+                target_patches: torch.Tensor):
+        """
+        The forward pass of the char-level decoder model.
+        :param encoded_patches: the encoded patches
+        :param target_patches: the target patches
+        :return: the output of the model
+        """
+        # preparing the labels for model training
+        target_patches = torch.cat((torch.ones_like(target_patches[:,0:1])*self.bos_token_id, target_patches), dim=1)
+        # print('target_patches shape:', target_patches.shape)
+        target_masks = target_patches == self.special_token_id
+        labels = target_patches.clone().masked_fill_(target_masks, -100)
+        # masking the labels for model training
+        target_masks = torch.ones_like(labels)
+        target_masks = target_masks.masked_fill_(labels == -100, 0)
+        # select patches
+        if PATCH_SAMPLING_BATCH_SIZE!=0 and PATCH_SAMPLING_BATCH_SIZE<target_patches.shape[0]:
+            indices = list(range(len(target_patches)))
+            random.shuffle(indices)
+            selected_indices = sorted(indices[:PATCH_SAMPLING_BATCH_SIZE])
+            target_patches = target_patches[selected_indices,:]
+            target_masks = target_masks[selected_indices,:]
+            encoded_patches = encoded_patches[selected_indices,:]
+        # get input embeddings
+        inputs_embeds = torch.nn.functional.embedding(target_patches, self.base.transformer.wte.weight)
+        # concatenate the encoded patches with the input embeddings
+        inputs_embeds = torch.cat((encoded_patches.unsqueeze(1), inputs_embeds[:,1:,:]), dim=1)
+        output = self.base(inputs_embeds=inputs_embeds,
+                         attention_mask=target_masks,
+                         labels=labels)
+                         # output_hidden_states=True=True)
+        return output
+    def generate(self,
+                 encoded_patch: torch.Tensor,   # [hidden_size]
+                 tokens: torch.Tensor): # [1]
+        """
+        The generate function for generating a patch based on the encoded patch and already generated tokens.
+        :param encoded_patch: the encoded patch
+        :param tokens: already generated tokens in the patch
+        :return: the probability distribution of next token
+        """
+        encoded_patch = encoded_patch.reshape(1, 1, -1) # [1, 1, hidden_size]
+        tokens = tokens.reshape(1, -1)
+        # Get input embeddings
+        tokens = torch.nn.functional.embedding(tokens, self.base.transformer.wte.weight)
+        # Concatenate the encoded patch with the input embeddings
+        tokens = torch.cat((encoded_patch, tokens[:,1:,:]), dim=1)
+        # Get output from model
+        outputs = self.base(inputs_embeds=tokens)
+        # Get probabilities of next token
+        probs = torch.nn.functional.softmax(outputs.logits.squeeze(0)[-1], dim=-1)
+        return probs
+def safe_normalize_probs(probs):
+    epsilon = 1e-12  # Smallest value to avoid log(0) and maintain precision
+    probs = np.array(probs, dtype=np.float64)
+    probs = np.where(np.isnan(probs) | (probs < 0), 0, probs)
+    probs = probs + epsilon  # Ensure strictly positive
+    s = probs.sum()
+    if s > 0:
+        probs = probs / s
+    else:
+        probs = np.zeros_like(probs)
+        probs[0] = 1.0
+    return probs
+class NotaGenLMHeadModel(PreTrainedModel):
+    """
+    NotaGen is a language model with a hierarchical structure.
+    It includes a patch-level decoder and a char-level decoder.
+    The patch-level decoder is used to generate patch features in an auto-regressive manner.
+    The char-level decoder is used to generate the chars within each patch in an auto-regressive manner.
+    It inherits PreTrainedModel from transformers.
+    """
+    def __init__(self, encoder_config, decoder_config):
+        super().__init__(encoder_config)
+        self.special_token_id = 0
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.patch_level_decoder = PatchLevelDecoder(encoder_config)
+        self.char_level_decoder = CharLevelDecoder(decoder_config)
+    def forward(self,
+                patches: torch.Tensor,
+                masks: torch.Tensor):
+        """
+        The forward pass of the bGPT model.
+        :param patches: the patches to be encoded
+        :param masks: the masks for the patches
+        :return: the decoded patches
+        """
+        patches = patches.reshape(len(patches), -1, PATCH_SIZE)
+        encoded_patches = self.patch_level_decoder(patches, masks)["last_hidden_state"]
+        left_shift_masks = masks * (masks.flip(1).cumsum(1).flip(1) > 1)
+        masks[:, 0] = 0
+        encoded_patches = encoded_patches[left_shift_masks == 1]
+        patches = patches[masks == 1]
+        return self.char_level_decoder(encoded_patches, patches)
+    def generate(self,
+                 patches: torch.Tensor,
+                 top_k=0,
+                 top_p=1,
+                 temperature=1.0):
+        """
+        The generate function for generating patches based on patches.
+        :param patches: the patches to be encoded
+        :param top_k: the top k for sampling
+        :param top_p: the top p for sampling
+        :param temperature: the temperature for sampling
+        :return: the generated patches
+        """
+        if patches.shape[-1] % PATCH_SIZE != 0:
+            tokens = patches[:,:,-(patches.shape[-1]%PATCH_SIZE):].squeeze(0, 1)
+            tokens = torch.cat((torch.tensor([self.bos_token_id], device=self.device), tokens), dim=-1)
+            patches = patches[:,:,:-(patches.shape[-1]%PATCH_SIZE)]
+        else:
+            tokens =  torch.tensor([self.bos_token_id], device=self.device)
+        patches = patches.reshape(len(patches), -1, PATCH_SIZE) # [bs, seq, patch_size]
+        encoded_patches = self.patch_level_decoder(patches)["last_hidden_state"]    # [bs, seq, hidden_size]
+        generated_patch = []
+        while True:
+            prob = self.char_level_decoder.generate(encoded_patches[0][-1], tokens).cpu().detach().numpy()  # [128]
+            prob = safe_normalize_probs(prob)
+            prob = top_k_sampling(prob, top_k=top_k, return_probs=True) # [128]
+            prob = safe_normalize_probs(prob)
+            prob = top_p_sampling(prob, top_p=top_p, return_probs=True) # [128]
+            prob = safe_normalize_probs(prob)
+            token = temperature_sampling(prob, temperature=temperature) # int
+            char = chr(token)
+            generated_patch.append(token)
+            if len(tokens) >= PATCH_SIZE:# or token == self.eos_token_id:
+                break
+            else:
+                tokens = torch.cat((tokens, torch.tensor([token], device=self.device)), dim=0)
+        return generated_patch

utils (4).py ADDED Viewed

	@@ -0,0 +1,423 @@

+import torch
+import random
+import bisect
+import json
+import re
+import numpy as np
+from config import *
+from transformers import GPT2Model, GPT2LMHeadModel, LlamaModel, LlamaForCausalLM, PreTrainedModel
+from samplings import top_p_sampling, top_k_sampling, temperature_sampling
+from tokenizers import Tokenizer
+class Patchilizer:
+    def __init__(self, stream=PATCH_STREAM):
+        self.stream = stream
+        self.delimiters = ["|:", "::", ":|", "[|", "||", "|]", "|"]
+        self.regexPattern = '(' + '|'.join(map(re.escape, self.delimiters)) + ')'
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.special_token_id = 0
+    def split_bars(self, body_lines):
+        """
+        Split a body of music into individual bars.
+        """
+        new_bars = []
+        try:
+            for line in body_lines:
+                line_bars = re.split(self.regexPattern, line)
+                line_bars = list(filter(None, line_bars))
+                new_line_bars = []
+                if len(line_bars) == 1:
+                    new_line_bars = line_bars
+                else:
+                    if line_bars[0] in self.delimiters:
+                        new_line_bars = [line_bars[i] + line_bars[i + 1] for i in range(0, len(line_bars), 2)]
+                    else:
+                        new_line_bars = [line_bars[0]] + [line_bars[i] + line_bars[i + 1] for i in range(1, len(line_bars), 2)]
+                    if 'V' not in new_line_bars[-1]:
+                        new_line_bars[-2] += new_line_bars[-1]  # 吸收最后一个 小节线+\n 的组合
+                        new_line_bars = new_line_bars[:-1]
+                new_bars += new_line_bars
+        except:
+            pass
+        return new_bars
+    def split_patches(self, abc_text, patch_size=PATCH_SIZE, generate_last=False):
+        if not generate_last and len(abc_text) % patch_size != 0:
+            abc_text += chr(self.eos_token_id)
+        patches = [abc_text[i : i + patch_size] for i in range(0, len(abc_text), patch_size)]
+        return patches
+    def patch2chars(self, patch):
+        """
+        Convert a patch into a bar.
+        """
+        bytes = ''
+        for idx in patch:
+            if idx == self.eos_token_id:
+                break
+            if idx < self.eos_token_id:
+                pass
+            bytes += chr(idx)
+        return bytes
+    def patchilize_metadata(self, metadata_lines):
+        metadata_patches = []
+        for line in metadata_lines:
+            metadata_patches += self.split_patches(line)
+        return metadata_patches
+    def patchilize_tunebody(self, tunebody_lines, encode_mode='train'):
+        tunebody_patches = []
+        bars = self.split_bars(tunebody_lines)
+        if encode_mode == 'train':
+            for bar in bars:
+                tunebody_patches += self.split_patches(bar)
+        elif encode_mode == 'generate':
+            for bar in bars[:-1]:
+                tunebody_patches += self.split_patches(bar)
+            tunebody_patches += self.split_patches(bars[-1], generate_last=True)
+        return tunebody_patches
+    def encode_train(self, abc_text, patch_length=PATCH_LENGTH, patch_size=PATCH_SIZE, add_special_patches=True, cut=True):
+        lines = abc_text.split('\n')
+        lines = list(filter(None, lines))
+        lines = [line + '\n' for line in lines]
+        tunebody_index = -1
+        for i, line in enumerate(lines):
+            if '[V:' in line:
+                tunebody_index = i
+                break
+        metadata_lines = lines[ : tunebody_index]
+        tunebody_lines = lines[tunebody_index : ]
+        if self.stream:
+            tunebody_lines = ['[r:' + str(line_index) + '/' + str(len(tunebody_lines) - line_index - 1) + ']' + line for line_index, line in
+                                enumerate(tunebody_lines)]
+        metadata_patches = self.patchilize_metadata(metadata_lines)
+        tunebody_patches = self.patchilize_tunebody(tunebody_lines, encode_mode='train')
+        if add_special_patches:
+            bos_patch = chr(self.bos_token_id) * (patch_size - 1) + chr(self.eos_token_id)
+            eos_patch = chr(self.bos_token_id) + chr(self.eos_token_id) * (patch_size - 1)
+            metadata_patches = [bos_patch] + metadata_patches
+            tunebody_patches = tunebody_patches + [eos_patch]
+        if self.stream:
+            if len(metadata_patches) + len(tunebody_patches) > patch_length:
+                available_cut_indexes = [0] + [index + 1 for index, patch in enumerate(tunebody_patches) if '\n' in patch]
+                line_index_for_cut_index = list(range(len(available_cut_indexes)))
+                end_index = len(metadata_patches) + len(tunebody_patches) - patch_length
+                biggest_index = bisect.bisect_left(available_cut_indexes, end_index)
+                available_cut_indexes = available_cut_indexes[:biggest_index + 1]
+                if len(available_cut_indexes) == 1:
+                    choices = ['head']
+                elif len(available_cut_indexes) == 2:
+                    choices = ['head', 'tail']
+                else:
+                    choices = ['head', 'tail', 'middle']
+                choice = random.choice(choices)
+                if choice == 'head':
+                    patches = metadata_patches + tunebody_patches[0:]
+                else:
+                    if choice == 'tail':
+                        cut_index = len(available_cut_indexes) - 1
+                    else:
+                        cut_index = random.choice(range(1, len(available_cut_indexes) - 1))
+                    line_index = line_index_for_cut_index[cut_index]
+                    stream_tunebody_lines = tunebody_lines[line_index : ]
+                    stream_tunebody_patches = self.patchilize_tunebody(stream_tunebody_lines, encode_mode='train')
+                    if add_special_patches:
+                        stream_tunebody_patches = stream_tunebody_patches + [eos_patch]
+                    patches = metadata_patches + stream_tunebody_patches
+            else:
+                patches = metadata_patches + tunebody_patches
+        else:
+            patches = metadata_patches + tunebody_patches
+        if cut:
+            patches = patches[ : patch_length]
+        else:
+            pass
+        # encode to ids
+        id_patches = []
+        for patch in patches:
+            id_patch = [ord(c) for c in patch] + [self.special_token_id] * (patch_size - len(patch))
+            id_patches.append(id_patch)
+        return id_patches
+    def encode_generate(self, abc_code, patch_length=PATCH_LENGTH, patch_size=PATCH_SIZE, add_special_patches=True):
+        lines = abc_code.split('\n')
+        lines = list(filter(None, lines))
+        tunebody_index = None
+        for i, line in enumerate(lines):
+            if line.startswith('[V:') or line.startswith('[r:'):
+                tunebody_index = i
+                break
+        metadata_lines = lines[ : tunebody_index]
+        tunebody_lines = lines[tunebody_index : ]
+        metadata_lines = [line + '\n' for line in metadata_lines]
+        if self.stream:
+            if not abc_code.endswith('\n'):
+                tunebody_lines = [tunebody_lines[i] + '\n' for i in range(len(tunebody_lines) - 1)] + [tunebody_lines[-1]]
+            else:
+                tunebody_lines = [tunebody_lines[i] + '\n' for i in range(len(tunebody_lines))]
+        else:
+            tunebody_lines = [line + '\n' for line in tunebody_lines]
+        metadata_patches = self.patchilize_metadata(metadata_lines)
+        tunebody_patches = self.patchilize_tunebody(tunebody_lines, encode_mode='generate')
+        if add_special_patches:
+            bos_patch = chr(self.bos_token_id) * (patch_size - 1) + chr(self.eos_token_id)
+            metadata_patches = [bos_patch] + metadata_patches
+        patches = metadata_patches + tunebody_patches
+        patches = patches[ : patch_length]
+        # encode to ids
+        id_patches = []
+        for patch in patches:
+            if len(patch) < PATCH_SIZE and patch[-1] != chr(self.eos_token_id):
+                id_patch = [ord(c) for c in patch]
+            else:
+                id_patch = [ord(c) for c in patch] + [self.special_token_id] * (patch_size - len(patch))
+            id_patches.append(id_patch)
+        return id_patches
+    def decode(self, patches):
+        """
+        Decode patches into music.
+        """
+        return ''.join(self.patch2chars(patch) for patch in patches)
+class PatchLevelDecoder(PreTrainedModel):
+    """
+    A Patch-level Decoder model for generating patch features in an auto-regressive manner.
+    It inherits PreTrainedModel from transformers.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.patch_embedding = torch.nn.Linear(PATCH_SIZE * 128, config.n_embd)
+        torch.nn.init.normal_(self.patch_embedding.weight, std=0.02)
+        self.base = GPT2Model(config)
+    def forward(self,
+                patches: torch.Tensor,
+                masks=None) -> torch.Tensor:
+        """
+        The forward pass of the patch-level decoder model.
+        :param patches: the patches to be encoded
+        :param masks: the masks for the patches
+        :return: the encoded patches
+        """
+        patches = torch.nn.functional.one_hot(patches, num_classes=128).to(self.dtype)
+        patches = patches.reshape(len(patches), -1, PATCH_SIZE * (128))
+        patches = self.patch_embedding(patches.to(self.device))
+        if masks==None:
+            return self.base(inputs_embeds=patches)
+        else:
+            return self.base(inputs_embeds=patches,
+                             attention_mask=masks)
+class CharLevelDecoder(PreTrainedModel):
+    """
+    A Char-level Decoder model for generating the chars within each patch in an auto-regressive manner
+    based on the encoded patch features. It inherits PreTrainedModel from transformers.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.special_token_id = 0
+        self.bos_token_id = 1
+        self.base = GPT2LMHeadModel(config)
+    def forward(self,
+                encoded_patches: torch.Tensor,
+                target_patches: torch.Tensor):
+        """
+        The forward pass of the char-level decoder model.
+        :param encoded_patches: the encoded patches
+        :param target_patches: the target patches
+        :return: the output of the model
+        """
+        # preparing the labels for model training
+        target_patches = torch.cat((torch.ones_like(target_patches[:,0:1])*self.bos_token_id, target_patches), dim=1)
+        # print('target_patches shape:', target_patches.shape)
+        target_masks = target_patches == self.special_token_id
+        labels = target_patches.clone().masked_fill_(target_masks, -100)
+        # masking the labels for model training
+        target_masks = torch.ones_like(labels)
+        target_masks = target_masks.masked_fill_(labels == -100, 0)
+        # select patches
+        if PATCH_SAMPLING_BATCH_SIZE!=0 and PATCH_SAMPLING_BATCH_SIZE<target_patches.shape[0]:
+            indices = list(range(len(target_patches)))
+            random.shuffle(indices)
+            selected_indices = sorted(indices[:PATCH_SAMPLING_BATCH_SIZE])
+            target_patches = target_patches[selected_indices,:]
+            target_masks = target_masks[selected_indices,:]
+            encoded_patches = encoded_patches[selected_indices,:]
+        # get input embeddings
+        inputs_embeds = torch.nn.functional.embedding(target_patches, self.base.transformer.wte.weight)
+        # concatenate the encoded patches with the input embeddings
+        inputs_embeds = torch.cat((encoded_patches.unsqueeze(1), inputs_embeds[:,1:,:]), dim=1)
+        output = self.base(inputs_embeds=inputs_embeds,
+                         attention_mask=target_masks,
+                         labels=labels)
+                         # output_hidden_states=True=True)
+        return output
+    def generate(self,
+                 encoded_patch: torch.Tensor,   # [hidden_size]
+                 tokens: torch.Tensor): # [1]
+        """
+        The generate function for generating a patch based on the encoded patch and already generated tokens.
+        :param encoded_patch: the encoded patch
+        :param tokens: already generated tokens in the patch
+        :return: the probability distribution of next token
+        """
+        encoded_patch = encoded_patch.reshape(1, 1, -1) # [1, 1, hidden_size]
+        tokens = tokens.reshape(1, -1)
+        # Get input embeddings
+        tokens = torch.nn.functional.embedding(tokens, self.base.transformer.wte.weight)
+        # Concatenate the encoded patch with the input embeddings
+        tokens = torch.cat((encoded_patch, tokens[:,1:,:]), dim=1)
+        # Get output from model
+        outputs = self.base(inputs_embeds=tokens)
+        # Get probabilities of next token
+        probs = torch.nn.functional.softmax(outputs.logits.squeeze(0)[-1], dim=-1)
+        return probs
+def safe_normalize_probs(probs):
+    epsilon = 1e-12
+    probs = np.array(probs, dtype=np.float64)
+    probs = np.where(np.isnan(probs) | (probs < 0), 0, probs)
+    probs = probs + epsilon
+    s = probs.sum()
+    if s > 0:
+        probs = probs / s
+    else:
+        probs = np.zeros_like(probs)
+        probs[0] = 1.0
+    return probs
+class NotaGenLMHeadModel(PreTrainedModel):
+    """
+    NotaGen is a language model with a hierarchical structure.
+    It includes a patch-level decoder and a char-level decoder.
+    The patch-level decoder is used to generate patch features in an auto-regressive manner.
+    The char-level decoder is used to generate the chars within each patch in an auto-regressive manner.
+    It inherits PreTrainedModel from transformers.
+    """
+    def __init__(self, encoder_config, decoder_config):
+        super().__init__(encoder_config)
+        self.special_token_id = 0
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.patch_level_decoder = PatchLevelDecoder(encoder_config)
+        self.char_level_decoder = CharLevelDecoder(decoder_config)
+    def forward(self,
+                patches: torch.Tensor,
+                masks: torch.Tensor):
+        """
+        The forward pass of the bGPT model.
+        :param patches: the patches to be encoded
+        :param masks: the masks for the patches
+        :return: the decoded patches
+        """
+        patches = patches.reshape(len(patches), -1, PATCH_SIZE)
+        encoded_patches = self.patch_level_decoder(patches, masks)["last_hidden_state"]
+        left_shift_masks = masks * (masks.flip(1).cumsum(1).flip(1) > 1)
+        masks[:, 0] = 0
+        encoded_patches = encoded_patches[left_shift_masks == 1]
+        patches = patches[masks == 1]
+        return self.char_level_decoder(encoded_patches, patches)
+    def generate(self,
+                 patches: torch.Tensor,
+                 top_k=0,
+                 top_p=1,
+                 temperature=1.0):
+        """
+        The generate function for generating patches based on patches.
+        :param patches: the patches to be encoded
+        :param top_k: the top k for sampling
+        :param top_p: the top p for sampling
+        :param temperature: the temperature for sampling
+        :return: the generated patches
+        """
+        if patches.shape[-1] % PATCH_SIZE != 0:
+            tokens = patches[:,:,-(patches.shape[-1]%PATCH_SIZE):].squeeze(0, 1)
+            tokens = torch.cat((torch.tensor([self.bos_token_id], device=self.device), tokens), dim=-1)
+            patches = patches[:,:,:-(patches.shape[-1]%PATCH_SIZE)]
+        else:
+            tokens =  torch.tensor([self.bos_token_id], device=self.device)
+        patches = patches.reshape(len(patches), -1, PATCH_SIZE) # [bs, seq, patch_size]
+        encoded_patches = self.patch_level_decoder(patches)["last_hidden_state"]    # [bs, seq, hidden_size]
+        generated_patch = []
+        while True:
+            prob = self.char_level_decoder.generate(encoded_patches[0][-1], tokens).cpu().detach().numpy()  # [128]
+            prob = safe_normalize_probs(prob)
+            prob = top_k_sampling(prob, top_k=top_k, return_probs=True) # [128]
+            prob = safe_normalize_probs(prob)
+            prob = top_p_sampling(prob, top_p=top_p, return_probs=True) # [128]
+            prob = safe_normalize_probs(prob)
+            token = temperature_sampling(prob, temperature=temperature) # int
+            char = chr(token)
+            generated_patch.append(token)
+            if len(tokens) >= PATCH_SIZE:# or token == self.eos_token_id:
+                break
+            else:
+                tokens = torch.cat((tokens, torch.tensor([token], device=self.device)), dim=0)
+        return generated_patch

utils (5).py ADDED Viewed

	@@ -0,0 +1,421 @@

+import torch
+import random
+import bisect
+import json
+import re
+import numpy as np
+from config import *
+from transformers import GPT2Model, GPT2LMHeadModel, LlamaModel, LlamaForCausalLM, PreTrainedModel
+from samplings import top_p_sampling, top_k_sampling, temperature_sampling
+from tokenizers import Tokenizer
+class Patchilizer:
+    def __init__(self, stream=PATCH_STREAM):
+        self.stream = stream
+        self.delimiters = ["|:", "::", ":|", "[|", "||", "|]", "|"]
+        self.regexPattern = '(' + '|'.join(map(re.escape, self.delimiters)) + ')'
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.special_token_id = 0
+    def split_bars(self, body_lines):
+        """
+        Split a body of music into individual bars.
+        """
+        new_bars = []
+        try:
+            for line in body_lines:
+                line_bars = re.split(self.regexPattern, line)
+                line_bars = list(filter(None, line_bars))
+                new_line_bars = []
+                if len(line_bars) == 1:
+                    new_line_bars = line_bars
+                else:
+                    if line_bars[0] in self.delimiters:
+                        new_line_bars = [line_bars[i] + line_bars[i + 1] for i in range(0, len(line_bars), 2)]
+                    else:
+                        new_line_bars = [line_bars[0]] + [line_bars[i] + line_bars[i + 1] for i in range(1, len(line_bars), 2)]
+                    if 'V' not in new_line_bars[-1]:
+                        new_line_bars[-2] += new_line_bars[-1]
+                        new_line_bars = new_line_bars[:-1]
+                new_bars += new_line_bars
+        except:
+            pass
+        return new_bars
+    def split_patches(self, abc_text, patch_size=PATCH_SIZE, generate_last=False):
+        if not generate_last and len(abc_text) % patch_size != 0:
+            abc_text += chr(self.eos_token_id)
+        patches = [abc_text[i : i + patch_size] for i in range(0, len(abc_text), patch_size)]
+        return patches
+    def patch2chars(self, patch):
+        """
+        Convert a patch into a bar.
+        """
+        bytes = ''
+        for idx in patch:
+            if idx == self.eos_token_id:
+                break
+            if idx < self.eos_token_id:
+                pass
+            bytes += chr(idx)
+        return bytes
+    def patchilize_metadata(self, metadata_lines):
+        metadata_patches = []
+        for line in metadata_lines:
+            metadata_patches += self.split_patches(line)
+        return metadata_patches
+    def patchilize_tunebody(self, tunebody_lines, encode_mode='train'):
+        tunebody_patches = []
+        bars = self.split_bars(tunebody_lines)
+        if encode_mode == 'train':
+            for bar in bars:
+                tunebody_patches += self.split_patches(bar)
+        elif encode_mode == 'generate':
+            for bar in bars[:-1]:
+                tunebody_patches += self.split_patches(bar)
+            tunebody_patches += self.split_patches(bars[-1], generate_last=True)
+        return tunebody_patches
+    def encode_train(self, abc_text, patch_length=PATCH_LENGTH, patch_size=PATCH_SIZE, add_special_patches=True, cut=True):
+        lines = abc_text.split('\n')
+        lines = list(filter(None, lines))
+        lines = [line + '\n' for line in lines]
+        tunebody_index = -1
+        for i, line in enumerate(lines):
+            if line.startswith('[V:'):
+                tunebody_index = i
+                break
+        metadata_lines = lines[ : tunebody_index]
+        tunebody_lines = lines[tunebody_index : ]
+        if self.stream:
+            tunebody_lines = ['[r:' + str(line_index) + '/' + str(len(tunebody_lines) - line_index - 1) + ']' + line for line_index, line in
+                                enumerate(tunebody_lines)]    # [r:n/n]
+        metadata_patches = self.patchilize_metadata(metadata_lines)
+        tunebody_patches = self.patchilize_tunebody(tunebody_lines, encode_mode='train')
+        if add_special_patches:
+            bos_patch = chr(self.bos_token_id) * (patch_size - 1) + chr(self.eos_token_id)
+            eos_patch = chr(self.bos_token_id) + chr(self.eos_token_id) * (patch_size - 1)
+            metadata_patches = [bos_patch] + metadata_patches
+            tunebody_patches = tunebody_patches + [eos_patch]
+        if self.stream:
+            if len(metadata_patches) + len(tunebody_patches) > patch_length:
+                available_cut_indexes = [0] + [index + 1 for index, patch in enumerate(tunebody_patches) if '\n' in patch]
+                line_index_for_cut_index = list(range(len(available_cut_indexes)))
+                end_index = len(metadata_patches) + len(tunebody_patches) - patch_length
+                biggest_index = bisect.bisect_left(available_cut_indexes, end_index)
+                available_cut_indexes = available_cut_indexes[:biggest_index + 1]
+                if len(available_cut_indexes) == 1:
+                    choices = ['head']
+                elif len(available_cut_indexes) == 2:
+                    choices = ['head', 'tail']
+                else:
+                    choices = ['head', 'tail', 'middle']
+                choice = random.choice(choices)
+                if choice == 'head':
+                    patches = metadata_patches + tunebody_patches[0:]
+                else:
+                    if choice == 'tail':
+                        cut_index = len(available_cut_indexes) - 1
+                    else:
+                        cut_index = random.choice(range(1, len(available_cut_indexes) - 1))
+                    line_index = line_index_for_cut_index[cut_index]
+                    stream_tunebody_lines = tunebody_lines[line_index : ]
+                    stream_tunebody_patches = self.patchilize_tunebody(stream_tunebody_lines, encode_mode='train')
+                    if add_special_patches:
+                        stream_tunebody_patches = stream_tunebody_patches + [eos_patch]
+                    patches = metadata_patches + stream_tunebody_patches
+            else:
+                patches = metadata_patches + tunebody_patches
+        else:
+            patches = metadata_patches + tunebody_patches
+        if cut:
+            patches = patches[ : patch_length]
+        else:
+            pass
+        # encode to ids
+        id_patches = []
+        for patch in patches:
+            id_patch = [ord(c) for c in patch] + [self.special_token_id] * (patch_size - len(patch))
+            id_patches.append(id_patch)
+        return id_patches
+    def encode_generate(self, abc_code, patch_length=PATCH_LENGTH, patch_size=PATCH_SIZE, add_special_patches=True):
+        lines = abc_code.split('\n')
+        lines = list(filter(None, lines))
+        tunebody_index = None
+        for i, line in enumerate(lines):
+            if line.startswith('[V:') or line.startswith('[r:'):
+                tunebody_index = i
+                break
+        metadata_lines = lines[ : tunebody_index]
+        tunebody_lines = lines[tunebody_index : ]
+        metadata_lines = [line + '\n' for line in metadata_lines]
+        if self.stream:
+            if not abc_code.endswith('\n'):
+                tunebody_lines = [tunebody_lines[i] + '\n' for i in range(len(tunebody_lines) - 1)] + [tunebody_lines[-1]]
+            else:
+                tunebody_lines = [tunebody_lines[i] + '\n' for i in range(len(tunebody_lines))]
+        else:
+            tunebody_lines = [line + '\n' for line in tunebody_lines]
+        metadata_patches = self.patchilize_metadata(metadata_lines)
+        tunebody_patches = self.patchilize_tunebody(tunebody_lines, encode_mode='generate')
+        if add_special_patches:
+            bos_patch = chr(self.bos_token_id) * (patch_size - 1) + chr(self.eos_token_id)
+            metadata_patches = [bos_patch] + metadata_patches
+        patches = metadata_patches + tunebody_patches
+        patches = patches[ : patch_length]
+        # encode to ids
+        id_patches = []
+        for patch in patches:
+            if len(patch) < PATCH_SIZE and patch[-1] != chr(self.eos_token_id):
+                id_patch = [ord(c) for c in patch]
+            else:
+                id_patch = [ord(c) for c in patch] + [self.special_token_id] * (patch_size - len(patch))
+            id_patches.append(id_patch)
+        return id_patches
+    def decode(self, patches):
+        """
+        Decode patches into music.
+        """
+        return ''.join(self.patch2chars(patch) for patch in patches)
+class PatchLevelDecoder(PreTrainedModel):
+    """
+    A Patch-level Decoder model for generating patch features in an auto-regressive manner.
+    It inherits PreTrainedModel from transformers.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.patch_embedding = torch.nn.Linear(PATCH_SIZE * 128, config.n_embd)
+        torch.nn.init.normal_(self.patch_embedding.weight, std=0.02)
+        self.base = GPT2Model(config)
+    def forward(self,
+                patches: torch.Tensor,
+                masks=None) -> torch.Tensor:
+        """
+        The forward pass of the patch-level decoder model.
+        :param patches: the patches to be encoded
+        :param masks: the masks for the patches
+        :return: the encoded patches
+        """
+        patches = torch.nn.functional.one_hot(patches, num_classes=128).to(self.dtype)
+        patches = patches.reshape(len(patches), -1, PATCH_SIZE * (128))
+        patches = self.patch_embedding(patches.to(self.device))
+        if masks==None:
+            return self.base(inputs_embeds=patches)
+        else:
+            return self.base(inputs_embeds=patches,
+                             attention_mask=masks)
+class CharLevelDecoder(PreTrainedModel):
+    """
+    A Char-level Decoder model for generating the chars within each patch in an auto-regressive manner
+    based on the encoded patch features. It inherits PreTrainedModel from transformers.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.special_token_id = 0
+        self.bos_token_id = 1
+        self.base = GPT2LMHeadModel(config)
+    def forward(self,
+                encoded_patches: torch.Tensor,
+                target_patches: torch.Tensor):
+        """
+        The forward pass of the char-level decoder model.
+        :param encoded_patches: the encoded patches
+        :param target_patches: the target patches
+        :return: the output of the model
+        """
+        # preparing the labels for model training
+        target_patches = torch.cat((torch.ones_like(target_patches[:,0:1])*self.bos_token_id, target_patches), dim=1)
+        target_masks = target_patches == self.special_token_id
+        labels = target_patches.clone().masked_fill_(target_masks, -100)
+        # masking the labels for model training
+        target_masks = torch.ones_like(labels)
+        target_masks = target_masks.masked_fill_(labels == -100, 0)
+        # select patches
+        if PATCH_SAMPLING_BATCH_SIZE!=0 and PATCH_SAMPLING_BATCH_SIZE<target_patches.shape[0]:
+            indices = list(range(len(target_patches)))
+            random.shuffle(indices)
+            selected_indices = sorted(indices[:PATCH_SAMPLING_BATCH_SIZE])
+            target_patches = target_patches[selected_indices,:]
+            target_masks = target_masks[selected_indices,:]
+            encoded_patches = encoded_patches[selected_indices,:]
+        # get input embeddings
+        inputs_embeds = torch.nn.functional.embedding(target_patches, self.base.transformer.wte.weight)
+        # concatenate the encoded patches with the input embeddings
+        inputs_embeds = torch.cat((encoded_patches.unsqueeze(1), inputs_embeds[:,1:,:]), dim=1)
+        output = self.base(inputs_embeds=inputs_embeds,
+                         attention_mask=target_masks,
+                         labels=labels)
+        return output
+    def generate(self,
+                 encoded_patch: torch.Tensor,
+                 tokens: torch.Tensor):
+        """
+        The generate function for generating a patch based on the encoded patch and already generated tokens.
+        :param encoded_patch: the encoded patch
+        :param tokens: already generated tokens in the patch
+        :return: the probability distribution of next token
+        """
+        encoded_patch = encoded_patch.reshape(1, 1, -1)
+        tokens = tokens.reshape(1, -1)
+        # Get input embeddings
+        tokens = torch.nn.functional.embedding(tokens, self.base.transformer.wte.weight)
+        # Concatenate the encoded patch with the input embeddings
+        tokens = torch.cat((encoded_patch, tokens[:,1:,:]), dim=1)
+        # Get output from model
+        outputs = self.base(inputs_embeds=tokens)
+        # Get probabilities of next token
+        probs = torch.nn.functional.softmax(outputs.logits.squeeze(0)[-1], dim=-1)
+        return probs
+def safe_normalize_probs(probs):
+    epsilon = 1e-12
+    probs = np.array(probs, dtype=np.float64)
+    probs = np.where(np.isnan(probs) | (probs < 0), 0, probs)
+    probs = probs + epsilon
+    s = probs.sum()
+    if s > 0:
+        probs = probs / s
+    else:
+        probs = np.zeros_like(probs)
+        probs[0] = 1.0
+    return probs
+class NotaGenLMHeadModel(PreTrainedModel):
+    """
+    NotaGen is a language model with a hierarchical structure.
+    It includes a patch-level decoder and a char-level decoder.
+    The patch-level decoder is used to generate patch features in an auto-regressive manner.
+    The char-level decoder is used to generate the chars within each patch in an auto-regressive manner.
+    It inherits PreTrainedModel from transformers.
+    """
+    def __init__(self, encoder_config, decoder_config):
+        super().__init__(encoder_config)
+        self.special_token_id = 0
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.patch_level_decoder = PatchLevelDecoder(encoder_config)
+        self.char_level_decoder = CharLevelDecoder(decoder_config)
+    def forward(self,
+                patches: torch.Tensor,
+                masks: torch.Tensor):
+        """
+        The forward pass of the bGPT model.
+        :param patches: the patches to be encoded
+        :param masks: the masks for the patches
+        :return: the decoded patches
+        """
+        patches = patches.reshape(len(patches), -1, PATCH_SIZE)
+        encoded_patches = self.patch_level_decoder(patches, masks)["last_hidden_state"]
+        left_shift_masks = masks * (masks.flip(1).cumsum(1).flip(1) > 1)
+        masks[:, 0] = 0
+        encoded_patches = encoded_patches[left_shift_masks == 1]
+        patches = patches[masks == 1]
+        return self.char_level_decoder(encoded_patches, patches)
+    def generate(self,
+                 patches: torch.Tensor,
+                 top_k=0,
+                 top_p=1,
+                 temperature=1.0):
+        """
+        The generate function for generating patches based on patches.
+        :param patches: the patches to be encoded
+        :param top_k: the top k for sampling
+        :param top_p: the top p for sampling
+        :param temperature: the temperature for sampling
+        :return: the generated patches
+        """
+        if patches.shape[-1] % PATCH_SIZE != 0:
+            tokens = patches[:,:,-(patches.shape[-1]%PATCH_SIZE):].squeeze(0, 1)
+            tokens = torch.cat((torch.tensor([self.bos_token_id], device=self.device), tokens), dim=-1)
+            patches = patches[:,:,:-(patches.shape[-1]%PATCH_SIZE)]
+        else:
+            tokens =  torch.tensor([self.bos_token_id], device=self.device)
+        patches = patches.reshape(len(patches), -1, PATCH_SIZE)
+        encoded_patches = self.patch_level_decoder(patches)["last_hidden_state"]
+        generated_patch = []
+        while True:
+            prob = self.char_level_decoder.generate(encoded_patches[0][-1], tokens).cpu().detach().numpy()
+            prob = safe_normalize_probs(prob)
+            prob = top_k_sampling(prob, top_k=top_k, return_probs=True)
+            prob = safe_normalize_probs(prob)
+            prob = top_p_sampling(prob, top_p=top_p, return_probs=True)
+            prob = safe_normalize_probs(prob)
+            token = temperature_sampling(prob, temperature=temperature)
+            char = chr(token)
+            generated_patch.append(token)
+            if len(tokens) >= PATCH_SIZE:
+                break
+            else:
+                tokens = torch.cat((tokens, torch.tensor([token], device=self.device)), dim=0)
+        return generated_patch

utils.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import torch
+import random
+import bisect
+import json
+import re
+import numpy as np
+from config import *
+from transformers import GPT2Model, GPT2LMHeadModel, LlamaModel, LlamaForCausalLM, PreTrainedModel
+from samplings import top_p_sampling, top_k_sampling, temperature_sampling
+from tokenizers import Tokenizer
+class Patchilizer:
+    def __init__(self, stream=PATCH_STREAM):
+        self.stream = stream
+        self.delimiters = ["|:", "::", ":|", "[|", "||", "|]", "|"]
+        self.regexPattern = '(' + '|'.join(map(re.escape, self.delimiters)) + ')'
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.special_token_id = 0
+    def split_bars(self, body_lines):
+        """
+        Split a body of music into individual bars.
+        """
+        new_bars = []
+        try:
+            for line in body_lines:
+                line_bars = re.split(self.regexPattern, line)
+                line_bars = list(filter(None, line_bars))
+                new_line_bars = []
+                if len(line_bars) == 1:
+                    new_line_bars = line_bars
+                else:
+                    if line_bars[0] in self.delimiters:
+                        new_line_bars = [line_bars[i] + line_bars[i + 1] for i in range(0, len(line_bars), 2)]
+                    else:
+                        new_line_bars = [line_bars[0]] + [line_bars[i] + line_bars[i + 1] for i in range(1, len(line_bars), 2)]
+                    if 'V' not in new_line_bars[-1]:
+                        new_line_bars[-2] += new_line_bars[-1]
+                        new_line_bars = new_line_bars[:-1]
+                new_bars += new_line_bars
+        except:
+            pass
+        return new_bars
+    def split_patches(self, abc_text, patch_size=PATCH_SIZE, generate_last=False):
+        if not generate_last and len(abc_text) % patch_size != 0:
+            abc_text += chr(self.eos_token_id)
+        patches = [abc_text[i : i + patch_size] for i in range(0, len(abc_text), patch_size)]
+        return patches
+    def patch2chars(self, patch):
+        """
+        Convert a patch into a bar.
+        """
+        bytes = ''
+        for idx in patch:
+            if idx == self.eos_token_id:
+                break
+            if idx < self.eos_token_id:
+                pass
+            bytes += chr(idx)
+        return bytes
+    def patchilize_metadata(self, metadata_lines):
+        metadata_patches = []
+        for line in metadata_lines:
+            metadata_patches += self.split_patches(line)
+        return metadata_patches
+    def patchilize_tunebody(self, tunebody_lines, encode_mode='train'):
+        tunebody_patches = []
+        bars = self.split_bars(tunebody_lines)
+        if encode_mode == 'train':
+            for bar in bars:
+                tunebody_patches += self.split_patches(bar)
+        elif encode_mode == 'generate':
+            for bar in bars[:-1]:
+                tunebody_patches += self.split_patches(bar)
+            tunebody_patches += self.split_patches(bars[-1], generate_last=True)
+        return tunebody_patches
+    def encode(self, abc_text, patch_length=PATCH_LENGTH, patch_size=PATCH_SIZE, add_special_patches=True, cut=True):
+        lines = abc_text.split('\n')
+        lines = list(filter(None, lines))
+        lines = [line + '\n' for line in lines]
+        tunebody_index = -1
+        for i, line in enumerate(lines):
+            if line.startswith('[r:'):
+                tunebody_index = i
+                break
+        metadata_lines = lines[: tunebody_index]
+        tunebody_lines = lines[tunebody_index:]
+        metadata_patches = self.patchilize_metadata(metadata_lines)
+        tunebody_patches = self.patchilize_tunebody(tunebody_lines, encode_mode='train')
+        if add_special_patches:
+            bos_patch = chr(self.bos_token_id) * (patch_size - 1) + chr(self.eos_token_id)
+            eos_patch = chr(self.bos_token_id) + chr(self.eos_token_id) * (patch_size - 1)
+            metadata_patches = [bos_patch] + metadata_patches
+            tunebody_patches = tunebody_patches + [eos_patch]
+        if self.stream:
+            if len(metadata_patches) + len(tunebody_patches) > patch_length:
+                available_cut_indexes = [0] + [index + 1 for index, patch in enumerate(tunebody_patches) if
+                                               '\n' in patch]
+                line_index_for_cut_index = list(range(len(available_cut_indexes)))
+                end_index = len(metadata_patches) + len(tunebody_patches) - patch_length
+                biggest_index = bisect.bisect_left(available_cut_indexes, end_index)
+                available_cut_indexes = available_cut_indexes[:biggest_index + 1]
+                if len(available_cut_indexes) == 1:
+                    choices = ['head']
+                elif len(available_cut_indexes) == 2:
+                    choices = ['head', 'tail']
+                else:
+                    choices = ['head', 'tail', 'middle']
+                choice = random.choice(choices)
+                if choice == 'head':
+                    patches = metadata_patches + tunebody_patches[0:]
+                else:
+                    if choice == 'tail':
+                        cut_index = len(available_cut_indexes) - 1
+                    else:
+                        cut_index = random.choice(range(1, len(available_cut_indexes) - 1))
+                    line_index = line_index_for_cut_index[cut_index]
+                    stream_tunebody_lines = tunebody_lines[line_index:]
+                    stream_tunebody_patches = self.patchilize_tunebody(stream_tunebody_lines, encode_mode='train')
+                    if add_special_patches:
+                        stream_tunebody_patches = stream_tunebody_patches + [eos_patch]
+                    patches = metadata_patches + stream_tunebody_patches
+            else:
+                patches = metadata_patches + tunebody_patches
+        else:
+            patches = metadata_patches + tunebody_patches
+        patches = patches[: patch_length]
+        # encode to ids
+        id_patches = []
+        for patch in patches:
+            id_patch = [ord(c) for c in patch] + [self.special_token_id] * (patch_size - len(patch))
+            id_patches.append(id_patch)
+        return id_patches
+    def encode_generate(self, abc_code, patch_length=PATCH_LENGTH, patch_size=PATCH_SIZE, add_special_patches=True):
+        lines = abc_code.split('\n')
+        lines = list(filter(None, lines))
+        tunebody_index = None
+        for i, line in enumerate(lines):
+            if line.startswith('[V:') or line.startswith('[r:'):
+                tunebody_index = i
+                break
+        metadata_lines = lines[ : tunebody_index]
+        tunebody_lines = lines[tunebody_index : ]
+        metadata_lines = [line + '\n' for line in metadata_lines]
+        if self.stream:
+            if not abc_code.endswith('\n'):
+                tunebody_lines = [tunebody_lines[i] + '\n' for i in range(len(tunebody_lines) - 1)] + [tunebody_lines[-1]]
+            else:
+                tunebody_lines = [tunebody_lines[i] + '\n' for i in range(len(tunebody_lines))]
+        else:
+            tunebody_lines = [line + '\n' for line in tunebody_lines]
+        metadata_patches = self.patchilize_metadata(metadata_lines)
+        tunebody_patches = self.patchilize_tunebody(tunebody_lines, encode_mode='generate')
+        if add_special_patches:
+            bos_patch = chr(self.bos_token_id) * (patch_size - 1) + chr(self.eos_token_id)
+            metadata_patches = [bos_patch] + metadata_patches
+        patches = metadata_patches + tunebody_patches
+        patches = patches[ : patch_length]
+        # encode to ids
+        id_patches = []
+        for patch in patches:
+            if len(patch) < PATCH_SIZE and patch[-1] != chr(self.eos_token_id):
+                id_patch = [ord(c) for c in patch]
+            else:
+                id_patch = [ord(c) for c in patch] + [self.special_token_id] * (patch_size - len(patch))
+            id_patches.append(id_patch)
+        return id_patches
+    def decode(self, patches):
+        """
+        Decode patches into music.
+        """
+        return ''.join(self.patch2chars(patch) for patch in patches)
+class PatchLevelDecoder(PreTrainedModel):
+    """
+    A Patch-level Decoder model for generating patch features in an auto-regressive manner.
+    It inherits PreTrainedModel from transformers.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.patch_embedding = torch.nn.Linear(PATCH_SIZE * 128, config.n_embd)
+        torch.nn.init.normal_(self.patch_embedding.weight, std=0.02)
+        self.base = GPT2Model(config)
+    def forward(self,
+                patches: torch.Tensor,
+                masks=None) -> torch.Tensor:
+        """
+        The forward pass of the patch-level decoder model.
+        :param patches: the patches to be encoded
+        :param masks: the masks for the patches
+        :return: the encoded patches
+        """
+        patches = torch.nn.functional.one_hot(patches, num_classes=128).to(self.dtype)
+        patches = patches.reshape(len(patches), -1, PATCH_SIZE * (128))
+        patches = self.patch_embedding(patches.to(self.device))
+        if masks==None:
+            return self.base(inputs_embeds=patches)
+        else:
+            return self.base(inputs_embeds=patches,
+                             attention_mask=masks)
+class CharLevelDecoder(PreTrainedModel):
+    """
+    A Char-level Decoder model for generating the chars within each patch in an auto-regressive manner
+    based on the encoded patch features. It inherits PreTrainedModel from transformers.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.special_token_id = 0
+        self.bos_token_id = 1
+        self.base = GPT2LMHeadModel(config)
+    def forward(self,
+                encoded_patches: torch.Tensor,
+                target_patches: torch.Tensor):
+        """
+        The forward pass of the char-level decoder model.
+        :param encoded_patches: the encoded patches
+        :param target_patches: the target patches
+        :return: the output of the model
+        """
+        target_patches = torch.cat((torch.ones_like(target_patches[:, 0:1]) * self.bos_token_id,
+                                        target_patches), dim=1)  # [patch_len, patch_size + 1]
+        target_masks = target_patches == self.special_token_id  # [patch_len, patch_size + 1]
+        labels = target_patches.clone().masked_fill_(target_masks, -100)
+        target_masks = torch.ones_like(labels)
+        target_masks = target_masks.masked_fill_(labels == -100, 0)
+        input_embeds = torch.nn.functional.embedding(target_patches, self.base.transformer.wte.weight)
+        input_embeds = torch.cat((encoded_patches.unsqueeze(1), input_embeds[:, 1:, :]), dim=1)
+        logits = self.base(inputs_embeds=input_embeds,
+                           attention_mask=target_masks).logits  # [patch_len, patch_size + 1, vocab_size]
+        logits = logits[:, :-1, :]
+        token_logps = torch.gather(logits.log_softmax(-1), dim=-1, index=target_patches[:, 1:].unsqueeze(-1)).squeeze(-1)   # [patch_len, patch_size]
+        token_logps = token_logps[target_masks[:, 1:] == 1]
+        all_logps = token_logps.sum()
+        return all_logps
+    def generate(self,
+                 encoded_patch: torch.Tensor,   # [hidden_size]
+                 tokens: torch.Tensor): # [1]
+        """
+        The generate function for generating a patch based on the encoded patch and already generated tokens.
+        :param encoded_patch: the encoded patch
+        :param tokens: already generated tokens in the patch
+        :return: the probability distribution of next token
+        """
+        encoded_patch = encoded_patch.reshape(1, 1, -1) # [1, 1, hidden_size]
+        tokens = tokens.reshape(1, -1)
+        # Get input embeddings
+        tokens = torch.nn.functional.embedding(tokens, self.base.transformer.wte.weight)
+        # Concatenate the encoded patch with the input embeddings
+        tokens = torch.cat((encoded_patch, tokens[:,1:,:]), dim=1)
+        # Get output from model
+        outputs = self.base(inputs_embeds=tokens)
+        # Get probabilities of next token
+        probs = torch.nn.functional.softmax(outputs.logits.squeeze(0)[-1], dim=-1)
+        return probs
+def safe_normalize_probs(probs):
+    epsilon = 1e-12
+    probs = np.array(probs, dtype=np.float64)
+    probs = np.where(np.isnan(probs) | (probs < 0), 0, probs)
+    probs = probs + epsilon
+    s = probs.sum()
+    if s > 0:
+        probs = probs / s
+    else:
+        probs = np.zeros_like(probs)
+        probs[0] = 1.0
+    return probs
+class NotaGenLMHeadModel(PreTrainedModel):
+    """
+    NotaGen is a language model with a hierarchical structure.
+    It includes a patch-level decoder and a char-level decoder.
+    The patch-level decoder is used to generate patch features in an auto-regressive manner.
+    The char-level decoder is used to generate the chars within each patch in an auto-regressive manner.
+    It inherits PreTrainedModel from transformers.
+    """
+    def __init__(self, encoder_config, decoder_config):
+        super().__init__(encoder_config)
+        self.special_token_id = 0
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.patch_level_decoder = PatchLevelDecoder(encoder_config)
+        self.char_level_decoder = CharLevelDecoder(decoder_config)
+    def forward(self,
+                patches: torch.Tensor,
+                masks: torch.Tensor):
+        """
+        The forward pass of the bGPT model.
+        :param patches: the patches to be encoded
+        :param masks: the masks for the patches
+        :return: the decoded patches
+        """
+        patches = patches.reshape(len(patches), -1, PATCH_SIZE)
+        encoded_patches = self.patch_level_decoder(patches, masks)["last_hidden_state"]
+        left_shift_masks = masks * (masks.flip(1).cumsum(1).flip(1) > 1)
+        masks[:, 0] = 0
+        encoded_patches = encoded_patches[left_shift_masks == 1]
+        patches = patches[masks == 1]
+        return self.char_level_decoder(encoded_patches, patches)
+    def generate(self,
+                 patches: torch.Tensor,
+                 top_k=0,
+                 top_p=1,
+                 temperature=1.0):
+        """
+        The generate function for generating patches based on patches.
+        :param patches: the patches to be encoded
+        :param top_k: the top k for sampling
+        :param top_p: the top p for sampling
+        :param temperature: the temperature for sampling
+        :return: the generated patches
+        """
+        if patches.shape[-1] % PATCH_SIZE != 0:
+            tokens = patches[:,:,-(patches.shape[-1]%PATCH_SIZE):].squeeze(0, 1)
+            tokens = torch.cat((torch.tensor([self.bos_token_id], device=self.device), tokens), dim=-1)
+            patches = patches[:,:,:-(patches.shape[-1]%PATCH_SIZE)]
+        else:
+            tokens =  torch.tensor([self.bos_token_id], device=self.device)
+        patches = patches.reshape(len(patches), -1, PATCH_SIZE) # [bs, seq, patch_size]
+        encoded_patches = self.patch_level_decoder(patches)["last_hidden_state"]    # [bs, seq, hidden_size]
+        generated_patch = []
+        while True:
+            prob = self.char_level_decoder.generate(encoded_patches[0][-1], tokens).cpu().detach().numpy()  # [128]
+            prob = safe_normalize_probs(prob)
+            prob = top_k_sampling(prob, top_k=top_k, return_probs=True) # [128]
+            prob = safe_normalize_probs(prob)
+            prob = top_p_sampling(prob, top_p=top_p, return_probs=True) # [128]
+            prob = safe_normalize_probs(prob)
+            token = temperature_sampling(prob, temperature=temperature) # int
+            char = chr(token)
+            generated_patch.append(token)
+            if len(tokens) >= PATCH_SIZE:# or token == self.eos_token_id:
+                break
+            else:
+                tokens = torch.cat((tokens, torch.tensor([token], device=self.device)), dim=0)
+        return generated_patch

xml2abc.py ADDED Viewed

	@@ -0,0 +1,1609 @@

+#!/usr/bin/env python
+# coding=latin-1
+'''
+Copyright (C) 2012-2018: W.G. Vree
+Contributions: M. Tarenskeen, N. Liberg, Paul Villiger, Janus Meuris, Larry Myerscough,
+Dick Jackson, Jan Wybren de Jong, Mark Zealey.
+This program is free software; you can redistribute it and/or modify it under the terms of the
+Lesser GNU General Public License as published by the Free Software Foundation;
+This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the Lesser GNU General Public License for more details. <http://www.gnu.org/licenses/lgpl.html>.
+'''
+'''Small revisions made for NotaGen to improve the succuess rate of conversion.'''
+try:    import xml.etree.cElementTree as E
+except: import xml.etree.ElementTree as E
+import os, sys, types, re, math
+VERSION = 143
+python3 = sys.version_info.major > 2
+if python3:
+    tupletype = tuple
+    listtype = list
+    max_int = sys.maxsize
+else:
+    tupletype = types.TupleType
+    listtype = types.ListType
+    max_int = sys.maxint
+note_ornamentation_map = {        # for notations/, modified from EasyABC
+    'ornaments/trill-mark':       'T',
+    'ornaments/mordent':          'M',
+    'ornaments/inverted-mordent': 'P',
+    'ornaments/turn':             '!turn!',
+    'ornaments/inverted-turn':    '!invertedturn!',
+    'technical/up-bow':           'u',
+    'technical/down-bow':         'v',
+    'technical/harmonic':         '!open!',
+    'technical/open-string':      '!open!',
+    'technical/stopped':          '!plus!',
+    'technical/snap-pizzicato':   '!snap!',
+    'technical/thumb-position':   '!thumb!',
+    'articulations/accent':       '!>!',
+    'articulations/strong-accent':'!^!',
+    'articulations/staccato':     '.',
+    'articulations/staccatissimo':'!wedge!',
+    'articulations/scoop':        '!slide!',
+    'fermata':                    '!fermata!',
+    'arpeggiate':                 '!arpeggio!',
+    'articulations/tenuto':       '!tenuto!',
+    'articulations/staccatissimo':'!wedge!', # not sure whether this is the right translation
+    'articulations/spiccato':     '!wedge!', # not sure whether this is the right translation
+    'articulations/breath-mark':  '!breath!', # this may need to be tested to make sure it appears on the right side of the note
+    'articulations/detached-legato': '!tenuto!.',
+}
+dynamics_map = {    # for direction/direction-type/dynamics/
+    'p':    '!p!',
+    'pp':   '!pp!',
+    'ppp':  '!ppp!',
+    'pppp': '!pppp!',
+    'f':    '!f!',
+    'ff':   '!ff!',
+    'fff':  '!fff!',
+    'ffff': '!ffff!',
+    'mp':   '!mp!',
+    'mf':   '!mf!',
+    'sfz':  '!sfz!',
+}
+percSvg = '''%%beginsvg
+    <defs>
+    <text id="x" x="-3" y="0">&#xe263;</text>
+    <text id="x-" x="-3" y="0">&#xe263;</text>
+    <text id="x+" x="-3" y="0">&#xe263;</text>
+    <text id="normal" x="-3.7" y="0">&#xe0a3;</text>
+    <text id="normal-" x="-3.7" y="0">&#xe0a3;</text>
+    <text id="normal+" x="-3.7" y="0">&#xe0a4;</text>
+    <g id="circle-x"><text x="-3" y="0">&#xe263;</text><circle r="4" class="stroke"></circle></g>
+    <g id="circle-x-"><text x="-3" y="0">&#xe263;</text><circle r="4" class="stroke"></circle></g>
+    <path id="triangle" d="m-4 -3.2l4 6.4 4 -6.4z" class="stroke" style="stroke-width:1.4"></path>
+    <path id="triangle-" d="m-4 -3.2l4 6.4 4 -6.4z" class="stroke" style="stroke-width:1.4"></path>
+    <path id="triangle+" d="m-4 -3.2l4 6.4 4 -6.4z" class="stroke" style="fill:#000"></path>
+    <path id="square" d="m-3.5 3l0 -6.2 7.2 0 0 6.2z" class="stroke" style="stroke-width:1.4"></path>
+    <path id="square-" d="m-3.5 3l0 -6.2 7.2 0 0 6.2z" class="stroke" style="stroke-width:1.4"></path>
+    <path id="square+" d="m-3.5 3l0 -6.2 7.2 0 0 6.2z" class="stroke" style="fill:#000"></path>
+    <path id="diamond" d="m0 -3l4.2 3.2 -4.2 3.2 -4.2 -3.2z" class="stroke" style="stroke-width:1.4"></path>
+    <path id="diamond-" d="m0 -3l4.2 3.2 -4.2 3.2 -4.2 -3.2z" class="stroke" style="stroke-width:1.4"></path>
+    <path id="diamond+" d="m0 -3l4.2 3.2 -4.2 3.2 -4.2 -3.2z" class="stroke" style="fill:#000"></path>
+    </defs>
+    %%endsvg'''
+tabSvg = '''%%beginsvg
+    <style type="text/css">
+    .bf {font-family:sans-serif; font-size:7px}
+    </style>
+    <defs>
+    <rect id="clr" x="-3" y="-1" width="6" height="5" fill="white"></rect>
+    <rect id="clr2" x="-3" y="-1" width="11" height="5" fill="white"></rect>'''
+kopSvg = '<g id="kop%s" class="bf"><use xlink:href="#clr"></use><text x="-2" y="3">%s</text></g>\n'
+kopSvg2 = '<g id="kop%s" class="bf"><use xlink:href="#clr2"></use><text x="-2" y="3">%s</text></g>\n'
+def info (s, warn=1): sys.stderr.write ((warn and '-- ' or '') + s + '\n')
+#-------------------
+# data abstractions
+#-------------------
+class Measure:
+    def __init__ (s, p):
+        s.reset ()
+        s.ixp = p       # part number
+        s.ixm = 0       # measure number
+        s.mdur = 0      # measure duration (nominal metre value in divisions)
+        s.divs = 0      # number of divisions per 1/4
+        s.mtr = 4,4     # meter
+    def reset (s):      # reset each measure
+        s.attr = ''     # measure signatures, tempo
+        s.lline = ''    # left barline, but only holds ':' at start of repeat, otherwise empty
+        s.rline = '|'   # right barline
+        s.lnum = ''     # (left) volta number
+class Note:
+    def __init__ (s, dur=0, n=None):
+        s.tijd = 0      # the time in XML division units
+        s.dur = dur     # duration of a note in XML divisions
+        s.fact = None   # time modification for tuplet notes (num, div)
+        s.tup = ['']    # start(s) and/or stop(s) of tuplet
+        s.tupabc = ''   # abc tuplet string to issue before note
+        s.beam = 0      # 1 = beamed
+        s.grace = 0     # 1 = grace note
+        s.before = []   # abc string that goes before the note/chord
+        s.after = ''    # the same after the note/chord
+        s.ns = n and [n] or []  # notes in the chord
+        s.lyrs = {}     # {number -> syllabe}
+        s.tab = None    # (string number, fret number)
+        s.ntdec = ''    # !string!, !courtesy!
+class Elem:
+    def __init__ (s, string):
+        s.tijd = 0      # the time in XML division units
+        s.str = string  # any abc string that is not a note
+class Counter:
+    def inc (s, key, voice): s.counters [key][voice] = s.counters [key].get (voice, 0) + 1
+    def clear (s, vnums):  # reset all counters
+        tups = list( zip (vnums.keys (), len (vnums) * [0]))
+        s.counters = {'note': dict (tups), 'nopr': dict (tups), 'nopt': dict (tups)}
+    def getv (s, key, voice): return s.counters[key][voice]
+    def prcnt (s, ip):  # print summary of all non zero counters
+        for iv in s.counters ['note']:
+            if s.getv ('nopr', iv) != 0:
+                info ( 'part %d, voice %d has %d skipped non printable notes' % (ip, iv, s.getv ('nopr', iv)))
+            if s.getv ('nopt', iv) != 0:
+                info ( 'part %d, voice %d has %d notes without pitch' % (ip, iv, s.getv ('nopt', iv)))
+            if s.getv ('note', iv) == 0: # no real notes counted in this voice
+                info ( 'part %d, skipped empty voice %d' % (ip, iv))
+class Music:
+    def __init__(s, options):
+        s.tijd = 0              # the current time
+        s.maxtime = 0           # maximum time in a measure
+        s.gMaten = []           # [voices,.. for all measures in a part]
+        s.gLyrics = []          # [{num: (abc_lyric_string, melis)},.. for all measures in a part]
+        s.vnums = {}            # all used voice id's in a part (xml voice id's == numbers)
+        s.cnt = Counter ()      # global counter object
+        s.vceCnt = 1            # the global voice count over all parts
+        s.lastnote = None       # the last real note record inserted in s.voices
+        s.bpl = options.b       # the max number of bars per line when writing abc
+        s.cpl = options.n       # the number of chars per line when writing abc
+        s.repbra = 0            # true if volta is used somewhere
+        s.nvlt = options.v      # no volta on higher voice numbers
+        s.jscript = options.j   # compatibility with javascript version
+    def initVoices (s, newPart=0):
+        s.vtimes, s.voices, s.lyrics = {}, {}, {}
+        for v in s.vnums:
+            s.vtimes [v] = 0    # {voice: the end time of the last item in each voice}
+            s.voices [v] = []   # {voice: [Note|Elem, ..]}
+            s.lyrics [v] = []   # {voice: [{num: syl}, ..]}
+        if newPart: s.cnt.clear (s.vnums)   # clear counters once per part
+    def incTime (s, dt):
+        s.tijd += dt
+        if s.tijd < 0: s.tijd = 0   # erroneous <backup> element
+        if s.tijd > s.maxtime: s.maxtime = s.tijd
+    def appendElemCv (s, voices, elem):
+        for v in voices:
+            s.appendElem (v, elem) # insert element in all voices
+    def insertElem (s, v, elem):    # insert at the start of voice v in the current measure
+        obj = Elem (elem)
+        obj.tijd = 0        # because voice is sorted later
+        s.voices [v].insert (0, obj)
+    def appendObj (s, v, obj, dur):
+        obj.tijd = s.tijd
+        s.voices [v].append (obj)
+        s.incTime (dur)
+        if s.tijd > s.vtimes[v]: s.vtimes[v] = s.tijd   # don't update for inserted earlier items
+    def appendElem (s, v, elem, tel=0):
+        s.appendObj (v, Elem (elem), 0)
+        if tel: s.cnt.inc ('note', v)   # count number of certain elements in each voice (in addition to notes)
+    def appendElemT (s, v, elem, tijd): # insert element at specified time
+        obj = Elem (elem)
+        obj.tijd = tijd
+        s.voices [v].append (obj)
+    def appendNote (s, v, note, noot):
+        note.ns.append (note.ntdec + noot)
+        s.appendObj (v, note, int (note.dur))
+        s.lastnote = note           # remember last note/rest for later modifications (chord, grace)
+        if noot != 'z' and noot != 'x':         # real notes and grace notes
+            s.cnt.inc ('note', v)   # count number of real notes in each voice
+            if not note.grace:                  # for every real note
+                s.lyrics[v].append (note.lyrs)  # even when it has no lyrics
+    def getLastRec (s, voice):
+        if s.gMaten: return s.gMaten[-1][voice][-1] # the last record in the last measure
+        return None                                 # no previous records in the first measure
+    def getLastMelis (s, voice, num):   # get melisma of last measure
+        if s.gLyrics:
+            lyrdict = s.gLyrics[-1][voice]  # the previous lyrics dict in this voice
+            if num in lyrdict: return lyrdict[num][1]   # lyrdict = num -> (lyric string, melisma)
+        return 0 # no previous lyrics in voice or line number
+    def addChord (s, note, noot):   # careful: we assume that chord notes follow immediately
+        for d in note.before:       # put all decorations before chord
+            if d not in s.lastnote.before:
+                s.lastnote.before += [d]
+        s.lastnote.ns.append (note.ntdec + noot)
+    def addBar (s, lbrk, m): # linebreak, measure data
+        if m.mdur and s.maxtime > m.mdur: info ('measure %d in part %d longer than metre' % (m.ixm+1, m.ixp+1))
+        s.tijd = s.maxtime              # the time of the bar lines inserted here
+        for v in s.vnums:
+            if m.lline or m.lnum:       # if left barline or left volta number
+                p = s.getLastRec (v)    # get the previous barline record
+                if p:                   # in measure 1 no previous measure is available
+                    x = p.str           # p.str is the ABC barline string
+                    if m.lline:         # append begin of repeat, m.lline == ':'
+                        x = (x + m.lline).replace (':|:','::').replace ('||','|')
+                    if s.nvlt == 3:     # add volta number only to lowest voice in part 0
+                        if m.ixp + v == min (s.vnums): x += m.lnum
+                    elif m.lnum:        # new behaviour with I:repbra 0
+                        x += m.lnum     # add volta number(s) or text to all voices
+                        s.repbra = 1    # signal occurrence of a volta
+                    p.str = x           # modify previous right barline
+                elif m.lline:           # begin of new part and left repeat bar is required
+                    s.insertElem (v, '|:')
+            if lbrk:
+                p = s.getLastRec (v)    # get the previous barline record
+                if p: p.str += lbrk     # insert linebreak char after the barlines+volta
+            if m.attr:                  # insert signatures at front of buffer
+                s.insertElem (v, '%s' % m.attr)
+            s.appendElem (v, ' %s' % m.rline)   # insert current barline record at time maxtime
+            s.voices[v] = sortMeasure (s.voices[v], m)  # make all times consistent
+            lyrs = s.lyrics[v]          # [{number: sylabe}, .. for all notes]
+            lyrdict = {}                # {number: (abc_lyric_string, melis)} for this voice
+            nums = [num for d in lyrs for num in d.keys ()] # the lyrics numbers in this measure
+            maxNums = max (nums + [0])  # the highest lyrics number in this measure
+            for i in range (maxNums, 0, -1):
+                xs = [syldict.get (i, '') for syldict in lyrs]  # collect the syllabi with number i
+                melis = s.getLastMelis (v, i)  # get melisma from last measure
+                lyrdict [i] = abcLyr (xs, melis)
+            s.lyrics[v] = lyrdict       # {number: (abc_lyric_string, melis)} for this measure
+            mkBroken (s.voices[v])
+        s.gMaten.append (s.voices)
+        s.gLyrics.append (s.lyrics)
+        s.tijd = s.maxtime = 0
+        s.initVoices ()
+    def outVoices (s, divs, ip, isSib): # output all voices of part ip
+        vvmap = {}                  # xml voice number -> abc voice number (one part)
+        vnum_keys = list (s.vnums.keys ())
+        if s.jscript or isSib: vnum_keys.sort ()
+        lvc = min (vnum_keys or [1])	# lowest xml voice number of this part
+        for iv in vnum_keys:
+            if s.cnt.getv ('note', iv) == 0:    # no real notes counted in this voice
+                continue            # skip empty voices
+            if abcOut.denL: unitL = abcOut.denL # take the unit length from the -d option
+            else:           unitL = compUnitLength (iv, s.gMaten, divs) # compute the best unit length for this voice
+            abcOut.cmpL.append (unitL)  # remember for header output
+            vn, vl = [], {}         # for voice iv: collect all notes to vn and all lyric lines to vl
+            for im in range (len (s.gMaten)):
+                measure = s.gMaten [im][iv]
+                vn.append (outVoice (measure, divs [im], im, ip, unitL))
+                checkMelismas (s.gLyrics, s.gMaten, im, iv)
+                for n, (lyrstr, melis) in s.gLyrics [im][iv].items ():
+                    if n in vl:
+                        while len (vl[n]) < im: vl[n].append ('') # fill in skipped measures
+                        vl[n].append (lyrstr)
+                    else:
+                        vl[n] = im * [''] + [lyrstr]    # must skip im measures
+            for n, lyrs in vl.items (): # fill up possibly empty lyric measures at the end
+                mis = len (vn) - len (lyrs)
+                lyrs += mis * ['']
+            abcOut.add ('V:%d' % s.vceCnt)
+            if s.repbra:
+                if s.nvlt == 1 and s.vceCnt > 1: abcOut.add ('I:repbra 0')  # only volta on first voice
+                if s.nvlt == 2 and iv > lvc:     abcOut.add ('I:repbra 0')  # only volta on first voice of each part
+            if   s.cpl > 0:  s.bpl = 0      # option -n (max chars per line) overrules -b (max bars per line)
+            elif s.bpl == 0: s.cpl = 100    # the default: 100 chars per line
+            bn = 0                  # count bars
+            while vn:               # while still measures available
+                ib = 1
+                chunk = vn [0]
+                while ib < len (vn):
+                    if s.cpl > 0 and len (chunk) + len (vn [ib]) >= s.cpl: break    # line full (number of chars)
+                    if s.bpl > 0 and ib >= s.bpl: break                             # line full (number of bars)
+                    chunk += vn [ib]
+                    ib += 1
+                bn += ib
+                abcOut.add (chunk + ' %%%d' % bn)   # line with barnumer
+                del vn[:ib]         # chop ib bars
+                lyrlines = sorted (vl.items ())     # order the numbered lyric lines for output
+                for n, lyrs in lyrlines:
+                    abcOut.add ('w: ' + '|'.join (lyrs[:ib]) + '|')
+                    del lyrs[:ib]
+            vvmap [iv] = s.vceCnt   # xml voice number -> abc voice number
+            s.vceCnt += 1           # count voices over all parts
+        s.gMaten = []               # reset the follwing instance vars for each part
+        s.gLyrics = []
+        s.cnt.prcnt (ip+1)          # print summary of skipped items in this part
+        return vvmap
+class ABCoutput:
+    pagekeys = 'scale,pageheight,pagewidth,leftmargin,rightmargin,topmargin,botmargin'.split (',')
+    def __init__ (s, fnmext, pad, X, options):
+        s.fnmext = fnmext
+        s.outlist = []          # list of ABC strings
+        s.title = 'T:Title'
+        s.key = 'none'
+        s.clefs = {}            # clefs for all abc-voices
+        s.mtr = 'none'
+        s.tempo = 0             # 0 -> no tempo field
+        s.tempo_units = (1,4)	# note type of tempo direction
+        s.pad = pad             # the output path or none
+        s.X = X + 1             # the abc tune number
+        s.denL = options.d      # denominator of the unit length (L:) from -d option
+        s.volpan = int (options.m)  # 0 -> no %%MIDI, 1 -> only program, 2 -> all %%MIDI
+        s.cmpL = []             # computed optimal unit length for all voices
+        s.jscript = options.j   # compatibility with javascript version
+        s.tstep = options.t     # translate percmap to voicemap
+        s.stemless = 0          # use U:s=!stemless!
+        s.shiftStem = options.s # shift note heads 3 units left
+        if pad:
+            _, base_name = os.path.split (fnmext)
+            s.outfile = open (os.path.join (pad, base_name), 'w', encoding='utf-8')
+        else:   s.outfile = sys.stdout
+        if s.jscript: s.X = 1   # always X:1 in javascript version
+        s.pageFmt = {}
+        for k in s.pagekeys: s.pageFmt [k] = None
+        if len (options.p) == 7:
+            for k, v in zip (s.pagekeys, options.p):
+                try: s.pageFmt [k] = float (v)
+                except: info ('illegal float %s for %s', (k, v)); continue
+    def add (s, str):
+        s.outlist.append (str + '\n')   # collect all ABC output
+    def mkHeader (s, stfmap, partlist, midimap, vmpdct, koppen): # stfmap = [parts], part = [staves], stave = [voices]
+        accVce, accStf, staffs = [], [], stfmap[:]  # staffs is consumed
+        for x in partlist:              # collect partnames into accVce and staff groups into accStf
+            try: prgroupelem (x, ('', ''), '', stfmap, accVce, accStf)
+            except: info ('lousy musicxml: error in part-list')
+        staves = ' '.join (accStf)
+        clfnms = {}
+        for part, (partname, partabbrv) in zip (staffs, accVce):
+            if not part: continue       # skip empty part
+            firstVoice = part[0][0]     # the first voice number in this part
+            nm  = partname.replace ('\n','\\n').replace ('.:','.').strip (':')
+            snm = partabbrv.replace ('\n','\\n').replace ('.:','.').strip (':')
+            clfnms [firstVoice] = (nm and 'nm="%s"' % nm or '') + (snm and ' snm="%s"' % snm or '')
+        hd = ['X:%d\n%s\n' % (s.X, s.title)]
+        for i, k in enumerate (s.pagekeys):
+            if s.jscript and k in ['pageheight','topmargin', 'botmargin']: continue
+            if s.pageFmt [k] != None: hd.append ('%%%%%s %.2f%s\n' % (k, s.pageFmt [k], i > 0 and 'cm' or ''))
+        if staves and len (accStf) > 1: hd.append ('%%score ' + staves + '\n')
+        tempo = s.tempo and 'Q:%d/%d=%s\n' % (s.tempo_units [0], s.tempo_units [1], s.tempo) or ''  # default no tempo field
+        d = {}  # determine the most frequently occurring unit length over all voices
+        for x in s.cmpL: d[x] = d.get (x, 0) + 1
+        if s.jscript:   defLs = sorted (d.items (), key=lambda x: (-x[1], x[0])) # when tie (1) sort on key (0)
+        else:           defLs = sorted (d.items (), key=lambda x: -x[1])
+        defL = s.denL and s.denL or defLs [0][0] # override default unit length with -d option
+        hd.append ('L:1/%d\n%sM:%s\n' % (defL, tempo, s.mtr))
+        hd.append ('K:%s\n' % s.key)
+        if s.stemless: hd.append ('U:s=!stemless!\n')
+        vxs = sorted (vmpdct.keys ())
+        for vx in vxs: hd.extend (vmpdct [vx])
+        s.dojef = 0    # translate percmap to voicemap
+        for vnum, clef in s.clefs.items ():
+            ch, prg, vol, pan = midimap [vnum-1][:4]
+            dmap = midimap [vnum - 1][4:]   # map of abc percussion notes to midi notes
+            if dmap and 'perc' not in clef: clef = (clef + ' map=perc').strip ();
+            hd.append ('V:%d %s %s\n' % (vnum, clef, clfnms.get (vnum, '')))
+            if vnum in vmpdct:
+                hd.append ('%%%%voicemap tab%d\n' % vnum)
+                hd.append ('K:none\nM:none\n%%clef none\n%%staffscale 1.6\n%%flatbeams true\n%%stemdir down\n')
+            if 'perc' in clef: hd.append ('K:none\n');  # no key for a perc voice
+            if s.volpan > 1:    # option -m 2 -> output all recognized midi commands when needed and present in xml
+                if ch > 0 and ch != vnum: hd.append ('%%%%MIDI channel %d\n' % ch)
+                if prg > 0:  hd.append ('%%%%MIDI program %d\n' % (prg - 1))
+                if vol >= 0: hd.append ('%%%%MIDI control 7 %.0f\n' % vol)  # volume == 0 is possible ...
+                if pan >= 0: hd.append ('%%%%MIDI control 10 %.0f\n' % pan)
+            elif s.volpan > 0: # default -> only output midi program command when present in xml
+                if dmap and ch > 0: hd.append ('%%%%MIDI channel %d\n' % ch) # also channel if percussion part
+                if prg > 0:  hd.append ('%%%%MIDI program %d\n' % (prg - 1))
+            for abcNote, step, midiNote, notehead in dmap:
+                if not notehead: notehead = 'normal'
+                if abcMid (abcNote) != midiNote or abcNote != step:
+                    if s.volpan > 0: hd.append ('%%%%MIDI drummap %s %s\n' % (abcNote, midiNote))
+                    hd.append ('I:percmap %s %s %s %s\n' % (abcNote, step, midiNote, notehead))
+                    s.dojef = s.tstep
+            if defL != s.cmpL [vnum-1]: # only if computed unit length different from header
+                hd.append ('L:1/%d\n' % s.cmpL [vnum-1])
+        s.outlist = hd + s.outlist
+        if koppen:  # output SVG stuff needed for tablature
+            k1 = kopSvg.replace ('-2','-5') if s.shiftStem else kopSvg  # shift note heads 3 units left
+            k2 = kopSvg2.replace ('-2','-5') if s.shiftStem else kopSvg2
+            tb = tabSvg.replace ('-3','-6') if s.shiftStem else tabSvg
+            ks = sorted (koppen.keys ())                                # javascript compatibility
+            ks = [k2 % (k, k) if len (k) == 2 else k1 % (k, k) for k in ks]
+            tbs = map (lambda x: x.strip () + '\n', tb.splitlines ())   # javascript compatibility
+            s.outlist = tbs + ks + ['</defs>\n%%endsvg\n'] + s.outlist
+    def writeall (s):  # determine the required encoding of the entire ABC output
+        str = ''.join (s.outlist)
+        # print(str)
+        if s.dojef: str = perc2map (str)
+        if python3: s.outfile.write (str)
+        else:       s.outfile.write (str)
+        if s.pad: s.outfile.close ()                # close each file with -o option
+        else: s.outfile.write ('\n')                # add empty line between tunes on stdout
+        info ('%s written with %d voices' % (s.fnmext, len (s.clefs)), warn=0)
+#----------------
+# functions
+#----------------
+def abcLyr (xs, melis): # Convert list xs to abc lyrics.
+    if not ''.join (xs): return '', 0  # there is no lyrics in this measure
+    res = []
+    for x in xs:        # xs has for every note a lyrics syllabe or an empty string
+        if x == '':     # note without lyrics
+            if melis: x = '_'   # set melisma
+            else: x = '*'       # skip note
+        elif x.endswith ('_') and not x.endswith ('\_'): # start of new melisma
+            x = x.replace ('_', '') # remove and set melis boolean
+            melis = 1           # so next skips will become melisma
+        else: melis = 0         # melisma stops on first syllable
+        res.append (x)
+    return (' '.join (res), melis)
+def simplify (a, b):    # divide a and b by their greatest common divisor
+    x, y = a, b
+    while b: a, b = b, a % b
+    return x // a, y // a
+def abcdur (nx, divs, uL):      # convert an musicXML duration d to abc units with L:1/uL
+    if nx.dur == 0: return ''   # when called for elements without duration
+    num, den = simplify (uL * nx.dur, divs * 4) # L=1/8 -> uL = 8 units
+    if nx.fact:                 # apply tuplet time modification
+        numfac, denfac = nx.fact
+        num, den = simplify (num * numfac, den * denfac)
+    if den > 64:                # limit the denominator to a maximum of 64
+        x = float (num) / den; n = math.floor (x);  # when just above an integer n
+        if x - n < 0.1 * x: num, den = n, 1;        # round to n
+        num64 = 64. * num / den + 1.0e-15           # to get Python2 behaviour of round
+        num, den = simplify (int (round (num64)), 64)
+    if num == 1:
+        if   den == 1: dabc = ''
+        elif den == 2: dabc = '/'
+        else:          dabc = '/%d' % den
+    elif den == 1:     dabc = '%d' % num
+    else:              dabc = '%d/%d' % (num, den)
+    return dabc
+def abcMid (note):  # abc note -> midi pitch
+    r = re.search (r"([_^]*)([A-Ga-g])([',]*)", note)
+    if not r: return -1
+    acc, n, oct = r.groups ()
+    nUp = n.upper ()
+    p = 60 + [0,2,4,5,7,9,11]['CDEFGAB'.index (nUp)] + (12 if nUp != n else 0);
+    if acc: p += (1 if acc[0] == '^' else -1) * len (acc)
+    if oct: p += (12 if oct[0] == "'" else -12) * len (oct)
+    return p
+def staffStep (ptc, o, clef, tstep):
+    ndif = 0
+    if 'stafflines=1' in clef: ndif += 4                    # meaning of one line: E (xml) -> B (abc)
+    if not tstep and clef.startswith ('bass'): ndif += 12   # transpose bass -> treble (C3 -> A4)
+    if ndif:    # diatonic transposition == addition modulo 7
+        nm7 = 'C,D,E,F,G,A,B'.split (',')
+        n = nm7.index (ptc) + ndif
+        ptc, o = nm7 [n % 7], o + n // 7
+    if o > 4: ptc = ptc.lower ()
+    if o > 5: ptc = ptc + (o-5) * "'"
+    if o < 4: ptc = ptc + (4-o) * ","
+    return ptc
+def setKey (fifths, mode):
+    sharpness = ['Fb', 'Cb','Gb','Db','Ab','Eb','Bb','F','C','G','D','A', 'E', 'B', 'F#','C#','G#','D#','A#','E#','B#']
+    offTab = {'maj':8, 'ion':8, 'm':11, 'min':11, 'aeo':11, 'mix':9, 'dor':10, 'phr':12, 'lyd':7, 'loc':13, 'non':8}
+    mode = mode.lower ()[:3] # only first three chars, no case
+    key = sharpness [offTab [mode] + fifths] + (mode if offTab [mode] != 8 else '')
+    accs = ['F','C','G','D','A','E','B']
+    if fifths >= 0: msralts = dict (zip (accs[:fifths], fifths * [1]))
+    else:           msralts = dict (zip (accs[fifths:], -fifths * [-1]))
+    return key, msralts
+def insTup (ix, notes, fact):   # read one nested tuplet
+    tupcnt = 0
+    nx = notes [ix]
+    if 'start' in nx.tup:
+        nx.tup.remove ('start') # do recursive calls when starts remain
+    tix = ix                    # index of first tuplet note
+    fn, fd = fact               # xml time-mod of the higher level
+    fnum, fden = nx.fact        # xml time-mod of the current level
+    tupfact = fnum//fn, fden//fd  # abc time mod of this level
+    while ix < len (notes):
+        nx = notes [ix]
+        if isinstance (nx, Elem) or nx.grace:
+            ix += 1             # skip all non tuplet elements
+            continue
+        if 'start' in nx.tup:   # more nested tuplets to start
+            ix, tupcntR = insTup (ix, notes, tupfact)   # ix is on the stop note!
+            tupcnt += tupcntR
+        elif nx.fact:
+            tupcnt += 1         # count tuplet elements
+        if 'stop' in nx.tup:
+            nx.tup.remove ('stop')
+            break
+        if not nx.fact:         # stop on first non tuplet note
+            ix = lastix         # back to last tuplet note
+            break
+        lastix = ix
+        ix += 1
+    # put abc tuplet notation before the recursive ones
+    tup = (tupfact[0], tupfact[1], tupcnt)
+    if tup == (3, 2, 3): tupPrefix = '(3'
+    else:                tupPrefix = '(%d:%d:%d' % tup
+    notes [tix].tupabc = tupPrefix + notes [tix].tupabc
+    return ix, tupcnt           # ix is on the last tuplet note
+def mkBroken (vs):      # introduce broken rhythms (vs: one voice, one measure)
+    vs = [n for n in vs if isinstance (n, Note)]
+    i = 0
+    while i < len (vs) - 1:
+        n1, n2 = vs[i], vs[i+1]     # scan all adjacent pairs
+        # skip if note in tuplet or has no duration or outside beam
+        if not n1.fact and not n2.fact and n1.dur > 0 and n2.beam:
+            if n1.dur * 3 == n2.dur:
+                n2.dur = (2 * n2.dur) // 3
+                n1.dur = n1.dur * 2
+                n1.after = '<' + n1.after
+                i += 1              # do not chain broken rhythms
+            elif n2.dur * 3 == n1.dur:
+                n1.dur = (2 * n1.dur) // 3
+                n2.dur = n2.dur * 2
+                n1.after = '>' + n1.after
+                i += 1              # do not chain broken rhythms
+        i += 1
+def outVoice (measure, divs, im, ip, unitL):    # note/elem objects of one measure in one voice
+    ix = 0
+    while ix < len (measure):   # set all (nested) tuplet annotations
+        nx = measure [ix]
+        if isinstance (nx, Note) and nx.fact and not nx.grace:
+            ix, tupcnt = insTup (ix, measure, (1, 1))   # read one tuplet, insert annotation(s)
+        ix += 1
+    vs = []
+    for nx in measure:
+        if isinstance (nx, Note):
+            durstr = abcdur (nx, divs, unitL)           # xml -> abc duration string
+            chord = len (nx.ns) > 1
+            cns = [nt[:-1] for nt in nx.ns if nt.endswith ('-')]
+            tie = ''
+            if chord and len (cns) == len (nx.ns):      # all chord notes tied
+                nx.ns = cns     # chord notes without tie
+                tie = '-'       # one tie for whole chord
+            s = nx.tupabc + ''.join (nx.before)
+            if chord: s += '['
+            for nt in nx.ns: s += nt
+            if chord: s += ']' + tie
+            if s.endswith ('-'): s, tie = s[:-1], '-'   # split off tie
+            s += durstr + tie   # and put it back again
+            s += nx.after
+            nospace = nx.beam
+        else:
+            if isinstance (nx.str, listtype): nx.str = nx.str [0]
+            s = nx.str
+            nospace = 1
+        if nospace: vs.append (s)
+        else: vs.append (' ' + s)
+    vs = ''.join (vs)   # ad hoc: remove multiple pedal directions
+    while vs.find ('!ped!!ped!') >= 0: vs = vs.replace ('!ped!!ped!','!ped!')
+    while vs.find ('!ped-up!!ped-up!') >= 0: vs = vs.replace ('!ped-up!!ped-up!','!ped-up!')
+    while vs.find ('!8va(!!8va)!') >= 0: vs = vs.replace ('!8va(!!8va)!','')    # remove empty ottava's
+    return vs
+def sortMeasure (voice, m):
+    voice.sort (key=lambda o: o.tijd)   # sort on time
+    time = 0
+    v = []
+    rs = []                            # holds rests in between notes
+    for i, nx in enumerate (voice):    # establish sequentiality
+        if nx.tijd > time and chkbug (nx.tijd - time, m):
+            v.append (Note (nx.tijd - time, 'x'))   # fill hole with invisble rest
+            rs.append (len (v) - 1)
+        if isinstance (nx, Elem):
+            if nx.tijd < time: nx.tijd = time # shift elems without duration to where they fit
+            v.append (nx)
+            time = nx.tijd
+            continue
+        if nx.tijd < time:                  # overlapping element
+            if nx.ns[0] == 'z': continue    # discard overlapping rest
+            if v[-1].tijd <= nx.tijd:       # we can do something
+                if v[-1].ns[0] == 'z':      # shorten rest
+                    v[-1].dur = nx.tijd - v[-1].tijd
+                    if v[-1].dur == 0: del v[-1]        # nothing left
+                    info ('overlap in part %d, measure %d: rest shortened' % (m.ixp+1, m.ixm+1))
+                else:                       # make a chord of overlap
+                    v[-1].ns += nx.ns
+                    info ('overlap in part %d, measure %d: added chord' % (m.ixp+1, m.ixm+1))
+                    nx.dur = (nx.tijd + nx.dur) - time  # the remains
+                    if nx.dur <= 0: continue            # nothing left
+                    nx.tijd = time          # append remains
+            else:                           # give up
+                info ('overlapping notes in one voice! part %d, measure %d, note %s discarded' % (m.ixp+1, m.ixm+1, isinstance (nx, Note) and nx.ns or nx.str))
+                continue
+        v.append (nx)
+        if isinstance (nx, Note):
+            if nx.ns [0] in 'zx':
+                rs.append (len (v) - 1)     # remember rests between notes
+            elif len (rs):
+                if nx.beam and not nx.grace:    # copy beam into rests
+                    for j in rs: v[j].beam = nx.beam
+                rs = []                     # clear rests on each note
+        time = nx.tijd + nx.dur
+    #   when a measure contains no elements and no forwards -> no incTime -> s.maxtime = 0 -> right barline
+    #   is inserted at time == 0 (in addbar) and is only element in the voice when sortMeasure is called
+    if time == 0: info ('empty measure in part %d, measure %d, it should contain at least a rest to advance the time!' % (m.ixp+1, m.ixm+1))
+    return v
+def getPartlist (ps):   # correct part-list (from buggy xml-software)
+    xs = [] # the corrected part-list
+    e = []  # stack of opened part-groups
+    for x in list (ps): # insert missing stops, delete double starts
+        if x.tag ==  'part-group':
+            num, type = x.get ('number'), x.get ('type')
+            if type == 'start':
+                if num in e:    # missing stop: insert one
+                    xs.append (E.Element ('part-group', number = num, type = 'stop'))
+                    xs.append (x)
+                else:           # normal start
+                    xs.append (x)
+                    e.append (num)
+            else:
+                if num in e:    # normal stop
+                    e.remove (num)
+                    xs.append (x)
+                else: pass      # double stop: skip it
+        else: xs.append (x)
+    for num in reversed (e):    # fill missing stops at the end
+        xs.append (E.Element ('part-group', number = num, type = 'stop'))
+    return xs
+def parseParts (xs, d, e):  # -> [elems on current level], rest of xs
+    if not xs: return [],[]
+    x = xs.pop (0)
+    if x.tag == 'part-group':
+        num, type = x.get ('number'), x.get ('type')
+        if type == 'start': # go one level deeper
+            s = [x.findtext (n, '') for n in ['group-symbol','group-barline','group-name','group-abbreviation']]
+            d [num] = s     # remember groupdata by group number
+            e.append (num)  # make stack of open group numbers
+            elemsnext, rest1 = parseParts (xs, d, e) # parse one level deeper to next stop
+            elems, rest2 = parseParts (rest1, d, e)  # parse the rest on this level
+            return [elemsnext] + elems, rest2
+        else:               # stop: close level and return group-data
+            nums = e.pop () # last open group number in stack order
+            if xs and xs[0].get ('type') == 'stop':     # two consequetive stops
+                if num != nums:                         # in the wrong order (tempory solution)
+                    d[nums], d[num] = d[num], d[nums]   # exchange values    (only works for two stops!!!)
+            sym = d[num]    # retrieve an return groupdata as last element of the group
+            return [sym], xs
+    else:
+        elems, rest = parseParts (xs, d, e) # parse remaining elements on current level
+        name = x.findtext ('part-name',''), x.findtext ('part-abbreviation','')
+        return [name] + elems, rest
+def bracePart (part):       # put a brace on multistaff part and group voices
+    if not part: return []  # empty part in the score
+    brace = []
+    for ivs in part:
+        if len (ivs) == 1:  # stave with one voice
+            brace.append ('%s' % ivs[0])
+        else:               # stave with multiple voices
+            brace += ['('] + ['%s' % iv for iv in ivs] + [')']
+        brace.append ('|')
+    del brace[-1]           # no barline at the end
+    if len (part) > 1:
+        brace = ['{'] + brace + ['}']
+    return brace
+def prgroupelem (x, gnm, bar, pmap, accVce, accStf):    # collect partnames (accVce) and %%score map (accStf)
+    if type (x) == tupletype:   # partname-tuple = (part-name, part-abbrev)
+        y = pmap.pop (0)
+        if gnm[0]: x = [n1 + ':' + n2 for n1, n2 in zip (gnm, x)]   # put group-name before part-name
+        accVce.append (x)
+        accStf.extend (bracePart (y))
+    elif len (x) == 2 and type (x[0]) == tupletype: # misuse of group just to add extra name to stave
+        y = pmap.pop (0)
+        nms = [n1 + ':' + n2 for n1, n2 in zip (x[0], x[1][2:])]    # x[0] = partname-tuple, x[1][2:] = groupname-tuple
+        accVce.append (nms)
+        accStf.extend (bracePart (y))
+    else:
+        prgrouplist (x, bar, pmap, accVce, accStf)
+def prgrouplist (x, pbar, pmap, accVce, accStf):    # collect partnames, scoremap for a part-group
+    sym, bar, gnm, gabbr = x[-1]    # bracket symbol, continue barline, group-name-tuple
+    bar = bar == 'yes' or pbar      # pbar -> the parent has bar
+    accStf.append (sym == 'brace' and '{' or '[')
+    for z in x[:-1]:
+        prgroupelem (z, (gnm, gabbr), bar, pmap, accVce, accStf)
+        if bar: accStf.append ('|')
+    if bar: del accStf [-1]         # remove last one before close
+    accStf.append (sym == 'brace' and '}' or ']')
+def compUnitLength (iv, maten, divs):   # compute optimal unit length
+    uLmin, minLen = 0, max_int
+    for uL in [4,8,16]:     # try 1/4, 1/8 and 1/16
+        vLen = 0            # total length of abc duration strings in this voice
+        for im, m in enumerate (maten): # all measures
+            for e in m[iv]: # all notes in voice iv
+                if isinstance (e, Elem) or e.dur == 0: continue # no real durations
+                vLen += len (abcdur (e, divs [im], uL))  # add len of duration string
+        if vLen < minLen: uLmin, minLen = uL, vLen  # remember the smallest
+    return uLmin
+def doSyllable (syl):
+    txt = ''
+    for e in syl:
+        if   e.tag == 'elision': txt += '~'
+        elif e.tag == 'text':   # escape - and space characters
+            txt += (e.text or '').replace ('_','\_').replace('-', r'\-').replace(' ', '~')
+    if not txt: return txt
+    if syl.findtext('syllabic') in ['begin', 'middle']: txt += '-'
+    if syl.find('extend') is not None:                  txt += '_'
+    return txt
+def checkMelismas (lyrics, maten, im, iv):
+    if im == 0: return
+    maat = maten [im][iv]               # notes of the current measure
+    curlyr = lyrics [im][iv]            # lyrics dict of current measure
+    prvlyr = lyrics [im-1][iv]          # lyrics dict of previous measure
+    for n, (lyrstr, melis) in prvlyr.items ():  # all lyric numbers in the previous measure
+        if n not in curlyr and melis:   # melisma required, but no lyrics present -> make one!
+            ms = getMelisma (maat)      # get a melisma for the current measure
+            if ms: curlyr [n] = (ms, 0) # set melisma as the n-th lyrics of the current measure
+def getMelisma (maat):                  # get melisma from notes in maat
+    ms = []
+    for note in maat:                   # every note should get an underscore
+        if not isinstance (note, Note): continue    # skip Elem's
+        if note.grace: continue         # skip grace notes
+        if note.ns [0] in 'zx': break   # stop on first rest
+        ms.append ('_')
+    return ' '.join (ms)
+def perc2map (abcIn):
+    fillmap = {'diamond':1, 'triangle':1, 'square':1, 'normal':1};
+    abc = map (lambda x: x.strip (), percSvg.splitlines ())
+    id='default'
+    maps = {'default': []};
+    dmaps = {'default': []}
+    r1 = re.compile (r'V:\s*(\S+)')
+    ls = abcIn.splitlines ()
+    for x in ls:
+        if 'I:percmap' in x:
+            noot, step, midi, kop = map (lambda x: x.strip (), x.split ()[1:])
+            if kop in fillmap: kop = kop + '+' + ',' + kop
+            x = '%%%%map perc%s %s print=%s midi=%s heads=%s' % (id, noot, step, midi, kop)
+            maps [id].append (x)
+        if '%%MIDI' in x: dmaps [id].append (x)
+        if 'V:' in x:
+            r = r1.match (x)
+            if r:
+                id = r.group (1);
+                if id not in maps: maps [id] = []; dmaps [id] = []
+    ids = sorted (maps.keys ())
+    for id in ids: abc += maps [id]
+    id='default'
+    for x in ls:
+        if 'I:percmap' in x: continue
+        if '%%MIDI' in x: continue
+        if 'V:' in x or 'K:' in x:
+            r = r1.match (x)
+            if r: id = r.group (1)
+            abc.append (x)
+            if id in dmaps and len (dmaps [id]) > 0: abc.extend (dmaps [id]); del dmaps [id]
+            if 'perc' in x and 'map=' not in x: x += ' map=perc';
+            if 'map=perc' in x and len (maps [id]) > 0: abc.append ('%%voicemap perc' + id);
+            if 'map=off' in x: abc.append ('%%voicemap');
+        else:
+            abc.append (x)
+    return '\n'.join (abc) + '\n'
+def addoct (ptc, o):    # xml staff step, xml octave number
+    p = ptc
+    if o > 4: p = ptc.lower ()
+    if o > 5: p = p + (o-5) * "'"
+    if o < 4: p = p + (4-o) * ","
+    return p            # abc pitch == abc note without accidental
+def chkbug (dt, m):
+    if dt > m.divs / 16: return 1   # duration should be > 1/64 note
+    info ('MuseScore bug: incorrect duration, smaller then 1/64! in measure %d, part %d' % (m.ixm, m.ixp))
+    return 0
+#----------------
+# parser
+#----------------
+class Parser:
+    note_alts = [   # 3 alternative notations of the same note for tablature mapping
+        [x.strip () for x in '=C,  ^C, =D, ^D, =E, =F, ^F, =G, ^G, =A, ^A, =B'.split (',')],
+        [x.strip () for x in '^B,  _D,^^C, _E, _F, ^E, _G,^^F, _A,^^G, _B, _C'.split (',')],
+        [x.strip () for x in '__D,^^B,__E,__F,^^D,__G,^^E,__A,_/A,__B,__C,^^A'.split (',')] ]
+    step_map = {'C':0,'D':2,'E':4,'F':5,'G':7,'A':9,'B':11}
+    def __init__ (s, options):
+        # unfold repeats, number of chars per line, credit filter level, volta option
+        s.slurBuf = {}    # dict of open slurs keyed by slur number
+        s.dirStk = {}     # {direction-type + number -> (type, voice | time)} dict for proper closing
+        s.ingrace = 0     # marks a sequence of grace notes
+        s.msc = Music (options)  # global music data abstraction
+        s.unfold = options.u    # turn unfolding repeats on
+        s.ctf = options.c       # credit text filter level
+        s.gStfMap = []    # [[abc voice numbers] for all parts]
+        s.midiMap = []    # midi-settings for each abc voice, in order
+        s.drumInst = {}   # inst_id -> midi pitch for channel 10 notes
+        s.drumNotes = {}  # (xml voice, abc note) -> (midi note, note head)
+        s.instMid = []    # [{inst id -> midi-settings} for all parts]
+        s.midDflt = [-1,-1,-1,-91] # default midi settings for channel, program, volume, panning
+        s.msralts = {}    # xml-notenames (without octave) with accidentals from the key
+        s.curalts = {}    # abc-notenames (with voice number) with passing accidentals
+        s.stfMap = {}     # xml staff number -> [xml voice number]
+        s.vce2stf = {}    # xml voice number -> allocated staff number
+        s.clefMap = {}    # xml staff number -> abc clef (for header only)
+        s.curClef = {}    # xml staff number -> current abc clef
+        s.stemDir = {}    # xml voice number -> current stem direction
+        s.clefOct = {}    # xml staff number -> current clef-octave-change
+        s.curStf = {}     # xml voice number -> current xml staff number
+        s.nolbrk = options.x;   # generate no linebreaks ($)
+        s.jscript = options.j   # compatibility with javascript version
+        s.ornaments = sorted (note_ornamentation_map.items ())
+        s.doPageFmt = len (options.p) == 1 # translate xml page format
+        s.tstep = options.t     # clef determines step on staff (percussion)
+        s.dirtov1 = options.v1  # all directions to first voice of staff
+        s.ped = options.ped     # render pedal directions
+        s.wstems = options.stm  # translate stem elements
+        s.pedVce = None   # voice for pedal directions
+        s.repeat_str = {} # staff number -> [measure number, repeat-text]
+        s.tabVceMap = {}  # abc voice num -> [%%map ...] for tab voices
+        s.koppen = {}     # noteheads needed for %%map
+    def matchSlur (s, type2, n, v2, note2, grace, stopgrace): # match slur number n in voice v2, add abc code to before/after
+        if type2 not in ['start', 'stop']: return   # slur type continue has no abc equivalent
+        if n == None: n = '1'
+        if n in s.slurBuf:
+            type1, v1, note1, grace1 = s.slurBuf [n]
+            if type2 != type1:              # slur complete, now check the voice
+                if v2 == v1:                # begins and ends in the same voice: keep it
+                    if type1 == 'start' and (not grace1 or not stopgrace):  # normal slur: start before stop and no grace slur
+                        note1.before = ['('] + note1.before # keep left-right order!
+                        note2.after += ')'
+                    # no else: don't bother with reversed stave spanning slurs
+                del s.slurBuf [n]           # slur finished, remove from stack
+            else:                           # double definition, keep the last
+                info ('double slur numbers %s-%s in part %d, measure %d, voice %d note %s, first discarded' % (type2, n, s.msr.ixp+1, s.msr.ixm+1, v2, note2.ns))
+                s.slurBuf [n] = (type2, v2, note2, grace)
+        else:                               # unmatched slur, put in dict
+            s.slurBuf [n] = (type2, v2, note2, grace)
+    def doNotations (s, note, nttn, isTab):
+        for key, val in s.ornaments:
+            if nttn.find (key) != None: note.before += [val]  # just concat all ornaments
+        trem = nttn.find ('ornaments/tremolo')
+        if trem != None:
+            type = trem.get ('type')
+            if type == 'single':
+                note.before.insert (0, '!%s!' % (int (trem.text) * '/'))
+            else:
+                note.fact = None    # no time modification in ABC
+                if s.tstep:         # abc2svg version
+                    if type == 'stop': note.before.insert (0, '!trem%s!' % trem.text);
+                else:               # abc2xml version
+                    if type == 'start': note.before.insert (0, '!%s-!' % (int (trem.text) * '/'));
+        fingering = nttn.findall ('technical/fingering')
+        for finger in fingering:    # handle multiple finger annotations
+            if not isTab: note.before += ['!%s!' % finger.text] # fingering goes before chord (addChord)
+        snaar = nttn.find ('technical/string')
+        if snaar != None and isTab:
+            if s.tstep:
+                fret = nttn.find ('technical/fret')
+                if fret != None: note.tab = (snaar.text, fret.text)
+            else:
+                deco = '!%s!' % snaar.text  # no double string decos (bug in musescore)
+                if deco not in note.ntdec: note.ntdec += deco
+        wvln = nttn.find ('ornaments/wavy-line')
+        if wvln != None:
+            if   wvln.get ('type') == 'start': note.before = ['!trill(!'] + note.before # keep left-right order!
+            elif wvln.get ('type') == 'stop': note.before = ['!trill)!'] + note.before
+        glis = nttn.find ('glissando')
+        if glis == None: glis = nttn.find ('slide') # treat slide as glissando
+        if glis != None:
+            lt = '~' if glis.get ('line-type') =='wavy' else '-'
+            if   glis.get ('type') == 'start': note.before = ['!%s(!' % lt] + note.before # keep left-right order!
+            elif glis.get ('type') == 'stop': note.before = ['!%s)!' % lt] + note.before
+    def tabnote (s, alt, ptc, oct, v, ntrec):
+        p = s.step_map [ptc] + int (alt or '0') # p in -2 .. 13
+        if p > 11: oct += 1             # octave correction
+        if p < 0: oct -= 1
+        p = p % 12                      # remap p into 0..11
+        snaar_nw, fret_nw = ntrec.tab   # the computed/annotated allocation of nt
+        for i in range (4):             # support same note on 4 strings
+            na = s.note_alts [i % 3] [p]    # get alternative representation of same note
+            o = oct
+            if na in ['^B', '^^B']: o -= 1  # because in adjacent octave
+            if na in ['_C', '__C']: o += 1
+            if '/' in na or i == 3: o = 9   # emergency notation for 4th string case
+            nt = addoct (na, o)
+            snaar, fret = s.tabmap.get ((v, nt), ('', ''))  # the current allocation of nt
+            if not snaar: break             # note not yet allocated
+            if snaar_nw == snaar: return nt # use present allocation
+            if i == 3:                  # new allocaion needed but none is free
+                fmt = 'rejected: voice %d note %3s string %s fret %2s remains: string %s fret %s'
+                info (fmt % (v, nt, snaar_nw, fret_nw, snaar, fret), 1)
+                ntrec.tab = (snaar, fret)
+        s.tabmap [v, nt] = ntrec.tab    # for tablature map (voice, note) -> (string, fret)
+        return nt                       # ABC code always in key C (with midi pitch alterations)
+    def ntAbc (s, ptc, oct, note, v, ntrec, isTab): # pitch, octave -> abc notation
+        acc2alt = {
+            'double-flat': -2,
+            'flat-flat': -2,
+            'flat': -1,
+            'natural-flat': -1,
+            'natural': 0,
+            'sharp': 1,
+            'natural-sharp': 1,
+            'sharp-sharp': 2,
+            'double-sharp': 2
+        }
+        oct += s.clefOct.get (s.curStf [v], 0)  # minus clef-octave-change value
+        acc = note.findtext ('accidental')  # should be the notated accidental
+        alt = note.findtext ('pitch/alter') # pitch alteration (midi)
+        if ntrec.tab: return s.tabnote (alt, ptc, oct, v, ntrec)    # implies s.tstep is true (options.t was given)
+        elif isTab and s.tstep:
+            nt = ['__','_','','^','^^'][int (alt or '0') + 2] + addoct (ptc, oct)
+            info ('no string notation found for note %s in voice %d' % (nt, v), 1)
+        p = addoct (ptc, oct)
+        if alt == None and s.msralts.get (ptc, 0): alt = 0  # no alt but key implies alt -> natural!!
+        if alt == None and (p, v) in s.curalts: alt = 0     # no alt but previous note had one -> natural!!
+        if acc == None and alt == None: return p    # no acc, no alt
+        elif acc != None:
+            alt = acc2alt [acc]      # acc takes precedence over the pitch here!
+        else:   # now see if we really must add an accidental
+            alt = int (float (alt))
+            if (p, v) in s.curalts:  # the note in this voice has been altered before
+                if alt == s.curalts [(p, v)]: return p      # alteration still the same
+            elif alt == s.msralts.get (ptc, 0): return p    # alteration implied by the key
+            tieElms = note.findall ('tie') + note.findall ('notations/tied')    # in xml we have separate notated ties and playback ties
+            if 'stop' in [e.get ('type') for e in tieElms]: return p    # don't alter tied notes
+            info ('accidental %d added in part %d, measure %d, voice %d note %s' % (alt, s.msr.ixp+1, s.msr.ixm+1, v+1, p))
+        s.curalts [(p, v)] = alt
+        p = ['__','_','=','^','^^'][alt+2] + p # and finally ... prepend the accidental
+        return p
+    def doNote (s, n):    # parse a musicXML note tag
+        note = Note ()
+        v = int (n.findtext ('voice', '1'))
+        if s.isSib: v += 100 * int (n.findtext ('staff', '1'))  # repair bug in Sibelius
+        chord = n.find ('chord') != None
+        p = n.findtext ('pitch/step') or n.findtext ('unpitched/display-step')
+        o = n.findtext ('pitch/octave') or n.findtext ('unpitched/display-octave')
+        r = n.find ('rest')
+        numer = n.findtext ('time-modification/actual-notes')
+        if numer:
+            denom = n.findtext ('time-modification/normal-notes')
+            note.fact = (int (numer), int (denom))
+        note.tup = [x.get ('type') for x in n.findall ('notations/tuplet')]
+        dur = n.findtext ('duration')
+        grc = n.find ('grace')
+        note.grace = grc != None
+        note.before, note.after = [], '' # strings with ABC stuff that goes before or after a note/chord
+        if note.grace and not s.ingrace: # open a grace sequence
+            s.ingrace = 1
+            note.before = ['{']
+            if grc.get ('slash') == 'yes': note.before += ['/'] # acciaccatura
+        stopgrace = not note.grace and s.ingrace
+        if stopgrace:                   # close the grace sequence
+            s.ingrace = 0
+            s.msc.lastnote.after += '}' # close grace on lastenote.after
+        if dur == None or note.grace: dur = 0
+        if r == None and n.get ('print-object') == 'no':
+            if chord: return
+            r = 1  # turn invisible notes (that advance the time) into invisible rests
+        note.dur = int (dur)
+        if r == None and (not p or not o):  # not a rest and no pitch
+            s.msc.cnt.inc ('nopt', v)       # count unpitched notes
+            o, p = 5,'E'                    # make it an E5 ??
+        isTab = s.curClef and s.curClef.get (s.curStf [v], '').startswith ('tab')
+        nttn = n.find ('notations')     # add ornaments
+        if nttn != None: s.doNotations (note, nttn, isTab)
+        e = n.find ('stem') if r == None else None  # no !stemless! before rest
+        if e != None and e.text == 'none' and (not isTab or v in s.hasStems or s.tstep):
+            note.before += ['!stemless!']; abcOut.stemless = 0; # ???????????U???????????!stemless!
+            # note.before += ['s']; abcOut.stemless = 1;
+        e = n.find ('accidental')
+        if e != None and e.get ('parentheses') == 'yes': note.ntdec += '!courtesy!'
+        if r != None: noot = 'x' if n.get ('print-object') == 'no' or isTab else 'z'
+        else: noot = s.ntAbc (p, int (o), n, v, note, isTab)
+        if n.find ('unpitched') != None:
+            clef = s.curClef [s.curStf [v]]     # the current clef for this voice
+            step = staffStep (p, int (o), clef, s.tstep)        # (clef independent) step value of note on the staff
+            instr = n.find ('instrument')
+            instId = instr.get ('id') if instr != None else 'dummyId'
+            midi = s.drumInst.get (instId, abcMid (noot))
+            nh =  n.findtext ('notehead', '').replace (' ','-') # replace spaces in xml notehead names for percmap
+            if nh == 'x': noot = '^' + noot.replace ('^','').replace ('_','')
+            if nh in ['circle-x','diamond','triangle']: noot = '_' + noot.replace ('^','').replace ('_','')
+            if nh and n.find ('notehead').get ('filled','') == 'yes': nh += '+'
+            if nh and n.find ('notehead').get ('filled','') == 'no': nh += '-'
+            s.drumNotes [(v, noot)] = (step, midi, nh) # keep data for percussion map
+        tieElms = n.findall ('tie') + n.findall ('notations/tied')  # in xml we have separate notated ties and playback ties
+        if 'start' in [e.get ('type') for e in tieElms]:            # n can have stop and start tie
+            noot = noot + '-'
+        note.beam = sum ([1 for b in n.findall('beam') if b.text in ['continue', 'end']]) + int (note.grace)
+        lyrlast = 0; rsib = re.compile (r'^.*verse')
+        for e in n.findall ('lyric'):
+            lyrnum = int (rsib.sub ('', e.get ('number', '1'))) # also do Sibelius numbers
+            if lyrnum == 0: lyrnum = lyrlast + 1                # and correct Sibelius bugs
+            else: lyrlast = lyrnum
+            note.lyrs [lyrnum] = doSyllable (e)
+        stemdir = n.findtext ('stem')
+        if s.wstems and (stemdir == 'up' or stemdir == 'down'):
+            if stemdir != s.stemDir.get (v, ''):
+                s.stemDir [v] = stemdir
+                s.msc.appendElem (v, '[I:stemdir %s]' % stemdir)
+        if chord: s.msc.addChord (note, noot)
+        else:
+            xmlstaff = int (n.findtext ('staff', '1'))
+            if s.curStf [v] != xmlstaff:    # the note should go to another staff
+                dstaff = xmlstaff - s.curStf [v]    # relative new staff number
+                s.curStf [v] = xmlstaff     # remember the new staff for this voice
+                s.msc.appendElem (v, '[I:staff %+d]' % dstaff)  # insert a move before the note
+            s.msc.appendNote (v, note, noot)
+        for slur in n.findall ('notations/slur'):   # s.msc.lastnote points to the last real note/chord inserted above
+            s.matchSlur (slur.get ('type'), slur.get ('number'), v, s.msc.lastnote, note.grace, stopgrace) # match slur definitions
+    def doAttr (s, e):    # parse a musicXML attribute tag
+        teken = {'C1':'alto1','C2':'alto2','C3':'alto','C4':'tenor','F4':'bass','F3':'bass3','G2':'treble','TAB':'tab','percussion':'perc'}
+        dvstxt = e.findtext ('divisions')
+        if dvstxt: s.msr.divs = int (dvstxt)
+        steps = int (e.findtext ('transpose/chromatic', '0'))   # for transposing instrument
+        fifths = e.findtext ('key/fifths')
+        first = s.msc.tijd == 0 and s.msr.ixm == 0  # first attributes in first measure
+        if fifths:
+            key, s.msralts = setKey (int (fifths), e.findtext ('key/mode','major'))
+            if first and not steps and abcOut.key == 'none':
+                abcOut.key = key                # first measure -> header, if not transposing instrument or percussion part!
+            elif key != abcOut.key or not first:
+                s.msr.attr += '[K:%s]' % key    # otherwise -> voice
+        beats = e.findtext ('time/beats')
+        if beats:
+            unit = e.findtext ('time/beat-type')
+            mtr = beats + '/' + unit
+            if first: abcOut.mtr = mtr          # first measure -> header
+            else: s.msr.attr += '[M:%s]' % mtr  # otherwise -> voice
+            s.msr.mtr = int (beats), int (unit)
+        s.msr.mdur = (s.msr.divs * s.msr.mtr[0] * 4) // s.msr.mtr[1]   # duration of measure in xml-divisions
+        for ms in e.findall('measure-style'):
+            n = int (ms.get ('number', '1'))    # staff number
+            voices = s.stfMap [n]               # all voices of staff n
+            for mr in ms.findall('measure-repeat'):
+                ty = mr.get('type')
+                if ty == 'start':               # remember start measure number and text voor each staff
+                    s.repeat_str [n] = [s.msr.ixm, mr.text]
+                    for v in voices:            # insert repeat into all voices, value will be overwritten at stop
+                        s.msc.insertElem (v, s.repeat_str [n])
+                elif ty == 'stop':              # calculate repeat measure count for this staff n
+                    start_ix, text_ = s.repeat_str [n]
+                    repeat_count = s.msr.ixm - start_ix
+                    if text_:
+                        mid_str =  "%s " % text_
+                        repeat_count /= int (text_)
+                    else:
+                        mid_str = ""            # overwrite repeat with final string
+                    s.repeat_str [n][0] = '[I:repeat %s%d]' % (mid_str, repeat_count)
+                    del s.repeat_str [n]        # remove closed repeats
+        toct = e.findtext ('transpose/octave-change', '')
+        if toct: steps += 12 * int (toct)       # extra transposition of toct octaves
+        for clef in e.findall ('clef'):         # a part can have multiple staves
+            n = int (clef.get ('number', '1'))  # local staff number for this clef
+            sgn = clef.findtext ('sign')
+            line = clef.findtext ('line', '') if sgn not in ['percussion','TAB'] else ''
+            cs = teken.get (sgn + line, '')
+            oct = clef.findtext ('clef-octave-change', '') or '0'
+            if oct: cs += {-2:'-15', -1:'-8', 1:'+8', 2:'+15'}.get (int (oct), '')
+            s.clefOct [n] = -int (oct);         # xml playback pitch -> abc notation pitch
+            if steps: cs += ' transpose=' + str (steps)
+            stfdtl = e.find ('staff-details')
+            if stfdtl and int (stfdtl.get ('number', '1')) == n:
+                lines = stfdtl.findtext ('staff-lines')
+                if lines:
+                    lns= '|||' if lines == '3' and sgn == 'TAB' else lines
+                    cs += ' stafflines=%s' % lns
+                    s.stafflines = int (lines)  # remember for tab staves
+                strings = stfdtl.findall ('staff-tuning')
+                if strings:
+                    tuning = [st.findtext ('tuning-step') + st.findtext ('tuning-octave') for st in strings]
+                    cs += ' strings=%s' % ','.join (tuning)
+                capo = stfdtl.findtext ('capo')
+                if capo: cs += ' capo=%s' % capo
+            s.curClef [n] = cs                  # keep track of current clef (for percmap)
+            if first: s.clefMap [n] = cs        # clef goes to header (where it is mapped to voices)
+            else:
+                voices = s.stfMap[n]            # clef change to all voices of staff n
+                for v in voices:
+                    if n != s.curStf [v]:       # voice is not at its home staff n
+                        dstaff = n - s.curStf [v]
+                        s.curStf [v] = n        # reset current staff at start of measure to home position
+                        s.msc.appendElem (v, '[I:staff %+d]' % dstaff)
+                    s.msc.appendElem (v, '[K:%s]' % cs)
+    def findVoice (s, i, es):
+        stfnum = int (es[i].findtext ('staff',1))   # directions belong to a staff
+        vs = s.stfMap [stfnum]                      # voices in this staff
+        v1 = vs [0] if vs else 1                    # directions to first voice of staff
+        if s.dirtov1: return stfnum, v1, v1         # option --v1
+        for e in es [i+1:]:                         # or to the voice of the next note
+            if e.tag == 'note':
+                v = int (e.findtext ('voice', '1'))
+                if s.isSib: v += 100 * int (e.findtext ('staff', '1'))  # repair bug in Sibelius
+                stf = s.vce2stf [v]                 # use our own staff allocation
+                return stf, v, v1                   # voice of next note, first voice of staff
+            if e.tag == 'backup': break
+        return stfnum, v1, v1                       # no note found, fall back to v1
+    def doDirection (s, e, i, es): # parse a musicXML direction tag
+        def addDirection (x, vs, tijd, stfnum):
+            if not x: return
+            vs = s.stfMap [stfnum] if '!8v' in x else [vs]  # ottava's go to all voices of staff
+            for v in vs:
+                if tijd != None:   # insert at time of encounter
+                    s.msc.appendElemT (v, x.replace ('(',')').replace ('ped','ped-up'), tijd)
+                else:
+                    s.msc.appendElem (v, x)
+        def startStop (dtype, vs, stfnum=1):
+            typmap = {'down':'!8va(!', 'up':'!8vb(!', 'crescendo':'!<(!', 'diminuendo':'!>(!', 'start':'!ped!'}
+            type = t.get ('type', '')
+            k = dtype + t.get ('number', '1')       # key to match the closing direction
+            if type in typmap:                      # opening the direction
+                x = typmap [type]
+                if k in s.dirStk:                   # closing direction already encountered
+                    stype, tijd = s.dirStk [k]; del s.dirStk [k]
+                    if stype == 'stop':
+                        addDirection (x, vs, tijd, stfnum)
+                    else:
+                        info ('%s direction %s has no stop in part %d, measure %d, voice %d' % (dtype, stype, s.msr.ixp+1, s.msr.ixm+1, vs+1))
+                        s.dirStk [k] = ((type , vs))    # remember voice and type for closing
+                else:
+                    s.dirStk [k] = ((type , vs))    # remember voice and type for closing
+            elif type == 'stop':
+                if k in s.dirStk:                   # matching open direction found
+                    type, vs = s.dirStk [k]; del s.dirStk [k]   # into the same voice
+                    if type == 'stop':
+                        info ('%s direction %s has double stop in part %d, measure %d, voice %d' % (dtype, type, s.msr.ixp+1, s.msr.ixm+1, vs+1))
+                        x = ''
+                    else:
+                        x = typmap [type].replace ('(',')').replace ('ped','ped-up')
+                else:                               # closing direction found before opening
+                    s.dirStk [k] = ('stop', s.msc.tijd)
+                    x = ''                          # delay code generation until opening found
+            elif type in ['continue', 'resume', 'discontinue', 'change']:
+                # ?? 'continue' ? 'resume'????????????????
+                # ??????????????
+                # info('Ignoring unsupported direction type: %s' % type)
+                x = ''
+            else: raise ValueError ('wrong direction type')
+            addDirection (x, vs, None, stfnum)
+        tempo, wrdstxt = None, ''
+        plcmnt = e.get ('placement')
+        stf, vs, v1 = s.findVoice (i, es)
+        jmp = ''    # for jump sound elements: dacapo, dalsegno and family
+        jmps = [('dacapo','D.C.'),('dalsegno','D.S.'),('tocoda','dacoda'),('fine','fine'),('coda','O'),('segno','S')]
+        t = e.find ('sound')        # there are many possible attributes for sound
+        if t != None:
+            minst = t.find ('midi-instrument')
+            if minst:
+                prg = t.findtext ('midi-instrument/midi-program')
+                chn = t.findtext ('midi-instrument/midi-channel')
+                vids = [v for v, id in s.vceInst.items () if id == minst.get ('id')]
+                if vids: vs = vids [0]          # direction for the indentified voice, not the staff
+                parm, inst = ('program', str (int (prg) - 1)) if prg else ('channel', chn)
+                if inst and abcOut.volpan > 0: s.msc.appendElem (vs, '[I:MIDI= %s %s]' % (parm, inst))
+            tempo = t.get ('tempo') # look for tempo attribute
+            if tempo:
+                tempo = '%.0f' % float (tempo) # hope it is a number and insert in voice 1
+                tempo_units = (1,4) # always 1/4 for sound elements!
+            for r, v in jmps:
+                if t.get (r, ''): jmp = v; break
+        dirtypes = e.findall ('direction-type')
+        for dirtyp in dirtypes:
+            units = { 'whole': (1,1), 'half': (1,2), 'quarter': (1,4), 'eighth': (1,8) }
+            metr = dirtyp.find ('metronome')
+            if metr != None:
+                t = metr.findtext ('beat-unit', '')
+                if t in units:  tempo_units = units [t]
+                else:           tempo_units = units ['quarter']
+                if metr.find ('beat-unit-dot') != None:
+                    tempo_units = simplify (tempo_units [0] * 3, tempo_units [1] * 2)
+                debugtext = metr.findtext ('per-minute')
+                tmpro = None
+                if metr.findtext ('per-minute'):
+                    tmpro = re.search ('[.\d]+', metr.findtext ('per-minute'))  # look for a number #####
+                if tmpro: tempo = tmpro.group () # overwrites the value set by the sound element of this direction
+            t = dirtyp.find ('wedge')
+            if t != None: startStop ('wedge', vs)
+            allwrds = dirtyp.findall ('words')        # insert text annotations
+            if not allwrds: allwrds = dirtyp.findall ('rehearsal')   # treat rehearsal mark as text annotation
+            for wrds in allwrds:
+                if jmp: # ignore the words when a jump sound element is present in this direction
+                    s.msc.appendElem (vs, '!%s!' % jmp , 1) # to voice
+                    break
+                plc = plcmnt == 'below' and '_' or '^'
+                if float (wrds.get ('default-y', '0')) < 0: plc = '_'
+                wrdstxt += (wrds.text or '').replace ('"','\\"').replace ('\n', '\\n')
+            wrdstxt = wrdstxt.strip ()
+            for key, val in dynamics_map.items ():
+                if dirtyp.find ('dynamics/' + key) != None:
+                    s.msc.appendElem (vs, val, 1)   # to voice
+            if dirtyp.find ('coda') != None: s.msc.appendElem (vs, 'O', 1)
+            if dirtyp.find ('segno') != None: s.msc.appendElem (vs, 'S', 1)
+            t = dirtyp.find ('octave-shift')
+            if t != None: startStop ('octave-shift', vs, stf)   # assume size == 8 for the time being
+            t = dirtyp.find ('pedal')
+            if t != None and s.ped:
+                if not s.pedVce: s.pedVce = vs
+                startStop ('pedal', s.pedVce)
+            if dirtyp.findtext ('other-direction') == 'diatonic fretting': s.diafret = 1;
+        if tempo:
+            tempo = '%.0f' % float (tempo) # hope it is a number and insert in voice 1
+            if s.msc.tijd == 0 and s.msr.ixm == 0:      # first measure -> header
+                abcOut.tempo = tempo
+                abcOut.tempo_units = tempo_units
+            else:
+                s.msc.appendElem (v1, '[Q:%d/%d=%s]' % (tempo_units [0], tempo_units [1], tempo))   # otherwise -> 1st voice
+        if wrdstxt: s.msc.appendElem (vs, '"%s%s"' % (plc, wrdstxt), 1) # to voice, but after tempo
+    def doHarmony (s, e, i, es):    # parse a musicXMl harmony tag
+        _, vt, _ = s.findVoice (i, es)
+        short = {'major':'', 'minor':'m', 'augmented':'+', 'diminished':'dim', 'dominant':'7', 'half-diminished':'m7b5'}
+        accmap = {'major':'maj', 'dominant':'', 'minor':'m', 'diminished':'dim', 'augmented':'+', 'suspended':'sus'}
+        modmap = {'second':'2', 'fourth':'4', 'seventh':'7', 'sixth':'6', 'ninth':'9', '11th':'11', '13th':'13'}
+        altmap = {'1':'#', '0':'', '-1':'b'}
+        root = e.findtext ('root/root-step','')
+        alt = altmap.get (e.findtext ('root/root-alter'), '')
+        sus = ''
+        kind = e.findtext ('kind', '')
+        if kind in short: kind = short [kind]
+        elif '-' in kind:   # xml chord names: <triad name>-<modification>
+            triad, mod = kind.split ('-')
+            kind = accmap.get (triad, '') + modmap.get (mod, '')
+            if kind.startswith ('sus'): kind, sus = '', kind    # sus-suffix goes to the end
+        elif kind == 'none': kind = e.find ('kind').get ('text','')
+        degrees = e.findall ('degree')
+        for d in degrees:   # chord alterations
+            kind += altmap.get (d.findtext ('degree-alter'),'') + d.findtext ('degree-value','')
+        kind = kind.replace ('79','9').replace ('713','13').replace ('maj6','6')
+        bass = e.findtext ('bass/bass-step','') + altmap.get (e.findtext ('bass/bass-alter'),'')
+        s.msc.appendElem (vt, '"%s%s%s%s%s"' % (root, alt, kind, sus, bass and '/' + bass), 1)
+    def doBarline (s, e):       # 0 = no repeat, 1 = begin repeat, 2 = end repeat
+        rep = e.find ('repeat')
+        if rep != None: rep = rep.get ('direction')
+        if s.unfold:            # unfold repeat, don't translate barlines
+            return rep and (rep == 'forward' and 1 or 2) or 0
+        loc = e.get ('location', 'right')   # right is the default
+        if loc == 'right':      # only change style for the right side
+            style = e.findtext ('bar-style')
+            if   style == 'light-light': s.msr.rline = '||'
+            elif style == 'light-heavy': s.msr.rline = '|]'
+        if rep != None:         # repeat found
+            if rep == 'forward': s.msr.lline = ':'
+            else:                s.msr.rline = ':|' # override barline style
+        end = e.find ('ending')
+        if end != None:
+            if end.get ('type') == 'start':
+                n = end.get ('number', '1').replace ('.','').replace (' ','')
+                try: list (map (int, n.split (',')))    # should be a list of integers
+                except: n = '"%s"' % n.strip ()         # illegal musicXML
+                s.msr.lnum = n          # assume a start is always at the beginning of a measure
+            elif s.msr.rline == '|':    # stop and discontinue the same  in ABC ?
+                s.msr.rline = '||'      # to stop on a normal barline use || in ABC ?
+        return 0
+    def doPrint (s, e):     # print element, measure number -> insert a line break
+        if e.get ('new-system') == 'yes' or e.get ('new-page') == 'yes':
+            if not s.nolbrk: return '$'  # a line break
+    def doPartList (s, e):  # translate the start/stop-event-based xml-partlist into proper tree
+        for sp in e.findall ('part-list/score-part'):
+            midi = {}
+            for m in sp.findall ('midi-instrument'):
+                x = [m.findtext (p, s.midDflt [i]) for i,p in enumerate (['midi-channel','midi-program','volume','pan'])]
+                pan = float (x[3])
+                if pan >= -90 and pan <= 90:    # would be better to map behind-pannings
+                    pan = (float (x[3]) + 90) / 180 * 127   # xml between -90 and +90
+                midi [m.get ('id')] = [int (x[0]), int (x[1]), float (x[2]) * 1.27, pan]    # volume 100 -> midi 127
+                up = m.findtext ('midi-unpitched')
+                if up: s.drumInst [m.get ('id')] = int (up) - 1 # store midi-pitch for channel 10 notes
+            s.instMid.append (midi)
+        ps = e.find ('part-list')               # partlist  = [groupelem]
+        xs = getPartlist (ps)                   # groupelem = partname | grouplist
+        partlist, _ = parseParts (xs, {}, [])   # grouplist = [groupelem, ..., groupdata]
+        return partlist                         # groupdata = [group-symbol, group-barline, group-name, group-abbrev]
+    def mkTitle (s, e):
+        def filterCredits (y):  # y == filter level, higher filters less
+            cs = []
+            for x in credits:   # skip redundant credit lines
+                if y < 6 and (x in title or x in mvttl): continue         # sure skip
+                if y < 5 and (x in composer or x in lyricist): continue   # almost sure skip
+                if y < 4 and ((title and title in x) or (mvttl and mvttl in x)): continue   # may skip too much
+                if y < 3 and ([1 for c in composer if c in x] or [1 for c in lyricist if c in x]): continue # skips too much
+                if y < 2 and re.match (r'^[\d\W]*$', x): continue       # line only contains numbers and punctuation
+                cs.append (x)
+            if y == 0 and (title + mvttl): cs = ''  # default: only credit when no title set
+            return cs
+        title = e.findtext ('work/work-title', '').strip ()
+        mvttl = e.findtext ('movement-title', '').strip ()
+        composer, lyricist, credits = [], [], []
+        for creator in e.findall ('identification/creator'):
+            if creator.text:
+                if creator.get ('type') == 'composer':
+                    composer += [line.strip () for line in creator.text.split ('\n')]
+                elif creator.get ('type') in ('lyricist', 'transcriber'):
+                    lyricist += [line.strip () for line in creator.text.split ('\n')]
+        for rights in e.findall ('identification/rights'):
+            if rights.text:
+                lyricist += [line.strip () for line in rights.text.split ('\n')]
+        for credit in e.findall('credit'):
+            cs = ''.join (e.text or '' for e in credit.findall('credit-words'))
+            credits += [re.sub (r'\s*[\r\n]\s*', ' ', cs)]
+        credits = filterCredits (s.ctf)
+        if title: title = 'T:%s\n' % title.replace ('\n', '\nT:')
+        if mvttl: title += 'T:%s\n' % mvttl.replace ('\n', '\nT:')
+        if credits: title += '\n'.join (['T:%s' % c for c in credits]) + '\n'
+        if composer: title += '\n'.join (['C:%s' % c for c in composer]) + '\n'
+        if lyricist: title += '\n'.join (['Z:%s' % c for c in lyricist]) + '\n'
+        if title: abcOut.title = title[:-1]
+        s.isSib = 'Sibelius' in (e.findtext ('identification/encoding/software') or '')
+        if s.isSib: info ('Sibelius MusicXMl is unreliable')
+    def doDefaults (s, e):
+        if not s.doPageFmt: return  # return if -pf option absent
+        d = e.find ('defaults');
+        if d == None: return;
+        mils = d.findtext ('scaling/millimeters')   # mills == staff height (mm)
+        tenths = d.findtext ('scaling/tenths')      # staff height in tenths
+        if not mils or not tenths: return
+        xmlScale = float (mils) / float (tenths) / 10   # tenths -> mm
+        space = 10 * xmlScale       # space between staff lines == 10 tenths
+        abcScale = space / 0.2117   # 0.2117 cm = 6pt = space between staff lines for scale = 1.0 in abcm2ps
+        abcOut.pageFmt ['scale'] = abcScale
+        eks = 2 * ['page-layout/'] + 4 * ['page-layout/page-margins/']
+        eks = [a+b for a,b in zip (eks, 'page-height,page-width,left-margin,right-margin,top-margin,bottom-margin'.split (','))]
+        for i in range (6):
+            v = d.findtext (eks [i])
+            k = abcOut.pagekeys [i+1]   # pagekeys [0] == scale already done, skip it
+            if not abcOut.pageFmt [k] and v:
+                try: abcOut.pageFmt [k] = float (v) * xmlScale  # -> cm
+                except: info ('illegal value %s for xml element %s', (v, eks [i])); continue    # just skip illegal values
+    def locStaffMap (s, part, maten):   # map voice to staff with majority voting
+        vmap = {}   # {voice -> {staff -> n}} count occurrences of voice in staff
+        s.vceInst = {}          # {voice -> instrument id} for this part
+        s.msc.vnums = {}        # voice id's
+        s.hasStems = {}         # XML voice nums with at least one note with a stem (for tab key)
+        s.stfMap, s.clefMap = {}, {}    # staff -> [voices], staff -> clef
+        ns = part.findall ('measure/note')
+        for n in ns:            # count staff allocations for all notes
+            v = int (n.findtext ('voice', '1'))
+            if s.isSib: v += 100 * int (n.findtext ('staff', '1'))  # repair bug in Sibelius
+            s.msc.vnums [v] = 1 # collect all used voice id's in this part
+            sn = int (n.findtext ('staff', '1'))
+            s.stfMap [sn] = []
+            if v not in vmap:
+                vmap [v] = {sn:1}
+            else:
+                d = vmap[v]     # counter for voice v
+                d[sn] = d.get (sn, 0) + 1   # ++ number of allocations for staff sn
+            x = n.find ('instrument')
+            if x != None: s.vceInst [v] = x.get ('id')
+            x, noRest = n.findtext ('stem'), n.find ('rest') == None
+            if noRest and (not x or x != 'none'): s.hasStems [v] = 1    # XML voice v has at least one stem
+        vks = list (vmap.keys ())
+        if s.jscript or s.isSib: vks.sort ()
+        for v in vks:           # choose staff with most allocations for each voice
+            xs = [(n, sn) for sn, n in vmap[v].items ()]
+            xs.sort ()
+            stf = xs[-1][1]     # the winner: staff with most notes of voice v
+            s.stfMap [stf].append (v)
+            s.vce2stf [v] = stf # reverse map
+            s.curStf [v] = stf  # current staff of XML voice v
+    def addStaffMap (s, vvmap): # vvmap: xml voice number -> global abc voice number
+        part = [] # default: brace on staffs of one part
+        for stf, voices in sorted (s.stfMap.items ()):  # s.stfMap has xml staff and voice numbers
+            locmap = [vvmap [iv] for iv in voices if iv in vvmap]
+            nostem = [(iv not in s.hasStems) for iv in voices if iv in vvmap]   # same order as locmap
+            if locmap:          # abc voice number of staff stf
+                part.append (locmap)
+                clef = s.clefMap.get (stf, 'treble')    # {xml staff number -> clef}
+                for i, iv in enumerate (locmap):
+                    clef_attr = ''
+                    if clef.startswith ('tab'):
+                        if nostem [i] and 'nostems' not in clef: clef_attr = ' nostems'
+                        if s.diafret and 'diafret' not in clef: clef_attr += ' diafret' # for all voices in the part
+                    abcOut.clefs [iv] = clef + clef_attr # add nostems when all notes of voice had no stem
+        s.gStfMap.append (part)
+    def addMidiMap (s, ip, vvmap):      # map abc voices to midi settings
+        instr = s.instMid [ip]          # get the midi settings for this part
+        if instr.values (): defInstr = list(instr.values ())[0]   # default settings = first instrument
+        else:               defInstr = s.midDflt    # no instruments defined
+        xs = []
+        for v, vabc in vvmap.items ():  # xml voice num, abc voice num
+            ks = sorted (s.drumNotes.items ())
+            ds = [(nt, step, midi, head) for (vd, nt), (step, midi, head) in ks if v == vd] # map perc notes
+            id = s.vceInst.get (v, '')  # get the instrument-id for part with multiple instruments
+            if id in instr:             # id is defined as midi-instrument in part-list
+                   xs.append ((vabc, instr [id] + ds))  # get midi settings for id
+            else:  xs.append ((vabc, defInstr   + ds))  # only one instrument for this part
+        xs.sort ()  # put abc voices in order
+        s.midiMap.extend ([midi for v, midi in xs])
+        snaarmap = ['E','G','B','d', 'f', 'a', "c'", "e'"]
+        diamap = '0,1-,1,1+,2,3,3,4,4,5,6,6+,7,8-,8,8+,9,10,10,11,11,12,13,13+,14'.split (',')
+        for k in sorted (s.tabmap.keys ()): # add %%map's for all tab voices
+            v, noot = k;
+            snaar, fret = s.tabmap [k];
+            if s.diafret: fret = diamap [int (fret)]
+            vabc = vvmap [v]
+            snaar = s.stafflines - int (snaar)
+            xs = s.tabVceMap.get (vabc, [])
+            xs.append ('%%%%map tab%d %s print=%s heads=kop%s\n' % (vabc, noot, snaarmap [snaar], fret))
+            s.tabVceMap [vabc] = xs
+            s.koppen [fret] = 1  # collect noteheads for SVG defs
+    def parse (s, fobj):
+        vvmapAll = {}   # collect xml->abc voice maps (vvmap) of all parts
+        e = E.parse (fobj)
+        s.mkTitle (e)
+        s.doDefaults (e)
+        partlist = s.doPartList (e)
+        parts = e.findall ('part')
+        for ip, p in enumerate (parts):
+            maten = p.findall ('measure')
+            s.locStaffMap (p, maten)        # {voice -> staff} for this part
+            s.drumNotes = {}    # (xml voice, abc note) -> (midi note, note head)
+            s.clefOct = {}      # xml staff number -> current clef-octave-change
+            s.curClef = {}      # xml staff number -> current abc clef
+            s.stemDir = {}      # xml voice number -> current stem direction
+            s.tabmap = {}       # (xml voice, abc note) -> (string, fret)
+            s.diafret = 0       # use diatonic fretting
+            s.stafflines = 5
+            s.msc.initVoices (newPart = 1)  # create all voices
+            aantalHerhaald = 0  # keep track of number of repititions
+            herhaalMaat = 0     # target measure of the repitition
+            divisions = []      # current value of <divisions> for each measure
+            s.msr = Measure (ip)   # various measure data
+            while s.msr.ixm < len (maten):
+                if ip == 31 and s.msr.ixm == 405:
+                    print('')
+                maat = maten [s.msr.ixm]
+                herhaal, lbrk = 0, ''
+                s.msr.reset ()
+                s.curalts = {}  # passing accidentals are reset each measure
+                es = list (maat)
+                for i, e in enumerate (es):
+                    if   e.tag == 'note':       s.doNote (e)
+                    elif e.tag == 'attributes': s.doAttr (e)
+                    elif e.tag == 'direction':
+                        s.doDirection (e, i, es)
+                    elif e.tag == 'sound':      s.doDirection (maat, i, es) # sound element directly in measure!
+                    elif e.tag == 'harmony':    s.doHarmony (e, i, es)
+                    elif e.tag == 'barline':
+                        herhaal = s.doBarline (e)
+                    elif e.tag == 'backup':
+                        dt = int (e.findtext ('duration'))
+                        if chkbug (dt, s.msr): s.msc.incTime (-dt)
+                    elif e.tag == 'forward':
+                        dt = int (e.findtext ('duration'))
+                        if chkbug (dt, s.msr): s.msc.incTime (dt)
+                    elif e.tag == 'print':  lbrk = s.doPrint (e)
+                s.msc.addBar (lbrk, s.msr)
+                divisions.append (s.msr.divs)
+                if herhaal == 1:
+                    herhaalMaat = s.msr.ixm
+                    s.msr.ixm += 1
+                elif herhaal == 2:
+                    if aantalHerhaald < 1:  # jump
+                        s.msr.ixm = herhaalMaat
+                        aantalHerhaald += 1
+                    else:
+                        aantalHerhaald = 0  # reset
+                        s.msr.ixm += 1      # just continue
+                else: s.msr.ixm += 1        # on to the next measure
+            for rv in s.repeat_str.values ():   # close hanging measure-repeats without stop
+                rv [0] = '[I:repeat %s %d]' % (rv [1], 1)
+            vvmap = s.msc.outVoices (divisions, ip, s.isSib)
+            s.addStaffMap (vvmap)           # update global staff map
+            s.addMidiMap (ip, vvmap)
+            vvmapAll.update (vvmap)
+        if vvmapAll:            # skip output if no part has any notes
+            abcOut.mkHeader (s.gStfMap, partlist, s.midiMap, s.tabVceMap, s.koppen)
+            abcOut.writeall ()
+        else: info ('nothing written, %s has no notes ...' % abcOut.fnmext)
+#----------------
+# Main Program
+#----------------
+if __name__ == '__main__':
+    from optparse import OptionParser
+    from glob import glob
+    from zipfile import ZipFile
+    ustr = '%prog [-h] [-u] [-m] [-c C] [-d D] [-n CPL] [-b BPL] [-o DIR] [-v V]\n'
+    ustr += '[-x] [-p PFMT] [-t] [-s] [-i] [--v1] [--noped] [--stems] <file1> [<file2> ...]'
+    parser = OptionParser (usage=ustr, version=str(VERSION))
+    parser.add_option ("-u", action="store_true", help="unfold simple repeats")
+    parser.add_option ("-m", action="store", help="0 -> no %%MIDI, 1 -> minimal %%MIDI, 2-> all %%MIDI", default=0)
+    parser.add_option ("-c", action="store", type="int", help="set credit text filter to C", default=0, metavar='C')
+    parser.add_option ("-d", action="store", type="int", help="set L:1/D", default=0, metavar='D')  # ??????????????L
+    parser.add_option ("-n", action="store", type="int", help="CPL: max number of characters per line (default 100)", default=0, metavar='CPL')
+    parser.add_option ("-b", action="store", type="int", help="BPL: max number of bars per line", default=0, metavar='BPL')
+    parser.add_option ("-o", action="store", help="store abc files in DIR", default='', metavar='DIR')
+    parser.add_option ("-v", action="store", type="int", help="set volta typesetting behaviour to V", default=0, metavar='V')
+    parser.add_option ("-x", action="store_true", help="output no line breaks")
+    parser.add_option ("-p", action="store", help="pageformat PFMT (cm) = scale, pageheight, pagewidth, leftmargin, rightmargin, topmargin, botmargin", default='', metavar='PFMT')
+    parser.add_option ("-j", action="store_true", help="switch for compatibility with javascript version")
+    parser.add_option ("-t", action="store_true", help="translate perc- and tab-staff to ABC code with %%map, %%voicemap")
+    parser.add_option ("-s", action="store_true", help="shift node heads 3 units left in a tab staff")
+    parser.add_option ("--v1", action="store_true", help="start-stop directions allways to first voice of staff")
+    parser.add_option ("--noped", action="store_false", help="skip all pedal directions", dest='ped', default=True)
+    parser.add_option ("--stems", action="store_true", help="translate stem directions", dest='stm', default=False)
+    parser.add_option ("-i", action="store_true", help="read xml file from standard input")
+    options, args = parser.parse_args ()
+    if options.n < 0: parser.error ('only values >= 0')
+    if options.b < 0: parser.error ('only values >= 0')
+    if options.d and options.d not in [2**n for n in range (10)]:
+        parser.error ('D should be on of %s' % ','.join ([str(2**n) for n in range (10)]))
+    options.p = options.p and options.p.split (',') or [] # ==> [] | [string]
+    if len (args) == 0 and not options.i: parser.error ('no input file given')
+    pad = options.o
+    if pad:
+        if not os.path.exists (pad): os.mkdir (pad)
+        if not os.path.isdir (pad): parser.error ('%s is not a directory' % pad)
+    fnmext_list = []
+    for i in args: fnmext_list += glob (i)
+    if options.i: fnmext_list = ['stdin.xml']
+    if not fnmext_list: parser.error ('none of the input files exist')
+    for X, fnmext in enumerate (fnmext_list):
+        fnm, ext = os.path.splitext (fnmext)
+        if ext.lower () not in ('.xml','.mxl','.musicxml'):
+            info ('skipped input file %s, it should have extension .xml or .mxl' % fnmext)
+            continue
+        if os.path.isdir (fnmext):
+            info ('skipped directory %s. Only files are accepted' % fnmext)
+            continue
+        if fnmext == 'stdin.xml':
+            fobj = sys.stdin
+        elif ext.lower () == '.mxl':        # extract .xml file from .mxl file
+            z = ZipFile(fnmext)
+            for n in z.namelist():          # assume there is always an xml file in a mxl archive !!
+                if (n[:4] != 'META') and (n[-4:].lower() == '.xml'):
+                    fobj = z.open (n)
+                    break   # assume only one MusicXML file per archive
+        else:
+            fobj = open (fnmext, 'rb')      # open regular xml file
+        abcOut = ABCoutput (fnm + '.abc', pad, X, options)  # create global ABC output object
+        psr = Parser (options)  # xml parser
+        try:
+            psr.parse (fobj)    # parse file fobj and write abc to <fnm>.abc
+        except:
+            etype, value, traceback = sys.exc_info ()   # works in python 2 & 3
+            info ('** %s occurred: %s in %s' % (etype, value, fnmext), 0)