matteospanio commited on
Commit
dcfdd2e
·
verified ·
1 Parent(s): 8de8b15

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. README.md +122 -3
  2. config.json +29 -0
  3. model.safetensors +3 -0
  4. tokenizer.json +0 -0
  5. tokenizer_config.json +16 -0
  6. training_config.json +48 -0
README.md CHANGED
@@ -1,3 +1,122 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ license: apache-2.0
5
+ library_name: transformers
6
+ pipeline_tag: fill-mask
7
+ tags:
8
+ - music
9
+ - lilypond
10
+ - mlm
11
+ - music-information-retrieval
12
+ base_model: microsoft/codebert-base
13
+ datasets:
14
+ - custom
15
+ model-index:
16
+ - name: lilyBERT
17
+ results:
18
+ - task:
19
+ type: text-classification
20
+ name: Composer Classification (Linear Probe)
21
+ dataset:
22
+ type: custom
23
+ name: Mutopia (out-of-domain)
24
+ metrics:
25
+ - type: accuracy
26
+ value: 84.3
27
+ name: Composer Accuracy
28
+ - type: accuracy
29
+ value: 82.9
30
+ name: Style Accuracy
31
+ ---
32
+
33
+ # lilyBERT
34
+
35
+ **lilyBERT** is a masked language model for [LilyPond](https://lilypond.org/) music notation, built by adapting [CodeBERT](https://huggingface.co/microsoft/codebert-base) to the musical domain.
36
+
37
+ LilyPond is a text-based music engraving language with formal grammar, block structure, and backslash commands — making it structurally similar to a programming language. lilyBERT leverages this by extending CodeBERT's vocabulary with 115 domain-specific tokens (e.g. `\trill`, `\fermata`, `\mordent`, `\staccato`) and performing MLM pre-training on curated Baroque music scores.
38
+
39
+ ## Training
40
+
41
+ This checkpoint was trained in two stages:
42
+
43
+ 1. **Stage 1 — PDMX pre-training**: CodeBERT fine-tuned on the [PDMX](https://github.com/fosfrancesco/pdmx) corpus of automatically converted LilyPond files.
44
+ 2. **Stage 2 — BMdataset fine-tuning**: Further fine-tuned on the [BMdataset](https://doi.org/10.5281/zenodo.18723290), a musicologically curated collection of ~470 Baroque scores in LilyPond format (~90M tokens).
45
+
46
+ | Hyperparameter | Value |
47
+ |---|---|
48
+ | Architecture | RobertaForMaskedLM (12 layers, 768 hidden, 12 heads) |
49
+ | Vocab size | 50,380 (50,265 base + 115 music tokens) |
50
+ | Max sequence length | 512 |
51
+ | MLM probability | 0.15 |
52
+ | Batch size | 72 × 2 GPUs × 2 grad. accum. = 288 |
53
+ | Learning rate | 2e-4 (cosine schedule) |
54
+ | Warmup | 10% |
55
+ | Epochs | 10 (early stopping, patience 5) |
56
+ | Precision | bf16 |
57
+ | Optimizer | AdamW (fused) |
58
+
59
+ ## Results
60
+
61
+ Linear probing on the out-of-domain Mutopia corpus (layer 6, 5-fold CV):
62
+
63
+ | Model | Composer Acc. | Style Acc. |
64
+ |-------|:---:|:---:|
65
+ | CB + PDMX_full (15B tokens) | 80.8 | 82.6 |
66
+ | CB + BMdataset (90M tokens) | 82.9 | **83.7** |
67
+ | CB + PDMX_90M (90M tokens) | 81.7 | 82.3 |
68
+ | **CB + PDMX → BM (this model)** | **84.3** | 82.9 |
69
+
70
+ 90M tokens of expertly curated data outperform 15B tokens of automatically converted data. Combining broad pre-training with domain-specific fine-tuning yields the best overall composer accuracy.
71
+
72
+ ## Usage
73
+
74
+ ```python
75
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
76
+
77
+ tokenizer = AutoTokenizer.from_pretrained("csc-unipd/lilybert")
78
+ model = AutoModelForMaskedLM.from_pretrained("csc-unipd/lilybert")
79
+ ```
80
+
81
+ ### Fill-mask example
82
+
83
+ ```python
84
+ from transformers import pipeline
85
+
86
+ filler = pipeline("fill-mask", model="csc-unipd/lilybert")
87
+ filler("\\relative c' { c4 d <mask> f | g2 g }")
88
+ ```
89
+
90
+ ### Feature extraction
91
+
92
+ ```python
93
+ import torch
94
+
95
+ inputs = tokenizer("\\relative c' { c4 d e f | g2 g }", return_tensors="pt")
96
+ with torch.no_grad():
97
+ outputs = model(**inputs, output_hidden_states=True)
98
+
99
+ # Layer 6 embeddings (best for linear probing)
100
+ embeddings = outputs.hidden_states[6]
101
+ ```
102
+
103
+ ## Citation
104
+
105
+ ```bibtex
106
+ @misc{spanio2026bmdataset,
107
+ title = {BMdataset: A Musicologically Curated LilyPond Dataset},
108
+ author = {Spanio, Matteo and Guler, Ilay and Roda, Antonio},
109
+ year = {2026},
110
+ publisher = {Under review},
111
+ }
112
+ ```
113
+
114
+ ## Links
115
+
116
+ - **Paper**: SMC 2026 (to appear)
117
+ - **Dataset**: [Zenodo (doi:10.5281/zenodo.18723290)](https://doi.org/10.5281/zenodo.18723290)
118
+ - **Code**: [GitHub (CSCPadova/lilybert)](https://github.com/CSCPadova/lilybert)
119
+
120
+ ## License
121
+
122
+ Apache-2.0
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "RobertaForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "is_decoder": false,
17
+ "layer_norm_eps": 1e-05,
18
+ "max_position_embeddings": 514,
19
+ "model_type": "roberta",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "output_past": true,
23
+ "pad_token_id": 1,
24
+ "tie_word_embeddings": true,
25
+ "transformers_version": "5.2.0",
26
+ "type_vocab_size": 1,
27
+ "use_cache": false,
28
+ "vocab_size": 50380
29
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7740960ce5b8a6feadc9a567464544ab29471b3db4b4055c0849e8e2bac147db
3
+ size 499167664
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<s>",
5
+ "cls_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "errors": "replace",
8
+ "is_local": true,
9
+ "mask_token": "<mask>",
10
+ "model_max_length": 512,
11
+ "pad_token": "<pad>",
12
+ "sep_token": "</s>",
13
+ "tokenizer_class": "TokenizersBackend",
14
+ "trim_offsets": true,
15
+ "unk_token": "<unk>"
16
+ }
training_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data_dir": "/nfsd/voce/machine_learning/experiments/artifacts/processed",
3
+ "tokenizer_path": "/nfsd/voce/machine_learning/experiments/artifacts/tokenizer",
4
+ "output_dir": "/nfsd/voce/machine_learning/experiments/cb-pdmx-bm",
5
+ "model_architecture": "microsoft/codebert-base",
6
+ "random_init": false,
7
+ "hidden_size": 768,
8
+ "num_hidden_layers": 12,
9
+ "num_attention_heads": 12,
10
+ "intermediate_size": 3072,
11
+ "max_position_embeddings": 514,
12
+ "max_length": 512,
13
+ "mlm_probability": 0.15,
14
+ "per_device_train_batch_size": 72,
15
+ "per_device_eval_batch_size": 72,
16
+ "num_train_epochs": 10,
17
+ "learning_rate": 0.0002,
18
+ "lr_scheduler_type": "cosine",
19
+ "max_grad_norm": 1.0,
20
+ "weight_decay": 0.01,
21
+ "warmup_ratio": 0.1,
22
+ "max_steps": -1,
23
+ "logging_steps": 50,
24
+ "eval_steps": 1000,
25
+ "save_steps": 1000,
26
+ "seed": 42,
27
+ "pretokenized_shards_dir": "/nfsd/voce/machine_learning/experiments/codebert/baroque-music-shards/mlm",
28
+ "resume_from_checkpoint": null,
29
+ "dataloader_num_workers": 8,
30
+ "early_stopping": true,
31
+ "early_stopping_patience": 5,
32
+ "early_stopping_threshold": 0.0,
33
+ "bf16": true,
34
+ "gradient_accumulation_steps": 2,
35
+ "optim": "adamw_torch_fused",
36
+ "dataloader_pin_memory": true,
37
+ "dataloader_prefetch_factor": 4,
38
+ "save_total_limit": 3,
39
+ "torch_compile": false,
40
+ "ddp_find_unused_parameters": false,
41
+ "wandb_enabled": true,
42
+ "wandb_project": "lilybert",
43
+ "wandb_entity": null,
44
+ "wandb_mode": "online",
45
+ "wandb_run_name": "cb-pdmx-baroquemusic",
46
+ "tensorboard_enabled": true,
47
+ "tensorboard_log_dir": "/nfsd/voce/machine_learning/experiments/cb-pdmx-bm/tensorboard"
48
+ }