Spaces:

monetjoe
/

EMelodyGen

Running

App Files Files

admin commited on Feb 23

Commit

93fe177

1 Parent(s): bba92ce

upd gr ver

Browse files

Files changed (5) hide show

README.md +1 -1
app.py +11 -11
generate.py +1 -1
model.py +14 -27
utils.py +15 -7

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🎶😆😠😟
 colorFrom: indigo
 colorTo: yellow
 sdk: gradio
-sdk_version: 5.32.0
 app_file: app.py
 pinned: false
 license: lgpl-3.0

 colorFrom: indigo
 colorTo: yellow
 sdk: gradio
+sdk_version: 6.6.0
 app_file: app.py
 pinned: false
 license: lgpl-3.0

app.py CHANGED Viewed

@@ -162,8 +162,6 @@ if __name__ == "__main__":
                     gr.Video(
                         "./demo.mp4" if EN_US else "./src/tutorial.mp4",
                         label=_L("Video demo"),
-                        show_download_button=False,
-                        show_share_button=False,
                     )
                     gr.Markdown(
                         f"## {_L('Cite')}"
@@ -215,9 +213,6 @@ if __name__ == "__main__":
                             else "./src/4q.jpg"
                         ),
                         show_label=False,
-                        show_download_button=False,
-                        show_fullscreen_button=False,
-                        show_share_button=False,
                     )
                     v_radio = gr.Radio(
                         [_L("Low"), _L("High")],
@@ -283,7 +278,11 @@ if __name__ == "__main__":
                                 save_file = gr.File(label=_L("Download template"))
             with gr.Column():
-                wav_audio = gr.Audio(label=_L("Audio"), type="filepath")
                 with gr.Accordion(label=_L("Feedback"), open=False):
                     fdb_radio = gr.Radio(
                         ["Q1", "Q2", "Q3", "Q4"],
@@ -293,7 +292,7 @@ if __name__ == "__main__":
                     )
                     fdb_btn = gr.Button(_L("Submit"))
-                status_bar = gr.Textbox(label=_L("Status"), show_copy_button=True)
                 with gr.Row():
                     mid_file = gr.File(label=_L("Download MIDI"), min_width=80)
                     pdf_file = gr.File(label=_L("Download PDF score"), min_width=80)
@@ -301,11 +300,12 @@ if __name__ == "__main__":
                     mxl_file = gr.File(label=_L("Download MXL"), min_width=80)
                 with gr.Row():
-                    abc_txt = gr.TextArea(
-                        label=_L("ABC notation"),
-                        show_copy_button=True,
                     )
-                    staff_img = gr.Image(label=_L("Staff"), type="filepath")
         # actions
         gen1_btn.click(

                     gr.Video(
                         "./demo.mp4" if EN_US else "./src/tutorial.mp4",
                         label=_L("Video demo"),
                     )
                     gr.Markdown(
                         f"## {_L('Cite')}"
                             else "./src/4q.jpg"
                         ),
                         show_label=False,
                     )
                     v_radio = gr.Radio(
                         [_L("Low"), _L("High")],
                                 save_file = gr.File(label=_L("Download template"))
             with gr.Column():
+                wav_audio = gr.Audio(
+                    label=_L("Audio"),
+                    type="filepath",
+                    buttons=["download"],
+                )
                 with gr.Accordion(label=_L("Feedback"), open=False):
                     fdb_radio = gr.Radio(
                         ["Q1", "Q2", "Q3", "Q4"],
                     )
                     fdb_btn = gr.Button(_L("Submit"))
+                status_bar = gr.Textbox(label=_L("Status"), buttons=["copy"])
                 with gr.Row():
                     mid_file = gr.File(label=_L("Download MIDI"), min_width=80)
                     pdf_file = gr.File(label=_L("Download PDF score"), min_width=80)
                     mxl_file = gr.File(label=_L("Download MXL"), min_width=80)
                 with gr.Row():
+                    abc_txt = gr.TextArea(label=_L("ABC notation"), buttons=["copy"])
+                    staff_img = gr.Image(
+                        label=_L("Staff"),
+                        type="filepath",
+                        buttons=["fullscreen", "download"],
                     )
         # actions
         gen1_btn.click(

generate.py CHANGED Viewed

@@ -113,7 +113,7 @@ def generate_music(
     )
     model = TunesFormer(patch_config, char_config, share_weights=SHARE_WEIGHTS)
     checkpoint = torch.load(weights, map_location=DEVICE)
-    model.load_state_dict(checkpoint["model"])
     model = model.to(DEVICE)
     model.eval()
     prompt = f"A:{emo}\n"

     )
     model = TunesFormer(patch_config, char_config, share_weights=SHARE_WEIGHTS)
     checkpoint = torch.load(weights, map_location=DEVICE)
+    model.load_state_dict(checkpoint["model"], strict=False)
     model = model.to(DEVICE)
     model.eval()
     prompt = f"A:{emo}\n"

model.py CHANGED Viewed

@@ -66,10 +66,8 @@ class Patchilizer:
         """
         lines = unidecode(abc_code).split("\n")
         lines = list(filter(None, lines))  # remove empty lines
         body = ""
         patches = []
         for line in lines:
             if len(line) > 1 and (
                 (line[0].isalpha() and line[1] == ":") or line.startswith("%%score")
@@ -129,7 +127,6 @@ class PatchLevelDecoder(PreTrainedModel):
         patches = torch.nn.functional.one_hot(patches, num_classes=128).float()
         patches = patches.reshape(len(patches), -1, PATCH_SIZE * 128)
         patches = self.patch_embedding(patches.to(self.device))
         return self.base(inputs_embeds=patches)
@@ -161,11 +158,9 @@ class CharLevelDecoder(PreTrainedModel):
         # preparing the labels for model training
         target_masks = target_patches == self.pad_token_id
         labels = target_patches.clone().masked_fill_(target_masks, -100)
         # masking the labels for model training
         target_masks = torch.ones_like(labels)
         target_masks = target_masks.masked_fill_(labels == -100, 0)
         # select patches
         if (
             patch_sampling_batch_size != 0
@@ -174,7 +169,6 @@ class CharLevelDecoder(PreTrainedModel):
             indices = list(range(len(target_patches)))
             random.shuffle(indices)
             selected_indices = sorted(indices[:patch_sampling_batch_size])
             target_patches = target_patches[selected_indices, :]
             target_masks = target_masks[selected_indices, :]
             encoded_patches = encoded_patches[selected_indices, :]
@@ -184,12 +178,10 @@ class CharLevelDecoder(PreTrainedModel):
         inputs_embeds = torch.nn.functional.embedding(
             target_patches, self.base.transformer.wte.weight
         )
         # concatenate the encoded patches with the input embeddings
         inputs_embeds = torch.cat(
             (encoded_patches.unsqueeze(1), inputs_embeds[:, 1:, :]), dim=1
         )
         return self.base(
             inputs_embeds=inputs_embeds, attention_mask=target_masks, labels=labels
         )
@@ -203,20 +195,14 @@ class CharLevelDecoder(PreTrainedModel):
         """
         encoded_patch = encoded_patch.reshape(1, 1, -1)
         tokens = tokens.reshape(1, -1)
         # Get input embeddings
         tokens = torch.nn.functional.embedding(tokens, self.base.transformer.wte.weight)
         # Concatenate the encoded patch with the input embeddings
         tokens = torch.cat((encoded_patch, tokens[:, 1:, :]), dim=1)
         # Get output from model
         outputs = self.base(inputs_embeds=tokens)
         # Get probabilities of next token
-        probs = torch.nn.functional.softmax(outputs.logits.squeeze(0)[-1], dim=-1)
-        return probs
 class TunesFormer(PreTrainedModel):
@@ -235,14 +221,11 @@ class TunesFormer(PreTrainedModel):
             max_layers = max(
                 encoder_config.num_hidden_layers, decoder_config.num_hidden_layers
             )
             max_context_size = max(encoder_config.max_length, decoder_config.max_length)
             max_position_embeddings = max(
                 encoder_config.max_position_embeddings,
                 decoder_config.max_position_embeddings,
             )
             encoder_config.num_hidden_layers = max_layers
             encoder_config.max_length = max_context_size
             encoder_config.max_position_embeddings = max_position_embeddings
@@ -252,7 +235,6 @@ class TunesFormer(PreTrainedModel):
         self.patch_level_decoder = PatchLevelDecoder(encoder_config)
         self.char_level_decoder = CharLevelDecoder(decoder_config)
         if share_weights:
             self.patch_level_decoder.base = self.char_level_decoder.base.transformer
@@ -268,13 +250,20 @@ class TunesFormer(PreTrainedModel):
         """
         patches = patches.reshape(len(patches), -1, PATCH_SIZE)
         encoded_patches = self.patch_level_decoder(patches)["last_hidden_state"]
         return self.char_level_decoder(
             encoded_patches.squeeze(0)[:-1, :],
             patches.squeeze(0)[1:, :],
             patch_sampling_batch_size,
         )
     def generate(
         self,
         patches: torch.Tensor,
@@ -291,13 +280,11 @@ class TunesFormer(PreTrainedModel):
         """
         patches = patches.reshape(len(patches), -1, PATCH_SIZE)
         encoded_patches = self.patch_level_decoder(patches)["last_hidden_state"]
         if tokens == None:
             tokens = torch.tensor([self.bos_token_id], device=self.device)
         generated_patch = []
         random.seed(seed)
         while True:
             if seed != None:
                 n_seed = random.randint(0, 1000000)
@@ -312,12 +299,13 @@ class TunesFormer(PreTrainedModel):
                 .detach()
                 .numpy()
             )
             prob = top_p_sampling(prob, top_p=top_p, return_probs=True)
             prob = top_k_sampling(prob, top_k=top_k, return_probs=True)
-            token = temperature_sampling(prob, temperature=temperature, seed=n_seed)
             generated_patch.append(token)
             if token == self.eos_token_id or len(tokens) >= PATCH_SIZE - 1:
                 break
@@ -333,7 +321,6 @@ class TunesFormer(PreTrainedModel):
 class PatchilizedData(Dataset):
     def __init__(self, items, patchilizer):
         self.texts = []
         for item in tqdm(items):
             text = item["control code"] + "\n".join(
                 item["abc notation"].split("\n")[1:]

         """
         lines = unidecode(abc_code).split("\n")
         lines = list(filter(None, lines))  # remove empty lines
         body = ""
         patches = []
         for line in lines:
             if len(line) > 1 and (
                 (line[0].isalpha() and line[1] == ":") or line.startswith("%%score")
         patches = torch.nn.functional.one_hot(patches, num_classes=128).float()
         patches = patches.reshape(len(patches), -1, PATCH_SIZE * 128)
         patches = self.patch_embedding(patches.to(self.device))
         return self.base(inputs_embeds=patches)
         # preparing the labels for model training
         target_masks = target_patches == self.pad_token_id
         labels = target_patches.clone().masked_fill_(target_masks, -100)
         # masking the labels for model training
         target_masks = torch.ones_like(labels)
         target_masks = target_masks.masked_fill_(labels == -100, 0)
         # select patches
         if (
             patch_sampling_batch_size != 0
             indices = list(range(len(target_patches)))
             random.shuffle(indices)
             selected_indices = sorted(indices[:patch_sampling_batch_size])
             target_patches = target_patches[selected_indices, :]
             target_masks = target_masks[selected_indices, :]
             encoded_patches = encoded_patches[selected_indices, :]
         inputs_embeds = torch.nn.functional.embedding(
             target_patches, self.base.transformer.wte.weight
         )
         # concatenate the encoded patches with the input embeddings
         inputs_embeds = torch.cat(
             (encoded_patches.unsqueeze(1), inputs_embeds[:, 1:, :]), dim=1
         )
         return self.base(
             inputs_embeds=inputs_embeds, attention_mask=target_masks, labels=labels
         )
         """
         encoded_patch = encoded_patch.reshape(1, 1, -1)
         tokens = tokens.reshape(1, -1)
         # Get input embeddings
         tokens = torch.nn.functional.embedding(tokens, self.base.transformer.wte.weight)
         # Concatenate the encoded patch with the input embeddings
         tokens = torch.cat((encoded_patch, tokens[:, 1:, :]), dim=1)
         # Get output from model
         outputs = self.base(inputs_embeds=tokens)
         # Get probabilities of next token
+        return torch.nn.functional.softmax(outputs.logits.squeeze(0)[-1], dim=-1)
 class TunesFormer(PreTrainedModel):
             max_layers = max(
                 encoder_config.num_hidden_layers, decoder_config.num_hidden_layers
             )
             max_context_size = max(encoder_config.max_length, decoder_config.max_length)
             max_position_embeddings = max(
                 encoder_config.max_position_embeddings,
                 decoder_config.max_position_embeddings,
             )
             encoder_config.num_hidden_layers = max_layers
             encoder_config.max_length = max_context_size
             encoder_config.max_position_embeddings = max_position_embeddings
         self.patch_level_decoder = PatchLevelDecoder(encoder_config)
         self.char_level_decoder = CharLevelDecoder(decoder_config)
         if share_weights:
             self.patch_level_decoder.base = self.char_level_decoder.base.transformer
         """
         patches = patches.reshape(len(patches), -1, PATCH_SIZE)
         encoded_patches = self.patch_level_decoder(patches)["last_hidden_state"]
         return self.char_level_decoder(
             encoded_patches.squeeze(0)[:-1, :],
             patches.squeeze(0)[1:, :],
             patch_sampling_batch_size,
         )
+    def norm(self, prob):
+        prob = [float(x) for x in prob]
+        s = sum(prob)
+        if s == 0:
+            raise ValueError("全零概率")
+        return [x / s for x in prob]
     def generate(
         self,
         patches: torch.Tensor,
         """
         patches = patches.reshape(len(patches), -1, PATCH_SIZE)
         encoded_patches = self.patch_level_decoder(patches)["last_hidden_state"]
         if tokens == None:
             tokens = torch.tensor([self.bos_token_id], device=self.device)
         generated_patch = []
         random.seed(seed)
         while True:
             if seed != None:
                 n_seed = random.randint(0, 1000000)
                 .detach()
                 .numpy()
             )
             prob = top_p_sampling(prob, top_p=top_p, return_probs=True)
             prob = top_k_sampling(prob, top_k=top_k, return_probs=True)
+            token = temperature_sampling(
+                self.norm(prob),
+                temperature=temperature,
+                seed=n_seed,
+            )
             generated_patch.append(token)
             if token == self.eos_token_id or len(tokens) >= PATCH_SIZE - 1:
                 break
 class PatchilizedData(Dataset):
     def __init__(self, items, patchilizer):
         self.texts = []
         for item in tqdm(items):
             text = item["control code"] + "\n".join(
                 item["abc notation"].split("\n")[1:]

utils.py CHANGED Viewed

@@ -5,19 +5,27 @@ import torch
 import warnings
 import requests
 import subprocess
-import modelscope
-import huggingface_hub
 from tqdm import tqdm
 warnings.filterwarnings("ignore")
 TEMP_DIR = "./__pycache__"
 EN_US = os.getenv("LANG") != "zh_CN.UTF-8"
-WEIGHTS_DIR = (
-    huggingface_hub.snapshot_download("monetjoe/EMelodyGen", cache_dir=TEMP_DIR)
-    if EN_US
-    else modelscope.snapshot_download("monetjoe/EMelodyGen", cache_dir=TEMP_DIR)
-)
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 PATCH_LENGTH = 128  # Patch Length
 PATCH_SIZE = 32  # Patch Size

 import warnings
 import requests
 import subprocess
 from tqdm import tqdm
 warnings.filterwarnings("ignore")
 TEMP_DIR = "./__pycache__"
 EN_US = os.getenv("LANG") != "zh_CN.UTF-8"
+if EN_US:
+    import huggingface_hub
+    WEIGHTS_DIR = huggingface_hub.snapshot_download(
+        "monetjoe/EMelodyGen",
+        cache_dir=TEMP_DIR,
+    )
+else:
+    import modelscope
+    WEIGHTS_DIR = modelscope.snapshot_download(
+        "monetjoe/EMelodyGen",
+        cache_dir=TEMP_DIR,
+    )
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 PATCH_LENGTH = 128  # Patch Length
 PATCH_SIZE = 32  # Patch Size