vampnet-bytecover

Build error

App Files Files Community

nopyharp

by npruyne - opened May 16, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+90

-46

Files changed (2) hide show

app.py +87 -44
requirements.txt +3 -2

app.py CHANGED Viewed

@@ -15,8 +15,9 @@ import gradio as gr
 import spaces
 from vampnet.interface import Interface, signal_concat
 from vampnet import mask as pmask
-from pyharp import AudioLabel, LabelList
 from bytecover.models.train_module import TrainModule
 from bytecover.utils import initialize_logging, load_config
 import pinecone
@@ -52,6 +53,8 @@ elif bytecover_module.state == "initializing":
 bytecover_model.eval()
 print("Loading CLAP model")
 if torch.cuda.is_available():
@@ -75,13 +78,29 @@ def flatten_vector_embed(vector_embed):
 def format_time(num_seconds):
     return f"{num_seconds // 60}:{num_seconds % 60:02d}"
-def bytecover(sig, chunk_size=3.0, bytecover_match_ct=3, clap_match_ct=3):
     """
     This function defines the audio processing steps
     Args:
         input_audio_path (str): the audio filepath to be processed.
         <YOUR_KWARGS>: additional keyword arguments necessary for processing.
             NOTE: These should correspond to and match order of UI elements defined below.
     Returns:
         output_audio_path (str): the filepath of the processed audio.
         output_labels (LabelList): any labels to display.
@@ -97,20 +116,20 @@ def bytecover(sig, chunk_size=3.0, bytecover_match_ct=3, clap_match_ct=3):
     """
     sig_mono = sig.copy().to_mono().audio_data.squeeze(1)
-    # Chunk audio to desired length
-    chunk_samples = int(chunk_size * sig.sample_rate)
-    print(f"Chunk samples: {chunk_samples}")
-    print(f"Shape of audio: {sig_mono.shape}")
-    chunks = torch.tensor_split(sig_mono, [i for i in range(chunk_samples, sig_mono.shape[1], chunk_samples)], dim=1)
-    if chunks[-1].shape[1] < chunk_samples:
-        print("Cutting last chunk due to length")
-        chunks = tuple(list(chunks)[:-1])
-    print(f"Number of chunks: {len(chunks)}")
     print("Getting Bytecover embeddings")
     bytecover_embeddings = []
-    for chunk in tqdm(chunks):
         result = bytecover_model.forward(chunk.to(bytecover_module.config["device"]))['f_t'].detach()
         bytecover_embeddings.append(result)
@@ -118,21 +137,27 @@ def bytecover(sig, chunk_size=3.0, bytecover_match_ct=3, clap_match_ct=3):
     print("Getting CLAP embeddings")
     clap_embeddings = []
-    for chunk in tqdm(chunks):
-        result = clap_model.get_audio_embedding_from_data(chunk.detach().cpu().numpy())
         clap_embeddings.append(result)
-    clean_clap_embeddings = [convert_to_npfloat64_to_list(convert_to_npfloat64(flatten_vector_embed(embedding))) for embedding in clap_embeddings]
     clap_matches = []
     bytecover_matches = []
     match_metadatas = {}
-    output_labels = LabelList()
     times = {}
-    for clean_embeddings, pinecone_index, match_list, embedding_num, num_matches in zip([clean_bytecover_embeddings, clean_clap_embeddings], [index_bytecover, index_clap], [bytecover_matches, clap_matches], range(2), [bytecover_match_ct, clap_match_ct]):
         for i, embedding in enumerate(clean_embeddings):
@@ -156,48 +181,66 @@ def bytecover(sig, chunk_size=3.0, bytecover_match_ct=3, clap_match_ct=3):
         print("Matches obtained!")
         top_matches = sorted(match_list, key=lambda item: item[0], reverse=True)
-        for i, match in enumerate(top_matches[:int(num_matches)]):
             metadata = match_metadatas[match[2]]
             song_artists = metadata['artists']
             if type(song_artists) is list:
-                artists = ' and '.join(artists)
             song_title = metadata['song']
-            song_link = f"https://open.spotify.com/track/{metadata['spotify_id'].split(':')[2]}"
-            embed_name = ['ByteCover', 'CLAP'][embedding_num]
             match_time = match[1]
             times[match_time] = times.get(match_time, 0) + 1
-            label = AudioLabel(
-                t=match_time,
-                label=f'{song_title}',
-                duration=chunk_size,
-                link=song_link,
-                description=f'Embedding: {embed_name}\n{song_title} by {song_artists}\nClick the tag to view on Spotify!',
-                amplitude=1.0 - 0.5 * (times[match_time] - 1),
-                color=AudioLabel.rgb_color_to_int(200, 170, 3, 10) if embedding_num == 1 else 0
-            )
             # if embedding_num == 1:
-            #     label.rgb_color_to_int(200, 170, 3, 240)
             # else:
-            #     pass
-            #     #label.set_color(204, 52, 235, 240)
-            output_labels.append(label)
-    """
-    <YOUR AUDIO SAVING CODE HERE>
-    # Save processed audio and obtain default path
-    output_audio_path = save_audio(signal, None)
-    """
-    return output_labels
 ### END BYTECOVER
@@ -641,9 +684,9 @@ with gr.Blocks() as demo:
                         gr.Button(f"use as input (feedback)")
                     )
-            thank_you = gr.Markdown("")
-            labels = gr.JSON(label="output labels")
             # download all the outputs
             # download = gr.File(type="filepath", label="download outputs")

 import spaces
 from vampnet.interface import Interface, signal_concat
 from vampnet import mask as pmask
+from ytmusicapi import YTMusic
+# from pyharp import AudioLabel, LabelList
 from bytecover.models.train_module import TrainModule
 from bytecover.utils import initialize_logging, load_config
 import pinecone
 bytecover_model.eval()
+ytm = YTMusic()
 print("Loading CLAP model")
 if torch.cuda.is_available():
 def format_time(num_seconds):
     return f"{num_seconds // 60}:{num_seconds % 60:02d}"
+def chunk_audio(chunk_size, sig, sr):
+    # Chunk audio to desired length
+    chunk_samples = int(chunk_size * sr)
+    print(f"Chunk samples: {chunk_samples}")
+    print(f"Shape of audio: {sig.shape}")
+    chunks = torch.tensor_split(sig, [i for i in range(chunk_samples, sig.shape[1], chunk_samples)], dim=1)
+    if chunks[-1].shape[1] < chunk_samples:
+        print("Cutting last chunk due to length")
+        chunks = tuple(list(chunks)[:-1])
+    print(f"Number of chunks: {len(chunks)}")
+    return chunks
+def bytecover(sig, bytecover_match_ct=3, clap_match_ct=3, chunk_size=None):
     """
     This function defines the audio processing steps
     Args:
         input_audio_path (str): the audio filepath to be processed.
         <YOUR_KWARGS>: additional keyword arguments necessary for processing.
             NOTE: These should correspond to and match order of UI elements defined below.
     Returns:
         output_audio_path (str): the filepath of the processed audio.
         output_labels (LabelList): any labels to display.
     """
     sig_mono = sig.copy().to_mono().audio_data.squeeze(1)
+    if chunk_size is not None:
+        chunks = chunk_audio(chunk_size, sig_mono, sig.sample_rate)
+        bc_chunks = chunks
+        clap_chunks = chunks
+        chunk_sizes = [chunk_size, chunk_size]
+    else:
+        bc_chunks = chunk_audio(10, sig_mono, sig.sample_rate)
+        clap_chunks = chunk_audio(3, sig_mono, sig.sample_rate)
+        chunk_sizes = [10, 3]
     print("Getting Bytecover embeddings")
     bytecover_embeddings = []
+    for chunk in tqdm(bc_chunks):
         result = bytecover_model.forward(chunk.to(bytecover_module.config["device"]))['f_t'].detach()
         bytecover_embeddings.append(result)
     print("Getting CLAP embeddings")
     clap_embeddings = []
+    for chunk in tqdm(clap_chunks):
+        result = clap_model.get_audio_embedding_from_data(chunk, use_tensor=True).detach()
         clap_embeddings.append(result)
+    clean_clap_embeddings = [convert_to_npfloat64_to_list(convert_to_npfloat64(flatten_vector_embed(embedding.cpu()))) for embedding in clap_embeddings]
     clap_matches = []
     bytecover_matches = []
     match_metadatas = {}
+    output_md = ""
     times = {}
+    for clean_embeddings, pinecone_index, match_list, embedding_num, num_matches, chunk_size in zip([clean_bytecover_embeddings, clean_clap_embeddings], [index_bytecover, index_clap], [bytecover_matches, clap_matches], range(2), [bytecover_match_ct, clap_match_ct], chunk_sizes):
+        if embedding_num == 0:
+            continue
+            output_md += "# Melodic Matches\n"
+        else:
+            output_md += "# Timbral Matches\n"
         for i, embedding in enumerate(clean_embeddings):
         print("Matches obtained!")
         top_matches = sorted(match_list, key=lambda item: item[0], reverse=True)
+        found_tracks = []
+        for i, match in enumerate(top_matches):
+            if len(found_tracks) >= num_matches:
+                break
+            #print(match[0])
             metadata = match_metadatas[match[2]]
             song_artists = metadata['artists']
             if type(song_artists) is list:
+                artists = ', '.join(artists)
             song_title = metadata['song']
+            if metadata['spotify_id'] in found_tracks:
+                continue
+            found_tracks.append(metadata['spotify_id'])
+            song_genre = metadata['genre']
+            yt_id = ytm.search(f"{song_title} {song_artists}", filter="songs", limit = 1)[0]['videoId']
+            song_link = f"https://music.youtube.com/watch?v={yt_id}&t={int(metadata['clip_num']) * 10}"
+            #song_link = f"https://open.spotify.com/track/{metadata['spotify_id'].split(':')[2]}"
+            embed_name = ['Melodic', 'Timbral'][embedding_num]
             match_time = match[1]
             times[match_time] = times.get(match_time, 0) + 1
             # if embedding_num == 1:
+            #     color = OutputLabel.rgb_color_to_int(200, 170, 3, 20)
             # else:
+            #     color = OutputLabel.rgb_color_to_int(204, 52, 235, 20)
+            # if match[0] < 0.5:
+            #     color_list = min_sim_color
+            # else:
+            #     color_list = [int(min_color + (match[0] - 0.5) * 2 * (max_color - min_color)) for min_color, max_color in zip(min_sim_color, max_sim_color)]
+            # if match[0] < 0.5:
+            #     color_list = [0, 200, 0, 20]
+            # normalized_similarity = (match[0] - 0.5) * 2
+            # color_list = [int(min(400 * normalized_similarity, 200)), int(min(400 * (1 - normalized_similarity), 200)), 0, 20]
+            output_md += f'{format_time(match_time)}: \n [{song_title} by {song_artists}]({song_link}) \n Genre: {song_genre} \n Similarity: {match[0]}\n\n'
+            # label = AudioLabel(t=match_time,
+            #                    label=f'{song_title}',
+            #                    duration=chunk_size,
+            #                    link=song_link,
+            #                    description=f'Similarity type: {embed_name}, similarity: {match[0]}\n{song_title} by {song_artists}\nGenre: {song_genre}\nClick the tag to view on YouTube Music!',
+            #                    # amplitude=1.0 - 0.5 * (times[match_time] - 1),
+            #                    color=color)
+    return output_md
 ### END BYTECOVER
                         gr.Button(f"use as input (feedback)")
                     )
+            #thank_you = gr.Markdown("")
+            labels = gr.Markdown(label="output labels")
             # download all the outputs
             # download = gr.File(type="filepath", label="download outputs")

requirements.txt CHANGED Viewed

@@ -18,11 +18,12 @@ laion_clap
 nnAudio
 ffmpeg-python
 torchvision
-torch
 jsonlines
 wandb
 tqdm
 # For pinecone_generate only
 google-api-python-client
 protoc-gen-openapiv2-protoc3-19
-transformers==4.30.0

 nnAudio
 ffmpeg-python
 torchvision
 jsonlines
 wandb
 tqdm
+spaces
+ytmusicapi
 # For pinecone_generate only
 google-api-python-client
 protoc-gen-openapiv2-protoc3-19
+transformers==4.30.0