Spaces:
Build error
Build error
nopyharp
#1
by
npruyne - opened
- app.py +87 -44
- requirements.txt +3 -2
app.py
CHANGED
|
@@ -15,8 +15,9 @@ import gradio as gr
|
|
| 15 |
import spaces
|
| 16 |
from vampnet.interface import Interface, signal_concat
|
| 17 |
from vampnet import mask as pmask
|
|
|
|
| 18 |
|
| 19 |
-
from pyharp import AudioLabel, LabelList
|
| 20 |
from bytecover.models.train_module import TrainModule
|
| 21 |
from bytecover.utils import initialize_logging, load_config
|
| 22 |
import pinecone
|
|
@@ -52,6 +53,8 @@ elif bytecover_module.state == "initializing":
|
|
| 52 |
|
| 53 |
bytecover_model.eval()
|
| 54 |
|
|
|
|
|
|
|
| 55 |
print("Loading CLAP model")
|
| 56 |
|
| 57 |
if torch.cuda.is_available():
|
|
@@ -75,13 +78,29 @@ def flatten_vector_embed(vector_embed):
|
|
| 75 |
def format_time(num_seconds):
|
| 76 |
return f"{num_seconds // 60}:{num_seconds % 60:02d}"
|
| 77 |
|
| 78 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
"""
|
| 80 |
This function defines the audio processing steps
|
|
|
|
| 81 |
Args:
|
| 82 |
input_audio_path (str): the audio filepath to be processed.
|
|
|
|
| 83 |
<YOUR_KWARGS>: additional keyword arguments necessary for processing.
|
| 84 |
NOTE: These should correspond to and match order of UI elements defined below.
|
|
|
|
| 85 |
Returns:
|
| 86 |
output_audio_path (str): the filepath of the processed audio.
|
| 87 |
output_labels (LabelList): any labels to display.
|
|
@@ -97,20 +116,20 @@ def bytecover(sig, chunk_size=3.0, bytecover_match_ct=3, clap_match_ct=3):
|
|
| 97 |
"""
|
| 98 |
sig_mono = sig.copy().to_mono().audio_data.squeeze(1)
|
| 99 |
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
-
chunk_samples = int(chunk_size * sig.sample_rate)
|
| 103 |
-
print(f"Chunk samples: {chunk_samples}")
|
| 104 |
-
print(f"Shape of audio: {sig_mono.shape}")
|
| 105 |
-
chunks = torch.tensor_split(sig_mono, [i for i in range(chunk_samples, sig_mono.shape[1], chunk_samples)], dim=1)
|
| 106 |
-
if chunks[-1].shape[1] < chunk_samples:
|
| 107 |
-
print("Cutting last chunk due to length")
|
| 108 |
-
chunks = tuple(list(chunks)[:-1])
|
| 109 |
-
print(f"Number of chunks: {len(chunks)}")
|
| 110 |
|
| 111 |
print("Getting Bytecover embeddings")
|
| 112 |
bytecover_embeddings = []
|
| 113 |
-
for chunk in tqdm(
|
| 114 |
result = bytecover_model.forward(chunk.to(bytecover_module.config["device"]))['f_t'].detach()
|
| 115 |
bytecover_embeddings.append(result)
|
| 116 |
|
|
@@ -118,21 +137,27 @@ def bytecover(sig, chunk_size=3.0, bytecover_match_ct=3, clap_match_ct=3):
|
|
| 118 |
|
| 119 |
print("Getting CLAP embeddings")
|
| 120 |
clap_embeddings = []
|
| 121 |
-
for chunk in tqdm(
|
| 122 |
-
result = clap_model.get_audio_embedding_from_data(chunk.detach()
|
| 123 |
clap_embeddings.append(result)
|
| 124 |
|
| 125 |
-
clean_clap_embeddings = [convert_to_npfloat64_to_list(convert_to_npfloat64(flatten_vector_embed(embedding))) for embedding in clap_embeddings]
|
| 126 |
|
| 127 |
clap_matches = []
|
| 128 |
bytecover_matches = []
|
| 129 |
match_metadatas = {}
|
| 130 |
|
| 131 |
-
|
| 132 |
|
| 133 |
times = {}
|
| 134 |
|
| 135 |
-
for clean_embeddings, pinecone_index, match_list, embedding_num, num_matches in zip([clean_bytecover_embeddings, clean_clap_embeddings], [index_bytecover, index_clap], [bytecover_matches, clap_matches], range(2), [bytecover_match_ct, clap_match_ct]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
for i, embedding in enumerate(clean_embeddings):
|
| 138 |
|
|
@@ -156,48 +181,66 @@ def bytecover(sig, chunk_size=3.0, bytecover_match_ct=3, clap_match_ct=3):
|
|
| 156 |
print("Matches obtained!")
|
| 157 |
|
| 158 |
top_matches = sorted(match_list, key=lambda item: item[0], reverse=True)
|
|
|
|
|
|
|
| 159 |
|
| 160 |
-
for i, match in enumerate(top_matches
|
|
|
|
|
|
|
|
|
|
| 161 |
metadata = match_metadatas[match[2]]
|
| 162 |
song_artists = metadata['artists']
|
| 163 |
if type(song_artists) is list:
|
| 164 |
-
artists = '
|
| 165 |
|
| 166 |
|
| 167 |
song_title = metadata['song']
|
| 168 |
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
match_time = match[1]
|
| 174 |
times[match_time] = times.get(match_time, 0) + 1
|
| 175 |
|
| 176 |
-
label = AudioLabel(
|
| 177 |
-
t=match_time,
|
| 178 |
-
label=f'{song_title}',
|
| 179 |
-
duration=chunk_size,
|
| 180 |
-
link=song_link,
|
| 181 |
-
description=f'Embedding: {embed_name}\n{song_title} by {song_artists}\nClick the tag to view on Spotify!',
|
| 182 |
-
amplitude=1.0 - 0.5 * (times[match_time] - 1),
|
| 183 |
-
color=AudioLabel.rgb_color_to_int(200, 170, 3, 10) if embedding_num == 1 else 0
|
| 184 |
-
)
|
| 185 |
-
|
| 186 |
# if embedding_num == 1:
|
| 187 |
-
#
|
| 188 |
# else:
|
| 189 |
-
#
|
| 190 |
-
# #label.set_color(204, 52, 235, 240)
|
| 191 |
-
|
| 192 |
-
output_labels.append(label)
|
| 193 |
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
-
return
|
| 201 |
|
| 202 |
### END BYTECOVER
|
| 203 |
|
|
@@ -641,9 +684,9 @@ with gr.Blocks() as demo:
|
|
| 641 |
gr.Button(f"use as input (feedback)")
|
| 642 |
)
|
| 643 |
|
| 644 |
-
thank_you = gr.Markdown("")
|
| 645 |
|
| 646 |
-
labels = gr.
|
| 647 |
|
| 648 |
# download all the outputs
|
| 649 |
# download = gr.File(type="filepath", label="download outputs")
|
|
|
|
| 15 |
import spaces
|
| 16 |
from vampnet.interface import Interface, signal_concat
|
| 17 |
from vampnet import mask as pmask
|
| 18 |
+
from ytmusicapi import YTMusic
|
| 19 |
|
| 20 |
+
# from pyharp import AudioLabel, LabelList
|
| 21 |
from bytecover.models.train_module import TrainModule
|
| 22 |
from bytecover.utils import initialize_logging, load_config
|
| 23 |
import pinecone
|
|
|
|
| 53 |
|
| 54 |
bytecover_model.eval()
|
| 55 |
|
| 56 |
+
ytm = YTMusic()
|
| 57 |
+
|
| 58 |
print("Loading CLAP model")
|
| 59 |
|
| 60 |
if torch.cuda.is_available():
|
|
|
|
| 78 |
def format_time(num_seconds):
|
| 79 |
return f"{num_seconds // 60}:{num_seconds % 60:02d}"
|
| 80 |
|
| 81 |
+
def chunk_audio(chunk_size, sig, sr):
|
| 82 |
+
# Chunk audio to desired length
|
| 83 |
+
chunk_samples = int(chunk_size * sr)
|
| 84 |
+
print(f"Chunk samples: {chunk_samples}")
|
| 85 |
+
print(f"Shape of audio: {sig.shape}")
|
| 86 |
+
chunks = torch.tensor_split(sig, [i for i in range(chunk_samples, sig.shape[1], chunk_samples)], dim=1)
|
| 87 |
+
if chunks[-1].shape[1] < chunk_samples:
|
| 88 |
+
print("Cutting last chunk due to length")
|
| 89 |
+
chunks = tuple(list(chunks)[:-1])
|
| 90 |
+
print(f"Number of chunks: {len(chunks)}")
|
| 91 |
+
return chunks
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def bytecover(sig, bytecover_match_ct=3, clap_match_ct=3, chunk_size=None):
|
| 95 |
"""
|
| 96 |
This function defines the audio processing steps
|
| 97 |
+
|
| 98 |
Args:
|
| 99 |
input_audio_path (str): the audio filepath to be processed.
|
| 100 |
+
|
| 101 |
<YOUR_KWARGS>: additional keyword arguments necessary for processing.
|
| 102 |
NOTE: These should correspond to and match order of UI elements defined below.
|
| 103 |
+
|
| 104 |
Returns:
|
| 105 |
output_audio_path (str): the filepath of the processed audio.
|
| 106 |
output_labels (LabelList): any labels to display.
|
|
|
|
| 116 |
"""
|
| 117 |
sig_mono = sig.copy().to_mono().audio_data.squeeze(1)
|
| 118 |
|
| 119 |
+
if chunk_size is not None:
|
| 120 |
+
chunks = chunk_audio(chunk_size, sig_mono, sig.sample_rate)
|
| 121 |
+
bc_chunks = chunks
|
| 122 |
+
clap_chunks = chunks
|
| 123 |
+
chunk_sizes = [chunk_size, chunk_size]
|
| 124 |
+
else:
|
| 125 |
+
bc_chunks = chunk_audio(10, sig_mono, sig.sample_rate)
|
| 126 |
+
clap_chunks = chunk_audio(3, sig_mono, sig.sample_rate)
|
| 127 |
+
chunk_sizes = [10, 3]
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
print("Getting Bytecover embeddings")
|
| 131 |
bytecover_embeddings = []
|
| 132 |
+
for chunk in tqdm(bc_chunks):
|
| 133 |
result = bytecover_model.forward(chunk.to(bytecover_module.config["device"]))['f_t'].detach()
|
| 134 |
bytecover_embeddings.append(result)
|
| 135 |
|
|
|
|
| 137 |
|
| 138 |
print("Getting CLAP embeddings")
|
| 139 |
clap_embeddings = []
|
| 140 |
+
for chunk in tqdm(clap_chunks):
|
| 141 |
+
result = clap_model.get_audio_embedding_from_data(chunk, use_tensor=True).detach()
|
| 142 |
clap_embeddings.append(result)
|
| 143 |
|
| 144 |
+
clean_clap_embeddings = [convert_to_npfloat64_to_list(convert_to_npfloat64(flatten_vector_embed(embedding.cpu()))) for embedding in clap_embeddings]
|
| 145 |
|
| 146 |
clap_matches = []
|
| 147 |
bytecover_matches = []
|
| 148 |
match_metadatas = {}
|
| 149 |
|
| 150 |
+
output_md = ""
|
| 151 |
|
| 152 |
times = {}
|
| 153 |
|
| 154 |
+
for clean_embeddings, pinecone_index, match_list, embedding_num, num_matches, chunk_size in zip([clean_bytecover_embeddings, clean_clap_embeddings], [index_bytecover, index_clap], [bytecover_matches, clap_matches], range(2), [bytecover_match_ct, clap_match_ct], chunk_sizes):
|
| 155 |
+
|
| 156 |
+
if embedding_num == 0:
|
| 157 |
+
continue
|
| 158 |
+
output_md += "# Melodic Matches\n"
|
| 159 |
+
else:
|
| 160 |
+
output_md += "# Timbral Matches\n"
|
| 161 |
|
| 162 |
for i, embedding in enumerate(clean_embeddings):
|
| 163 |
|
|
|
|
| 181 |
print("Matches obtained!")
|
| 182 |
|
| 183 |
top_matches = sorted(match_list, key=lambda item: item[0], reverse=True)
|
| 184 |
+
|
| 185 |
+
found_tracks = []
|
| 186 |
|
| 187 |
+
for i, match in enumerate(top_matches):
|
| 188 |
+
if len(found_tracks) >= num_matches:
|
| 189 |
+
break
|
| 190 |
+
#print(match[0])
|
| 191 |
metadata = match_metadatas[match[2]]
|
| 192 |
song_artists = metadata['artists']
|
| 193 |
if type(song_artists) is list:
|
| 194 |
+
artists = ', '.join(artists)
|
| 195 |
|
| 196 |
|
| 197 |
song_title = metadata['song']
|
| 198 |
|
| 199 |
+
if metadata['spotify_id'] in found_tracks:
|
| 200 |
+
continue
|
| 201 |
+
|
| 202 |
+
found_tracks.append(metadata['spotify_id'])
|
| 203 |
+
|
| 204 |
+
song_genre = metadata['genre']
|
| 205 |
|
| 206 |
+
yt_id = ytm.search(f"{song_title} {song_artists}", filter="songs", limit = 1)[0]['videoId']
|
| 207 |
+
song_link = f"https://music.youtube.com/watch?v={yt_id}&t={int(metadata['clip_num']) * 10}"
|
| 208 |
+
|
| 209 |
+
#song_link = f"https://open.spotify.com/track/{metadata['spotify_id'].split(':')[2]}"
|
| 210 |
+
|
| 211 |
+
embed_name = ['Melodic', 'Timbral'][embedding_num]
|
| 212 |
|
| 213 |
match_time = match[1]
|
| 214 |
times[match_time] = times.get(match_time, 0) + 1
|
| 215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
# if embedding_num == 1:
|
| 217 |
+
# color = OutputLabel.rgb_color_to_int(200, 170, 3, 20)
|
| 218 |
# else:
|
| 219 |
+
# color = OutputLabel.rgb_color_to_int(204, 52, 235, 20)
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
+
# if match[0] < 0.5:
|
| 222 |
+
# color_list = min_sim_color
|
| 223 |
+
# else:
|
| 224 |
+
# color_list = [int(min_color + (match[0] - 0.5) * 2 * (max_color - min_color)) for min_color, max_color in zip(min_sim_color, max_sim_color)]
|
| 225 |
+
|
| 226 |
+
# if match[0] < 0.5:
|
| 227 |
+
# color_list = [0, 200, 0, 20]
|
| 228 |
+
|
| 229 |
+
# normalized_similarity = (match[0] - 0.5) * 2
|
| 230 |
+
# color_list = [int(min(400 * normalized_similarity, 200)), int(min(400 * (1 - normalized_similarity), 200)), 0, 20]
|
| 231 |
+
|
| 232 |
+
output_md += f'{format_time(match_time)}: \n [{song_title} by {song_artists}]({song_link}) \n Genre: {song_genre} \n Similarity: {match[0]}\n\n'
|
| 233 |
+
|
| 234 |
+
# label = AudioLabel(t=match_time,
|
| 235 |
+
# label=f'{song_title}',
|
| 236 |
+
# duration=chunk_size,
|
| 237 |
+
# link=song_link,
|
| 238 |
+
# description=f'Similarity type: {embed_name}, similarity: {match[0]}\n{song_title} by {song_artists}\nGenre: {song_genre}\nClick the tag to view on YouTube Music!',
|
| 239 |
+
# # amplitude=1.0 - 0.5 * (times[match_time] - 1),
|
| 240 |
+
# color=color)
|
| 241 |
+
|
| 242 |
|
| 243 |
+
return output_md
|
| 244 |
|
| 245 |
### END BYTECOVER
|
| 246 |
|
|
|
|
| 684 |
gr.Button(f"use as input (feedback)")
|
| 685 |
)
|
| 686 |
|
| 687 |
+
#thank_you = gr.Markdown("")
|
| 688 |
|
| 689 |
+
labels = gr.Markdown(label="output labels")
|
| 690 |
|
| 691 |
# download all the outputs
|
| 692 |
# download = gr.File(type="filepath", label="download outputs")
|
requirements.txt
CHANGED
|
@@ -18,11 +18,12 @@ laion_clap
|
|
| 18 |
nnAudio
|
| 19 |
ffmpeg-python
|
| 20 |
torchvision
|
| 21 |
-
torch
|
| 22 |
jsonlines
|
| 23 |
wandb
|
| 24 |
tqdm
|
|
|
|
|
|
|
| 25 |
# For pinecone_generate only
|
| 26 |
google-api-python-client
|
| 27 |
protoc-gen-openapiv2-protoc3-19
|
| 28 |
-
transformers==4.30.0
|
|
|
|
| 18 |
nnAudio
|
| 19 |
ffmpeg-python
|
| 20 |
torchvision
|
|
|
|
| 21 |
jsonlines
|
| 22 |
wandb
|
| 23 |
tqdm
|
| 24 |
+
spaces
|
| 25 |
+
ytmusicapi
|
| 26 |
# For pinecone_generate only
|
| 27 |
google-api-python-client
|
| 28 |
protoc-gen-openapiv2-protoc3-19
|
| 29 |
+
transformers==4.30.0
|