Files changed (2) hide show
  1. app.py +87 -44
  2. requirements.txt +3 -2
app.py CHANGED
@@ -15,8 +15,9 @@ import gradio as gr
15
  import spaces
16
  from vampnet.interface import Interface, signal_concat
17
  from vampnet import mask as pmask
 
18
 
19
- from pyharp import AudioLabel, LabelList
20
  from bytecover.models.train_module import TrainModule
21
  from bytecover.utils import initialize_logging, load_config
22
  import pinecone
@@ -52,6 +53,8 @@ elif bytecover_module.state == "initializing":
52
 
53
  bytecover_model.eval()
54
 
 
 
55
  print("Loading CLAP model")
56
 
57
  if torch.cuda.is_available():
@@ -75,13 +78,29 @@ def flatten_vector_embed(vector_embed):
75
  def format_time(num_seconds):
76
  return f"{num_seconds // 60}:{num_seconds % 60:02d}"
77
 
78
- def bytecover(sig, chunk_size=3.0, bytecover_match_ct=3, clap_match_ct=3):
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  """
80
  This function defines the audio processing steps
 
81
  Args:
82
  input_audio_path (str): the audio filepath to be processed.
 
83
  <YOUR_KWARGS>: additional keyword arguments necessary for processing.
84
  NOTE: These should correspond to and match order of UI elements defined below.
 
85
  Returns:
86
  output_audio_path (str): the filepath of the processed audio.
87
  output_labels (LabelList): any labels to display.
@@ -97,20 +116,20 @@ def bytecover(sig, chunk_size=3.0, bytecover_match_ct=3, clap_match_ct=3):
97
  """
98
  sig_mono = sig.copy().to_mono().audio_data.squeeze(1)
99
 
100
- # Chunk audio to desired length
 
 
 
 
 
 
 
 
101
 
102
- chunk_samples = int(chunk_size * sig.sample_rate)
103
- print(f"Chunk samples: {chunk_samples}")
104
- print(f"Shape of audio: {sig_mono.shape}")
105
- chunks = torch.tensor_split(sig_mono, [i for i in range(chunk_samples, sig_mono.shape[1], chunk_samples)], dim=1)
106
- if chunks[-1].shape[1] < chunk_samples:
107
- print("Cutting last chunk due to length")
108
- chunks = tuple(list(chunks)[:-1])
109
- print(f"Number of chunks: {len(chunks)}")
110
 
111
  print("Getting Bytecover embeddings")
112
  bytecover_embeddings = []
113
- for chunk in tqdm(chunks):
114
  result = bytecover_model.forward(chunk.to(bytecover_module.config["device"]))['f_t'].detach()
115
  bytecover_embeddings.append(result)
116
 
@@ -118,21 +137,27 @@ def bytecover(sig, chunk_size=3.0, bytecover_match_ct=3, clap_match_ct=3):
118
 
119
  print("Getting CLAP embeddings")
120
  clap_embeddings = []
121
- for chunk in tqdm(chunks):
122
- result = clap_model.get_audio_embedding_from_data(chunk.detach().cpu().numpy())
123
  clap_embeddings.append(result)
124
 
125
- clean_clap_embeddings = [convert_to_npfloat64_to_list(convert_to_npfloat64(flatten_vector_embed(embedding))) for embedding in clap_embeddings]
126
 
127
  clap_matches = []
128
  bytecover_matches = []
129
  match_metadatas = {}
130
 
131
- output_labels = LabelList()
132
 
133
  times = {}
134
 
135
- for clean_embeddings, pinecone_index, match_list, embedding_num, num_matches in zip([clean_bytecover_embeddings, clean_clap_embeddings], [index_bytecover, index_clap], [bytecover_matches, clap_matches], range(2), [bytecover_match_ct, clap_match_ct]):
 
 
 
 
 
 
136
 
137
  for i, embedding in enumerate(clean_embeddings):
138
 
@@ -156,48 +181,66 @@ def bytecover(sig, chunk_size=3.0, bytecover_match_ct=3, clap_match_ct=3):
156
  print("Matches obtained!")
157
 
158
  top_matches = sorted(match_list, key=lambda item: item[0], reverse=True)
 
 
159
 
160
- for i, match in enumerate(top_matches[:int(num_matches)]):
 
 
 
161
  metadata = match_metadatas[match[2]]
162
  song_artists = metadata['artists']
163
  if type(song_artists) is list:
164
- artists = ' and '.join(artists)
165
 
166
 
167
  song_title = metadata['song']
168
 
169
- song_link = f"https://open.spotify.com/track/{metadata['spotify_id'].split(':')[2]}"
 
 
 
 
 
170
 
171
- embed_name = ['ByteCover', 'CLAP'][embedding_num]
 
 
 
 
 
172
 
173
  match_time = match[1]
174
  times[match_time] = times.get(match_time, 0) + 1
175
 
176
- label = AudioLabel(
177
- t=match_time,
178
- label=f'{song_title}',
179
- duration=chunk_size,
180
- link=song_link,
181
- description=f'Embedding: {embed_name}\n{song_title} by {song_artists}\nClick the tag to view on Spotify!',
182
- amplitude=1.0 - 0.5 * (times[match_time] - 1),
183
- color=AudioLabel.rgb_color_to_int(200, 170, 3, 10) if embedding_num == 1 else 0
184
- )
185
-
186
  # if embedding_num == 1:
187
- # label.rgb_color_to_int(200, 170, 3, 240)
188
  # else:
189
- # pass
190
- # #label.set_color(204, 52, 235, 240)
191
-
192
- output_labels.append(label)
193
 
194
- """
195
- <YOUR AUDIO SAVING CODE HERE>
196
- # Save processed audio and obtain default path
197
- output_audio_path = save_audio(signal, None)
198
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
- return output_labels
201
 
202
  ### END BYTECOVER
203
 
@@ -641,9 +684,9 @@ with gr.Blocks() as demo:
641
  gr.Button(f"use as input (feedback)")
642
  )
643
 
644
- thank_you = gr.Markdown("")
645
 
646
- labels = gr.JSON(label="output labels")
647
 
648
  # download all the outputs
649
  # download = gr.File(type="filepath", label="download outputs")
 
15
  import spaces
16
  from vampnet.interface import Interface, signal_concat
17
  from vampnet import mask as pmask
18
+ from ytmusicapi import YTMusic
19
 
20
+ # from pyharp import AudioLabel, LabelList
21
  from bytecover.models.train_module import TrainModule
22
  from bytecover.utils import initialize_logging, load_config
23
  import pinecone
 
53
 
54
  bytecover_model.eval()
55
 
56
+ ytm = YTMusic()
57
+
58
  print("Loading CLAP model")
59
 
60
  if torch.cuda.is_available():
 
78
  def format_time(num_seconds):
79
  return f"{num_seconds // 60}:{num_seconds % 60:02d}"
80
 
81
+ def chunk_audio(chunk_size, sig, sr):
82
+ # Chunk audio to desired length
83
+ chunk_samples = int(chunk_size * sr)
84
+ print(f"Chunk samples: {chunk_samples}")
85
+ print(f"Shape of audio: {sig.shape}")
86
+ chunks = torch.tensor_split(sig, [i for i in range(chunk_samples, sig.shape[1], chunk_samples)], dim=1)
87
+ if chunks[-1].shape[1] < chunk_samples:
88
+ print("Cutting last chunk due to length")
89
+ chunks = tuple(list(chunks)[:-1])
90
+ print(f"Number of chunks: {len(chunks)}")
91
+ return chunks
92
+
93
+
94
+ def bytecover(sig, bytecover_match_ct=3, clap_match_ct=3, chunk_size=None):
95
  """
96
  This function defines the audio processing steps
97
+
98
  Args:
99
  input_audio_path (str): the audio filepath to be processed.
100
+
101
  <YOUR_KWARGS>: additional keyword arguments necessary for processing.
102
  NOTE: These should correspond to and match order of UI elements defined below.
103
+
104
  Returns:
105
  output_audio_path (str): the filepath of the processed audio.
106
  output_labels (LabelList): any labels to display.
 
116
  """
117
  sig_mono = sig.copy().to_mono().audio_data.squeeze(1)
118
 
119
+ if chunk_size is not None:
120
+ chunks = chunk_audio(chunk_size, sig_mono, sig.sample_rate)
121
+ bc_chunks = chunks
122
+ clap_chunks = chunks
123
+ chunk_sizes = [chunk_size, chunk_size]
124
+ else:
125
+ bc_chunks = chunk_audio(10, sig_mono, sig.sample_rate)
126
+ clap_chunks = chunk_audio(3, sig_mono, sig.sample_rate)
127
+ chunk_sizes = [10, 3]
128
 
 
 
 
 
 
 
 
 
129
 
130
  print("Getting Bytecover embeddings")
131
  bytecover_embeddings = []
132
+ for chunk in tqdm(bc_chunks):
133
  result = bytecover_model.forward(chunk.to(bytecover_module.config["device"]))['f_t'].detach()
134
  bytecover_embeddings.append(result)
135
 
 
137
 
138
  print("Getting CLAP embeddings")
139
  clap_embeddings = []
140
+ for chunk in tqdm(clap_chunks):
141
+ result = clap_model.get_audio_embedding_from_data(chunk, use_tensor=True).detach()
142
  clap_embeddings.append(result)
143
 
144
+ clean_clap_embeddings = [convert_to_npfloat64_to_list(convert_to_npfloat64(flatten_vector_embed(embedding.cpu()))) for embedding in clap_embeddings]
145
 
146
  clap_matches = []
147
  bytecover_matches = []
148
  match_metadatas = {}
149
 
150
+ output_md = ""
151
 
152
  times = {}
153
 
154
+ for clean_embeddings, pinecone_index, match_list, embedding_num, num_matches, chunk_size in zip([clean_bytecover_embeddings, clean_clap_embeddings], [index_bytecover, index_clap], [bytecover_matches, clap_matches], range(2), [bytecover_match_ct, clap_match_ct], chunk_sizes):
155
+
156
+ if embedding_num == 0:
157
+ continue
158
+ output_md += "# Melodic Matches\n"
159
+ else:
160
+ output_md += "# Timbral Matches\n"
161
 
162
  for i, embedding in enumerate(clean_embeddings):
163
 
 
181
  print("Matches obtained!")
182
 
183
  top_matches = sorted(match_list, key=lambda item: item[0], reverse=True)
184
+
185
+ found_tracks = []
186
 
187
+ for i, match in enumerate(top_matches):
188
+ if len(found_tracks) >= num_matches:
189
+ break
190
+ #print(match[0])
191
  metadata = match_metadatas[match[2]]
192
  song_artists = metadata['artists']
193
  if type(song_artists) is list:
194
+ artists = ', '.join(artists)
195
 
196
 
197
  song_title = metadata['song']
198
 
199
+ if metadata['spotify_id'] in found_tracks:
200
+ continue
201
+
202
+ found_tracks.append(metadata['spotify_id'])
203
+
204
+ song_genre = metadata['genre']
205
 
206
+ yt_id = ytm.search(f"{song_title} {song_artists}", filter="songs", limit = 1)[0]['videoId']
207
+ song_link = f"https://music.youtube.com/watch?v={yt_id}&t={int(metadata['clip_num']) * 10}"
208
+
209
+ #song_link = f"https://open.spotify.com/track/{metadata['spotify_id'].split(':')[2]}"
210
+
211
+ embed_name = ['Melodic', 'Timbral'][embedding_num]
212
 
213
  match_time = match[1]
214
  times[match_time] = times.get(match_time, 0) + 1
215
 
 
 
 
 
 
 
 
 
 
 
216
  # if embedding_num == 1:
217
+ # color = OutputLabel.rgb_color_to_int(200, 170, 3, 20)
218
  # else:
219
+ # color = OutputLabel.rgb_color_to_int(204, 52, 235, 20)
 
 
 
220
 
221
+ # if match[0] < 0.5:
222
+ # color_list = min_sim_color
223
+ # else:
224
+ # color_list = [int(min_color + (match[0] - 0.5) * 2 * (max_color - min_color)) for min_color, max_color in zip(min_sim_color, max_sim_color)]
225
+
226
+ # if match[0] < 0.5:
227
+ # color_list = [0, 200, 0, 20]
228
+
229
+ # normalized_similarity = (match[0] - 0.5) * 2
230
+ # color_list = [int(min(400 * normalized_similarity, 200)), int(min(400 * (1 - normalized_similarity), 200)), 0, 20]
231
+
232
+ output_md += f'{format_time(match_time)}: \n [{song_title} by {song_artists}]({song_link}) \n Genre: {song_genre} \n Similarity: {match[0]}\n\n'
233
+
234
+ # label = AudioLabel(t=match_time,
235
+ # label=f'{song_title}',
236
+ # duration=chunk_size,
237
+ # link=song_link,
238
+ # description=f'Similarity type: {embed_name}, similarity: {match[0]}\n{song_title} by {song_artists}\nGenre: {song_genre}\nClick the tag to view on YouTube Music!',
239
+ # # amplitude=1.0 - 0.5 * (times[match_time] - 1),
240
+ # color=color)
241
+
242
 
243
+ return output_md
244
 
245
  ### END BYTECOVER
246
 
 
684
  gr.Button(f"use as input (feedback)")
685
  )
686
 
687
+ #thank_you = gr.Markdown("")
688
 
689
+ labels = gr.Markdown(label="output labels")
690
 
691
  # download all the outputs
692
  # download = gr.File(type="filepath", label="download outputs")
requirements.txt CHANGED
@@ -18,11 +18,12 @@ laion_clap
18
  nnAudio
19
  ffmpeg-python
20
  torchvision
21
- torch
22
  jsonlines
23
  wandb
24
  tqdm
 
 
25
  # For pinecone_generate only
26
  google-api-python-client
27
  protoc-gen-openapiv2-protoc3-19
28
- transformers==4.30.0
 
18
  nnAudio
19
  ffmpeg-python
20
  torchvision
 
21
  jsonlines
22
  wandb
23
  tqdm
24
+ spaces
25
+ ytmusicapi
26
  # For pinecone_generate only
27
  google-api-python-client
28
  protoc-gen-openapiv2-protoc3-19
29
+ transformers==4.30.0