krislette commited on
Commit
7633e2f
·
1 Parent(s): 75d43d2

Auto-deploy from GitHub: 7bbe4e79d2cd5e035a2fc8cda464b3cd867300d5

Browse files
scripts/predict.py CHANGED
@@ -5,6 +5,7 @@ from src.llm2vectrain.llm2vec_trainer import l2vec_single_train, load_pca_model
5
  from src.models.mlp import build_mlp, load_config
6
  from src.utils.dataset import instance_scaler
7
 
 
8
  import numpy as np
9
  import pandas as pd
10
 
@@ -47,8 +48,9 @@ def predict_pipeline(audio_file, lyrics):
47
  # 5.) Reduce the lyrics using saved PCA model
48
  reduced_lyrics = load_pca_model(lyrics_features)
49
 
50
- # Scale the vectors using Z-Score again
51
- audio_features, reduced_lyrics = instance_scaler(audio_features, reduced_lyrics)
 
52
 
53
  # 6.) Concatenate the vectors of audio_features + lyrics_features
54
  results = np.concatenate([audio_features, reduced_lyrics], axis=1)
 
5
  from src.models.mlp import build_mlp, load_config
6
  from src.utils.dataset import instance_scaler
7
 
8
+ import joblib
9
  import numpy as np
10
  import pandas as pd
11
 
 
48
  # 5.) Reduce the lyrics using saved PCA model
49
  reduced_lyrics = load_pca_model(lyrics_features)
50
 
51
+ # 6.) Apply PCA scaler to PCA-reduced lyrics
52
+ pca_scaler = joblib.load("models/fusion/pca_scaler.pkl")
53
+ reduced_lyrics = pca_scaler.transform(reduced_lyrics)
54
 
55
  # 6.) Concatenate the vectors of audio_features + lyrics_features
56
  results = np.concatenate([audio_features, reduced_lyrics], axis=1)
src/musiclime/explainer.py CHANGED
@@ -15,7 +15,32 @@ from src.musiclime.print_utils import green_bold
15
 
16
 
17
  class MusicLIMEExplainer:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def __init__(self, kernel_width=25, random_state=None):
 
 
 
 
 
 
 
 
 
 
19
  self.random_state = check_random_state(random_state)
20
 
21
  def kernel(d, kernel_width):
@@ -33,6 +58,29 @@ class MusicLIMEExplainer:
33
  labels=(1,),
34
  temporal_segments=10,
35
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  # These are for debugging only I have to see THAT progress
37
  print("[MusicLIME] Starting MusicLIME explanation...")
38
  print(
@@ -93,6 +141,29 @@ class MusicLIMEExplainer:
93
  return explanation
94
 
95
  def _generate_neighborhood(self, audio_fact, text_fact, predict_fn, num_samples):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  n_audio = audio_fact.get_number_components()
97
  n_text = text_fact.num_words()
98
  total_features = n_audio + n_text
@@ -192,7 +263,48 @@ class MusicLIMEExplainer:
192
 
193
 
194
  class MusicLIMEExplanation:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  def __init__(self, audio_factorization, text_factorization, data, predictions):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  self.audio_factorization = audio_factorization
197
  self.text_factorization = text_factorization
198
  self.data = data
@@ -203,7 +315,21 @@ class MusicLIMEExplanation:
203
  self.local_pred = {}
204
 
205
  def get_explanation(self, label, num_features=10):
206
- """Get top features for explanation"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  if label not in self.local_exp:
208
  return []
209
 
@@ -231,7 +357,23 @@ class MusicLIMEExplanation:
231
  return explanations
232
 
233
  def save_to_json(self, filepath, song_info=None, num_features=10):
234
- """Save explanation results to JSON file"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  results_dir = Path("results")
236
  results_dir.mkdir(exist_ok=True)
237
 
 
15
 
16
 
17
  class MusicLIMEExplainer:
18
+ """
19
+ LIME-based explainer for multimodal music classification models.
20
+
21
+ Generates local explanations for AI vs Human music classification by
22
+ perturbing audio (source separation) and lyrics (line removal) components
23
+ and analyzing their impact on model predictions.
24
+
25
+ Attributes
26
+ ----------
27
+ random_state : RandomState
28
+ Random number generator for reproducible perturbations
29
+ base : LimeBase
30
+ Core LIME explanation engine with exponential kernel
31
+ """
32
+
33
  def __init__(self, kernel_width=25, random_state=None):
34
+ """
35
+ Initialize MusicLIME explainer with kernel parameters.
36
+
37
+ Parameters
38
+ ----------
39
+ kernel_width : int, default=25
40
+ Width parameter for the exponential kernel function
41
+ random_state : int or RandomState, optional
42
+ Random seed for reproducible perturbations
43
+ """
44
  self.random_state = check_random_state(random_state)
45
 
46
  def kernel(d, kernel_width):
 
58
  labels=(1,),
59
  temporal_segments=10,
60
  ):
61
+ """
62
+ Generate LIME explanations for a music instance using audio and lyrics.
63
+
64
+ Parameters
65
+ ----------
66
+ audio : array-like
67
+ Raw audio waveform data
68
+ lyrics : str
69
+ Song lyrics as text string
70
+ predict_fn : callable
71
+ Prediction function that takes (texts, audios) and returns probabilities (wrapper)
72
+ num_samples : int, default=1000
73
+ Number of perturbed samples to generate for LIME
74
+ labels : tuple, default=(1,)
75
+ Target labels to explain (0=AI-Generated, 1=Human-Composed)
76
+ temporal_segments : int, default=10
77
+ Number of temporal segments for audio factorization
78
+
79
+ Returns
80
+ -------
81
+ MusicLIMEExplanation
82
+ Explanation object containing feature importance weights
83
+ """
84
  # These are for debugging only I have to see THAT progress
85
  print("[MusicLIME] Starting MusicLIME explanation...")
86
  print(
 
141
  return explanation
142
 
143
  def _generate_neighborhood(self, audio_fact, text_fact, predict_fn, num_samples):
144
+ """
145
+ Generate perturbed samples and predictions for LIME explanation.
146
+
147
+ Parameters
148
+ ----------
149
+ audio_fact : OpenUnmixFactorization
150
+ Audio factorization object for source separation
151
+ text_fact : LineIndexedString
152
+ Text factorization object for line-based perturbations
153
+ predict_fn : callable
154
+ Model prediction function
155
+ num_samples : int
156
+ Number of perturbations to generate
157
+
158
+ Returns
159
+ -------
160
+ data : ndarray
161
+ Binary perturbation masks (num_samples, total_features)
162
+ predictions : ndarray
163
+ Model predictions for perturbed instances
164
+ distances : ndarray
165
+ Cosine distances from original instance
166
+ """
167
  n_audio = audio_fact.get_number_components()
168
  n_text = text_fact.num_words()
169
  total_features = n_audio + n_text
 
263
 
264
 
265
  class MusicLIMEExplanation:
266
+ """
267
+ Container for MusicLIME explanation results and analysis methods.
268
+
269
+ Stores factorizations, perturbation data, and LIME-fitted explanations
270
+ for a single music instance. Provides methods to extract top features
271
+ and export results to structured formats.
272
+
273
+ Attributes
274
+ ----------
275
+ audio_factorization : OpenUnmixFactorization
276
+ Audio source separation components
277
+ text_factorization : LineIndexedString
278
+ Lyrics line segmentation components
279
+ data : ndarray
280
+ Binary perturbation masks used for explanation
281
+ predictions : ndarray
282
+ Model predictions for all perturbations
283
+ intercept : dict
284
+ LIME model intercepts by label
285
+ local_exp : dict
286
+ Feature importance weights by label
287
+ score : dict
288
+ LIME model R² scores by label
289
+ local_pred : dict
290
+ Local model predictions by label
291
+ """
292
+
293
  def __init__(self, audio_factorization, text_factorization, data, predictions):
294
+ """
295
+ Initialize explanation object with factorizations and prediction data.
296
+
297
+ Parameters
298
+ ----------
299
+ audio_factorization : OpenUnmixFactorization
300
+ Audio source separation components
301
+ text_factorization : LineIndexedString
302
+ Text line segmentation components
303
+ data : ndarray
304
+ Binary perturbation masks used for explanation
305
+ predictions : ndarray
306
+ Model predictions for all perturbations
307
+ """
308
  self.audio_factorization = audio_factorization
309
  self.text_factorization = text_factorization
310
  self.data = data
 
315
  self.local_pred = {}
316
 
317
  def get_explanation(self, label, num_features=10):
318
+ """
319
+ Extract top feature explanations for a specific label.
320
+
321
+ Parameters
322
+ ----------
323
+ label : int
324
+ Target label to explain (0=AI-Generated, 1=Human-Composed)
325
+ num_features : int, default=10
326
+ Number of top features to return
327
+
328
+ Returns
329
+ -------
330
+ list of dict
331
+ Feature explanations with type, feature description, and weight
332
+ """
333
  if label not in self.local_exp:
334
  return []
335
 
 
357
  return explanations
358
 
359
  def save_to_json(self, filepath, song_info=None, num_features=10):
360
+ """
361
+ Save explanation results to structured JSON file.
362
+
363
+ Parameters
364
+ ----------
365
+ filepath : str
366
+ Output filename for JSON results
367
+ song_info : dict, optional
368
+ Additional metadata about the song
369
+ num_features : int, default=10
370
+ Number of top features to include in output
371
+
372
+ Returns
373
+ -------
374
+ Path
375
+ Path to the saved JSON file
376
+ """
377
  results_dir = Path("results")
378
  results_dir.mkdir(exist_ok=True)
379
 
src/musiclime/factorization.py CHANGED
@@ -6,7 +6,42 @@ from src.musiclime.print_utils import green_bold
6
 
7
 
8
  class OpenUnmixFactorization:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def __init__(self, audio, temporal_segmentation_params=10, composition_fn=None):
 
 
 
 
 
 
 
 
 
 
 
 
10
  print("[MusicLIME] Initializing OpenUnmix factorization...")
11
  self.audio = audio
12
  self.target_sr = 44100
@@ -49,6 +84,21 @@ class OpenUnmixFactorization:
49
  )
50
 
51
  def _compute_segments(self, signal, n_segments):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  audio_length = len(signal)
53
  samples_per_segment = audio_length // n_segments
54
 
@@ -60,6 +110,16 @@ class OpenUnmixFactorization:
60
  return segments
61
 
62
  def _separate_sources(self):
 
 
 
 
 
 
 
 
 
 
63
  waveform = np.expand_dims(self.audio, axis=1)
64
 
65
  # Load openunmix .pth files from local dir
@@ -81,6 +141,12 @@ class OpenUnmixFactorization:
81
  return components, names
82
 
83
  def _prepare_temporal_components(self):
 
 
 
 
 
 
84
  # Create temporal-source combinations
85
  self.components = []
86
  self.final_component_names = []
@@ -93,12 +159,41 @@ class OpenUnmixFactorization:
93
  self.final_component_names.append(f"{self.component_names[c]}_T{s}")
94
 
95
  def get_number_components(self):
 
 
 
 
 
 
 
 
96
  return len(self.components)
97
 
98
  def get_ordered_component_names(self):
 
 
 
 
 
 
 
 
99
  return self.final_component_names
100
 
101
  def compose_model_input(self, component_indices):
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  if len(component_indices) == 0:
103
  return np.zeros_like(self.audio)
104
 
 
6
 
7
 
8
  class OpenUnmixFactorization:
9
+ """
10
+ Audio factorization using OpenUnmix source separation with temporal segmentation.
11
+
12
+ Decomposes audio into interpretable components by separating sources
13
+ (vocals, bass, drums, other) and segmenting each across time windows.
14
+ Creates temporal-source combinations for fine-grained audio explanations.
15
+
16
+ Attributes
17
+ ----------
18
+ audio : ndarray
19
+ Original audio waveform
20
+ temporal_segments : list of tuple
21
+ Time window boundaries for segmentation
22
+ original_components : list of ndarray
23
+ Raw separated audio sources
24
+ component_names : list of str
25
+ Names of separated sources
26
+ components : list of ndarray
27
+ Final temporal-source component combinations
28
+ final_component_names : list of str
29
+ Names of temporal-source combinations
30
+ """
31
+
32
  def __init__(self, audio, temporal_segmentation_params=10, composition_fn=None):
33
+ """
34
+ Initialize audio factorization using OpenUnmix source separation with temporal segmentation.
35
+
36
+ Parameters
37
+ ----------
38
+ audio : array-like
39
+ Raw audio waveform data at 44.1kHz sample rate
40
+ temporal_segmentation_params : int, default=10
41
+ Number of temporal segments to divide the audio into
42
+ composition_fn : callable, optional
43
+ Custom function for composing separated sources (unused for now)
44
+ """
45
  print("[MusicLIME] Initializing OpenUnmix factorization...")
46
  self.audio = audio
47
  self.target_sr = 44100
 
84
  )
85
 
86
  def _compute_segments(self, signal, n_segments):
87
+ """
88
+ Divide audio signal into equal temporal segments for factorization.
89
+
90
+ Parameters
91
+ ----------
92
+ signal : array-like
93
+ Input audio waveform
94
+ n_segments : int
95
+ Number of temporal segments to create
96
+
97
+ Returns
98
+ -------
99
+ list of tuple
100
+ List of (start, end) sample indices for each segment
101
+ """
102
  audio_length = len(signal)
103
  samples_per_segment = audio_length // n_segments
104
 
 
110
  return segments
111
 
112
  def _separate_sources(self):
113
+ """
114
+ Perform source separation using OpenUnmix to extract instrument components.
115
+
116
+ Returns
117
+ -------
118
+ components : list of ndarray
119
+ Separated audio sources (vocals, bass, drums, other)
120
+ names : list of str
121
+ Names of the separated source components
122
+ """
123
  waveform = np.expand_dims(self.audio, axis=1)
124
 
125
  # Load openunmix .pth files from local dir
 
141
  return components, names
142
 
143
  def _prepare_temporal_components(self):
144
+ """
145
+ Create temporal-source combinations by applying each source to each time segment.
146
+
147
+ Creates components like 'vocals_T0', 'drums_T5' representing specific
148
+ instruments active only in specific temporal windows.
149
+ """
150
  # Create temporal-source combinations
151
  self.components = []
152
  self.final_component_names = []
 
159
  self.final_component_names.append(f"{self.component_names[c]}_T{s}")
160
 
161
  def get_number_components(self):
162
+ """
163
+ Get total number of factorized components (sources x temporal segments).
164
+
165
+ Returns
166
+ -------
167
+ int
168
+ Total number of temporal-source component combinations
169
+ """
170
  return len(self.components)
171
 
172
  def get_ordered_component_names(self):
173
+ """
174
+ Get ordered list of component names for explanation display.
175
+
176
+ Returns
177
+ -------
178
+ list of str
179
+ Component names in format '{source}_T{segment}' (e.g., 'vocals_T3')
180
+ """
181
  return self.final_component_names
182
 
183
  def compose_model_input(self, component_indices):
184
+ """
185
+ Reconstruct audio by summing selected temporal-source components.
186
+
187
+ Parameters
188
+ ----------
189
+ component_indices : array-like
190
+ Indices of components to include in reconstruction
191
+
192
+ Returns
193
+ -------
194
+ ndarray
195
+ Reconstructed audio waveform from selected components
196
+ """
197
  if len(component_indices) == 0:
198
  return np.zeros_like(self.audio)
199
 
src/musiclime/print_utils.py CHANGED
@@ -1,2 +1,15 @@
1
  def green_bold(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  return f"\033[1;32m{text}\033[0m"
 
1
  def green_bold(text):
2
+ """
3
+ Format text with green bold ANSI color codes for terminal output.
4
+
5
+ Parameters
6
+ ----------
7
+ text : str
8
+ Text string to format
9
+
10
+ Returns
11
+ -------
12
+ str
13
+ Text wrapped with ANSI escape codes for green bold formatting
14
+ """
15
  return f"\033[1;32m{text}\033[0m"
src/musiclime/text_utils.py CHANGED
@@ -4,7 +4,38 @@ from lime.lime_text import IndexedString
4
 
5
 
6
  class LineIndexedString(IndexedString):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def __init__(self, raw_string, bow=True, mask_string=None):
 
 
 
 
 
 
 
 
 
 
 
 
8
  self.raw = raw_string
9
  self.mask_string = mask_string
10
  self.bow = bow
@@ -18,6 +49,19 @@ class LineIndexedString(IndexedString):
18
  self.string_start = [0] * len(self.as_list)
19
 
20
  def _split_by_lines(self, text):
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  lines = text.split("\n")
22
  processed_lines = []
23
 
@@ -31,6 +75,19 @@ class LineIndexedString(IndexedString):
31
  return processed_lines
32
 
33
  def inverse_removing(self, words_to_remove):
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # Keep lines not in words_to_remove
35
  kept_lines = [
36
  self.as_list[i]
@@ -40,7 +97,28 @@ class LineIndexedString(IndexedString):
40
  return "\n".join(kept_lines)
41
 
42
  def num_words(self):
 
 
 
 
 
 
 
 
43
  return len(self.as_list)
44
 
45
  def word(self, id_):
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  return self.as_list[id_]
 
4
 
5
 
6
  class LineIndexedString(IndexedString):
7
+ """
8
+ Line-based text indexing for lyrics perturbation in MusicLIME.
9
+
10
+ Extends LIME's IndexedString to work with lyrics lines instead of words,
11
+ to enable more meaningful perturbations for song lyrics. Filters out
12
+ metadata and focuses on actual lyrical content.
13
+
14
+ Attributes
15
+ ----------
16
+ raw : str
17
+ Original raw lyrics text
18
+ as_list : list of str
19
+ Processed lyrics lines without metadata
20
+ as_np : ndarray
21
+ NumPy array of lyrics lines
22
+ positions : list of int
23
+ Line position indices for LIME compatibility
24
+ """
25
+
26
  def __init__(self, raw_string, bow=True, mask_string=None):
27
+ """
28
+ Initialize line-based text indexing for lyrics perturbation in MusicLIME.
29
+
30
+ Parameters
31
+ ----------
32
+ raw_string : str
33
+ Raw lyrics text to be processed
34
+ bow : bool, default=True
35
+ Bag-of-words flag (maintained for LIME compatibility)
36
+ mask_string : str, optional
37
+ String to use for masking removed lines
38
+ """
39
  self.raw = raw_string
40
  self.mask_string = mask_string
41
  self.bow = bow
 
49
  self.string_start = [0] * len(self.as_list)
50
 
51
  def _split_by_lines(self, text):
52
+ """
53
+ Split lyrics text into meaningful lines, filtering out metadata.
54
+
55
+ Parameters
56
+ ----------
57
+ text : str
58
+ Raw lyrics text with potential metadata
59
+
60
+ Returns
61
+ -------
62
+ list of str
63
+ Processed lyrics lines with metadata removed
64
+ """
65
  lines = text.split("\n")
66
  processed_lines = []
67
 
 
75
  return processed_lines
76
 
77
  def inverse_removing(self, words_to_remove):
78
+ """
79
+ Reconstruct lyrics text by removing specified line indices.
80
+
81
+ Parameters
82
+ ----------
83
+ words_to_remove : array-like
84
+ Indices of lyrics lines to remove from reconstruction
85
+
86
+ Returns
87
+ -------
88
+ str
89
+ Reconstructed lyrics text with specified lines removed
90
+ """
91
  # Keep lines not in words_to_remove
92
  kept_lines = [
93
  self.as_list[i]
 
97
  return "\n".join(kept_lines)
98
 
99
  def num_words(self):
100
+ """
101
+ Get total number of lyrics lines (called 'words' for LIME compatibility).
102
+
103
+ Returns
104
+ -------
105
+ int
106
+ Number of lyrics lines available for perturbation
107
+ """
108
  return len(self.as_list)
109
 
110
  def word(self, id_):
111
+ """
112
+ Get lyrics line content by index.
113
+
114
+ Parameters
115
+ ----------
116
+ id_ : int
117
+ Index of the lyrics line to retrieve
118
+
119
+ Returns
120
+ -------
121
+ str
122
+ Content of the specified lyrics line
123
+ """
124
  return self.as_list[id_]
src/musiclime/wrapper.py CHANGED
@@ -11,7 +11,31 @@ from src.musiclime.print_utils import green_bold
11
 
12
 
13
  class MusicLIMEPredictor:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def __init__(self):
 
 
 
 
 
 
15
  print("[MusicLIME] Loading models for MusicLIME...")
16
  self.llm2vec_model = load_llm2vec_model()
17
  config = load_config("config/model_config.yml")
@@ -20,14 +44,24 @@ class MusicLIMEPredictor:
20
 
21
  def __call__(self, texts, audios):
22
  """
23
- Predict function for MusicLIME
24
-
25
- Args:
26
- texts: List of lyric strings
27
- audios: Array of audio waveforms
28
-
29
- Returns:
30
- Array of prediction probabilities
 
 
 
 
 
 
 
 
 
 
31
  """
32
  print(f"[MusicLIME] Processing {len(texts)} samples with batch functions...")
33
 
 
11
 
12
 
13
  class MusicLIMEPredictor:
14
+ """
15
+ Batch prediction wrapper for MusicLIME explanations.
16
+
17
+ Integrates the complete Bach or Bot pipeline (SpecTTTra + LLM2Vec + MLP)
18
+ into a single callable for LIME perturbation processing. Optimized for
19
+ batch processing of multiple perturbed audio-lyrics pairs with detailed
20
+ timing analysis.
21
+
22
+ Attributes
23
+ ----------
24
+ llm2vec_model : LLM2Vec
25
+ Pre-loaded LLM2Vec model for lyrics feature extraction
26
+ classifier : MLPClassifier
27
+ Lazy-loaded MLP classifier for final predictions
28
+ config : dict
29
+ Model configuration parameters
30
+ """
31
+
32
  def __init__(self):
33
+ """
34
+ Initialize MusicLIME prediction wrapper with pre-trained models.
35
+
36
+ Loads LLM2Vec model and MLP configuration for batch processing
37
+ of perturbed audio-lyrics pairs during LIME explanation.
38
+ """
39
  print("[MusicLIME] Loading models for MusicLIME...")
40
  self.llm2vec_model = load_llm2vec_model()
41
  config = load_config("config/model_config.yml")
 
44
 
45
  def __call__(self, texts, audios):
46
  """
47
+ Batch prediction function for MusicLIME perturbations.
48
+
49
+ Processes multiple perturbed audio-lyrics pairs through the complete
50
+ pipeline: preprocessing -> feature extraction -> scaling -> MLP prediction.
51
+ Optimized for batch processing of LIME perturbations.
52
+
53
+ Parameters
54
+ ----------
55
+ texts : list of str
56
+ List of perturbed lyrics strings from LIME
57
+ audios : list of array-like
58
+ List of perturbed audio waveforms from LIME
59
+
60
+ Returns
61
+ -------
62
+ ndarray
63
+ Prediction probabilities in format [[P(AI), P(Human)], ...]
64
+ for each input pair, shape (n_samples, 2)
65
  """
66
  print(f"[MusicLIME] Processing {len(texts)} samples with batch functions...")
67
 
src/preprocessing/lyrics_preprocessor.py CHANGED
@@ -1,19 +1,19 @@
1
-
2
  import re
3
 
 
4
  class LyricsPreprocessor:
5
  """
6
- A preprocessing class for cleaning and preparing song lyrics
7
  for LLM2Vec.
8
 
9
  Parameters
10
  ----------
11
  keep_case : bool, optional (default=True)
12
  If False, converts all lyrics to lowercase.
13
-
14
  keep_punctuation : bool, optional (default=True)
15
  If False, removes all punctuation from lyrics.
16
-
17
  Usage
18
  -----
19
  >>> preprocessor = LyricsPreprocessor(keep_case=False, keep_punctuation=False)
@@ -21,9 +21,10 @@ class LyricsPreprocessor:
21
  >>> print(processed)
22
  "Hello, world! Sing along"
23
  """
 
24
  def __init__(self, keep_case=True, keep_punctuation=True):
25
  self.keep_case = keep_case
26
- self.keep_punctuation= keep_punctuation
27
 
28
  def __call__(self, lyrics: str):
29
  """
@@ -42,83 +43,34 @@ class LyricsPreprocessor:
42
  Returns
43
  -------
44
  str
45
-
46
  a cleaned lyric string
47
  """
48
  lyrics_cleaned = ""
49
 
50
  # Split lyrics by lines
51
- lyric_array = lyrics.split('\n')
52
 
53
  for line in lyric_array:
54
  line = line.strip()
55
 
56
  # Skip unimportant lines like [Chorus] or (Verse)
57
- if not line or re.match(r'^\[.*\]$', line) or re.match(r'^\(.*\)$', line):
58
  continue
59
-
60
  # Case handling
61
  if not self.keep_case:
62
  line = line.lower()
63
 
64
  # Punctuation handling
65
  if not self.keep_punctuation:
66
- line = re.sub(r'[^\w\s]', '', line)
67
-
68
  # Normalize to lowercase and split into words
69
  words = line.split()
70
-
71
- lyrics_cleaned += ' '.join(words) + ' '
72
 
73
  lyrics_cleaned = lyrics_cleaned.strip()
74
 
75
  return lyrics_cleaned
76
-
77
-
78
- def musiclime_lyrics_extractor(self, lyrics: str):
79
- """
80
- Preprocess the input lyrics text.
81
-
82
- Steps:
83
- 1. Removes empty lines or lines with metadata (e.g., [Chorus], (Verse)).
84
- 2. Applies case handling and punctuation removal based on settings.
85
- 3. Segments the lyrics into multiple lines.
86
- 3. Builds a list of lines from the lyrics
87
-
88
- Parameters
89
- ----------
90
- lyrics : str
91
- Raw lyrics text.
92
-
93
- Returns
94
- -------
95
- line_segmented_lyrics : list
96
- List of lines from the lyrics, processed using the class.
97
- """
98
-
99
- # Instantiate line lyrics list
100
- line_segmented_lyrics = []
101
-
102
- # Split lyrics by lines
103
- lyric_array = lyrics.split('\n')
104
-
105
- for line in lyric_array:
106
- line = line.strip()
107
-
108
- # Skip unimportant lines like [Chorus] or (Verse)
109
- if not line or re.match(r'^\[.*\]$', line) or re.match(r'^\(.*\)$', line):
110
- continue
111
-
112
- # Case handling
113
- if not self.keep_case:
114
- line = line.lower()
115
-
116
- # Punctuation handling
117
- if not self.keep_punctuation:
118
- line = re.sub(r'[^\w\s]', '', line)
119
-
120
- # Append line to line segmented lyrics list
121
- line_segmented_lyrics.append(line)
122
-
123
- return line_segmented_lyrics
124
-
 
 
1
  import re
2
 
3
+
4
  class LyricsPreprocessor:
5
  """
6
+ A preprocessing class for cleaning and preparing song lyrics
7
  for LLM2Vec.
8
 
9
  Parameters
10
  ----------
11
  keep_case : bool, optional (default=True)
12
  If False, converts all lyrics to lowercase.
13
+
14
  keep_punctuation : bool, optional (default=True)
15
  If False, removes all punctuation from lyrics.
16
+
17
  Usage
18
  -----
19
  >>> preprocessor = LyricsPreprocessor(keep_case=False, keep_punctuation=False)
 
21
  >>> print(processed)
22
  "Hello, world! Sing along"
23
  """
24
+
25
  def __init__(self, keep_case=True, keep_punctuation=True):
26
  self.keep_case = keep_case
27
+ self.keep_punctuation = keep_punctuation
28
 
29
  def __call__(self, lyrics: str):
30
  """
 
43
  Returns
44
  -------
45
  str
46
+
47
  a cleaned lyric string
48
  """
49
  lyrics_cleaned = ""
50
 
51
  # Split lyrics by lines
52
+ lyric_array = lyrics.split("\n")
53
 
54
  for line in lyric_array:
55
  line = line.strip()
56
 
57
  # Skip unimportant lines like [Chorus] or (Verse)
58
+ if not line or re.match(r"^\[.*\]$", line) or re.match(r"^\(.*\)$", line):
59
  continue
60
+
61
  # Case handling
62
  if not self.keep_case:
63
  line = line.lower()
64
 
65
  # Punctuation handling
66
  if not self.keep_punctuation:
67
+ line = re.sub(r"[^\w\s]", "", line)
68
+
69
  # Normalize to lowercase and split into words
70
  words = line.split()
71
+
72
+ lyrics_cleaned += " ".join(words) + " "
73
 
74
  lyrics_cleaned = lyrics_cleaned.strip()
75
 
76
  return lyrics_cleaned