File size: 3,518 Bytes
fc7b4a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124

import re

class LyricsPreprocessor:
    """
    A preprocessing class for cleaning and preparing song lyrics 
    for LLM2Vec.

    Parameters
    ----------
    keep_case : bool, optional (default=True)
        If False, converts all lyrics to lowercase.
    
    keep_punctuation : bool, optional (default=True)
        If False, removes all punctuation from lyrics.
    
    Usage
    -----
    >>> preprocessor = LyricsPreprocessor(keep_case=False, keep_punctuation=False)
    >>> processed = preprocessor("Hello, world!\n[Chorus]\nSing along")
    >>> print(processed)
    "Hello, world! Sing along"
    """
    def __init__(self, keep_case=True, keep_punctuation=True):
        self.keep_case = keep_case
        self.keep_punctuation= keep_punctuation

    def __call__(self, lyrics: str):
        """
        Preprocess the input lyrics text.

        Steps:
        1. Removes empty lines or lines with metadata (e.g., [Chorus], (Verse)).
        2. Applies case handling and punctuation removal based on settings.
        3. Builds a cleaned lyrics string.

        Parameters
        ----------
        lyrics : str
            Raw lyrics text.

        Returns
        -------
        str
        
        a cleaned lyric string
        """
        lyrics_cleaned = ""

        # Split lyrics by lines
        lyric_array = lyrics.split('\n')

        for line in lyric_array:
            line = line.strip()

            # Skip unimportant lines like [Chorus] or (Verse)
            if not line or re.match(r'^\[.*\]$', line) or re.match(r'^\(.*\)$', line):
                continue
            
            # Case handling
            if not self.keep_case:
                line = line.lower()

            # Punctuation handling
            if not self.keep_punctuation:
                line = re.sub(r'[^\w\s]', '', line)
            
            # Normalize to lowercase and split into words
            words = line.split()
            
            lyrics_cleaned += ' '.join(words) + ' '

        lyrics_cleaned = lyrics_cleaned.strip()

        return lyrics_cleaned
    

    def musiclime_lyrics_extractor(self, lyrics: str):
        """
        Preprocess the input lyrics text.

        Steps:
        1. Removes empty lines or lines with metadata (e.g., [Chorus], (Verse)).
        2. Applies case handling and punctuation removal based on settings.
        3. Segments the lyrics into multiple lines.
        3. Builds a list of lines from the lyrics

        Parameters
        ----------
        lyrics : str
            Raw lyrics text.

        Returns
        -------
        line_segmented_lyrics : list
            List of lines from the lyrics, processed using the class.
        """
        
        # Instantiate line lyrics list
        line_segmented_lyrics = []

        # Split lyrics by lines
        lyric_array = lyrics.split('\n')

        for line in lyric_array:
            line = line.strip()

            # Skip unimportant lines like [Chorus] or (Verse)
            if not line or re.match(r'^\[.*\]$', line) or re.match(r'^\(.*\)$', line):
                continue
            
            # Case handling
            if not self.keep_case:
                line = line.lower()

            # Punctuation handling
            if not self.keep_punctuation:
                line = re.sub(r'[^\w\s]', '', line)

            # Append line to line segmented lyrics list
            line_segmented_lyrics.append(line)
            
        return line_segmented_lyrics