File size: 2,038 Bytes
fc7b4a9
 
7633e2f
fc7b4a9
 
7633e2f
fc7b4a9
 
 
 
 
 
7633e2f
fc7b4a9
 
7633e2f
fc7b4a9
 
 
 
 
 
 
7633e2f
fc7b4a9
 
7633e2f
fc7b4a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7633e2f
fc7b4a9
 
 
 
 
7633e2f
fc7b4a9
 
 
 
 
7633e2f
fc7b4a9
7633e2f
fc7b4a9
 
 
 
 
 
7633e2f
 
fc7b4a9
 
7633e2f
 
fc7b4a9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import re


class LyricsPreprocessor:
    """
    A preprocessing class for cleaning and preparing song lyrics
    for LLM2Vec.

    Parameters
    ----------
    keep_case : bool, optional (default=True)
        If False, converts all lyrics to lowercase.

    keep_punctuation : bool, optional (default=True)
        If False, removes all punctuation from lyrics.

    Usage
    -----
    >>> preprocessor = LyricsPreprocessor(keep_case=False, keep_punctuation=False)
    >>> processed = preprocessor("Hello, world!\n[Chorus]\nSing along")
    >>> print(processed)
    "Hello, world! Sing along"
    """

    def __init__(self, keep_case=True, keep_punctuation=True):
        self.keep_case = keep_case
        self.keep_punctuation = keep_punctuation

    def __call__(self, lyrics: str):
        """
        Preprocess the input lyrics text.

        Steps:
        1. Removes empty lines or lines with metadata (e.g., [Chorus], (Verse)).
        2. Applies case handling and punctuation removal based on settings.
        3. Builds a cleaned lyrics string.

        Parameters
        ----------
        lyrics : str
            Raw lyrics text.

        Returns
        -------
        str

        a cleaned lyric string
        """
        lyrics_cleaned = ""

        # Split lyrics by lines
        lyric_array = lyrics.split("\n")

        for line in lyric_array:
            line = line.strip()

            # Skip unimportant lines like [Chorus] or (Verse)
            if not line or re.match(r"^\[.*\]$", line) or re.match(r"^\(.*\)$", line):
                continue

            # Case handling
            if not self.keep_case:
                line = line.lower()

            # Punctuation handling
            if not self.keep_punctuation:
                line = re.sub(r"[^\w\s]", "", line)

            # Normalize to lowercase and split into words
            words = line.split()

            lyrics_cleaned += " ".join(words) + " "

        lyrics_cleaned = lyrics_cleaned.strip()

        return lyrics_cleaned