Spaces:
Running
Running
| import re | |
| class LyricsPreprocessor: | |
| """ | |
| A preprocessing class for cleaning and preparing song lyrics | |
| for LLM2Vec. | |
| Parameters | |
| ---------- | |
| keep_case : bool, optional (default=True) | |
| If False, converts all lyrics to lowercase. | |
| keep_punctuation : bool, optional (default=True) | |
| If False, removes all punctuation from lyrics. | |
| Usage | |
| ----- | |
| >>> preprocessor = LyricsPreprocessor(keep_case=False, keep_punctuation=False) | |
| >>> processed = preprocessor("Hello, world!\n[Chorus]\nSing along") | |
| >>> print(processed) | |
| "Hello, world! Sing along" | |
| """ | |
| def __init__(self, keep_case=True, keep_punctuation=True): | |
| self.keep_case = keep_case | |
| self.keep_punctuation = keep_punctuation | |
| def __call__(self, lyrics: str): | |
| """ | |
| Preprocess the input lyrics text. | |
| Steps: | |
| 1. Removes empty lines or lines with metadata (e.g., [Chorus], (Verse)). | |
| 2. Applies case handling and punctuation removal based on settings. | |
| 3. Builds a cleaned lyrics string. | |
| Parameters | |
| ---------- | |
| lyrics : str | |
| Raw lyrics text. | |
| Returns | |
| ------- | |
| str | |
| a cleaned lyric string | |
| """ | |
| lyrics_cleaned = "" | |
| # Split lyrics by lines | |
| lyric_array = lyrics.split("\n") | |
| for line in lyric_array: | |
| line = line.strip() | |
| # Skip unimportant lines like [Chorus] or (Verse) | |
| if not line or re.match(r"^\[.*\]$", line) or re.match(r"^\(.*\)$", line): | |
| continue | |
| # Case handling | |
| if not self.keep_case: | |
| line = line.lower() | |
| # Punctuation handling | |
| if not self.keep_punctuation: | |
| line = re.sub(r"[^\w\s]", "", line) | |
| # Normalize to lowercase and split into words | |
| words = line.split() | |
| lyrics_cleaned += " ".join(words) + " " | |
| lyrics_cleaned = lyrics_cleaned.strip() | |
| return lyrics_cleaned | |