File size: 8,726 Bytes
4994b71 df5d609 4994b71 df5d609 4994b71 df5d609 4994b71 df5d609 4994b71 df5d609 4994b71 df5d609 4994b71 df5d609 4994b71 df5d609 4994b71 df5d609 4994b71 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 | import os
from pathlib import Path
from typing import List, Optional
from langchain_core.documents import Document
import chardet
from tqdm import tqdm
class LyricsLoader:
def __init__(self, lyrics_dir: str = "lyrics"):
self.lyrics_dir = Path(lyrics_dir)
def detect_file_encoding(self, file_path: Path) -> str:
"""Detect the encoding of a file"""
with open(file_path, 'rb') as file:
raw_data = file.read()
result = chardet.detect(raw_data)
return result['encoding']
def clean_lyrics_text(self, text: str) -> str:
"""Clean up lyrics text and normalize formatting"""
# First, handle text with no line breaks
if '\n' not in text and len(text) > 100:
# Add breaks after common punctuation
for punct in ['. ', '? ', '! ']:
text = text.replace(punct, punct + '\n')
# Add breaks for repeated words that might indicate chorus
for word in [' cause ', ' cos ', ' when ', ' and ']:
text = text.replace(word, '\n' + word.strip())
# Break very long lines
if len(text) > 200:
words = text.split()
lines = []
current_line = []
for word in words:
current_line.append(word)
if len(' '.join(current_line)) > 50: # reasonable line length
lines.append(' '.join(current_line))
current_line = []
if current_line:
lines.append(' '.join(current_line))
text = '\n'.join(lines)
# Split into lines
lines = text.split('\n')
cleaned_lines = []
prev_line = ""
consecutive_empty = 0
# Header patterns to remove (not whole lines)
header_patterns = [
'contributors',
'translations',
'lyrics',
'tradução',
'traducción',
'written by',
'produced by',
'you might also like',
'embed'
]
for line in lines:
# Normalize whitespace
line = line.strip()
# Handle empty lines
if not line:
consecutive_empty += 1
if consecutive_empty <= 2: # Keep up to 2 empty lines for section breaks
cleaned_lines.append("")
continue
consecutive_empty = 0
# Remove header patterns from line instead of skipping whole line
lower_line = line.lower()
cleaned_line = line
for pattern in header_patterns:
# Case-insensitive replacement of the pattern
pattern_start = lower_line.find(pattern)
if pattern_start != -1:
pattern_end = pattern_start + len(pattern)
# Remove the pattern and any following colon or dash
while pattern_end < len(line) and line[pattern_end] in [':', '-', ' ']:
pattern_end += 1
cleaned_line = line[:pattern_start].strip() + ' ' + line[pattern_end:].strip()
cleaned_line = cleaned_line.strip()
# Skip if line became empty after cleaning
if not cleaned_line:
continue
# Skip duplicate lines
if cleaned_line == prev_line:
continue
# Handle section markers
if any(pattern in lower_line for pattern in [
'verse', 'chorus', 'bridge', 'hook',
'intro', 'outro', 'pre-chorus'
]):
cleaned_lines.append(f"[{cleaned_line.strip('[]')}]")
continue
cleaned_lines.append(cleaned_line)
prev_line = cleaned_line
# Remove trailing empty lines
while cleaned_lines and not cleaned_lines[-1]:
cleaned_lines.pop()
# Join lines and ensure text ends with newline
cleaned_text = '\n'.join(cleaned_lines)
return cleaned_text.strip()
def is_valid_lyric_file(self, file_path: Path) -> bool:
"""Check if file is a valid lyrics file"""
# Skip invalid patterns
invalid_patterns = [
'[artwork]', 'artwork', 'cover', '.jpg', '.png',
'tracklist', 'credits', 'booklet', 'album art'
]
# Check filename
lower_name = file_path.name.lower()
if any(pattern in lower_name for pattern in invalid_patterns):
return False
# Check extension
if not lower_name.endswith('.txt'):
return False
# Check file size (100B to 1MB)
file_size = file_path.stat().st_size
if file_size < 100 or file_size > 1000000:
return False
return True
def read_and_validate_lyrics(
self,
file_path: Path,
artist_name: str
) -> Optional[Document]:
"""Read and validate a lyrics file with encoding detection"""
try:
# Try common encodings
for encoding in ['utf-8', 'latin-1', 'cp1252']:
try:
with open(file_path, 'r', encoding=encoding) as f:
text = f.read().strip()
# Basic validation
if not text or len(text) < 10:
print(f"Warning: Invalid or empty lyrics in {file_path.name}")
return None
# Clean the text
cleaned_text = self.clean_lyrics_text(text)
if not cleaned_text:
print(f"Warning: No valid content after cleaning in {file_path.name}")
return None
# Create metadata
metadata = {
'artist': artist_name,
'song_title': file_path.stem,
'source': str(file_path),
'encoding': encoding,
'original_size': len(text),
'cleaned_size': len(cleaned_text)
}
return Document(
page_content=cleaned_text,
metadata=metadata
)
except UnicodeDecodeError:
continue
print(f"Error: Could not decode {file_path.name} with supported encodings")
return None
except Exception as e:
print(f"Error reading {file_path.name}: {str(e)}")
return None
def load_lyrics(self) -> List[Document]:
"""Load and process lyrics from directory structure organized by artist"""
documents = []
if not self.lyrics_dir.exists():
raise FileNotFoundError(
f"Lyrics directory not found: {self.lyrics_dir}"
)
# First, count valid files
total_files = sum(
1 for artist_dir in self.lyrics_dir.iterdir()
if artist_dir.is_dir()
for f in artist_dir.glob('*.txt')
if self.is_valid_lyric_file(f)
)
if total_files == 0:
raise ValueError("No valid lyrics files found")
# Process files with progress bar
with tqdm(total=total_files, desc="Loading lyrics") as pbar:
for artist_dir in self.lyrics_dir.iterdir():
if artist_dir.is_dir():
artist_name = artist_dir.name
lyric_files = [
f for f in artist_dir.glob('*.txt')
if self.is_valid_lyric_file(f)
]
for lyric_file in lyric_files:
doc = self.read_and_validate_lyrics(
lyric_file,
artist_name
)
if doc:
documents.append(doc)
pbar.update(1)
print(f"Successfully loaded {len(documents)} valid lyrics files")
return documents |