import os from pathlib import Path from typing import List, Optional from langchain_core.documents import Document import chardet from tqdm import tqdm class LyricsLoader: def __init__(self, lyrics_dir: str = "lyrics"): self.lyrics_dir = Path(lyrics_dir) def detect_file_encoding(self, file_path: Path) -> str: """Detect the encoding of a file""" with open(file_path, 'rb') as file: raw_data = file.read() result = chardet.detect(raw_data) return result['encoding'] def clean_lyrics_text(self, text: str) -> str: """Clean up lyrics text and normalize formatting""" # First, handle text with no line breaks if '\n' not in text and len(text) > 100: # Add breaks after common punctuation for punct in ['. ', '? ', '! ']: text = text.replace(punct, punct + '\n') # Add breaks for repeated words that might indicate chorus for word in [' cause ', ' cos ', ' when ', ' and ']: text = text.replace(word, '\n' + word.strip()) # Break very long lines if len(text) > 200: words = text.split() lines = [] current_line = [] for word in words: current_line.append(word) if len(' '.join(current_line)) > 50: # reasonable line length lines.append(' '.join(current_line)) current_line = [] if current_line: lines.append(' '.join(current_line)) text = '\n'.join(lines) # Split into lines lines = text.split('\n') cleaned_lines = [] prev_line = "" consecutive_empty = 0 # Header patterns to remove (not whole lines) header_patterns = [ 'contributors', 'translations', 'lyrics', 'tradução', 'traducción', 'written by', 'produced by', 'you might also like', 'embed' ] for line in lines: # Normalize whitespace line = line.strip() # Handle empty lines if not line: consecutive_empty += 1 if consecutive_empty <= 2: # Keep up to 2 empty lines for section breaks cleaned_lines.append("") continue consecutive_empty = 0 # Remove header patterns from line instead of skipping whole line lower_line = line.lower() cleaned_line = line for pattern in header_patterns: # Case-insensitive replacement of the pattern pattern_start = lower_line.find(pattern) if pattern_start != -1: pattern_end = pattern_start + len(pattern) # Remove the pattern and any following colon or dash while pattern_end < len(line) and line[pattern_end] in [':', '-', ' ']: pattern_end += 1 cleaned_line = line[:pattern_start].strip() + ' ' + line[pattern_end:].strip() cleaned_line = cleaned_line.strip() # Skip if line became empty after cleaning if not cleaned_line: continue # Skip duplicate lines if cleaned_line == prev_line: continue # Handle section markers if any(pattern in lower_line for pattern in [ 'verse', 'chorus', 'bridge', 'hook', 'intro', 'outro', 'pre-chorus' ]): cleaned_lines.append(f"[{cleaned_line.strip('[]')}]") continue cleaned_lines.append(cleaned_line) prev_line = cleaned_line # Remove trailing empty lines while cleaned_lines and not cleaned_lines[-1]: cleaned_lines.pop() # Join lines and ensure text ends with newline cleaned_text = '\n'.join(cleaned_lines) return cleaned_text.strip() def is_valid_lyric_file(self, file_path: Path) -> bool: """Check if file is a valid lyrics file""" # Skip invalid patterns invalid_patterns = [ '[artwork]', 'artwork', 'cover', '.jpg', '.png', 'tracklist', 'credits', 'booklet', 'album art' ] # Check filename lower_name = file_path.name.lower() if any(pattern in lower_name for pattern in invalid_patterns): return False # Check extension if not lower_name.endswith('.txt'): return False # Check file size (100B to 1MB) file_size = file_path.stat().st_size if file_size < 100 or file_size > 1000000: return False return True def read_and_validate_lyrics( self, file_path: Path, artist_name: str ) -> Optional[Document]: """Read and validate a lyrics file with encoding detection""" try: # Try common encodings for encoding in ['utf-8', 'latin-1', 'cp1252']: try: with open(file_path, 'r', encoding=encoding) as f: text = f.read().strip() # Basic validation if not text or len(text) < 10: print(f"Warning: Invalid or empty lyrics in {file_path.name}") return None # Clean the text cleaned_text = self.clean_lyrics_text(text) if not cleaned_text: print(f"Warning: No valid content after cleaning in {file_path.name}") return None # Create metadata metadata = { 'artist': artist_name, 'song_title': file_path.stem, 'source': str(file_path), 'encoding': encoding, 'original_size': len(text), 'cleaned_size': len(cleaned_text) } return Document( page_content=cleaned_text, metadata=metadata ) except UnicodeDecodeError: continue print(f"Error: Could not decode {file_path.name} with supported encodings") return None except Exception as e: print(f"Error reading {file_path.name}: {str(e)}") return None def load_lyrics(self) -> List[Document]: """Load and process lyrics from directory structure organized by artist""" documents = [] if not self.lyrics_dir.exists(): raise FileNotFoundError( f"Lyrics directory not found: {self.lyrics_dir}" ) # First, count valid files total_files = sum( 1 for artist_dir in self.lyrics_dir.iterdir() if artist_dir.is_dir() for f in artist_dir.glob('*.txt') if self.is_valid_lyric_file(f) ) if total_files == 0: raise ValueError("No valid lyrics files found") # Process files with progress bar with tqdm(total=total_files, desc="Loading lyrics") as pbar: for artist_dir in self.lyrics_dir.iterdir(): if artist_dir.is_dir(): artist_name = artist_dir.name lyric_files = [ f for f in artist_dir.glob('*.txt') if self.is_valid_lyric_file(f) ] for lyric_file in lyric_files: doc = self.read_and_validate_lyrics( lyric_file, artist_name ) if doc: documents.append(doc) pbar.update(1) print(f"Successfully loaded {len(documents)} valid lyrics files") return documents