Spaces:

SongLift
/

LyrGen2

Sleeping

File size: 8,726 Bytes

import os
from pathlib import Path
from typing import List, Optional
from langchain_core.documents import Document
import chardet
from tqdm import tqdm

class LyricsLoader:
    def __init__(self, lyrics_dir: str = "lyrics"):
        self.lyrics_dir = Path(lyrics_dir)

    def detect_file_encoding(self, file_path: Path) -> str:
        """Detect the encoding of a file"""
        with open(file_path, 'rb') as file:
            raw_data = file.read()
            result = chardet.detect(raw_data)
            return result['encoding']

    def clean_lyrics_text(self, text: str) -> str:
        """Clean up lyrics text and normalize formatting"""
        # First, handle text with no line breaks
        if '\n' not in text and len(text) > 100:
            # Add breaks after common punctuation
            for punct in ['. ', '? ', '! ']:
                text = text.replace(punct, punct + '\n')
            # Add breaks for repeated words that might indicate chorus
            for word in [' cause ', ' cos ', ' when ', ' and ']:
                text = text.replace(word, '\n' + word.strip())
            # Break very long lines
            if len(text) > 200:
                words = text.split()
                lines = []
                current_line = []
                for word in words:
                    current_line.append(word)
                    if len(' '.join(current_line)) > 50:  # reasonable line length
                        lines.append(' '.join(current_line))
                        current_line = []
                if current_line:
                    lines.append(' '.join(current_line))
                text = '\n'.join(lines)
        
        # Split into lines
        lines = text.split('\n')
        cleaned_lines = []
        prev_line = ""
        consecutive_empty = 0
        
        # Header patterns to remove (not whole lines)
        header_patterns = [
            'contributors',
            'translations',
            'lyrics',
            'tradução',
            'traducción',
            'written by',
            'produced by',
            'you might also like',
            'embed'
        ]
        
        for line in lines:
            # Normalize whitespace
            line = line.strip()
            
            # Handle empty lines
            if not line:
                consecutive_empty += 1
                if consecutive_empty <= 2:  # Keep up to 2 empty lines for section breaks
                    cleaned_lines.append("")
                continue
            
            consecutive_empty = 0
            
            # Remove header patterns from line instead of skipping whole line
            lower_line = line.lower()
            cleaned_line = line
            for pattern in header_patterns:
                # Case-insensitive replacement of the pattern
                pattern_start = lower_line.find(pattern)
                if pattern_start != -1:
                    pattern_end = pattern_start + len(pattern)
                    # Remove the pattern and any following colon or dash
                    while pattern_end < len(line) and line[pattern_end] in [':', '-', ' ']:
                        pattern_end += 1
                    cleaned_line = line[:pattern_start].strip() + ' ' + line[pattern_end:].strip()
                    cleaned_line = cleaned_line.strip()
            
            # Skip if line became empty after cleaning
            if not cleaned_line:
                continue
                
            # Skip duplicate lines
            if cleaned_line == prev_line:
                continue
                
            # Handle section markers
            if any(pattern in lower_line for pattern in [
                'verse', 'chorus', 'bridge', 'hook',
                'intro', 'outro', 'pre-chorus'
            ]):
                cleaned_lines.append(f"[{cleaned_line.strip('[]')}]")
                continue
            
            cleaned_lines.append(cleaned_line)
            prev_line = cleaned_line
        
        # Remove trailing empty lines
        while cleaned_lines and not cleaned_lines[-1]:
            cleaned_lines.pop()
            
        # Join lines and ensure text ends with newline
        cleaned_text = '\n'.join(cleaned_lines)
        return cleaned_text.strip()

    def is_valid_lyric_file(self, file_path: Path) -> bool:
        """Check if file is a valid lyrics file"""
        # Skip invalid patterns
        invalid_patterns = [
            '[artwork]', 'artwork', 'cover', '.jpg', '.png',
            'tracklist', 'credits', 'booklet', 'album art'
        ]
        
        # Check filename
        lower_name = file_path.name.lower()
        if any(pattern in lower_name for pattern in invalid_patterns):
            return False
            
        # Check extension
        if not lower_name.endswith('.txt'):
            return False
            
        # Check file size (100B to 1MB)
        file_size = file_path.stat().st_size
        if file_size < 100 or file_size > 1000000:
            return False
            
        return True

    def read_and_validate_lyrics(
        self,
        file_path: Path,
        artist_name: str
    ) -> Optional[Document]:
        """Read and validate a lyrics file with encoding detection"""
        try:
            # Try common encodings
            for encoding in ['utf-8', 'latin-1', 'cp1252']:
                try:
                    with open(file_path, 'r', encoding=encoding) as f:
                        text = f.read().strip()
                        
                        # Basic validation
                        if not text or len(text) < 10:
                            print(f"Warning: Invalid or empty lyrics in {file_path.name}")
                            return None
                            
                        # Clean the text
                        cleaned_text = self.clean_lyrics_text(text)
                        if not cleaned_text:
                            print(f"Warning: No valid content after cleaning in {file_path.name}")
                            return None
                            
                        # Create metadata
                        metadata = {
                            'artist': artist_name,
                            'song_title': file_path.stem,
                            'source': str(file_path),
                            'encoding': encoding,
                            'original_size': len(text),
                            'cleaned_size': len(cleaned_text)
                        }
                        
                        return Document(
                            page_content=cleaned_text,
                            metadata=metadata
                        )
                except UnicodeDecodeError:
                    continue
                    
            print(f"Error: Could not decode {file_path.name} with supported encodings")
            return None
            
        except Exception as e:
            print(f"Error reading {file_path.name}: {str(e)}")
            return None

    def load_lyrics(self) -> List[Document]:
        """Load and process lyrics from directory structure organized by artist"""
        documents = []
        
        if not self.lyrics_dir.exists():
            raise FileNotFoundError(
                f"Lyrics directory not found: {self.lyrics_dir}"
            )
            
        # First, count valid files
        total_files = sum(
            1 for artist_dir in self.lyrics_dir.iterdir()
            if artist_dir.is_dir()
            for f in artist_dir.glob('*.txt')
            if self.is_valid_lyric_file(f)
        )
        
        if total_files == 0:
            raise ValueError("No valid lyrics files found")
            
        # Process files with progress bar
        with tqdm(total=total_files, desc="Loading lyrics") as pbar:
            for artist_dir in self.lyrics_dir.iterdir():
                if artist_dir.is_dir():
                    artist_name = artist_dir.name
                    lyric_files = [
                        f for f in artist_dir.glob('*.txt')
                        if self.is_valid_lyric_file(f)
                    ]
                    
                    for lyric_file in lyric_files:
                        doc = self.read_and_validate_lyrics(
                            lyric_file,
                            artist_name
                        )
                        if doc:
                            documents.append(doc)
                        pbar.update(1)
                        
        print(f"Successfully loaded {len(documents)} valid lyrics files")
        return documents