Spaces:

SongLift
/

LyrGen2

Sleeping

James Edmunds

Final updates before making cleanstart2 the new main

df5d609 over 1 year ago

8.73 kB

	import os
	from pathlib import Path
	from typing import List, Optional
	from langchain_core.documents import Document
	import chardet
	from tqdm import tqdm

	class LyricsLoader:
	def __init__(self, lyrics_dir: str = "lyrics"):
	self.lyrics_dir = Path(lyrics_dir)

	def detect_file_encoding(self, file_path: Path) -> str:
	"""Detect the encoding of a file"""
	with open(file_path, 'rb') as file:
	raw_data = file.read()
	result = chardet.detect(raw_data)
	return result['encoding']

	def clean_lyrics_text(self, text: str) -> str:
	"""Clean up lyrics text and normalize formatting"""
	# First, handle text with no line breaks
	if '\n' not in text and len(text) > 100:
	# Add breaks after common punctuation
	for punct in ['. ', '? ', '! ']:
	text = text.replace(punct, punct + '\n')
	# Add breaks for repeated words that might indicate chorus
	for word in [' cause ', ' cos ', ' when ', ' and ']:
	text = text.replace(word, '\n' + word.strip())
	# Break very long lines
	if len(text) > 200:
	words = text.split()
	lines = []
	current_line = []
	for word in words:
	current_line.append(word)
	if len(' '.join(current_line)) > 50: # reasonable line length
	lines.append(' '.join(current_line))
	current_line = []
	if current_line:
	lines.append(' '.join(current_line))
	text = '\n'.join(lines)

	# Split into lines
	lines = text.split('\n')
	cleaned_lines = []
	prev_line = ""
	consecutive_empty = 0

	# Header patterns to remove (not whole lines)
	header_patterns = [
	'contributors',
	'translations',
	'lyrics',
	'tradução',
	'traducción',
	'written by',
	'produced by',
	'you might also like',
	'embed'
	]

	for line in lines:
	# Normalize whitespace
	line = line.strip()

	# Handle empty lines
	if not line:
	consecutive_empty += 1
	if consecutive_empty <= 2: # Keep up to 2 empty lines for section breaks
	cleaned_lines.append("")
	continue

	consecutive_empty = 0

	# Remove header patterns from line instead of skipping whole line
	lower_line = line.lower()
	cleaned_line = line
	for pattern in header_patterns:
	# Case-insensitive replacement of the pattern
	pattern_start = lower_line.find(pattern)
	if pattern_start != -1:
	pattern_end = pattern_start + len(pattern)
	# Remove the pattern and any following colon or dash
	while pattern_end < len(line) and line[pattern_end] in [':', '-', ' ']:
	pattern_end += 1
	cleaned_line = line[:pattern_start].strip() + ' ' + line[pattern_end:].strip()
	cleaned_line = cleaned_line.strip()

	# Skip if line became empty after cleaning
	if not cleaned_line:
	continue

	# Skip duplicate lines
	if cleaned_line == prev_line:
	continue

	# Handle section markers
	if any(pattern in lower_line for pattern in [
	'verse', 'chorus', 'bridge', 'hook',
	'intro', 'outro', 'pre-chorus'
	]):
	cleaned_lines.append(f"[{cleaned_line.strip('[]')}]")
	continue

	cleaned_lines.append(cleaned_line)
	prev_line = cleaned_line

	# Remove trailing empty lines
	while cleaned_lines and not cleaned_lines[-1]:
	cleaned_lines.pop()

	# Join lines and ensure text ends with newline
	cleaned_text = '\n'.join(cleaned_lines)
	return cleaned_text.strip()

	def is_valid_lyric_file(self, file_path: Path) -> bool:
	"""Check if file is a valid lyrics file"""
	# Skip invalid patterns
	invalid_patterns = [
	'[artwork]', 'artwork', 'cover', '.jpg', '.png',
	'tracklist', 'credits', 'booklet', 'album art'
	]

	# Check filename
	lower_name = file_path.name.lower()
	if any(pattern in lower_name for pattern in invalid_patterns):
	return False

	# Check extension
	if not lower_name.endswith('.txt'):
	return False

	# Check file size (100B to 1MB)
	file_size = file_path.stat().st_size
	if file_size < 100 or file_size > 1000000:
	return False

	return True

	def read_and_validate_lyrics(
	self,
	file_path: Path,
	artist_name: str
	) -> Optional[Document]:
	"""Read and validate a lyrics file with encoding detection"""
	try:
	# Try common encodings
	for encoding in ['utf-8', 'latin-1', 'cp1252']:
	try:
	with open(file_path, 'r', encoding=encoding) as f:
	text = f.read().strip()

	# Basic validation
	if not text or len(text) < 10:
	print(f"Warning: Invalid or empty lyrics in {file_path.name}")
	return None

	# Clean the text
	cleaned_text = self.clean_lyrics_text(text)
	if not cleaned_text:
	print(f"Warning: No valid content after cleaning in {file_path.name}")
	return None

	# Create metadata
	metadata = {
	'artist': artist_name,
	'song_title': file_path.stem,
	'source': str(file_path),
	'encoding': encoding,
	'original_size': len(text),
	'cleaned_size': len(cleaned_text)
	}

	return Document(
	page_content=cleaned_text,
	metadata=metadata
	)
	except UnicodeDecodeError:
	continue

	print(f"Error: Could not decode {file_path.name} with supported encodings")
	return None

	except Exception as e:
	print(f"Error reading {file_path.name}: {str(e)}")
	return None

	def load_lyrics(self) -> List[Document]:
	"""Load and process lyrics from directory structure organized by artist"""
	documents = []

	if not self.lyrics_dir.exists():
	raise FileNotFoundError(
	f"Lyrics directory not found: {self.lyrics_dir}"
	)

	# First, count valid files
	total_files = sum(
	1 for artist_dir in self.lyrics_dir.iterdir()
	if artist_dir.is_dir()
	for f in artist_dir.glob('*.txt')
	if self.is_valid_lyric_file(f)
	)

	if total_files == 0:
	raise ValueError("No valid lyrics files found")

	# Process files with progress bar
	with tqdm(total=total_files, desc="Loading lyrics") as pbar:
	for artist_dir in self.lyrics_dir.iterdir():
	if artist_dir.is_dir():
	artist_name = artist_dir.name
	lyric_files = [
	f for f in artist_dir.glob('*.txt')
	if self.is_valid_lyric_file(f)
	]

	for lyric_file in lyric_files:
	doc = self.read_and_validate_lyrics(
	lyric_file,
	artist_name
	)
	if doc:
	documents.append(doc)
	pbar.update(1)

	print(f"Successfully loaded {len(documents)} valid lyrics files")
	return documents