Spaces:

Jacong
/

muse

Runtime error

App Files Files Community

muse / eval_pipeline /convert_lyrics.py

Jacong

Upload 96 files

aa9be1e verified 3 months ago

raw

history blame contribute delete

2.96 kB

	#!/usr/bin/env python3
	"""
	Convert existing lyric files to transcription.jsonl format
	Usage: python convert_lyrics.py --input_dir <directory_containing_txt> --output <output_file>

	Input format (xxx.txt):
	First line: Chinese/English (optional, will be ignored)
	Second line and after: Lyric content

	Output format (transcription.jsonl):
	{"file_path": "...", "file_name": "xxx.mp3", "file_idx": 1, "hyp_text": "lyrics"}
	"""
	import argparse, json, os, re, glob
	from pathlib import Path

	def extract_idx(filename):
	"""Extract index from filename (last number sequence)"""
	matches = re.findall(r'\d+', os.path.splitext(filename)[0])
	return int(matches[-1]) if matches else None

	def read_lyrics(txt_path):
	"""Read txt file and extract lyrics"""
	with open(txt_path, 'r', encoding='utf-8') as f:
	lines = f.readlines()

	# Skip first line if it's a language identifier
	if lines and lines[0].strip().lower() in ['chinese', 'english', 'zh', 'en']:
	lines = lines[1:]

	# Merge remaining lines as lyrics
	lyrics = ' '.join(line.strip() for line in lines if line.strip())
	return lyrics

	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--input_dir", required=True, help="Directory containing txt lyric files")
	parser.add_argument("--output", default="", help="Output file (default: input_dir/transcription.jsonl)")
	args = parser.parse_args()

	input_dir = Path(args.input_dir)
	output_file = args.output if args.output else input_dir / "transcription.jsonl"

	# Find all txt files
	txt_files = sorted(glob.glob(str(input_dir / "*.txt")))
	print(f"Found {len(txt_files)} txt files in {input_dir}")

	records = []
	for txt_path in txt_files:
	txt_name = os.path.basename(txt_path)
	idx = extract_idx(txt_name)

	# Infer corresponding audio filename
	base_name = os.path.splitext(txt_name)[0]
	# Try to find corresponding audio file
	audio_name = None
	for ext in ['.mp3', '.wav']:
	candidate = input_dir / f"{base_name}{ext}"
	if candidate.exists():
	audio_name = f"{base_name}{ext}"
	break
	if not audio_name:
	audio_name = f"{base_name}.mp3" # Default

	lyrics = read_lyrics(txt_path)

	rec = {
	"file_path": str(input_dir / audio_name),
	"file_name": audio_name,
	"file_idx": idx,
	"hyp_text": lyrics
	}
	records.append(rec)

	# Sort by index
	records.sort(key=lambda x: x["file_idx"] if x["file_idx"] is not None else 999999)

	# Write output
	with open(output_file, 'w', encoding='utf-8') as f:
	for rec in records:
	f.write(json.dumps(rec, ensure_ascii=False) + '\n')

	print(f"Converted {len(records)} files -> {output_file}")

	if __name__ == "__main__":
	main()