|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Process songs.jsonl to generate corresponding lrc files and jsonl files. |
|
|
""" |
|
|
|
|
|
import json |
|
|
import os |
|
|
import re |
|
|
from pathlib import Path |
|
|
from typing import List |
|
|
|
|
|
INPUT_JSONL = Path("xxx/diffrhythm2/example/final_zh_test.jsonl") |
|
|
OUTPUT_SONG_DIR = Path("xxx/diffrhythm2/example/zh_songs") |
|
|
OUTPUT_LRC_DIR = Path("xxx/diffrhythm2/example/zh_lrc") |
|
|
|
|
|
TIMESTAMP_PATTERN = re.compile(r"\[\d{2}:\d{2}(?:\.\d+)?\]") |
|
|
STRUCTURE_PATTERN = re.compile(r"^\[[^\]]+\]$") |
|
|
|
|
|
|
|
|
def normalize_structure(tag: str) -> str: |
|
|
"""Convert structure tag to target format.""" |
|
|
tag_lower = tag.lower() |
|
|
if tag_lower.startswith("verse"): |
|
|
return "[verse]" |
|
|
if "chorus" in tag_lower: |
|
|
return "[chorus]" |
|
|
if "bridge" in tag_lower: |
|
|
return "[bridge]" |
|
|
return f"[{tag_lower}]" |
|
|
|
|
|
|
|
|
def transform_lyrics(raw_lyrics: str) -> List[str]: |
|
|
"""Convert lyrics to LRC line list according to requirements.""" |
|
|
lines = ["[start]", "[intro]"] |
|
|
for raw_line in raw_lyrics.splitlines(): |
|
|
line = raw_line.strip() |
|
|
if not line: |
|
|
continue |
|
|
|
|
|
|
|
|
if STRUCTURE_PATTERN.match(line) and not TIMESTAMP_PATTERN.match(line): |
|
|
tag_content = line[1:-1].strip() |
|
|
lines.append(normalize_structure(tag_content)) |
|
|
continue |
|
|
|
|
|
|
|
|
text = TIMESTAMP_PATTERN.sub("", line).strip() |
|
|
if not text: |
|
|
continue |
|
|
lines.append(text) |
|
|
|
|
|
lines.append("[end]") |
|
|
return lines |
|
|
|
|
|
|
|
|
def ensure_dirs() -> None: |
|
|
OUTPUT_SONG_DIR.mkdir(parents=True, exist_ok=True) |
|
|
OUTPUT_LRC_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
def process_songs() -> None: |
|
|
ensure_dirs() |
|
|
with INPUT_JSONL.open("r", encoding="utf-8") as infile: |
|
|
for idx, line in enumerate(infile, start=1): |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
data = json.loads(line) |
|
|
description = data.get("description", "") |
|
|
lyrics_raw = data.get("lyrics", "") |
|
|
|
|
|
lrc_lines = transform_lyrics(lyrics_raw) |
|
|
lrc_filename = f"song_{idx}.lrc" |
|
|
lrc_path = OUTPUT_LRC_DIR / lrc_filename |
|
|
lrc_path.write_text("\n".join(lrc_lines), encoding="utf-8") |
|
|
|
|
|
song_base = f"song_{idx}" |
|
|
song_filename = f"{song_base}.jsonl" |
|
|
song_json_path = OUTPUT_SONG_DIR / song_filename |
|
|
song_entry = { |
|
|
"song_name": song_base, |
|
|
"style_prompt": description, |
|
|
"lyrics": f"example/zh_lrc/{lrc_filename}", |
|
|
} |
|
|
song_json_path.write_text(json.dumps(song_entry, ensure_ascii=False) + "\n", encoding="utf-8") |
|
|
print(f"Processed song {idx}: {song_filename}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
process_songs() |
|
|
|