#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Process songs.jsonl to generate corresponding lrc files and jsonl files.
"""

import json
import os
import re
from pathlib import Path
from typing import List

INPUT_JSONL = Path("xxx/diffrhythm2/example/final_zh_test.jsonl")
OUTPUT_SONG_DIR = Path("xxx/diffrhythm2/example/zh_songs")
OUTPUT_LRC_DIR = Path("xxx/diffrhythm2/example/zh_lrc")

TIMESTAMP_PATTERN = re.compile(r"\[\d{2}:\d{2}(?:\.\d+)?\]")
STRUCTURE_PATTERN = re.compile(r"^\[[^\]]+\]$")


def normalize_structure(tag: str) -> str:
    """Convert structure tag to target format."""
    tag_lower = tag.lower()
    if tag_lower.startswith("verse"):
        return "[verse]"
    if "chorus" in tag_lower:
        return "[chorus]"
    if "bridge" in tag_lower:
        return "[bridge]"
    return f"[{tag_lower}]"


def transform_lyrics(raw_lyrics: str) -> List[str]:
    """Convert lyrics to LRC line list according to requirements."""
    lines = ["[start]", "[intro]"]
    for raw_line in raw_lyrics.splitlines():
        line = raw_line.strip()
        if not line:
            continue

        # Process structure tags separately
        if STRUCTURE_PATTERN.match(line) and not TIMESTAMP_PATTERN.match(line):
            tag_content = line[1:-1].strip()
            lines.append(normalize_structure(tag_content))
            continue

        # Remove timestamps
        text = TIMESTAMP_PATTERN.sub("", line).strip()
        if not text:
            continue
        lines.append(text)

    lines.append("[end]")
    return lines


def ensure_dirs() -> None:
    OUTPUT_SONG_DIR.mkdir(parents=True, exist_ok=True)
    OUTPUT_LRC_DIR.mkdir(parents=True, exist_ok=True)


def process_songs() -> None:
    ensure_dirs()
    with INPUT_JSONL.open("r", encoding="utf-8") as infile:
        for idx, line in enumerate(infile, start=1):
            line = line.strip()
            if not line:
                continue
            data = json.loads(line)
            description = data.get("description", "")
            lyrics_raw = data.get("lyrics", "")

            lrc_lines = transform_lyrics(lyrics_raw)
            lrc_filename = f"song_{idx}.lrc"
            lrc_path = OUTPUT_LRC_DIR / lrc_filename
            lrc_path.write_text("\n".join(lrc_lines), encoding="utf-8")

            song_base = f"song_{idx}"
            song_filename = f"{song_base}.jsonl"
            song_json_path = OUTPUT_SONG_DIR / song_filename
            song_entry = {
                "song_name": song_base,
                "style_prompt": description,
                "lyrics": f"example/zh_lrc/{lrc_filename}",
            }
            song_json_path.write_text(json.dumps(song_entry, ensure_ascii=False) + "\n", encoding="utf-8")
            print(f"Processed song {idx}: {song_filename}")


if __name__ == "__main__":
    process_songs()