""" Generate multi-turn dialogue data, each turn contains lyric text and corresponding audio token slices """ import json import re import os import torch from tqdm import tqdm from my_tool import load_jsonl TOKEN_PER_SECOND = 25 # Number of tokens per second of audio NUM_ITEMS = 100000 # Process N items first timestamp_pattern = re.compile(r"\[([0-9]{1,2}):([0-9]{1,2})(?:[.:]([0-9]{1,3}))?\]") def _parse_lyric_with_timestamps(lyric: str): """ Return [(start_time_s, text), ...] sorted by timestamp """ result = [] for match in timestamp_pattern.finditer(lyric): start_idx = match.end() end_idx = lyric.find("[", start_idx) text = lyric[start_idx:end_idx].strip() if end_idx != -1 else lyric[start_idx:].strip() if not text: continue minute = int(match.group(1)) second = int(match.group(2)) ms = int(match.group(3)) if match.group(3) else 0 total_seconds = minute * 60 + second + ms / 1000 result.append((total_seconds, text)) return result def _load_audio_tokens(pt_file): """ Load MuCodec encoding of audio """ audio_ids = torch.load(pt_file, map_location="cpu").squeeze().long() return audio_ids def _get_token_slice(audio_tokens, start_s, end_s): """Split encoding by time segment""" start_idx = int(start_s * TOKEN_PER_SECOND) end_idx = int(end_s * TOKEN_PER_SECOND) sliced = audio_tokens[start_idx:end_idx] return "[SOA]" + "".join([f"" for i in sliced]) + "[EOA]" def _process_item(item, pt_dir:str): song_name = item.get("song") or item.get("name") song_name = song_name.split('.mp3')[0] # For mucodec, remove extension pt_file = os.path.join(pt_dir, f"{song_name}.pt") if not os.path.exists(pt_file): return None audio_tokens = _load_audio_tokens(pt_file) tlyric_ = item.get('tlyric', "") lyric_ = item.get('lyric', "") lyric = tlyric_ if len(tlyric_) > len(lyric_) else lyric_ lyrics_ts = _parse_lyric_with_timestamps(lyric) if not lyrics_ts: # Skip if no lyrics return None rounds = [] # First generate a system message containing song information intro_text = ( f"请生成一首歌曲,歌名为《{item.get('name', '')}》,风格是{item.get('style','')}" f",情绪为{item.get('emotion','')},节奏:{item.get('rhythm','')}," f"{item.get('description','')},由{item.get('singer','')}演唱,语言:{item.get('lang','')}。" f"歌词如下:" + " ".join([text for _, text in lyrics_ts]) + "接下来我会逐句告诉你需要生成歌曲片段的歌词,\n请先生成前奏" ) rounds.append({"role": "user", "content": intro_text}) rounds.append({"role": "assistant", "content": _get_token_slice(audio_tokens, 0, lyrics_ts[0][0])}) # Intro tokens # Each lyric line corresponds to one round for idx, (start_s, text) in enumerate(lyrics_ts[:-1]): ## Last line handled separately end_s = lyrics_ts[idx + 1][0] if idx + 1 < len(lyrics_ts) else len(audio_tokens)/TOKEN_PER_SECOND # Last line to end of audio rounds.append({"role": "user", "content": text}) rounds.append({"role": "assistant", "content": _get_token_slice(audio_tokens, start_s, end_s)}) # Tail processing logic rounds.append({"role": "user", "content": f"请生成歌词{lyrics_ts[-1][1]}以及歌曲结尾"}) rounds.append({"role": "assistant", "content": _get_token_slice(audio_tokens, lyrics_ts[-1][0], len(audio_tokens)/TOKEN_PER_SECOND)}) return rounds # ===== External Interface ===== def get_convert_convs(dataset:list[dict], pt_dir:str, save_path:str): with open(save_path, "w", encoding="utf-8") as fout: for item in tqdm(dataset, desc="Converting convs"): rounds = _process_item(item, pt_dir) if not rounds: continue fout.write(json.dumps({"messages": rounds}, ensure_ascii=False) + "\n")