| import os |
| import re |
| import json |
| import copy |
| from tqdm import tqdm |
| from my_tool import dict_sort_print |
| from collections import defaultdict |
| from convert_convs import _parse_lyric_with_timestamps |
|
|
| |
|
|
| def _parse_lyrics(text:str) -> dict: |
| """Parse metadata, lyrics and timestamps from lyric information""" |
| segs = text.split("\n") |
| metadata = { |
| "lyrics_meta": {}, |
| "lyrics": [], |
| "lyrics_time": [], |
| } |
| for seg in segs: |
| |
| results = _parse_lyric_with_timestamps(seg) |
| for time, content in results: |
| if ":" in content or "οΌ" in content: |
| |
| pos1 = content.find(":") |
| pos2 = content.find("οΌ") |
| pos = pos1 if pos1 != -1 else pos2 |
| key = content[:pos].strip() |
| value = content[pos+1:].strip() |
| metadata["lyrics_meta"][key] = value |
| elif time == "00:00.00": |
| |
| continue |
| elif len(metadata['lyrics']) == 0 and "/" in content: |
| |
| continue |
| else: |
| |
| if len(content) == 0: |
| |
| if len(metadata['lyrics']) != 0 and metadata['lyrics'][-1] != "<nop>": |
| |
| metadata['lyrics'].append("<nop>") |
| metadata['lyrics_time'].append(time) |
| else: |
| if len(metadata['lyrics_time']) != 0 and metadata['lyrics_time'][-1] == time and time != "<nop>": |
| |
| continue |
| |
| metadata['lyrics'].append(content) |
| metadata['lyrics_time'].append(time) |
| return metadata |
|
|
| |
|
|
| def _count_ch_nan(text:str): |
| """Count the number of Chinese and other non-English characters in a string""" |
| ch_num = 0 |
| nan_num = 0 |
| nan = "" |
| for c in text: |
| if '\u4e00' <= c <= '\u9fff': |
| ch_num += 1 |
| elif ('a' <= c <= 'z') or ('A' <= c <= 'Z') or len(c.strip()) == 0: |
| continue |
| else: |
| nan_num += 1 |
| nan += c |
| |
| |
| return ch_num, nan_num |
|
|
| def _lang_decide(lyrics:list[str], val_limit:int=5, word_limit=3) -> str: |
| """ |
| Determine the language type of lyrics (en/zh/ez/instrument/nan) |
| - val_limit: Only count if there are at least this many sentences |
| - word_limit: Only count if a sentence has at least this many words |
| """ |
| ch_lyrics = 0 |
| en_lyrics = 0 |
| nan_lyrics = 0 |
| for lyric in lyrics: |
| lyric = copy.deepcopy(lyric) |
| if lyric.strip() == "<nop>": |
| continue |
| lyric = re.sub(r"[''οΏ₯Β·β²Β΄οΌοΌοΌγοΌ""!@#$%^&*()?.'/,=+_ββ οΌβ¦γγ<>0-9ο½β»~;οΌγ»\"γβο½β³γγοΌγγβ{}\[\]-]", " ", lyric) |
| ch_num, nan_num = _count_ch_nan(lyric) |
| |
| if nan_num > word_limit: |
| nan_lyrics += 1 |
| continue |
| elif ch_num > word_limit: |
| ch_lyrics += 1 |
| |
| lyric = re.sub(r'[\u4e00-\u9fff]+', '', lyric) |
| |
| en_num = len(lyric.split(" ")) |
| if en_num > word_limit: |
| en_lyrics += 1 |
|
|
| if nan_lyrics > val_limit: |
| return "nan" |
| if ch_lyrics > val_limit and en_lyrics > val_limit: |
| return "ez" |
| if ch_lyrics > val_limit: |
| return "zh" |
| if en_lyrics > val_limit: |
| return "en" |
| return "instrument" |
|
|
| |
|
|
| def get_convert_lyrics(dataset:list[dict], save_path:str, dir:str, src_subfix:str=""): |
| """Convert lyrics and annotate language type (need to locate corresponding song)""" |
| new_dataset = [] |
| lang_count = defaultdict(int) |
| unmatch = [] |
| with open(save_path, 'w', encoding='utf-8') as file: |
| for ele in tqdm(dataset, desc="Converting Lyrics"): |
| ele = copy.deepcopy(ele) |
| |
| if not ele['has_lyric']: |
| |
| continue |
| |
| lyric = ele['lyric'] |
| if lyric == "": |
| lyric = ele['tlyric'] |
|
|
| |
| new_data = _parse_lyrics(lyric) |
|
|
| |
| lang = _lang_decide(new_data['lyrics']) |
| lang_count[lang] += 1 |
|
|
| |
| del ele['artists'] |
| del ele['lyric'] |
| del ele['tlyric'] |
| del ele['has_lyric'] |
|
|
| |
| ele['lyric_lang'] = lang |
| ele['source'] += src_subfix |
| for key, value in new_data.items(): |
| ele[key] = value |
| |
| new_dataset.append(ele) |
| json.dump(ele, file, ensure_ascii=False) |
| file.write("\n") |
|
|
| dict_sort_print(lang_count) |
| return new_dataset, unmatch |
|
|
| def get_match_music(music_data:list[dict], lyric_data:list[dict]): |
| """Get songs that match or don't match with lyrics""" |
| |
| name_map = {} |
| for ele in tqdm(lyric_data, desc="Existing Lyrics"): |
| name = ele['name'] |
| name = re.sub(" ", "", name) |
| artist = ele['artist'] |
| complete_name = f"{name} - {artist}.mp3" |
| name_map[complete_name] = ele |
| |
| |
| matches = [] |
| unmatches = [] |
| for ele in tqdm(music_data, desc="Check Matching"): |
| path = ele['path'] |
| name = os.path.basename(path) |
| if name not in name_map: |
| unmatches.append(ele) |
| else: |
| meta = name_map[name] |
| meta['path'] = path |
| matches.append(meta) |
| return matches, unmatches |
|
|