|
|
import re
|
|
|
|
|
|
def contains_chinese(text):
|
|
|
"""
|
|
|
Unicode 范围 \u4e00-\u9fff 包含常见的汉字
|
|
|
"""
|
|
|
return re.search(r'[\u4e00-\u9fff]', text) is not None
|
|
|
|
|
|
def process_lyrics(text):
|
|
|
"""
|
|
|
处理歌词文本:
|
|
|
1. 按 '/' 分割
|
|
|
2. 去除空白及空行
|
|
|
3. 过滤掉不包含中文(视为英文)的歌词
|
|
|
4. 去除重复歌词(保持原始顺序)
|
|
|
"""
|
|
|
|
|
|
lyrics = text.split('/')
|
|
|
processed = []
|
|
|
seen = set()
|
|
|
|
|
|
for line in lyrics:
|
|
|
|
|
|
line = line.strip()
|
|
|
|
|
|
if not line:
|
|
|
continue
|
|
|
|
|
|
if not contains_chinese(line):
|
|
|
continue
|
|
|
if len(line) < 3:
|
|
|
continue
|
|
|
|
|
|
if line not in seen:
|
|
|
seen.add(line)
|
|
|
processed.append(line)
|
|
|
|
|
|
return processed
|
|
|
|
|
|
def main():
|
|
|
input_filename = 'data\lyrics.txt'
|
|
|
output_filename = 'data\processed_data.txt'
|
|
|
|
|
|
|
|
|
with open(input_filename, 'r', encoding='utf-8') as f:
|
|
|
content = f.read()
|
|
|
|
|
|
|
|
|
processed = process_lyrics(content)
|
|
|
|
|
|
|
|
|
output_content = '/'.join(processed)
|
|
|
|
|
|
|
|
|
with open(output_filename, 'w', encoding='utf-8') as f:
|
|
|
f.write(output_content)
|
|
|
|
|
|
print(f'处理完成,结果保存在 {output_filename}')
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|
|
|
|