Spaces:
Build error
Build error
| import pandas as pd | |
| import re | |
| import syllables | |
| import csv | |
| import json | |
| import random | |
| en_train_output_file = 'en_train.jsonl' | |
| en_test_output_file = 'en_test.jsonl' | |
| en_train_output_file = 'kr_train.jsonl' | |
| en_test_output_file = 'kr_test.jsonl' | |
| en_file_path = 'spotify_millsongdata.csv' | |
| kr_file_path = 'kr_lyrics_data.csv' | |
| ending = "Suggestion:" | |
| frame_prompt = """Suggest a single line of Korean lyric that matches with given syllables,lyrics, and title. | |
| Ensure to avoid repeating previous lyrics. Focus on creative and original expression. | |
| Match the length of the sentence to the syllables I provide as closely as possible. | |
| For example, if Syllables: 7 given, you should write a 6~8 letter korean sentence. | |
| Your answer should feel like soft, trendy K-pop lyrics without any profanity. | |
| Your answer should be short, and only composed with a single sentence. | |
| Answer with a single line of lyrics you created, and nothing else. | |
| Here, | |
| Title: {title} | |
| Syllables: {syllables} | |
| Previous Lyrics: {lyric} | |
| Your korean lyric that should be added to the previous lyrics: | |
| """ | |
| languages = ["Korean", "English", "Korean with English"] | |
| contractions_syllables = { | |
| "you're": 1, | |
| "i'm": 1, | |
| "we're": 1, | |
| "they're": 1, | |
| "you've": 1, | |
| "i've": 1, | |
| "we've": 1, | |
| "they've": 1, | |
| "can't": 1, | |
| "won't": 1, | |
| "don't": 1, | |
| "didn't": 2, | |
| "isn't": 2, | |
| "aren't": 2, | |
| "wasn't": 2, | |
| "weren't": 2, | |
| "couldn't": 2, | |
| "shouldn't": 2, | |
| "wouldn't": 2, | |
| "hasn't": 2, | |
| "haven't": 2, | |
| "hadn't": 2, | |
| "it's": 1, | |
| "that's": 1, | |
| "there's": 1, | |
| "here's": 1, | |
| "what's": 1, | |
| "let's": 1, | |
| } | |
| def load_data(csv_file_path): | |
| origianl_data = [] | |
| with open(csv_file_path, 'r') as csv_file: | |
| csv_reader = csv.DictReader(csv_file) | |
| for row in csv_reader: | |
| origianl_data.append(row) | |
| return origianl_data | |
| def convert_data(datas): | |
| dataset = [] | |
| for data in datas: | |
| lyrics = data['text'] | |
| title = data['song'] | |
| syllable = count_syllables(lyrics) | |
| question = f"Given a syllable structure and title of the song, write English lyrics that match it. title: {title}, syllable: {syllable}" | |
| answer = lyrics | |
| new_data = { | |
| "messages": [ | |
| {"role": "user", "content": question}, | |
| {"role": "system", "content": answer} | |
| ] | |
| } | |
| dataset.append(new_data) | |
| return dataset | |
| def split_korean_english(word): | |
| korean_part = re.findall(r'[\uAC00-\uD7A3]+', word) | |
| english_part = re.findall(r'[a-zA-Z0-9]+', word) | |
| return ''.join(korean_part), ''.join(english_part) | |
| def identify_language(word): | |
| korean_part, english_part = split_korean_english(word) | |
| if korean_part and english_part: | |
| return "Korean with English" | |
| elif korean_part: | |
| return "Korean" | |
| elif english_part: | |
| return "English" | |
| else: | |
| return "Unknown" | |
| def count_syllables(text): | |
| lines = text.split('\n') | |
| result = "" | |
| count = 0 | |
| for line in lines: | |
| words = line.split() | |
| line_result = "" | |
| line_count = 0 | |
| for word in words: | |
| if word == "": | |
| continue | |
| korean_part, english_part = split_korean_english(word) | |
| syllable_count = 0 | |
| if korean_part: | |
| syllable_count += count_korean_syllable(korean_part) | |
| if english_part: | |
| syllable_count += count_english_syllable(english_part) | |
| line_result += f"{syllable_count}-" | |
| line_count += syllable_count | |
| if line_result: | |
| result += line_result[:-1] | |
| result += '\n' | |
| if line_count: | |
| count += line_count | |
| #return result.rstrip('\n') | |
| return count | |
| def is_korean(word): | |
| return bool(re.search(r'[\uAC00-\uD7A3]', word)) | |
| def count_english_syllable(word): | |
| word = re.sub(r'^\W+|\W+$', '', word) | |
| if word.lower().strip() in contractions_syllables: | |
| syllable_count = contractions_syllables.get(word.lower().strip()) | |
| else: | |
| word = re.sub(r'^\W+|\W+$', '', word) | |
| syllable_count = syllables.estimate(word) | |
| return syllable_count | |
| def count_korean_syllable(word): | |
| return len(word) | |
| def save_data(data, output_file): | |
| with open(output_file, 'w') as file: | |
| for line in data: | |
| file.write(f"{str(line)}\n") | |
| def generate_kr_lyrics_data(infile, trainfile, testfile): | |
| df = pd.read_csv(infile, usecols=['title', 'lyric', 'year']) | |
| df = df[df['year'] >= 2010] | |
| # shuffle data | |
| df = df.sample(frac=1).reset_index(drop=True) | |
| # train:test = 8:2 | |
| train_size = int(len(df) * 0.4) | |
| test_size = int(len(df) * 0.5) | |
| train_df = df[:train_size] | |
| test_df = df[train_size:test_size] | |
| with open(trainfile, 'w', encoding='utf-8') as train_outfile, open(testfile, 'w', encoding='utf-8') as test_outfile: | |
| for index, row in train_df.iterrows(): | |
| title = row['title'] | |
| lyric = row['lyric'] | |
| if pd.isna(title) or pd.isna(lyric): | |
| continue | |
| lines = lyric.split('\n') | |
| completed_lyric = "" | |
| for line in lines: | |
| syllable = count_syllables(line) | |
| if syllable: | |
| language = identify_language(line) | |
| prompt = frame_prompt.format(title=title, lyric=completed_lyric, syllables=syllable) | |
| train_outfile.write(json.dumps({"messages": [{"role": "user", "content": prompt}, {"role": "system", "content": line}]}) + "\n") | |
| completed_lyric += line + '\n' | |
| for index, row in test_df.iterrows(): | |
| title = row['title'] | |
| lyric = row['lyric'] | |
| if pd.isna(title) or pd.isna(lyric): | |
| continue | |
| lines = lyric.split('\n') | |
| completed_lyric = "" | |
| language = random.choice(languages) | |
| for line in lines: | |
| syllable = count_syllables(line) | |
| if syllable: | |
| language = identify_language(line) | |
| prompt = frame_prompt.format(title=title, lyric=completed_lyric, syllables=syllable, language=language) | |
| test_outfile.write(json.dumps({"messages": [{"role": "user", "content": prompt}, {"role": "system", "content": line}]}) + "\n") | |
| completed_lyric += line + '\n' | |
| generate_kr_lyrics_data(kr_file_path, 'train.jsonl', 'test.jsonl') | |
| ex_lyric = """ | |
| example | |
| hello | |
| world | |
| """ | |
| #print(count_syllables(ex_lyric)) |