yuzzznh commited on
Commit
81266fc
·
verified ·
1 Parent(s): 0bcb609

Create make_dataset.py

Browse files
Files changed (1) hide show
  1. make_dataset.py +211 -0
make_dataset.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ import syllables
4
+ import csv
5
+ import json
6
+ import random
7
+
8
+ en_train_output_file = 'en_train.jsonl'
9
+ en_test_output_file = 'en_test.jsonl'
10
+ en_train_output_file = 'kr_train.jsonl'
11
+ en_test_output_file = 'kr_test.jsonl'
12
+ en_file_path = 'spotify_millsongdata.csv'
13
+ kr_file_path = 'kr_lyrics_data.csv'
14
+
15
+ ending = "Suggestion:"
16
+ frame_prompt = """Suggest a single line of Korean lyric that matches with given syllables,lyrics, and title.
17
+ Ensure to avoid repeating previous lyrics. Focus on creative and original expression.
18
+ Match the length of the sentence to the syllables I provide as closely as possible.
19
+ For example, if Syllables: 7 given, you should write a 6~8 letter korean sentence.
20
+ Your answer should feel like soft, trendy K-pop lyrics without any profanity.
21
+ Your answer should be short, and only composed with a single sentence.
22
+ Answer with a single line of lyrics you created, and nothing else.
23
+
24
+ Here,
25
+ Title: {title}
26
+ Syllables: {syllables}
27
+ Previous Lyrics: {lyric}
28
+
29
+ Your korean lyric that should be added to the previous lyrics:
30
+ """
31
+
32
+ languages = ["Korean", "English", "Korean with English"]
33
+
34
+ contractions_syllables = {
35
+ "you're": 1,
36
+ "i'm": 1,
37
+ "we're": 1,
38
+ "they're": 1,
39
+ "you've": 1,
40
+ "i've": 1,
41
+ "we've": 1,
42
+ "they've": 1,
43
+ "can't": 1,
44
+ "won't": 1,
45
+ "don't": 1,
46
+ "didn't": 2,
47
+ "isn't": 2,
48
+ "aren't": 2,
49
+ "wasn't": 2,
50
+ "weren't": 2,
51
+ "couldn't": 2,
52
+ "shouldn't": 2,
53
+ "wouldn't": 2,
54
+ "hasn't": 2,
55
+ "haven't": 2,
56
+ "hadn't": 2,
57
+ "it's": 1,
58
+ "that's": 1,
59
+ "there's": 1,
60
+ "here's": 1,
61
+ "what's": 1,
62
+ "let's": 1,
63
+ }
64
+
65
+ def load_data(csv_file_path):
66
+ origianl_data = []
67
+ with open(csv_file_path, 'r') as csv_file:
68
+ csv_reader = csv.DictReader(csv_file)
69
+ for row in csv_reader:
70
+ origianl_data.append(row)
71
+ return origianl_data
72
+
73
+
74
+ def convert_data(datas):
75
+ dataset = []
76
+ for data in datas:
77
+ lyrics = data['text']
78
+ title = data['song']
79
+ syllable = count_syllables(lyrics)
80
+ question = f"Given a syllable structure and title of the song, write English lyrics that match it. title: {title}, syllable: {syllable}"
81
+ answer = lyrics
82
+ new_data = {
83
+ "messages": [
84
+ {"role": "user", "content": question},
85
+ {"role": "system", "content": answer}
86
+ ]
87
+ }
88
+ dataset.append(new_data)
89
+ return dataset
90
+
91
+ def split_korean_english(word):
92
+ korean_part = re.findall(r'[\uAC00-\uD7A3]+', word)
93
+ english_part = re.findall(r'[a-zA-Z0-9]+', word)
94
+ return ''.join(korean_part), ''.join(english_part)
95
+
96
+ def identify_language(word):
97
+ korean_part, english_part = split_korean_english(word)
98
+
99
+ if korean_part and english_part:
100
+ return "Korean with English"
101
+ elif korean_part:
102
+ return "Korean"
103
+ elif english_part:
104
+ return "English"
105
+ else:
106
+ return "Unknown"
107
+
108
+ def count_syllables(text):
109
+ lines = text.split('\n')
110
+ result = ""
111
+ count = 0
112
+
113
+ for line in lines:
114
+ words = line.split()
115
+ line_result = ""
116
+ line_count = 0
117
+ for word in words:
118
+ if word == "":
119
+ continue
120
+ korean_part, english_part = split_korean_english(word)
121
+ syllable_count = 0
122
+ if korean_part:
123
+ syllable_count += count_korean_syllable(korean_part)
124
+ if english_part:
125
+ syllable_count += count_english_syllable(english_part)
126
+ line_result += f"{syllable_count}-"
127
+ line_count += syllable_count
128
+ if line_result:
129
+ result += line_result[:-1]
130
+ result += '\n'
131
+ if line_count:
132
+ count += line_count
133
+
134
+ #return result.rstrip('\n')
135
+ return count
136
+
137
+ def is_korean(word):
138
+ return bool(re.search(r'[\uAC00-\uD7A3]', word))
139
+
140
+ def count_english_syllable(word):
141
+ word = re.sub(r'^\W+|\W+$', '', word)
142
+ if word.lower().strip() in contractions_syllables:
143
+ syllable_count = contractions_syllables.get(word.lower().strip())
144
+ else:
145
+ word = re.sub(r'^\W+|\W+$', '', word)
146
+ syllable_count = syllables.estimate(word)
147
+ return syllable_count
148
+
149
+ def count_korean_syllable(word):
150
+ return len(word)
151
+
152
+ def save_data(data, output_file):
153
+ with open(output_file, 'w') as file:
154
+ for line in data:
155
+ file.write(f"{str(line)}\n")
156
+
157
+
158
+ def generate_kr_lyrics_data(infile, trainfile, testfile):
159
+ df = pd.read_csv(infile, usecols=['title', 'lyric', 'year'])
160
+ df = df[df['year'] >= 2010]
161
+
162
+ # shuffle data
163
+ df = df.sample(frac=1).reset_index(drop=True)
164
+
165
+ # train:test = 8:2
166
+ train_size = int(len(df) * 0.4)
167
+ test_size = int(len(df) * 0.5)
168
+ train_df = df[:train_size]
169
+ test_df = df[train_size:test_size]
170
+
171
+ with open(trainfile, 'w', encoding='utf-8') as train_outfile, open(testfile, 'w', encoding='utf-8') as test_outfile:
172
+ for index, row in train_df.iterrows():
173
+ title = row['title']
174
+ lyric = row['lyric']
175
+ if pd.isna(title) or pd.isna(lyric):
176
+ continue
177
+
178
+ lines = lyric.split('\n')
179
+ completed_lyric = ""
180
+ for line in lines:
181
+ syllable = count_syllables(line)
182
+ if syllable:
183
+ language = identify_language(line)
184
+ prompt = frame_prompt.format(title=title, lyric=completed_lyric, syllables=syllable)
185
+ train_outfile.write(json.dumps({"messages": [{"role": "user", "content": prompt}, {"role": "system", "content": line}]}) + "\n")
186
+ completed_lyric += line + '\n'
187
+
188
+ for index, row in test_df.iterrows():
189
+ title = row['title']
190
+ lyric = row['lyric']
191
+ if pd.isna(title) or pd.isna(lyric):
192
+ continue
193
+
194
+ lines = lyric.split('\n')
195
+ completed_lyric = ""
196
+ language = random.choice(languages)
197
+ for line in lines:
198
+ syllable = count_syllables(line)
199
+ if syllable:
200
+ language = identify_language(line)
201
+ prompt = frame_prompt.format(title=title, lyric=completed_lyric, syllables=syllable, language=language)
202
+ test_outfile.write(json.dumps({"messages": [{"role": "user", "content": prompt}, {"role": "system", "content": line}]}) + "\n")
203
+ completed_lyric += line + '\n'
204
+
205
+ generate_kr_lyrics_data(kr_file_path, 'train.jsonl', 'test.jsonl')
206
+ ex_lyric = """
207
+ example
208
+ hello
209
+ world
210
+ """
211
+ #print(count_syllables(ex_lyric))