Spaces:

2024-FriendliAI-Hackathon-Team5
/

5

Build error

App Files Files Community

yuzzznh commited on May 25, 2024

Commit

81266fc

verified ·

1 Parent(s): 0bcb609

Create make_dataset.py

Browse files

Files changed (1) hide show

make_dataset.py +211 -0

make_dataset.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import pandas as pd
+import re
+import syllables
+import csv
+import json
+import random
+en_train_output_file = 'en_train.jsonl'
+en_test_output_file = 'en_test.jsonl'
+en_train_output_file = 'kr_train.jsonl'
+en_test_output_file = 'kr_test.jsonl'
+en_file_path = 'spotify_millsongdata.csv'
+kr_file_path = 'kr_lyrics_data.csv'
+ending = "Suggestion:"
+frame_prompt = """Suggest a single line of Korean lyric that matches with given syllables,lyrics, and title.
+Ensure to avoid repeating previous lyrics. Focus on creative and original expression.
+Match the length of the sentence to the syllables I provide as closely as possible.
+For example, if Syllables: 7 given, you should write a 6~8 letter korean sentence.
+Your answer should feel like soft, trendy K-pop lyrics without any profanity.
+Your answer should be short, and only composed with a single sentence.
+Answer with a single line of lyrics you created, and nothing else.
+Here,
+Title: {title}
+Syllables: {syllables}
+Previous Lyrics: {lyric}
+Your korean lyric that should be added to the previous lyrics:
+"""
+languages = ["Korean", "English", "Korean with English"]
+contractions_syllables = {
+    "you're": 1,
+    "i'm": 1,
+    "we're": 1,
+    "they're": 1,
+    "you've": 1,
+    "i've": 1,
+    "we've": 1,
+    "they've": 1,
+    "can't": 1,
+    "won't": 1,
+    "don't": 1,
+    "didn't": 2,
+    "isn't": 2,
+    "aren't": 2,
+    "wasn't": 2,
+    "weren't": 2,
+    "couldn't": 2,
+    "shouldn't": 2,
+    "wouldn't": 2,
+    "hasn't": 2,
+    "haven't": 2,
+    "hadn't": 2,
+    "it's": 1,
+    "that's": 1,
+    "there's": 1,
+    "here's": 1,
+    "what's": 1,
+    "let's": 1,
+}
+def load_data(csv_file_path):
+    origianl_data = []
+    with open(csv_file_path, 'r') as csv_file:
+        csv_reader = csv.DictReader(csv_file)
+        for row in csv_reader:
+            origianl_data.append(row)
+    return origianl_data
+def convert_data(datas):
+    dataset = []
+    for data in datas:
+        lyrics = data['text']
+        title = data['song']
+        syllable = count_syllables(lyrics)
+        question = f"Given a syllable structure and title of the song, write English lyrics that match it. title: {title}, syllable: {syllable}"
+        answer = lyrics
+        new_data = {
+                "messages": [
+                    {"role": "user", "content": question},
+                    {"role": "system", "content": answer}
+                ]
+            }
+    dataset.append(new_data)
+    return dataset
+def split_korean_english(word):
+    korean_part = re.findall(r'[\uAC00-\uD7A3]+', word)
+    english_part = re.findall(r'[a-zA-Z0-9]+', word)
+    return ''.join(korean_part), ''.join(english_part)
+def identify_language(word):
+    korean_part, english_part = split_korean_english(word)
+    if korean_part and english_part:
+        return "Korean with English"
+    elif korean_part:
+        return "Korean"
+    elif english_part:
+        return "English"
+    else:
+        return "Unknown"
+def count_syllables(text):
+    lines = text.split('\n')
+    result = ""
+    count = 0
+    for line in lines:
+        words = line.split()
+        line_result = ""
+        line_count = 0
+        for word in words:
+            if word == "":
+                continue
+            korean_part, english_part = split_korean_english(word)
+            syllable_count = 0
+            if korean_part:
+                syllable_count += count_korean_syllable(korean_part)
+            if english_part:
+                syllable_count += count_english_syllable(english_part)
+            line_result += f"{syllable_count}-"
+            line_count += syllable_count
+        if line_result:
+            result += line_result[:-1]
+            result += '\n'
+        if line_count:
+            count += line_count
+    #return result.rstrip('\n')
+    return count
+def is_korean(word):
+    return bool(re.search(r'[\uAC00-\uD7A3]', word))
+def count_english_syllable(word):
+    word = re.sub(r'^\W+|\W+$', '', word)
+    if word.lower().strip() in contractions_syllables:
+        syllable_count = contractions_syllables.get(word.lower().strip())
+    else:
+        word = re.sub(r'^\W+|\W+$', '', word)
+        syllable_count = syllables.estimate(word)
+    return syllable_count
+def count_korean_syllable(word):
+    return len(word)
+def save_data(data, output_file):
+    with open(output_file, 'w') as file:
+        for line in data:
+            file.write(f"{str(line)}\n")
+def generate_kr_lyrics_data(infile, trainfile, testfile):
+    df = pd.read_csv(infile, usecols=['title', 'lyric', 'year'])
+    df = df[df['year'] >= 2010]
+    # shuffle data
+    df = df.sample(frac=1).reset_index(drop=True)
+    # train:test = 8:2
+    train_size = int(len(df) * 0.4)
+    test_size = int(len(df) * 0.5)
+    train_df = df[:train_size]
+    test_df = df[train_size:test_size]
+    with open(trainfile, 'w', encoding='utf-8') as train_outfile, open(testfile, 'w', encoding='utf-8') as test_outfile:
+        for index, row in train_df.iterrows():
+            title = row['title']
+            lyric = row['lyric']
+            if pd.isna(title) or pd.isna(lyric):
+                continue
+            lines = lyric.split('\n')
+            completed_lyric = ""
+            for line in lines:
+                syllable = count_syllables(line)
+                if syllable:
+                    language = identify_language(line)
+                    prompt = frame_prompt.format(title=title, lyric=completed_lyric, syllables=syllable)
+                    train_outfile.write(json.dumps({"messages": [{"role": "user", "content": prompt}, {"role": "system", "content": line}]}) + "\n")
+                    completed_lyric += line + '\n'
+        for index, row in test_df.iterrows():
+            title = row['title']
+            lyric = row['lyric']
+            if pd.isna(title) or pd.isna(lyric):
+                continue
+            lines = lyric.split('\n')
+            completed_lyric = ""
+            language = random.choice(languages)
+            for line in lines:
+                syllable = count_syllables(line)
+                if syllable:
+                    language = identify_language(line)
+                    prompt = frame_prompt.format(title=title, lyric=completed_lyric, syllables=syllable, language=language)
+                    test_outfile.write(json.dumps({"messages": [{"role": "user", "content": prompt}, {"role": "system", "content": line}]}) + "\n")
+                    completed_lyric += line + '\n'
+generate_kr_lyrics_data(kr_file_path, 'train.jsonl', 'test.jsonl')
+ex_lyric = """
+example
+hello
+world
+"""
+#print(count_syllables(ex_lyric))