Spaces:

jhansss
/

SingingSDS

Sleeping

SingingSDS / offline_process /create_features.py

Refactor load_song_database to accept config and update data loading logic; add lyric_word_length calculation in create_features

6f349df 9 months ago

raw

history blame

2.67 kB

	from datasets import load_dataset, concatenate_datasets

	ds = load_dataset("espnet/ace-kising-segments", cache_dir="cache")

	combined = concatenate_datasets([ds["train"], ds["validation"], ds["test"]])

	# 2. filter rows by singer: baber
	combined = combined.filter(lambda x: x["singer"] == "barber")

	# 3. create a new column, which counts the nonzero numbers in the list in the note_midi column
	combined = combined.map(
	lambda x: {
	"note_midi_length": len([n for n in x["note_midi"] if n != 0]),
	"lyric_word_length": len(
	[word for word in x["note_lyrics"] if word not in ["<AP>", "<SP>", "-"]]
	), # counts the number of actual words (or characters for, e.g., Chinese/Japanese)
	}
	)
	combined = combined.map(
	lambda x: {
	"lyric_word_length": len(
	[word for word in x["note_lyrics"] if word not in ["<AP>", "<SP>", "-"]]
	)
	} # counts the number of actual words (or characters for, e.g., Chinese/Japanese)
	)

	# 4. sort by segment_id
	combined = combined.sort("segment_id")

	# 5. iterate over rows
	prev_songid = None
	prev_song_segment_id = None
	song2note_lengths = {}
	song2word_lengths = {}
	for row in combined:
	# segment_id: kising_barber_{songid}_{song_segment_id}
	_, _, songid, song_segment_id = row["segment_id"].split("_")
	if prev_songid != songid:
	if prev_songid is not None:
	assert (
	song_segment_id == "001"
	), f"prev_songid: {prev_songid}, songid: {songid}, song_segment_id: {song_segment_id}"
	song2note_lengths[f"kising_{songid}"] = [row["note_midi_length"]]
	song2word_lengths[f"kising_{songid}"] = [row["lyric_word_length"]]
	else:
	assert (
	int(song_segment_id) >= int(prev_song_segment_id) + 1
	), f"prev_song_segment_id: {prev_song_segment_id}, song_segment_id: {song_segment_id}"
	song2note_lengths[f"kising_{songid}"].append(row["note_midi_length"])
	song2word_lengths[f"kising_{songid}"].append(row["lyric_word_length"])
	prev_songid = songid
	prev_song_segment_id = song_segment_id

	# 6. write to json
	import json

	with open("data/song2note_lengths.json", "w") as f:
	json.dump(song2note_lengths, f, indent=4)

	with open("data/song2word_lengths.json", "w") as f:
	json.dump(song2word_lengths, f, indent=4)

	# 7. push score segments to hub
	# remove audio and singer columns
	combined = combined.remove_columns(["audio", "singer"])
	# replace kising_barber_ with kising_
	combined = combined.map(
	lambda x: {"segment_id": x["segment_id"].replace("kising_barber_", "kising_")}
	)
	# upload to hub
	combined.push_to_hub("jhansss/kising_score_segments")