Spaces:
Sleeping
Sleeping
Upload UltraSinger.py
Browse files- UltraSinger.py +994 -0
UltraSinger.py
ADDED
|
@@ -0,0 +1,994 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""UltraSinger uses AI to automatically create UltraStar song files"""
|
| 2 |
+
|
| 3 |
+
import copy
|
| 4 |
+
import getopt
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import re
|
| 8 |
+
|
| 9 |
+
import Levenshtein
|
| 10 |
+
import librosa
|
| 11 |
+
|
| 12 |
+
from tqdm import tqdm
|
| 13 |
+
from packaging import version
|
| 14 |
+
|
| 15 |
+
import soundfile as sf
|
| 16 |
+
|
| 17 |
+
from modules import os_helper
|
| 18 |
+
from modules.Audio.denoise import ffmpeg_reduce_noise
|
| 19 |
+
from modules.Audio.separation import separate_audio
|
| 20 |
+
from modules.Audio.vocal_chunks import (
|
| 21 |
+
export_chunks_from_transcribed_data,
|
| 22 |
+
export_chunks_from_ultrastar_data,
|
| 23 |
+
)
|
| 24 |
+
from modules.Audio.silence_processing import remove_silence_from_transcription_data, get_silence_sections
|
| 25 |
+
from modules.csv_handler import export_transcribed_data_to_csv
|
| 26 |
+
from modules.Audio.convert_audio import convert_audio_to_mono_wav, convert_wav_to_mp3
|
| 27 |
+
from modules.Audio.youtube import (
|
| 28 |
+
download_youtube_audio,
|
| 29 |
+
download_youtube_thumbnail,
|
| 30 |
+
download_youtube_video,
|
| 31 |
+
get_youtube_title,
|
| 32 |
+
)
|
| 33 |
+
from modules.DeviceDetection.device_detection import check_gpu_support
|
| 34 |
+
from modules.console_colors import (
|
| 35 |
+
ULTRASINGER_HEAD,
|
| 36 |
+
blue_highlighted,
|
| 37 |
+
gold_highlighted,
|
| 38 |
+
light_blue_highlighted,
|
| 39 |
+
red_highlighted,
|
| 40 |
+
)
|
| 41 |
+
from modules.Midi import midi_creator
|
| 42 |
+
from modules.Midi.midi_creator import (
|
| 43 |
+
convert_frequencies_to_notes,
|
| 44 |
+
create_midi_notes_from_pitched_data,
|
| 45 |
+
most_frequent,
|
| 46 |
+
)
|
| 47 |
+
from modules.Pitcher.pitcher import (
|
| 48 |
+
get_frequencies_with_high_confidence,
|
| 49 |
+
get_pitch_with_crepe_file,
|
| 50 |
+
)
|
| 51 |
+
from modules.Pitcher.pitched_data import PitchedData
|
| 52 |
+
from modules.Speech_Recognition.hyphenation import hyphenation, language_check, create_hyphenator
|
| 53 |
+
from modules.Speech_Recognition.Whisper import transcribe_with_whisper
|
| 54 |
+
from modules.Ultrastar import ultrastar_score_calculator, ultrastar_writer, ultrastar_converter, ultrastar_parser
|
| 55 |
+
from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue
|
| 56 |
+
from Settings import Settings
|
| 57 |
+
from modules.Speech_Recognition.TranscribedData import TranscribedData
|
| 58 |
+
from modules.plot import plot, plot_spectrogram
|
| 59 |
+
from modules.musicbrainz_client import get_music_infos
|
| 60 |
+
|
| 61 |
+
settings = Settings()
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def convert_midi_notes_to_ultrastar_notes(midi_notes: list[str]) -> list[int]:
|
| 65 |
+
"""Convert midi notes to ultrastar notes"""
|
| 66 |
+
print(f"{ULTRASINGER_HEAD} Creating Ultrastar notes from midi data")
|
| 67 |
+
|
| 68 |
+
ultrastar_note_numbers = []
|
| 69 |
+
for i in enumerate(midi_notes):
|
| 70 |
+
pos = i[0]
|
| 71 |
+
note_number_librosa = librosa.note_to_midi(midi_notes[pos])
|
| 72 |
+
pitch = ultrastar_converter.midi_note_to_ultrastar_note(
|
| 73 |
+
note_number_librosa
|
| 74 |
+
)
|
| 75 |
+
ultrastar_note_numbers.append(pitch)
|
| 76 |
+
# todo: Progress?
|
| 77 |
+
# print(
|
| 78 |
+
# f"Note: {midi_notes[i]} midi_note: {str(note_number_librosa)} pitch: {str(pitch)}"
|
| 79 |
+
# )
|
| 80 |
+
return ultrastar_note_numbers
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def pitch_each_chunk_with_crepe(directory: str) -> list[str]:
|
| 84 |
+
"""Pitch each chunk with crepe and return midi notes"""
|
| 85 |
+
print(
|
| 86 |
+
f"{ULTRASINGER_HEAD} Pitching each chunk with {blue_highlighted('crepe')}"
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
midi_notes = []
|
| 90 |
+
for filename in sorted(
|
| 91 |
+
[f for f in os.listdir(directory) if f.endswith(".wav")],
|
| 92 |
+
key=lambda x: int(x.split("_")[1]),
|
| 93 |
+
):
|
| 94 |
+
filepath = os.path.join(directory, filename)
|
| 95 |
+
# todo: stepsize = duration? then when shorter than "it" it should take the duration. Otherwise there a more notes
|
| 96 |
+
pitched_data = get_pitch_with_crepe_file(
|
| 97 |
+
filepath,
|
| 98 |
+
settings.crepe_model_capacity,
|
| 99 |
+
settings.crepe_step_size,
|
| 100 |
+
settings.tensorflow_device,
|
| 101 |
+
)
|
| 102 |
+
conf_f = get_frequencies_with_high_confidence(
|
| 103 |
+
pitched_data.frequencies, pitched_data.confidence
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
notes = convert_frequencies_to_notes(conf_f)
|
| 107 |
+
note = most_frequent(notes)[0][0]
|
| 108 |
+
|
| 109 |
+
midi_notes.append(note)
|
| 110 |
+
# todo: Progress?
|
| 111 |
+
# print(filename + " f: " + str(mean))
|
| 112 |
+
|
| 113 |
+
return midi_notes
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def add_hyphen_to_data(transcribed_data: list[TranscribedData], hyphen_words: list[list[str]]):
|
| 117 |
+
"""Add hyphen to transcribed data return new data list"""
|
| 118 |
+
new_data = []
|
| 119 |
+
|
| 120 |
+
for i, data in enumerate(transcribed_data):
|
| 121 |
+
if not hyphen_words[i]:
|
| 122 |
+
new_data.append(data)
|
| 123 |
+
else:
|
| 124 |
+
chunk_duration = data.end - data.start
|
| 125 |
+
chunk_duration = chunk_duration / (len(hyphen_words[i]))
|
| 126 |
+
|
| 127 |
+
next_start = data.start
|
| 128 |
+
for j in enumerate(hyphen_words[i]):
|
| 129 |
+
hyphenated_word_index = j[0]
|
| 130 |
+
dup = copy.copy(data)
|
| 131 |
+
dup.start = next_start
|
| 132 |
+
next_start = data.end - chunk_duration * (
|
| 133 |
+
len(hyphen_words[i]) - 1 - hyphenated_word_index
|
| 134 |
+
)
|
| 135 |
+
dup.end = next_start
|
| 136 |
+
dup.word = hyphen_words[i][hyphenated_word_index]
|
| 137 |
+
dup.is_hyphen = True
|
| 138 |
+
if hyphenated_word_index == len(hyphen_words[i]) - 1:
|
| 139 |
+
dup.is_word_end = True
|
| 140 |
+
else:
|
| 141 |
+
dup.is_word_end = False
|
| 142 |
+
new_data.append(dup)
|
| 143 |
+
|
| 144 |
+
return new_data
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def get_bpm_from_data(data, sampling_rate):
|
| 148 |
+
"""Get real bpm from audio data"""
|
| 149 |
+
onset_env = librosa.onset.onset_strength(y=data, sr=sampling_rate)
|
| 150 |
+
wav_tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sampling_rate)
|
| 151 |
+
|
| 152 |
+
print(
|
| 153 |
+
f"{ULTRASINGER_HEAD} BPM is {blue_highlighted(str(round(wav_tempo[0], 2)))}"
|
| 154 |
+
)
|
| 155 |
+
return wav_tempo[0]
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def get_bpm_from_file(wav_file: str) -> float:
|
| 159 |
+
"""Get real bpm from audio file"""
|
| 160 |
+
data, sampling_rate = librosa.load(wav_file, sr=None)
|
| 161 |
+
return get_bpm_from_data(data, sampling_rate)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def correct_words(recognized_words, word_list_file):
|
| 165 |
+
"""Docstring"""
|
| 166 |
+
with open(word_list_file, "r", encoding="utf-8") as file:
|
| 167 |
+
text = file.read()
|
| 168 |
+
word_list = text.split()
|
| 169 |
+
|
| 170 |
+
for i, rec_word in enumerate(recognized_words):
|
| 171 |
+
if rec_word.word in word_list:
|
| 172 |
+
continue
|
| 173 |
+
|
| 174 |
+
closest_word = min(
|
| 175 |
+
word_list, key=lambda x: Levenshtein.distance(rec_word.word, x)
|
| 176 |
+
)
|
| 177 |
+
print(recognized_words[i].word + " - " + closest_word)
|
| 178 |
+
recognized_words[i].word = closest_word
|
| 179 |
+
return recognized_words
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def print_help() -> None:
|
| 183 |
+
"""Print help text"""
|
| 184 |
+
help_string = """
|
| 185 |
+
UltraSinger.py [opt] [mode] [transcription] [pitcher] [extra]
|
| 186 |
+
|
| 187 |
+
[opt]
|
| 188 |
+
-h This help text.
|
| 189 |
+
-i Ultrastar.txt
|
| 190 |
+
audio like .mp3, .wav, youtube link
|
| 191 |
+
-o Output folder
|
| 192 |
+
|
| 193 |
+
[mode]
|
| 194 |
+
## INPUT is audio ##
|
| 195 |
+
default Creates all
|
| 196 |
+
|
| 197 |
+
# Single file creation selection is in progress, you currently getting all!
|
| 198 |
+
(-u Create ultrastar txt file) # In Progress
|
| 199 |
+
(-m Create midi file) # In Progress
|
| 200 |
+
(-s Create sheet file) # In Progress
|
| 201 |
+
|
| 202 |
+
## INPUT is ultrastar.txt ##
|
| 203 |
+
default Creates all
|
| 204 |
+
|
| 205 |
+
# Single selection is in progress, you currently getting all!
|
| 206 |
+
(-r repitch Ultrastar.txt (input has to be audio)) # In Progress
|
| 207 |
+
(-p Check pitch of Ultrastar.txt input) # In Progress
|
| 208 |
+
(-m Create midi file) # In Progress
|
| 209 |
+
|
| 210 |
+
[transcription]
|
| 211 |
+
# Default is whisper
|
| 212 |
+
--whisper Multilingual model > tiny|base|small|medium|large-v1|large-v2 >> ((default) is large-v2
|
| 213 |
+
English-only model > tiny.en|base.en|small.en|medium.en
|
| 214 |
+
--whisper_align_model Use other languages model for Whisper provided from huggingface.co
|
| 215 |
+
--language Override the language detected by whisper, does not affect transcription but steps after transcription
|
| 216 |
+
--whisper_batch_size Reduce if low on GPU mem >> ((default) is 16)
|
| 217 |
+
--whisper_compute_type Change to "int8" if low on GPU mem (may reduce accuracy) >> ((default) is "float16" for cuda devices, "int8" for cpu)
|
| 218 |
+
|
| 219 |
+
[pitcher]
|
| 220 |
+
# Default is crepe
|
| 221 |
+
--crepe tiny|full >> ((default) is full)
|
| 222 |
+
--crepe_step_size unit is miliseconds >> ((default) is 10)
|
| 223 |
+
|
| 224 |
+
[extra]
|
| 225 |
+
--hyphenation True|False >> ((default) is True)
|
| 226 |
+
--disable_separation True|False >> ((default) is False)
|
| 227 |
+
--disable_karaoke True|False >> ((default) is False)
|
| 228 |
+
--create_audio_chunks True|False >> ((default) is False)
|
| 229 |
+
--keep_cache True|False >> ((default) is False)
|
| 230 |
+
--plot True|False >> ((default) is False)
|
| 231 |
+
--format_version 0.3.0|1.0.0|1.1.0 >> ((default) is 1.0.0)
|
| 232 |
+
|
| 233 |
+
[device]
|
| 234 |
+
--force_cpu True|False >> ((default) is False) All steps will be forced to cpu
|
| 235 |
+
--force_whisper_cpu True|False >> ((default) is False) Only whisper will be forced to cpu
|
| 236 |
+
--force_crepe_cpu True|False >> ((default) is False) Only crepe will be forced to cpu
|
| 237 |
+
"""
|
| 238 |
+
print(help_string)
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def remove_unecessary_punctuations(transcribed_data: list[TranscribedData]) -> None:
|
| 242 |
+
"""Remove unecessary punctuations from transcribed data"""
|
| 243 |
+
punctuation = ".,"
|
| 244 |
+
for i, data in enumerate(transcribed_data):
|
| 245 |
+
data.word = data.word.translate(
|
| 246 |
+
{ord(i): None for i in punctuation}
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def hyphenate_each_word(language: str, transcribed_data: list[TranscribedData]) -> list[list[str]] | None:
|
| 251 |
+
"""Hyphenate each word in the transcribed data."""
|
| 252 |
+
lang_region = language_check(language)
|
| 253 |
+
if lang_region is None:
|
| 254 |
+
print(
|
| 255 |
+
f"{ULTRASINGER_HEAD} {red_highlighted('Error in hyphenation for language ')} {blue_highlighted(language)}{red_highlighted(', maybe you want to disable it?')}"
|
| 256 |
+
)
|
| 257 |
+
return None
|
| 258 |
+
|
| 259 |
+
hyphenated_word = []
|
| 260 |
+
try:
|
| 261 |
+
hyphenator = create_hyphenator(lang_region)
|
| 262 |
+
for i in tqdm(enumerate(transcribed_data)):
|
| 263 |
+
pos = i[0]
|
| 264 |
+
hyphenated_word.append(
|
| 265 |
+
hyphenation(transcribed_data[pos].word, hyphenator)
|
| 266 |
+
)
|
| 267 |
+
except:
|
| 268 |
+
print(f"{ULTRASINGER_HEAD} {red_highlighted('Error in hyphenation for language ')} {blue_highlighted(language)}{red_highlighted(', maybe you want to disable it?')}")
|
| 269 |
+
return None
|
| 270 |
+
|
| 271 |
+
return hyphenated_word
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def print_support() -> None:
|
| 275 |
+
"""Print support text"""
|
| 276 |
+
print()
|
| 277 |
+
print(
|
| 278 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('Do you like UltraSinger? Want it to be even better? Then help with your')} {light_blue_highlighted('support')}{gold_highlighted('!')}"
|
| 279 |
+
)
|
| 280 |
+
print(
|
| 281 |
+
f"{ULTRASINGER_HEAD} See project page -> https://github.com/rakuri255/UltraSinger"
|
| 282 |
+
)
|
| 283 |
+
print(
|
| 284 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('This will help a lot to keep this project alive and improved.')}"
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
def print_version() -> None:
|
| 288 |
+
"""Print version text"""
|
| 289 |
+
print()
|
| 290 |
+
print(
|
| 291 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('*****************************')}"
|
| 292 |
+
)
|
| 293 |
+
print(
|
| 294 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('UltraSinger Version:')} {light_blue_highlighted(settings.APP_VERSION)}"
|
| 295 |
+
)
|
| 296 |
+
print(
|
| 297 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('*****************************')}"
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
def run() -> None:
|
| 301 |
+
"""The processing function of this program"""
|
| 302 |
+
is_audio = ".txt" not in settings.input_file_path
|
| 303 |
+
ultrastar_class = None
|
| 304 |
+
real_bpm = None
|
| 305 |
+
(title, artist, year, genre) = (None, None, None, None)
|
| 306 |
+
|
| 307 |
+
if not is_audio: # Parse Ultrastar txt
|
| 308 |
+
print(
|
| 309 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('re-pitch mode')}"
|
| 310 |
+
)
|
| 311 |
+
(
|
| 312 |
+
basename_without_ext,
|
| 313 |
+
real_bpm,
|
| 314 |
+
song_output,
|
| 315 |
+
ultrastar_audio_input_path,
|
| 316 |
+
ultrastar_class,
|
| 317 |
+
) = parse_ultrastar_txt()
|
| 318 |
+
elif settings.input_file_path.startswith("https:"): # Youtube
|
| 319 |
+
print(
|
| 320 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}"
|
| 321 |
+
)
|
| 322 |
+
(
|
| 323 |
+
basename_without_ext,
|
| 324 |
+
song_output,
|
| 325 |
+
ultrastar_audio_input_path,
|
| 326 |
+
(title, artist, year, genre)
|
| 327 |
+
) = download_from_youtube()
|
| 328 |
+
else: # Audio File
|
| 329 |
+
print(
|
| 330 |
+
f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}"
|
| 331 |
+
)
|
| 332 |
+
(
|
| 333 |
+
basename_without_ext,
|
| 334 |
+
song_output,
|
| 335 |
+
ultrastar_audio_input_path,
|
| 336 |
+
(title, artist, year, genre)
|
| 337 |
+
) = infos_from_audio_input_file()
|
| 338 |
+
|
| 339 |
+
cache_path = os.path.join(song_output, "cache")
|
| 340 |
+
settings.processing_audio_path = os.path.join(
|
| 341 |
+
cache_path, basename_without_ext + ".wav"
|
| 342 |
+
)
|
| 343 |
+
os_helper.create_folder(cache_path)
|
| 344 |
+
|
| 345 |
+
# Separate vocal from audio
|
| 346 |
+
audio_separation_path = separate_vocal_from_audio(
|
| 347 |
+
basename_without_ext, cache_path, ultrastar_audio_input_path
|
| 348 |
+
)
|
| 349 |
+
vocals_path = os.path.join(audio_separation_path, "vocals.wav")
|
| 350 |
+
instrumental_path = os.path.join(audio_separation_path, "no_vocals.wav")
|
| 351 |
+
|
| 352 |
+
# Move instrumental and vocals
|
| 353 |
+
if settings.create_karaoke and version.parse(settings.format_version) < version.parse("1.1.0"):
|
| 354 |
+
karaoke_output_path = os.path.join(song_output, basename_without_ext + " [Karaoke].mp3")
|
| 355 |
+
convert_wav_to_mp3(instrumental_path, karaoke_output_path)
|
| 356 |
+
|
| 357 |
+
if version.parse(settings.format_version) >= version.parse("1.1.0"):
|
| 358 |
+
instrumental_output_path = os.path.join(song_output, basename_without_ext + " [Instrumental].mp3")
|
| 359 |
+
convert_wav_to_mp3(instrumental_path, instrumental_output_path)
|
| 360 |
+
vocals_output_path = os.path.join(song_output, basename_without_ext + " [Vocals].mp3")
|
| 361 |
+
convert_wav_to_mp3(vocals_path, vocals_output_path)
|
| 362 |
+
|
| 363 |
+
if settings.use_separated_vocal:
|
| 364 |
+
input_path = vocals_path
|
| 365 |
+
else:
|
| 366 |
+
input_path = ultrastar_audio_input_path
|
| 367 |
+
|
| 368 |
+
# Denoise vocal audio
|
| 369 |
+
denoised_output_path = os.path.join(
|
| 370 |
+
cache_path, basename_without_ext + "_denoised.wav"
|
| 371 |
+
)
|
| 372 |
+
denoise_vocal_audio(input_path, denoised_output_path)
|
| 373 |
+
|
| 374 |
+
# Convert to mono audio
|
| 375 |
+
mono_output_path = os.path.join(
|
| 376 |
+
cache_path, basename_without_ext + "_mono.wav"
|
| 377 |
+
)
|
| 378 |
+
convert_audio_to_mono_wav(denoised_output_path, mono_output_path)
|
| 379 |
+
|
| 380 |
+
# Mute silence sections
|
| 381 |
+
mute_output_path = os.path.join(
|
| 382 |
+
cache_path, basename_without_ext + "_mute.wav"
|
| 383 |
+
)
|
| 384 |
+
mute_no_singing_parts(mono_output_path, mute_output_path)
|
| 385 |
+
|
| 386 |
+
# Define the audio file to process
|
| 387 |
+
settings.processing_audio_path = mute_output_path
|
| 388 |
+
|
| 389 |
+
# Audio transcription
|
| 390 |
+
transcribed_data = None
|
| 391 |
+
language = settings.language
|
| 392 |
+
if is_audio:
|
| 393 |
+
detected_language, transcribed_data = transcribe_audio()
|
| 394 |
+
if language is None:
|
| 395 |
+
language = detected_language
|
| 396 |
+
|
| 397 |
+
remove_unecessary_punctuations(transcribed_data)
|
| 398 |
+
|
| 399 |
+
if settings.hyphenation:
|
| 400 |
+
hyphen_words = hyphenate_each_word(language, transcribed_data)
|
| 401 |
+
if hyphen_words is not None:
|
| 402 |
+
transcribed_data = add_hyphen_to_data(transcribed_data, hyphen_words)
|
| 403 |
+
|
| 404 |
+
transcribed_data = remove_silence_from_transcription_data(
|
| 405 |
+
settings.processing_audio_path, transcribed_data
|
| 406 |
+
)
|
| 407 |
+
|
| 408 |
+
# todo: do we need to correct words?
|
| 409 |
+
# lyric = 'input/faber_lyric.txt'
|
| 410 |
+
# --corrected_words = correct_words(vosk_speech, lyric)
|
| 411 |
+
|
| 412 |
+
# Create audio chunks
|
| 413 |
+
if settings.create_audio_chunks:
|
| 414 |
+
create_audio_chunks(
|
| 415 |
+
cache_path,
|
| 416 |
+
is_audio,
|
| 417 |
+
transcribed_data,
|
| 418 |
+
ultrastar_audio_input_path,
|
| 419 |
+
ultrastar_class,
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
+
# Pitch the audio
|
| 423 |
+
midi_notes, pitched_data, ultrastar_note_numbers = pitch_audio(
|
| 424 |
+
is_audio, transcribed_data, ultrastar_class
|
| 425 |
+
)
|
| 426 |
+
|
| 427 |
+
# Create plot
|
| 428 |
+
if settings.create_plot:
|
| 429 |
+
vocals_path = os.path.join(audio_separation_path, "vocals.wav")
|
| 430 |
+
plot_spectrogram(vocals_path, song_output, "vocals.wav")
|
| 431 |
+
plot_spectrogram(settings.processing_audio_path, song_output, "processing audio")
|
| 432 |
+
plot(pitched_data, song_output, transcribed_data, ultrastar_class, midi_notes)
|
| 433 |
+
|
| 434 |
+
# Write Ultrastar txt
|
| 435 |
+
if is_audio:
|
| 436 |
+
real_bpm, ultrastar_file_output = create_ultrastar_txt_from_automation(
|
| 437 |
+
basename_without_ext,
|
| 438 |
+
song_output,
|
| 439 |
+
transcribed_data,
|
| 440 |
+
ultrastar_audio_input_path,
|
| 441 |
+
ultrastar_note_numbers,
|
| 442 |
+
language,
|
| 443 |
+
title,
|
| 444 |
+
artist,
|
| 445 |
+
year,
|
| 446 |
+
genre
|
| 447 |
+
)
|
| 448 |
+
else:
|
| 449 |
+
ultrastar_file_output = create_ultrastar_txt_from_ultrastar_data(
|
| 450 |
+
song_output, ultrastar_class, ultrastar_note_numbers
|
| 451 |
+
)
|
| 452 |
+
|
| 453 |
+
# Calc Points
|
| 454 |
+
ultrastar_class, simple_score, accurate_score = calculate_score_points(
|
| 455 |
+
is_audio, pitched_data, ultrastar_class, ultrastar_file_output
|
| 456 |
+
)
|
| 457 |
+
|
| 458 |
+
# Add calculated score to Ultrastar txt #Todo: Missing Karaoke
|
| 459 |
+
ultrastar_writer.add_score_to_ultrastar_txt(
|
| 460 |
+
ultrastar_file_output, simple_score
|
| 461 |
+
)
|
| 462 |
+
|
| 463 |
+
# Midi
|
| 464 |
+
if settings.create_midi:
|
| 465 |
+
create_midi_file(real_bpm, song_output, ultrastar_class, basename_without_ext)
|
| 466 |
+
|
| 467 |
+
# Cleanup
|
| 468 |
+
if not settings.keep_cache:
|
| 469 |
+
remove_cache_folder(cache_path)
|
| 470 |
+
|
| 471 |
+
# Print Support
|
| 472 |
+
print_support()
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
def mute_no_singing_parts(mono_output_path, mute_output_path):
|
| 476 |
+
print(
|
| 477 |
+
f"{ULTRASINGER_HEAD} Mute audio parts with no singing"
|
| 478 |
+
)
|
| 479 |
+
silence_sections = get_silence_sections(mono_output_path)
|
| 480 |
+
y, sr = librosa.load(mono_output_path, sr=None)
|
| 481 |
+
# Mute the parts of the audio with no singing
|
| 482 |
+
for i in silence_sections:
|
| 483 |
+
# Define the time range to mute
|
| 484 |
+
|
| 485 |
+
start_time = i[0] # Start time in seconds
|
| 486 |
+
end_time = i[1] # End time in seconds
|
| 487 |
+
|
| 488 |
+
# Convert time to sample indices
|
| 489 |
+
start_sample = int(start_time * sr)
|
| 490 |
+
end_sample = int(end_time * sr)
|
| 491 |
+
|
| 492 |
+
y[start_sample:end_sample] = 0
|
| 493 |
+
sf.write(mute_output_path, y, sr)
|
| 494 |
+
|
| 495 |
+
|
| 496 |
+
def get_unused_song_output_dir(path: str) -> str:
|
| 497 |
+
"""Get an unused song output dir"""
|
| 498 |
+
# check if dir exists and add (i) if it does
|
| 499 |
+
i = 1
|
| 500 |
+
if os_helper.check_if_folder_exists(path):
|
| 501 |
+
path = f"{path} ({i})"
|
| 502 |
+
else:
|
| 503 |
+
return path
|
| 504 |
+
|
| 505 |
+
while os_helper.check_if_folder_exists(path):
|
| 506 |
+
path = path.replace(f"({i - 1})", f"({i})")
|
| 507 |
+
i += 1
|
| 508 |
+
if i > 999:
|
| 509 |
+
print(
|
| 510 |
+
f"{ULTRASINGER_HEAD} {red_highlighted('Error: Could not create output folder! (999) is the maximum number of tries.')}"
|
| 511 |
+
)
|
| 512 |
+
sys.exit(1)
|
| 513 |
+
return path
|
| 514 |
+
|
| 515 |
+
|
| 516 |
+
def transcribe_audio() -> (str, list[TranscribedData]):
|
| 517 |
+
"""Transcribe audio with AI"""
|
| 518 |
+
if settings.transcriber == "whisper":
|
| 519 |
+
device = "cpu" if settings.force_whisper_cpu else settings.pytorch_device
|
| 520 |
+
transcribed_data, detected_language = transcribe_with_whisper(
|
| 521 |
+
settings.processing_audio_path,
|
| 522 |
+
settings.whisper_model,
|
| 523 |
+
device,
|
| 524 |
+
settings.whisper_align_model,
|
| 525 |
+
settings.whisper_batch_size,
|
| 526 |
+
settings.whisper_compute_type,
|
| 527 |
+
settings.language,
|
| 528 |
+
)
|
| 529 |
+
else:
|
| 530 |
+
raise NotImplementedError
|
| 531 |
+
return detected_language, transcribed_data
|
| 532 |
+
|
| 533 |
+
|
| 534 |
+
def separate_vocal_from_audio(
|
| 535 |
+
basename_without_ext: str, cache_path: str, ultrastar_audio_input_path: str
|
| 536 |
+
) -> str:
|
| 537 |
+
"""Separate vocal from audio"""
|
| 538 |
+
audio_separation_path = os.path.join(
|
| 539 |
+
cache_path, "separated", "htdemucs", basename_without_ext
|
| 540 |
+
)
|
| 541 |
+
|
| 542 |
+
if settings.use_separated_vocal or settings.create_karaoke:
|
| 543 |
+
separate_audio(ultrastar_audio_input_path, cache_path, settings.pytorch_device)
|
| 544 |
+
|
| 545 |
+
return audio_separation_path
|
| 546 |
+
|
| 547 |
+
def calculate_score_points(
|
| 548 |
+
is_audio: bool, pitched_data: PitchedData, ultrastar_class: UltrastarTxtValue, ultrastar_file_output: str
|
| 549 |
+
):
|
| 550 |
+
"""Calculate score points"""
|
| 551 |
+
if is_audio:
|
| 552 |
+
ultrastar_class = ultrastar_parser.parse_ultrastar_txt(
|
| 553 |
+
ultrastar_file_output
|
| 554 |
+
)
|
| 555 |
+
(
|
| 556 |
+
simple_score,
|
| 557 |
+
accurate_score,
|
| 558 |
+
) = ultrastar_score_calculator.calculate_score(
|
| 559 |
+
pitched_data, ultrastar_class
|
| 560 |
+
)
|
| 561 |
+
ultrastar_score_calculator.print_score_calculation(
|
| 562 |
+
simple_score, accurate_score
|
| 563 |
+
)
|
| 564 |
+
else:
|
| 565 |
+
print(
|
| 566 |
+
f"{ULTRASINGER_HEAD} {blue_highlighted('Score of original Ultrastar txt')}"
|
| 567 |
+
)
|
| 568 |
+
(
|
| 569 |
+
simple_score,
|
| 570 |
+
accurate_score,
|
| 571 |
+
) = ultrastar_score_calculator.calculate_score(
|
| 572 |
+
pitched_data, ultrastar_class
|
| 573 |
+
)
|
| 574 |
+
ultrastar_score_calculator.print_score_calculation(
|
| 575 |
+
simple_score, accurate_score
|
| 576 |
+
)
|
| 577 |
+
print(
|
| 578 |
+
f"{ULTRASINGER_HEAD} {blue_highlighted('Score of re-pitched Ultrastar txt')}"
|
| 579 |
+
)
|
| 580 |
+
ultrastar_class = ultrastar_parser.parse_ultrastar_txt(
|
| 581 |
+
ultrastar_file_output
|
| 582 |
+
)
|
| 583 |
+
(
|
| 584 |
+
simple_score,
|
| 585 |
+
accurate_score,
|
| 586 |
+
) = ultrastar_score_calculator.calculate_score(
|
| 587 |
+
pitched_data, ultrastar_class
|
| 588 |
+
)
|
| 589 |
+
ultrastar_score_calculator.print_score_calculation(
|
| 590 |
+
simple_score, accurate_score
|
| 591 |
+
)
|
| 592 |
+
return ultrastar_class, simple_score, accurate_score
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
def create_ultrastar_txt_from_ultrastar_data(
|
| 596 |
+
song_output: str, ultrastar_class: UltrastarTxtValue, ultrastar_note_numbers: list[int]
|
| 597 |
+
) -> str:
|
| 598 |
+
"""Create Ultrastar txt from Ultrastar data"""
|
| 599 |
+
output_repitched_ultrastar = os.path.join(
|
| 600 |
+
song_output, ultrastar_class.title + ".txt"
|
| 601 |
+
)
|
| 602 |
+
ultrastar_writer.create_repitched_txt_from_ultrastar_data(
|
| 603 |
+
settings.input_file_path,
|
| 604 |
+
ultrastar_note_numbers,
|
| 605 |
+
output_repitched_ultrastar,
|
| 606 |
+
)
|
| 607 |
+
return output_repitched_ultrastar
|
| 608 |
+
|
| 609 |
+
|
| 610 |
+
def create_ultrastar_txt_from_automation(
|
| 611 |
+
basename_without_ext: str,
|
| 612 |
+
song_output: str,
|
| 613 |
+
transcribed_data: list[TranscribedData],
|
| 614 |
+
ultrastar_audio_input_path: str,
|
| 615 |
+
ultrastar_note_numbers: list[int],
|
| 616 |
+
language: str,
|
| 617 |
+
title: str,
|
| 618 |
+
artist: str,
|
| 619 |
+
year: str,
|
| 620 |
+
genre: str
|
| 621 |
+
):
|
| 622 |
+
"""Create Ultrastar txt from automation"""
|
| 623 |
+
ultrastar_header = UltrastarTxtValue()
|
| 624 |
+
ultrastar_header.version = settings.format_version
|
| 625 |
+
ultrastar_header.title = basename_without_ext
|
| 626 |
+
ultrastar_header.artist = basename_without_ext
|
| 627 |
+
ultrastar_header.mp3 = basename_without_ext + ".mp3"
|
| 628 |
+
ultrastar_header.audio = basename_without_ext + ".mp3"
|
| 629 |
+
ultrastar_header.vocals = basename_without_ext + " [Vocals].mp3"
|
| 630 |
+
ultrastar_header.instrumental = basename_without_ext + " [Instrumental].mp3"
|
| 631 |
+
ultrastar_header.video = basename_without_ext + ".mp4"
|
| 632 |
+
ultrastar_header.language = language
|
| 633 |
+
cover = basename_without_ext + " [CO].jpg"
|
| 634 |
+
ultrastar_header.cover = (
|
| 635 |
+
cover
|
| 636 |
+
if os_helper.check_file_exists(os.path.join(song_output, cover))
|
| 637 |
+
else None
|
| 638 |
+
)
|
| 639 |
+
ultrastar_header.creator = f"{ultrastar_header.creator} {Settings.APP_VERSION}"
|
| 640 |
+
ultrastar_header.comment = f"{ultrastar_header.comment} {Settings.APP_VERSION}"
|
| 641 |
+
|
| 642 |
+
# Additional data
|
| 643 |
+
if title is not None:
|
| 644 |
+
ultrastar_header.title = title
|
| 645 |
+
if artist is not None:
|
| 646 |
+
ultrastar_header.artist = artist
|
| 647 |
+
if year is not None:
|
| 648 |
+
ultrastar_header.year = extract_year(year)
|
| 649 |
+
if genre is not None:
|
| 650 |
+
ultrastar_header.genre = format_separated_string(genre)
|
| 651 |
+
|
| 652 |
+
real_bpm = get_bpm_from_file(ultrastar_audio_input_path)
|
| 653 |
+
ultrastar_file_output = os.path.join(
|
| 654 |
+
song_output, basename_without_ext + ".txt"
|
| 655 |
+
)
|
| 656 |
+
ultrastar_writer.create_ultrastar_txt_from_automation(
|
| 657 |
+
transcribed_data,
|
| 658 |
+
ultrastar_note_numbers,
|
| 659 |
+
ultrastar_file_output,
|
| 660 |
+
ultrastar_header,
|
| 661 |
+
real_bpm,
|
| 662 |
+
)
|
| 663 |
+
if settings.create_karaoke and version.parse(settings.format_version) < version.parse("1.1.0"):
|
| 664 |
+
title = basename_without_ext + " [Karaoke]"
|
| 665 |
+
ultrastar_header.title = title
|
| 666 |
+
ultrastar_header.mp3 = title + ".mp3"
|
| 667 |
+
karaoke_output_path = os.path.join(song_output, title)
|
| 668 |
+
karaoke_txt_output_path = karaoke_output_path + ".txt"
|
| 669 |
+
ultrastar_writer.create_ultrastar_txt_from_automation(
|
| 670 |
+
transcribed_data,
|
| 671 |
+
ultrastar_note_numbers,
|
| 672 |
+
karaoke_txt_output_path,
|
| 673 |
+
ultrastar_header,
|
| 674 |
+
real_bpm,
|
| 675 |
+
)
|
| 676 |
+
return real_bpm, ultrastar_file_output
|
| 677 |
+
|
| 678 |
+
def extract_year(date: str) -> str:
|
| 679 |
+
match = re.search(r'\b\d{4}\b', date)
|
| 680 |
+
if match:
|
| 681 |
+
return match.group(0)
|
| 682 |
+
else:
|
| 683 |
+
return date
|
| 684 |
+
|
| 685 |
+
def format_separated_string(data: str) -> str:
|
| 686 |
+
temp = re.sub(r'[;/]', ',', data)
|
| 687 |
+
words = temp.split(',')
|
| 688 |
+
words = [s for s in words if s.strip()]
|
| 689 |
+
|
| 690 |
+
for i, word in enumerate(words):
|
| 691 |
+
if "-" not in word:
|
| 692 |
+
words[i] = word.strip().capitalize() + ', '
|
| 693 |
+
else:
|
| 694 |
+
dash_words = word.split('-')
|
| 695 |
+
capitalized_dash_words = [dash_word.strip().capitalize() for dash_word in dash_words]
|
| 696 |
+
formatted_dash_word = '-'.join(capitalized_dash_words) + ', '
|
| 697 |
+
words[i] = formatted_dash_word
|
| 698 |
+
|
| 699 |
+
formatted_string = ''.join(words)
|
| 700 |
+
|
| 701 |
+
if formatted_string.endswith(', '):
|
| 702 |
+
formatted_string = formatted_string[:-2]
|
| 703 |
+
|
| 704 |
+
return formatted_string
|
| 705 |
+
|
| 706 |
+
def infos_from_audio_input_file() -> tuple[str, str, str, tuple[str, str, str, str]]:
|
| 707 |
+
"""Infos from audio input file"""
|
| 708 |
+
basename = os.path.basename(settings.input_file_path)
|
| 709 |
+
basename_without_ext = os.path.splitext(basename)[0]
|
| 710 |
+
|
| 711 |
+
artist, title = None, None
|
| 712 |
+
if " - " in basename_without_ext:
|
| 713 |
+
artist, title = basename_without_ext.split(" - ", 1)
|
| 714 |
+
search_string = f"{artist} - {title}"
|
| 715 |
+
else:
|
| 716 |
+
search_string = basename_without_ext
|
| 717 |
+
|
| 718 |
+
# Get additional data for song
|
| 719 |
+
(title_info, artist_info, year_info, genre_info) = get_music_infos(search_string)
|
| 720 |
+
|
| 721 |
+
if title_info is not None:
|
| 722 |
+
title = title_info
|
| 723 |
+
artist = artist_info
|
| 724 |
+
|
| 725 |
+
if artist is not None and title is not None:
|
| 726 |
+
basename_without_ext = f"{artist} - {title}"
|
| 727 |
+
extension = os.path.splitext(basename)[1]
|
| 728 |
+
basename = f"{basename_without_ext}{extension}"
|
| 729 |
+
|
| 730 |
+
song_output = os.path.join(settings.output_file_path, basename_without_ext)
|
| 731 |
+
song_output = get_unused_song_output_dir(song_output)
|
| 732 |
+
os_helper.create_folder(song_output)
|
| 733 |
+
os_helper.copy(settings.input_file_path, song_output)
|
| 734 |
+
os_helper.rename(os.path.join(song_output, os.path.basename(settings.input_file_path)), os.path.join(song_output, basename))
|
| 735 |
+
ultrastar_audio_input_path = os.path.join(song_output, basename)
|
| 736 |
+
return basename_without_ext, song_output, ultrastar_audio_input_path, (title, artist, year_info, genre_info)
|
| 737 |
+
|
| 738 |
+
|
| 739 |
+
FILENAME_REPLACEMENTS = (('?:"', ""), ("<", "("), (">", ")"), ("/\\|*", "-"))
|
| 740 |
+
|
| 741 |
+
|
| 742 |
+
def sanitize_filename(fname: str) -> str:
|
| 743 |
+
"""Sanitize filename"""
|
| 744 |
+
for old, new in FILENAME_REPLACEMENTS:
|
| 745 |
+
for char in old:
|
| 746 |
+
fname = fname.replace(char, new)
|
| 747 |
+
if fname.endswith("."):
|
| 748 |
+
fname = fname.rstrip(" .") # Windows does not like trailing periods
|
| 749 |
+
return fname
|
| 750 |
+
|
| 751 |
+
|
| 752 |
+
def download_from_youtube() -> tuple[str, str, str, tuple[str, str, str, str]]:
|
| 753 |
+
"""Download from YouTube"""
|
| 754 |
+
(artist, title) = get_youtube_title(settings.input_file_path)
|
| 755 |
+
|
| 756 |
+
# Get additional data for song
|
| 757 |
+
(title_info, artist_info, year_info, genre_info) = get_music_infos(f"{artist} - {title}")
|
| 758 |
+
|
| 759 |
+
if title_info is not None:
|
| 760 |
+
title = title_info
|
| 761 |
+
artist = artist_info
|
| 762 |
+
|
| 763 |
+
basename_without_ext = sanitize_filename(f"{artist} - {title}")
|
| 764 |
+
basename = basename_without_ext + ".mp3"
|
| 765 |
+
song_output = os.path.join(settings.output_file_path, basename_without_ext)
|
| 766 |
+
song_output = get_unused_song_output_dir(song_output)
|
| 767 |
+
os_helper.create_folder(song_output)
|
| 768 |
+
download_youtube_audio(
|
| 769 |
+
settings.input_file_path, basename_without_ext, song_output
|
| 770 |
+
)
|
| 771 |
+
download_youtube_video(
|
| 772 |
+
settings.input_file_path, basename_without_ext, song_output
|
| 773 |
+
)
|
| 774 |
+
download_youtube_thumbnail(
|
| 775 |
+
settings.input_file_path, basename_without_ext, song_output
|
| 776 |
+
)
|
| 777 |
+
ultrastar_audio_input_path = os.path.join(song_output, basename)
|
| 778 |
+
return basename_without_ext, song_output, ultrastar_audio_input_path, (title, artist, year_info, genre_info)
|
| 779 |
+
|
| 780 |
+
|
| 781 |
+
def parse_ultrastar_txt() -> tuple[str, float, str, str, UltrastarTxtValue]:
|
| 782 |
+
"""Parse Ultrastar txt"""
|
| 783 |
+
ultrastar_class = ultrastar_parser.parse_ultrastar_txt(
|
| 784 |
+
settings.input_file_path
|
| 785 |
+
)
|
| 786 |
+
real_bpm = ultrastar_converter.ultrastar_bpm_to_real_bpm(
|
| 787 |
+
float(ultrastar_class.bpm.replace(",", "."))
|
| 788 |
+
)
|
| 789 |
+
ultrastar_mp3_name = ultrastar_class.mp3
|
| 790 |
+
basename_without_ext = os.path.splitext(ultrastar_mp3_name)[0]
|
| 791 |
+
dirname = os.path.dirname(settings.input_file_path)
|
| 792 |
+
ultrastar_audio_input_path = os.path.join(dirname, ultrastar_mp3_name)
|
| 793 |
+
song_output = os.path.join(
|
| 794 |
+
settings.output_file_path,
|
| 795 |
+
ultrastar_class.artist.strip() + " - " + ultrastar_class.title.strip(),
|
| 796 |
+
)
|
| 797 |
+
song_output = get_unused_song_output_dir(str(song_output))
|
| 798 |
+
os_helper.create_folder(song_output)
|
| 799 |
+
|
| 800 |
+
return (
|
| 801 |
+
str(basename_without_ext),
|
| 802 |
+
real_bpm,
|
| 803 |
+
song_output,
|
| 804 |
+
str(ultrastar_audio_input_path),
|
| 805 |
+
ultrastar_class,
|
| 806 |
+
)
|
| 807 |
+
|
| 808 |
+
|
| 809 |
+
def create_midi_file(real_bpm: float,
|
| 810 |
+
song_output: str,
|
| 811 |
+
ultrastar_class: UltrastarTxtValue,
|
| 812 |
+
basename_without_ext: str) -> None:
|
| 813 |
+
"""Create midi file"""
|
| 814 |
+
print(
|
| 815 |
+
f"{ULTRASINGER_HEAD} Creating Midi with {blue_highlighted('pretty_midi')}"
|
| 816 |
+
)
|
| 817 |
+
|
| 818 |
+
voice_instrument = [
|
| 819 |
+
midi_creator.convert_ultrastar_to_midi_instrument(ultrastar_class)
|
| 820 |
+
]
|
| 821 |
+
midi_output = os.path.join(song_output, f"{basename_without_ext}.mid")
|
| 822 |
+
midi_creator.instruments_to_midi(
|
| 823 |
+
voice_instrument, real_bpm, midi_output
|
| 824 |
+
)
|
| 825 |
+
|
| 826 |
+
|
| 827 |
+
def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrastar_class: UltrastarTxtValue) -> tuple[
|
| 828 |
+
list[str], PitchedData, list[int]]:
|
| 829 |
+
"""Pitch audio"""
|
| 830 |
+
# todo: chunk pitching as option?
|
| 831 |
+
# midi_notes = pitch_each_chunk_with_crepe(chunk_folder_name)
|
| 832 |
+
device = "cpu" if settings.force_crepe_cpu else settings.tensorflow_device
|
| 833 |
+
pitched_data = get_pitch_with_crepe_file(
|
| 834 |
+
settings.processing_audio_path,
|
| 835 |
+
settings.crepe_model_capacity,
|
| 836 |
+
settings.crepe_step_size,
|
| 837 |
+
device,
|
| 838 |
+
)
|
| 839 |
+
if is_audio:
|
| 840 |
+
start_times = []
|
| 841 |
+
end_times = []
|
| 842 |
+
for i, data in enumerate(transcribed_data):
|
| 843 |
+
start_times.append(data.start)
|
| 844 |
+
end_times.append(data.end)
|
| 845 |
+
midi_notes = create_midi_notes_from_pitched_data(
|
| 846 |
+
start_times, end_times, pitched_data
|
| 847 |
+
)
|
| 848 |
+
|
| 849 |
+
else:
|
| 850 |
+
midi_notes = create_midi_notes_from_pitched_data(
|
| 851 |
+
ultrastar_class.startTimes, ultrastar_class.endTimes, pitched_data
|
| 852 |
+
)
|
| 853 |
+
ultrastar_note_numbers = convert_midi_notes_to_ultrastar_notes(midi_notes)
|
| 854 |
+
return midi_notes, pitched_data, ultrastar_note_numbers
|
| 855 |
+
|
| 856 |
+
|
| 857 |
+
def create_audio_chunks(
|
| 858 |
+
cache_path: str,
|
| 859 |
+
is_audio: bool,
|
| 860 |
+
transcribed_data: list[TranscribedData],
|
| 861 |
+
ultrastar_audio_input_path: str,
|
| 862 |
+
ultrastar_class: UltrastarTxtValue
|
| 863 |
+
) -> None:
|
| 864 |
+
"""Create audio chunks"""
|
| 865 |
+
audio_chunks_path = os.path.join(
|
| 866 |
+
cache_path, settings.audio_chunk_folder_name
|
| 867 |
+
)
|
| 868 |
+
os_helper.create_folder(audio_chunks_path)
|
| 869 |
+
if is_audio: # and csv
|
| 870 |
+
csv_filename = os.path.join(audio_chunks_path, "_chunks.csv")
|
| 871 |
+
export_chunks_from_transcribed_data(
|
| 872 |
+
settings.processing_audio_path, transcribed_data, audio_chunks_path
|
| 873 |
+
)
|
| 874 |
+
export_transcribed_data_to_csv(transcribed_data, csv_filename)
|
| 875 |
+
else:
|
| 876 |
+
export_chunks_from_ultrastar_data(
|
| 877 |
+
ultrastar_audio_input_path, ultrastar_class, audio_chunks_path
|
| 878 |
+
)
|
| 879 |
+
|
| 880 |
+
def denoise_vocal_audio(input_path: str, output_path: str) -> None:
|
| 881 |
+
"""Denoise vocal audio"""
|
| 882 |
+
ffmpeg_reduce_noise(input_path, output_path)
|
| 883 |
+
|
| 884 |
+
|
| 885 |
+
def main(argv: list[str]) -> None:
|
| 886 |
+
"""Main function"""
|
| 887 |
+
print_version()
|
| 888 |
+
init_settings(argv)
|
| 889 |
+
run()
|
| 890 |
+
sys.exit()
|
| 891 |
+
|
| 892 |
+
def remove_cache_folder(cache_path: str) -> None:
|
| 893 |
+
"""Remove cache folder"""
|
| 894 |
+
os_helper.remove_folder(cache_path)
|
| 895 |
+
|
| 896 |
+
def init_settings(argv: list[str]) -> None:
|
| 897 |
+
"""Init settings"""
|
| 898 |
+
long, short = arg_options()
|
| 899 |
+
opts, args = getopt.getopt(argv, short, long)
|
| 900 |
+
if len(opts) == 0:
|
| 901 |
+
print_help()
|
| 902 |
+
sys.exit()
|
| 903 |
+
for opt, arg in opts:
|
| 904 |
+
if opt == "-h":
|
| 905 |
+
print_help()
|
| 906 |
+
sys.exit()
|
| 907 |
+
elif opt in ("-i", "--ifile"):
|
| 908 |
+
settings.input_file_path = arg
|
| 909 |
+
elif opt in ("-o", "--ofile"):
|
| 910 |
+
settings.output_file_path = arg
|
| 911 |
+
elif opt in ("--whisper"):
|
| 912 |
+
settings.transcriber = "whisper"
|
| 913 |
+
settings.whisper_model = arg
|
| 914 |
+
elif opt in ("--whisper_align_model"):
|
| 915 |
+
settings.whisper_align_model = arg
|
| 916 |
+
elif opt in ("--whisper_batch_size"):
|
| 917 |
+
settings.whisper_batch_size = int(arg)
|
| 918 |
+
elif opt in ("--whisper_compute_type"):
|
| 919 |
+
settings.whisper_compute_type = arg
|
| 920 |
+
elif opt in ("--language"):
|
| 921 |
+
settings.language = arg
|
| 922 |
+
elif opt in ("--crepe"):
|
| 923 |
+
settings.crepe_model_capacity = arg
|
| 924 |
+
elif opt in ("--crepe_step_size"):
|
| 925 |
+
settings.crepe_step_size = int(arg)
|
| 926 |
+
elif opt in ("--plot"):
|
| 927 |
+
settings.create_plot = arg in ["True", "true"]
|
| 928 |
+
elif opt in ("--midi"):
|
| 929 |
+
settings.create_midi = arg in ["True", "true"]
|
| 930 |
+
elif opt in ("--hyphenation"):
|
| 931 |
+
settings.hyphenation = eval(arg.title())
|
| 932 |
+
elif opt in ("--disable_separation"):
|
| 933 |
+
settings.use_separated_vocal = not arg
|
| 934 |
+
elif opt in ("--disable_karaoke"):
|
| 935 |
+
settings.create_karaoke = not arg
|
| 936 |
+
elif opt in ("--create_audio_chunks"):
|
| 937 |
+
settings.create_audio_chunks = arg
|
| 938 |
+
elif opt in ("--force_cpu"):
|
| 939 |
+
settings.force_cpu = arg
|
| 940 |
+
if settings.force_cpu:
|
| 941 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
| 942 |
+
elif opt in ("--force_whisper_cpu"):
|
| 943 |
+
settings.force_whisper_cpu = eval(arg.title())
|
| 944 |
+
elif opt in ("--force_crepe_cpu"):
|
| 945 |
+
settings.force_crepe_cpu = eval(arg.title())
|
| 946 |
+
elif opt in ("--format_version"):
|
| 947 |
+
if arg != '0.3.0' and arg != '1.0.0' and arg != '1.1.0':
|
| 948 |
+
print(
|
| 949 |
+
f"{ULTRASINGER_HEAD} {red_highlighted('Error: Format version')} {blue_highlighted(arg)} {red_highlighted('is not supported.')}"
|
| 950 |
+
)
|
| 951 |
+
sys.exit(1)
|
| 952 |
+
settings.format_version = arg
|
| 953 |
+
elif opt in ("--keep_cache"):
|
| 954 |
+
settings.keep_cache = arg
|
| 955 |
+
if settings.output_file_path == "":
|
| 956 |
+
if settings.input_file_path.startswith("https:"):
|
| 957 |
+
dirname = os.getcwd()
|
| 958 |
+
else:
|
| 959 |
+
dirname = os.path.dirname(settings.input_file_path)
|
| 960 |
+
settings.output_file_path = os.path.join(dirname, "output")
|
| 961 |
+
|
| 962 |
+
if not settings.force_cpu:
|
| 963 |
+
settings.tensorflow_device, settings.pytorch_device = check_gpu_support()
|
| 964 |
+
|
| 965 |
+
|
| 966 |
+
def arg_options():
|
| 967 |
+
short = "hi:o:amv:"
|
| 968 |
+
long = [
|
| 969 |
+
"ifile=",
|
| 970 |
+
"ofile=",
|
| 971 |
+
"crepe=",
|
| 972 |
+
"crepe_step_size=",
|
| 973 |
+
"whisper=",
|
| 974 |
+
"whisper_align_model=",
|
| 975 |
+
"whisper_batch_size=",
|
| 976 |
+
"whisper_compute_type=",
|
| 977 |
+
"language=",
|
| 978 |
+
"plot=",
|
| 979 |
+
"midi=",
|
| 980 |
+
"hyphenation=",
|
| 981 |
+
"disable_separation=",
|
| 982 |
+
"disable_karaoke=",
|
| 983 |
+
"create_audio_chunks=",
|
| 984 |
+
"force_cpu=",
|
| 985 |
+
"force_whisper_cpu=",
|
| 986 |
+
"force_crepe_cpu=",
|
| 987 |
+
"format_version=",
|
| 988 |
+
"keep_cache"
|
| 989 |
+
]
|
| 990 |
+
return long, short
|
| 991 |
+
|
| 992 |
+
|
| 993 |
+
if __name__ == "__main__":
|
| 994 |
+
main(sys.argv[1:])
|