Add experimental vocal separation and selective transcription features
Browse files- Added **Separate Vocals** option (experimental): Splits input audio into vocals and music stems before processing.
- Added **Transcription Target** setting: Allows choosing which stem (vocals or music) to transcribe to MIDI.
- Added option to **Re-merge Other Part with Rendered Audio**: After rendering, merges the non-transcribed stem (e.g., original vocals) back with the new music.
- app.py +364 -160
- requirements.txt +6 -2
app.py
CHANGED
|
@@ -50,8 +50,13 @@ import soundfile as sf
|
|
| 50 |
import torch
|
| 51 |
import gradio as gr
|
| 52 |
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
|
|
|
| 55 |
from piano_transcription_inference import PianoTranscription, utilities, sample_rate as transcription_sample_rate
|
| 56 |
|
| 57 |
# --- Import core transcription and MIDI processing libraries ---
|
|
@@ -1042,6 +1047,9 @@ def recommend_8bit_params(midi_data, default_preset):
|
|
| 1042 |
def process_and_render_file(input_file,
|
| 1043 |
# --- Pass the preset selector value ---
|
| 1044 |
s8bit_preset_selector,
|
|
|
|
|
|
|
|
|
|
| 1045 |
# --- Transcription params ---
|
| 1046 |
enable_stereo_processing,
|
| 1047 |
transcription_method,
|
|
@@ -1071,14 +1079,9 @@ def process_and_render_file(input_file,
|
|
| 1071 |
filename = os.path.basename(input_file_path)
|
| 1072 |
print(f"Processing new file: {filename}")
|
| 1073 |
|
| 1074 |
-
|
| 1075 |
-
|
| 1076 |
-
|
| 1077 |
-
except Exception as e:
|
| 1078 |
-
# If loading fails, it might be a MIDI file, which librosa cannot handle.
|
| 1079 |
-
# We will proceed, assuming it's a MIDI, and let pretty_midi handle it later.
|
| 1080 |
-
print(f"Could not load as audio: {e}. Assuming it is a MIDI file.")
|
| 1081 |
-
pass
|
| 1082 |
|
| 1083 |
# --- Step 1: Check file type and transcribe if necessary ---
|
| 1084 |
if filename.lower().endswith(('.mid', '.midi', '.kar')):
|
|
@@ -1086,42 +1089,95 @@ def process_and_render_file(input_file,
|
|
| 1086 |
midi_path_for_rendering = input_file_path
|
| 1087 |
else: #if filename.lower().endswith(('.wav', '.mp3'))
|
| 1088 |
print("Audio file detected. Starting transcription...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1089 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1090 |
base_name = os.path.splitext(filename)[0]
|
| 1091 |
-
temp_dir = "output/
|
| 1092 |
os.makedirs(temp_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1093 |
|
| 1094 |
# === STEREO PROCESSING LOGIC ===
|
| 1095 |
if enable_stereo_processing:
|
| 1096 |
-
if
|
| 1097 |
print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.")
|
| 1098 |
enable_stereo_processing = False # Disable stereo processing if audio is not stereo
|
| 1099 |
|
| 1100 |
if enable_stereo_processing:
|
| 1101 |
-
print("Stereo processing enabled. Splitting channels...")
|
| 1102 |
try:
|
| 1103 |
-
|
| 1104 |
-
|
| 1105 |
|
| 1106 |
-
normalized_left = normalize_loudness(
|
| 1107 |
-
normalized_right = normalize_loudness(
|
| 1108 |
|
| 1109 |
-
|
| 1110 |
-
|
| 1111 |
|
| 1112 |
-
sf.write(
|
| 1113 |
-
sf.write(
|
| 1114 |
|
| 1115 |
-
print(f"Saved left channel to: {
|
| 1116 |
-
print(f"Saved right channel to: {
|
| 1117 |
|
| 1118 |
print("Transcribing left and right channel...")
|
| 1119 |
if transcription_method == "General Purpose":
|
| 1120 |
-
midi_path_left = TranscribeGeneralAudio(
|
| 1121 |
-
midi_path_right = TranscribeGeneralAudio(
|
| 1122 |
-
else:
|
| 1123 |
-
midi_path_left = TranscribePianoAudio(
|
| 1124 |
-
midi_path_right = TranscribePianoAudio(
|
| 1125 |
|
| 1126 |
if midi_path_left and midi_path_right:
|
| 1127 |
merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
|
|
@@ -1139,24 +1195,22 @@ def process_and_render_file(input_file,
|
|
| 1139 |
print(f"An error occurred during stereo processing: {e}")
|
| 1140 |
raise gr.Error(f"Stereo Processing Failed: {e}")
|
| 1141 |
else: # Standard mono transcription
|
| 1142 |
-
print("
|
| 1143 |
-
|
| 1144 |
-
|
| 1145 |
-
|
| 1146 |
-
|
| 1147 |
-
|
| 1148 |
-
|
| 1149 |
-
normalized_mono = normalize_loudness(mono_signal, native_sample_rate)
|
| 1150 |
|
| 1151 |
-
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
|
| 1155 |
try:
|
| 1156 |
if transcription_method == "General Purpose":
|
| 1157 |
-
midi_path_for_rendering = TranscribeGeneralAudio(
|
| 1158 |
else: # Piano-Specific
|
| 1159 |
-
midi_path_for_rendering = TranscribePianoAudio(
|
| 1160 |
except Exception as e:
|
| 1161 |
print(f"An error occurred during transcription: {e}")
|
| 1162 |
raise gr.Error(f"Transcription Failed: {e}")
|
|
@@ -1216,7 +1270,43 @@ def process_and_render_file(input_file,
|
|
| 1216 |
synth_params['fm_modulation_depth'],
|
| 1217 |
synth_params['fm_modulation_rate']
|
| 1218 |
)
|
| 1219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1220 |
print(f'Total processing time: {(reqtime.time() - start_time):.2f} sec')
|
| 1221 |
print('*' * 70)
|
| 1222 |
|
|
@@ -1308,7 +1398,18 @@ if __name__ == "__main__":
|
|
| 1308 |
if not soundfonts_dict:
|
| 1309 |
print("\nWARNING: No SoundFonts were found or could be downloaded.")
|
| 1310 |
print("Rendering with SoundFonts will fail. Only the 8-bit synthesizer will be available.")
|
| 1311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1312 |
# --- Define a constant for the fallback preset name ---
|
| 1313 |
# This prevents errors if the preset name is changed in the dictionary.
|
| 1314 |
FALLBACK_PRESET_NAME = "Generic Chiptune Loop"
|
|
@@ -1318,43 +1419,7 @@ if __name__ == "__main__":
|
|
| 1318 |
# Comprehensive preset dictionary including new JRPG and Handheld classics
|
| 1319 |
# Note: Vibrato depth is mapped to a representative value on the 0-50 Hz slider.
|
| 1320 |
S8BIT_PRESETS = {
|
| 1321 |
-
# ---
|
| 1322 |
-
"Rhythm Pop Lead (Rhythm Tengoku / リズム天国)": {
|
| 1323 |
-
# Description: A clean, round square wave perfect for the snappy, catchy feel of rhythm games.
|
| 1324 |
-
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18,
|
| 1325 |
-
'vibrato_rate': 4.5, 'vibrato_depth': 4,
|
| 1326 |
-
'smooth_notes_level': 0.9, # Formerly True -> 1.0; slightly reduced for a bit more attack.
|
| 1327 |
-
'continuous_vibrato_level': 0.8, # Formerly True -> 1.0; slightly weakened for more defined note transitions.
|
| 1328 |
-
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1329 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1330 |
-
},
|
| 1331 |
-
"Arcade Brawler Lead (Street Fighter / ストリートファイター)": {
|
| 1332 |
-
# Description: A gritty sawtooth lead with a hard attack, capturing the high-energy feel of classic fighting games.
|
| 1333 |
-
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
|
| 1334 |
-
'vibrato_rate': 5.0, 'vibrato_depth': 6,
|
| 1335 |
-
'smooth_notes_level': 0.8,
|
| 1336 |
-
'continuous_vibrato_level': 0.7,
|
| 1337 |
-
'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.1,
|
| 1338 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1339 |
-
},
|
| 1340 |
-
"Mega Man (Rockman / ロックマン)": {
|
| 1341 |
-
# Description: A thin, sharp square wave lead with fast vibrato, iconic for its driving, heroic melodies.
|
| 1342 |
-
'waveform_type': 'Square', 'pulse_width': 0.2, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
|
| 1343 |
-
'vibrato_rate': 6.0, 'vibrato_depth': 8,
|
| 1344 |
-
'smooth_notes_level': 0.9,
|
| 1345 |
-
'continuous_vibrato_level': 0.85,
|
| 1346 |
-
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.05,
|
| 1347 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1348 |
-
},
|
| 1349 |
-
"Kirby's Bubbly Melody (Hoshi no Kirby / 星のカービィ)": {
|
| 1350 |
-
# Description: A soft, round square wave with a bouncy vibrato, creating a cheerful and adorable sound.
|
| 1351 |
-
'waveform_type': 'Square', 'pulse_width': 0.4, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
|
| 1352 |
-
'vibrato_rate': 6.0, 'vibrato_depth': 4,
|
| 1353 |
-
'smooth_notes_level': 0.85,
|
| 1354 |
-
'continuous_vibrato_level': 0.3, # Formerly False (0.0); adds a hint of continuity for more liveliness.
|
| 1355 |
-
'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1356 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1357 |
-
},
|
| 1358 |
"Mario (Super Mario Bros / スーパーマリオブラザーズ)": {
|
| 1359 |
# Description: A bright square wave with a per-note vibrato, producing the classic bouncy platformer sound.
|
| 1360 |
'waveform_type': 'Square', 'pulse_width': 0.3, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25,
|
|
@@ -1364,41 +1429,13 @@ if __name__ == "__main__":
|
|
| 1364 |
'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1365 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1366 |
},
|
| 1367 |
-
|
| 1368 |
-
|
| 1369 |
-
|
| 1370 |
-
'
|
| 1371 |
-
'vibrato_rate': 3.5, 'vibrato_depth': 5,
|
| 1372 |
-
'smooth_notes_level': 0.95,
|
| 1373 |
-
'continuous_vibrato_level': 0.9,
|
| 1374 |
-
'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
|
| 1375 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1376 |
-
},
|
| 1377 |
-
"Mystic Mana Pad (Secret of Mana / 聖剣伝説2)": {
|
| 1378 |
-
# Description: A warm, ethereal square wave pad with slow vibrato, capturing a feeling of fantasy and wonder.
|
| 1379 |
-
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
|
| 1380 |
-
'vibrato_rate': 2.5, 'vibrato_depth': 4,
|
| 1381 |
-
'smooth_notes_level': 1.0,
|
| 1382 |
-
'continuous_vibrato_level': 0.95,
|
| 1383 |
-
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1384 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1385 |
-
},
|
| 1386 |
-
"Dragon Quest (ドラゴンクエスト)": {
|
| 1387 |
-
# Description: A pure triangle wave with a long decay, mimicking the grand, orchestral feel of a classical flute or string section.
|
| 1388 |
-
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.6,
|
| 1389 |
-
'vibrato_rate': 3.0, 'vibrato_depth': 4,
|
| 1390 |
-
'smooth_notes_level': 0.9,
|
| 1391 |
-
'continuous_vibrato_level': 0.9,
|
| 1392 |
-
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1393 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1394 |
-
},
|
| 1395 |
-
"ONI V (Wafu Mystic / ONI V 隠忍を継ぐ者)": {
|
| 1396 |
-
# Description: A solemn triangle wave with a slow, expressive vibrato, evoking the mysterious atmosphere of Japanese folklore.
|
| 1397 |
-
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
|
| 1398 |
-
'vibrato_rate': 3.5, 'vibrato_depth': 3,
|
| 1399 |
'smooth_notes_level': 0.9,
|
| 1400 |
'continuous_vibrato_level': 0.85,
|
| 1401 |
-
'bass_boost_level': 0.
|
| 1402 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1403 |
},
|
| 1404 |
"Zelda (The Legend of Zelda / ゼルダの伝説)": {
|
|
@@ -1410,23 +1447,22 @@ if __name__ == "__main__":
|
|
| 1410 |
'bass_boost_level': 0.15, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1411 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1412 |
},
|
| 1413 |
-
|
| 1414 |
-
|
| 1415 |
-
|
| 1416 |
-
'
|
| 1417 |
-
'vibrato_rate': 5.5, 'vibrato_depth': 6,
|
| 1418 |
'smooth_notes_level': 0.85,
|
| 1419 |
-
'continuous_vibrato_level': 0.
|
| 1420 |
-
'bass_boost_level': 0.
|
| 1421 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1422 |
},
|
| 1423 |
-
"
|
| 1424 |
-
# Description: A
|
| 1425 |
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
|
| 1426 |
-
'vibrato_rate': 5.0, 'vibrato_depth':
|
| 1427 |
'smooth_notes_level': 0.9,
|
| 1428 |
-
'continuous_vibrato_level': 0.
|
| 1429 |
-
'bass_boost_level': 0.
|
| 1430 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1431 |
},
|
| 1432 |
"Castlevania (Akumajō Dracula / 悪魔城ドラキュラ)": {
|
|
@@ -1438,13 +1474,22 @@ if __name__ == "__main__":
|
|
| 1438 |
'bass_boost_level': 0.35, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1439 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1440 |
},
|
| 1441 |
-
"
|
| 1442 |
-
# Description: A
|
| 1443 |
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
|
| 1444 |
-
'vibrato_rate': 5.0, 'vibrato_depth':
|
| 1445 |
'smooth_notes_level': 0.9,
|
| 1446 |
-
'continuous_vibrato_level': 0.
|
| 1447 |
-
'bass_boost_level': 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1448 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1449 |
},
|
| 1450 |
# --- Advanced System Impressions ---
|
|
@@ -1484,7 +1529,155 @@ if __name__ == "__main__":
|
|
| 1484 |
'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1485 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1486 |
},
|
| 1487 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1488 |
"Sci-Fi Energy Field": {
|
| 1489 |
# Description: (SFX) High-speed vibrato and noise create a constant, shimmering hum suitable for energy shields or force fields.
|
| 1490 |
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
|
|
@@ -1530,7 +1723,7 @@ if __name__ == "__main__":
|
|
| 1530 |
'bass_boost_level': 0.8, 'noise_level': 0.2, 'distortion_level': 0.5,
|
| 1531 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1532 |
},
|
| 1533 |
-
# --- Utility ---
|
| 1534 |
"Generic Chiptune Loop": {
|
| 1535 |
# Description: A well-balanced, pleasant square wave lead that serves as a great starting point for custom sounds.
|
| 1536 |
'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
|
|
@@ -1540,35 +1733,14 @@ if __name__ == "__main__":
|
|
| 1540 |
'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1541 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1542 |
},
|
| 1543 |
-
"Dark/Boss Atmosphere (Shin Megami Tensei / 真・女神転生)": {
|
| 1544 |
-
# Description: An aggressive sawtooth, inspired by the dark, rock-infused themes of SMT.
|
| 1545 |
-
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35,
|
| 1546 |
-
'vibrato_rate': 7.0, 'vibrato_depth': 12,
|
| 1547 |
-
'smooth_notes_level': 0.1,
|
| 1548 |
-
'continuous_vibrato_level': 0.0,
|
| 1549 |
-
'bass_boost_level': 0.4, 'noise_level': 0.15, 'distortion_level': 0.25,
|
| 1550 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1551 |
-
},
|
| 1552 |
-
"Modern JRPG Pad (Persona / ペルソナ)": {
|
| 1553 |
-
# Description: A warm, stylish square wave pad, capturing the modern, pop/jazz-infused feel of the Persona series.
|
| 1554 |
-
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
|
| 1555 |
-
'vibrato_rate': 2.5, 'vibrato_depth': 4,
|
| 1556 |
-
'smooth_notes_level': 1.0,
|
| 1557 |
-
'continuous_vibrato_level': 0.95,
|
| 1558 |
-
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1559 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1560 |
-
},
|
| 1561 |
-
"Tactical Brass (Fire Emblem / ファイアーエムブレム)": {
|
| 1562 |
-
# Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of Fire Emblem's tactical themes.
|
| 1563 |
-
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
|
| 1564 |
-
'vibrato_rate': 3.5, 'vibrato_depth': 5,
|
| 1565 |
-
'smooth_notes_level': 0.95,
|
| 1566 |
-
'continuous_vibrato_level': 0.9,
|
| 1567 |
-
'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
|
| 1568 |
-
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1569 |
-
}
|
| 1570 |
}
|
| 1571 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1572 |
app = gr.Blocks(theme=gr.themes.Base())
|
| 1573 |
|
| 1574 |
with app:
|
|
@@ -1611,6 +1783,27 @@ if __name__ == "__main__":
|
|
| 1611 |
info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time."
|
| 1612 |
)
|
| 1613 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1614 |
with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
|
| 1615 |
onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
|
| 1616 |
frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
|
|
@@ -1727,7 +1920,7 @@ if __name__ == "__main__":
|
|
| 1727 |
s8bit_waveform_type = gr.Dropdown(['Square', 'Sawtooth', 'Triangle'], value='Square', label="Waveform Type")
|
| 1728 |
s8bit_pulse_width = gr.Slider(0.01, 0.99, value=0.5, step=0.01, label="Pulse Width (Square Wave Only)")
|
| 1729 |
s8bit_envelope_type = gr.Dropdown(['Plucky (AD Envelope)', 'Sustained (Full Decay)'], value='Plucky (AD Envelope)', label="Envelope Type")
|
| 1730 |
-
s8bit_decay_time_s = gr.Slider(0.01, 0
|
| 1731 |
s8bit_vibrato_rate = gr.Slider(0, 20, value=5, label="Vibrato Rate (Hz)")
|
| 1732 |
s8bit_vibrato_depth = gr.Slider(0, 50, value=0, label="Vibrato Depth (Hz)")
|
| 1733 |
s8bit_bass_boost_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Bass Boost Level", info="Adjusts the volume of the sub-octave. 0 is off.")
|
|
@@ -1774,7 +1967,11 @@ if __name__ == "__main__":
|
|
| 1774 |
# all_inputs now includes the preset selector itself
|
| 1775 |
# Inputs for the main processing function
|
| 1776 |
all_inputs = [
|
| 1777 |
-
input_file, s8bit_preset_selector,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1778 |
transcription_method, onset_threshold, frame_threshold, minimum_note_length,
|
| 1779 |
minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends,
|
| 1780 |
render_type, soundfont_bank, render_sample_rate, render_with_sustains,
|
|
@@ -1810,6 +2007,13 @@ if __name__ == "__main__":
|
|
| 1810 |
inputs=all_inputs,
|
| 1811 |
outputs=all_outputs # Pass the combined list
|
| 1812 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1813 |
|
| 1814 |
# --- Listeners for dynamic UI updates ---
|
| 1815 |
transcription_method.change(
|
|
|
|
| 50 |
import torch
|
| 51 |
import gradio as gr
|
| 52 |
|
| 53 |
+
# --- Imports for Vocal Separation ---
|
| 54 |
+
import torchaudio
|
| 55 |
+
from demucs.apply import apply_model
|
| 56 |
+
from demucs.pretrained import get_model
|
| 57 |
+
from demucs.audio import convert_audio
|
| 58 |
|
| 59 |
+
from src.piano_transcription.utils import initialize_app
|
| 60 |
from piano_transcription_inference import PianoTranscription, utilities, sample_rate as transcription_sample_rate
|
| 61 |
|
| 62 |
# --- Import core transcription and MIDI processing libraries ---
|
|
|
|
| 1047 |
def process_and_render_file(input_file,
|
| 1048 |
# --- Pass the preset selector value ---
|
| 1049 |
s8bit_preset_selector,
|
| 1050 |
+
separate_vocals,
|
| 1051 |
+
remerge_vocals,
|
| 1052 |
+
transcription_target,
|
| 1053 |
# --- Transcription params ---
|
| 1054 |
enable_stereo_processing,
|
| 1055 |
transcription_method,
|
|
|
|
| 1079 |
filename = os.path.basename(input_file_path)
|
| 1080 |
print(f"Processing new file: {filename}")
|
| 1081 |
|
| 1082 |
+
# This will store the other part if separation is performed
|
| 1083 |
+
other_part_tensor = None
|
| 1084 |
+
other_part_sr = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1085 |
|
| 1086 |
# --- Step 1: Check file type and transcribe if necessary ---
|
| 1087 |
if filename.lower().endswith(('.mid', '.midi', '.kar')):
|
|
|
|
| 1089 |
midi_path_for_rendering = input_file_path
|
| 1090 |
else: #if filename.lower().endswith(('.wav', '.mp3'))
|
| 1091 |
print("Audio file detected. Starting transcription...")
|
| 1092 |
+
|
| 1093 |
+
try:
|
| 1094 |
+
# Use torchaudio to load directly into a tensor, as demucs needs it.
|
| 1095 |
+
# This is more efficient than loading with librosa then converting.
|
| 1096 |
+
audio_tensor, native_sample_rate = torchaudio.load(input_file_path)
|
| 1097 |
+
except Exception as e:
|
| 1098 |
+
raise gr.Error(f"Failed to load audio file: {e}")
|
| 1099 |
|
| 1100 |
+
# --- Demucs Vocal Separation Logic, now decides which stem to process ---
|
| 1101 |
+
if separate_vocals:
|
| 1102 |
+
if demucs_model is None:
|
| 1103 |
+
raise gr.Error("Demucs model is not loaded. Cannot separate vocals.")
|
| 1104 |
+
|
| 1105 |
+
# Convert to a common format (stereo, float32) that demucs expects
|
| 1106 |
+
audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels)
|
| 1107 |
+
|
| 1108 |
+
if torch.cuda.is_available():
|
| 1109 |
+
audio_tensor = audio_tensor.cuda()
|
| 1110 |
+
|
| 1111 |
+
print("Separating audio with Demucs... This may take some time.")
|
| 1112 |
+
all_stems = apply_model(demucs_model, audio_tensor[None], device='cuda' if torch.cuda.is_available() else 'cpu', progress=True)[0]
|
| 1113 |
+
|
| 1114 |
+
vocals_idx = demucs_model.sources.index('vocals')
|
| 1115 |
+
# Sum all stems that are NOT vocals to get the accompaniment
|
| 1116 |
+
accompaniment_indices = [i for i, source in enumerate(demucs_model.sources) if source != 'vocals']
|
| 1117 |
+
|
| 1118 |
+
vocals_tensor = all_stems[vocals_idx]
|
| 1119 |
+
accompaniment_tensor = all_stems[accompaniment_indices].sum(0)
|
| 1120 |
+
|
| 1121 |
+
# --- The new core branching logic ---
|
| 1122 |
+
if transcription_target == "Transcribe Vocals":
|
| 1123 |
+
print("Target: Transcribing VOCALS.")
|
| 1124 |
+
tensor_to_process = vocals_tensor
|
| 1125 |
+
other_part_tensor = accompaniment_tensor # Save accompaniment for re-merging
|
| 1126 |
+
else: # Default to "Transcribe Music (Accompaniment)"
|
| 1127 |
+
print("Target: Transcribing MUSIC (ACCOMPANIMENT).")
|
| 1128 |
+
tensor_to_process = accompaniment_tensor
|
| 1129 |
+
other_part_tensor = vocals_tensor # Save vocals for re-merging
|
| 1130 |
+
|
| 1131 |
+
other_part_sr = demucs_model.samplerate
|
| 1132 |
+
audio_tensor = tensor_to_process # The audio to be processed is now the chosen stem
|
| 1133 |
+
native_sample_rate = demucs_model.samplerate # Update sample rate to match demucs output
|
| 1134 |
+
print("Separation complete.")
|
| 1135 |
+
|
| 1136 |
+
# --- Prepare audio for transcription (saving to a temp file) ---
|
| 1137 |
+
# This part of the logic now works on whichever stem was selected above
|
| 1138 |
base_name = os.path.splitext(filename)[0]
|
| 1139 |
+
temp_dir = "output/temp_transcribe"
|
| 1140 |
os.makedirs(temp_dir, exist_ok=True)
|
| 1141 |
+
suffix = f"_{transcription_target.split(' ')[1].lower()}" if separate_vocals else "_original"
|
| 1142 |
+
audio_to_transcribe_path = os.path.join(temp_dir, f"{base_name}{suffix}.wav")
|
| 1143 |
+
|
| 1144 |
+
torchaudio.save(audio_to_transcribe_path, audio_tensor.cpu(), native_sample_rate)
|
| 1145 |
+
|
| 1146 |
+
# Convert tensor to numpy array (channels, samples) for librosa/pyloudnorm compatibility
|
| 1147 |
+
# We work with a CPU copy of the tensor.
|
| 1148 |
+
audio_data_np = audio_tensor.cpu().numpy()
|
| 1149 |
|
| 1150 |
# === STEREO PROCESSING LOGIC ===
|
| 1151 |
if enable_stereo_processing:
|
| 1152 |
+
if audio_data_np.ndim != 2 or audio_data_np.shape[0] != 2:
|
| 1153 |
print("Warning: Audio is not stereo or could not be loaded as stereo. Falling back to mono transcription.")
|
| 1154 |
enable_stereo_processing = False # Disable stereo processing if audio is not stereo
|
| 1155 |
|
| 1156 |
if enable_stereo_processing:
|
| 1157 |
+
print("Stereo processing enabled. Splitting, normalizing, and transcribing channels...")
|
| 1158 |
try:
|
| 1159 |
+
left_channel_np = audio_data_np[0]
|
| 1160 |
+
right_channel_np = audio_data_np[1]
|
| 1161 |
|
| 1162 |
+
normalized_left = normalize_loudness(left_channel_np, native_sample_rate)
|
| 1163 |
+
normalized_right = normalize_loudness(right_channel_np, native_sample_rate)
|
| 1164 |
|
| 1165 |
+
temp_left_path = os.path.join(temp_dir, f"{base_name}_left.wav")
|
| 1166 |
+
temp_right_path = os.path.join(temp_dir, f"{base_name}_right.wav")
|
| 1167 |
|
| 1168 |
+
sf.write(temp_left_path, normalized_left, native_sample_rate)
|
| 1169 |
+
sf.write(temp_right_path, normalized_right, native_sample_rate)
|
| 1170 |
|
| 1171 |
+
print(f"Saved left channel to: {temp_left_path}")
|
| 1172 |
+
print(f"Saved right channel to: {temp_right_path}")
|
| 1173 |
|
| 1174 |
print("Transcribing left and right channel...")
|
| 1175 |
if transcription_method == "General Purpose":
|
| 1176 |
+
midi_path_left = TranscribeGeneralAudio(temp_left_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
|
| 1177 |
+
midi_path_right = TranscribeGeneralAudio(temp_right_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
|
| 1178 |
+
else: # Piano-Specific
|
| 1179 |
+
midi_path_left = TranscribePianoAudio(temp_left_path)
|
| 1180 |
+
midi_path_right = TranscribePianoAudio(temp_right_path)
|
| 1181 |
|
| 1182 |
if midi_path_left and midi_path_right:
|
| 1183 |
merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid")
|
|
|
|
| 1195 |
print(f"An error occurred during stereo processing: {e}")
|
| 1196 |
raise gr.Error(f"Stereo Processing Failed: {e}")
|
| 1197 |
else: # Standard mono transcription
|
| 1198 |
+
print("Mono processing. Normalizing and transcribing audio...")
|
| 1199 |
+
# If the audio is stereo but stereo processing is disabled, convert to mono.
|
| 1200 |
+
if audio_data_np.shape[0] == 2:
|
| 1201 |
+
mono_signal_np = np.mean(audio_data_np, axis=0)
|
| 1202 |
+
else:
|
| 1203 |
+
mono_signal_np = audio_data_np[0]
|
|
|
|
|
|
|
| 1204 |
|
| 1205 |
+
normalized_mono = normalize_loudness(mono_signal_np, native_sample_rate)
|
| 1206 |
+
temp_mono_path = os.path.join(temp_dir, f"{base_name}_mono.wav")
|
| 1207 |
+
sf.write(temp_mono_path, normalized_mono, native_sample_rate)
|
| 1208 |
|
| 1209 |
try:
|
| 1210 |
if transcription_method == "General Purpose":
|
| 1211 |
+
midi_path_for_rendering = TranscribeGeneralAudio(temp_mono_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool)
|
| 1212 |
else: # Piano-Specific
|
| 1213 |
+
midi_path_for_rendering = TranscribePianoAudio(temp_mono_path)
|
| 1214 |
except Exception as e:
|
| 1215 |
print(f"An error occurred during transcription: {e}")
|
| 1216 |
raise gr.Error(f"Transcription Failed: {e}")
|
|
|
|
| 1270 |
synth_params['fm_modulation_depth'],
|
| 1271 |
synth_params['fm_modulation_rate']
|
| 1272 |
)
|
| 1273 |
+
|
| 1274 |
+
# --- Vocal Re-merging Logic now uses the generic "other_part" ---
|
| 1275 |
+
if separate_vocals and remerge_vocals and other_part_tensor is not None:
|
| 1276 |
+
print(f"Re-merging the non-transcribed part with newly rendered music...")
|
| 1277 |
+
|
| 1278 |
+
rendered_srate, rendered_music_int16 = results[4]
|
| 1279 |
+
|
| 1280 |
+
rendered_music_float = rendered_music_int16.astype(np.float32) / 32767.0
|
| 1281 |
+
rendered_music_tensor = torch.from_numpy(rendered_music_float).T
|
| 1282 |
+
|
| 1283 |
+
if rendered_srate != other_part_sr:
|
| 1284 |
+
resampler = torchaudio.transforms.Resample(rendered_srate, other_part_sr)
|
| 1285 |
+
rendered_music_tensor = resampler(rendered_music_tensor)
|
| 1286 |
+
|
| 1287 |
+
len_music = rendered_music_tensor.shape[1]
|
| 1288 |
+
len_other = other_part_tensor.shape[1]
|
| 1289 |
+
|
| 1290 |
+
if len_music > len_other:
|
| 1291 |
+
padding = len_music - len_other
|
| 1292 |
+
other_part_tensor = torch.nn.functional.pad(other_part_tensor, (0, padding))
|
| 1293 |
+
elif len_other > len_music:
|
| 1294 |
+
padding = len_other - len_music
|
| 1295 |
+
rendered_music_tensor = torch.nn.functional.pad(rendered_music_tensor, (0, padding))
|
| 1296 |
+
|
| 1297 |
+
merged_audio_tensor = rendered_music_tensor + other_part_tensor.cpu()
|
| 1298 |
+
|
| 1299 |
+
max_abs = torch.max(torch.abs(merged_audio_tensor))
|
| 1300 |
+
if max_abs > 1.0:
|
| 1301 |
+
merged_audio_tensor /= max_abs
|
| 1302 |
+
|
| 1303 |
+
merged_audio_int16 = (merged_audio_tensor.T.numpy() * 32767).astype(np.int16)
|
| 1304 |
+
|
| 1305 |
+
new_results = list(results)
|
| 1306 |
+
new_results[4] = (other_part_sr, merged_audio_int16)
|
| 1307 |
+
results = tuple(new_results)
|
| 1308 |
+
print("Re-merging complete.")
|
| 1309 |
+
|
| 1310 |
print(f'Total processing time: {(reqtime.time() - start_time):.2f} sec')
|
| 1311 |
print('*' * 70)
|
| 1312 |
|
|
|
|
| 1398 |
if not soundfonts_dict:
|
| 1399 |
print("\nWARNING: No SoundFonts were found or could be downloaded.")
|
| 1400 |
print("Rendering with SoundFonts will fail. Only the 8-bit synthesizer will be available.")
|
| 1401 |
+
|
| 1402 |
+
# --- Pre-load the Demucs model on startup for efficiency ---
|
| 1403 |
+
print("Loading Demucs model (htdemucs_ft), this may take a moment on first run...")
|
| 1404 |
+
try:
|
| 1405 |
+
demucs_model = get_model(name='htdemucs_ft')
|
| 1406 |
+
if torch.cuda.is_available():
|
| 1407 |
+
demucs_model = demucs_model.cuda()
|
| 1408 |
+
print("Demucs model loaded successfully.")
|
| 1409 |
+
except Exception as e:
|
| 1410 |
+
print(f"Warning: Could not load Demucs model. Vocal separation will not be available. Error: {e}")
|
| 1411 |
+
demucs_model = None
|
| 1412 |
+
|
| 1413 |
# --- Define a constant for the fallback preset name ---
|
| 1414 |
# This prevents errors if the preset name is changed in the dictionary.
|
| 1415 |
FALLBACK_PRESET_NAME = "Generic Chiptune Loop"
|
|
|
|
| 1419 |
# Comprehensive preset dictionary including new JRPG and Handheld classics
|
| 1420 |
# Note: Vibrato depth is mapped to a representative value on the 0-50 Hz slider.
|
| 1421 |
S8BIT_PRESETS = {
|
| 1422 |
+
# --- Classic Chiptune ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1423 |
"Mario (Super Mario Bros / スーパーマリオブラザーズ)": {
|
| 1424 |
# Description: A bright square wave with a per-note vibrato, producing the classic bouncy platformer sound.
|
| 1425 |
'waveform_type': 'Square', 'pulse_width': 0.3, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25,
|
|
|
|
| 1429 |
'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1430 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1431 |
},
|
| 1432 |
+
"Mega Man (Rockman / ロックマン)": {
|
| 1433 |
+
# Description: A thin, sharp square wave lead with fast vibrato, iconic for its driving, heroic melodies.
|
| 1434 |
+
'waveform_type': 'Square', 'pulse_width': 0.2, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
|
| 1435 |
+
'vibrato_rate': 6.0, 'vibrato_depth': 8,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1436 |
'smooth_notes_level': 0.9,
|
| 1437 |
'continuous_vibrato_level': 0.85,
|
| 1438 |
+
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.05,
|
| 1439 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1440 |
},
|
| 1441 |
"Zelda (The Legend of Zelda / ゼルダの伝説)": {
|
|
|
|
| 1447 |
'bass_boost_level': 0.15, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1448 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1449 |
},
|
| 1450 |
+
"Kirby's Bubbly Melody (Hoshi no Kirby / 星のカービィ)": {
|
| 1451 |
+
# Description: A soft, round square wave with a bouncy vibrato, creating a cheerful and adorable sound.
|
| 1452 |
+
'waveform_type': 'Square', 'pulse_width': 0.4, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
|
| 1453 |
+
'vibrato_rate': 6.0, 'vibrato_depth': 4,
|
|
|
|
| 1454 |
'smooth_notes_level': 0.85,
|
| 1455 |
+
'continuous_vibrato_level': 0.3, # Formerly False (0.0); adds a hint of continuity for more liveliness.
|
| 1456 |
+
'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1457 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1458 |
},
|
| 1459 |
+
"Pokémon (Game Boy Classics / ポケットモンスター)": {
|
| 1460 |
+
# Description: A full, friendly square wave sound, capturing the cheerful and adventurous spirit of early handheld RPGs.
|
| 1461 |
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
|
| 1462 |
+
'vibrato_rate': 5.0, 'vibrato_depth': 5,
|
| 1463 |
'smooth_notes_level': 0.9,
|
| 1464 |
+
'continuous_vibrato_level': 0.9,
|
| 1465 |
+
'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1466 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1467 |
},
|
| 1468 |
"Castlevania (Akumajō Dracula / 悪魔城ドラキュラ)": {
|
|
|
|
| 1474 |
'bass_boost_level': 0.35, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1475 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1476 |
},
|
| 1477 |
+
"Final Fantasy (Arpeggio / ファイナルファンタジー)": {
|
| 1478 |
+
# Description: A perfect, clean square wave with zero vibrato, creating the iconic, crystal-clear arpeggio sound.
|
| 1479 |
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22,
|
| 1480 |
+
'vibrato_rate': 5.0, 'vibrato_depth': 0,
|
| 1481 |
'smooth_notes_level': 0.9,
|
| 1482 |
+
'continuous_vibrato_level': 0.2,
|
| 1483 |
+
'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1484 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1485 |
+
},
|
| 1486 |
+
"ONI V (Wafu Mystic / ONI V 隠忍を継ぐ者)": {
|
| 1487 |
+
# Description: A solemn triangle wave with a slow, expressive vibrato, evoking the mysterious atmosphere of Japanese folklore.
|
| 1488 |
+
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
|
| 1489 |
+
'vibrato_rate': 3.5, 'vibrato_depth': 3,
|
| 1490 |
+
'smooth_notes_level': 0.9,
|
| 1491 |
+
'continuous_vibrato_level': 0.85,
|
| 1492 |
+
'bass_boost_level': 0.4, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1493 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1494 |
},
|
| 1495 |
# --- Advanced System Impressions ---
|
|
|
|
| 1529 |
'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1530 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1531 |
},
|
| 1532 |
+
# --- Action & Rock Leads ---
|
| 1533 |
+
"Falcom Ys (Rock Lead / イース)": {
|
| 1534 |
+
# Description: A powerful sawtooth with slight distortion, emulating the driving rock organ and guitar leads of action JRPGs.
|
| 1535 |
+
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
|
| 1536 |
+
'vibrato_rate': 5.5, 'vibrato_depth': 6,
|
| 1537 |
+
'smooth_notes_level': 0.85,
|
| 1538 |
+
'continuous_vibrato_level': 0.8,
|
| 1539 |
+
'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.15,
|
| 1540 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1541 |
+
},
|
| 1542 |
+
"Arcade Brawler Lead (Street Fighter / ストリートファイター)": {
|
| 1543 |
+
# Description: A gritty sawtooth lead with a hard attack, capturing the high-energy feel of classic fighting games.
|
| 1544 |
+
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15,
|
| 1545 |
+
'vibrato_rate': 5.0, 'vibrato_depth': 6,
|
| 1546 |
+
'smooth_notes_level': 0.8,
|
| 1547 |
+
'continuous_vibrato_level': 0.7,
|
| 1548 |
+
'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.1,
|
| 1549 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1550 |
+
},
|
| 1551 |
+
"Rhythm Pop Lead (Rhythm Tengoku / リズム天国)": {
|
| 1552 |
+
# Description: A clean, round square wave perfect for the snappy, catchy feel of rhythm games.
|
| 1553 |
+
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18,
|
| 1554 |
+
'vibrato_rate': 4.5, 'vibrato_depth': 4,
|
| 1555 |
+
'smooth_notes_level': 0.9, # Formerly True -> 1.0; slightly reduced for a bit more attack.
|
| 1556 |
+
'continuous_vibrato_level': 0.8, # Formerly True -> 1.0; slightly weakened for more defined note transitions.
|
| 1557 |
+
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1558 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1559 |
+
},
|
| 1560 |
+
# --- Epic & Orchestral Pads ---
|
| 1561 |
+
"Dragon Quest (Orchestral Feel / ドラゴンクエスト)": {
|
| 1562 |
+
# Description: A pure triangle wave with a long decay, mimicking the grand, orchestral feel of a classical flute or string section.
|
| 1563 |
+
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.6,
|
| 1564 |
+
'vibrato_rate': 3.0, 'vibrato_depth': 4,
|
| 1565 |
+
'smooth_notes_level': 0.9,
|
| 1566 |
+
'continuous_vibrato_level': 0.9,
|
| 1567 |
+
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1568 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1569 |
+
},
|
| 1570 |
+
"Mystic Mana Pad (Secret of Mana / 聖剣伝説2)": {
|
| 1571 |
+
# Description: A warm, ethereal square wave pad with slow vibrato, capturing a feeling of fantasy and wonder.
|
| 1572 |
+
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
|
| 1573 |
+
'vibrato_rate': 2.5, 'vibrato_depth': 4,
|
| 1574 |
+
'smooth_notes_level': 1.0,
|
| 1575 |
+
'continuous_vibrato_level': 0.95,
|
| 1576 |
+
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1577 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1578 |
+
},
|
| 1579 |
+
"Modern JRPG Pad (Persona / ペルソナ)": {
|
| 1580 |
+
# Description: A warm, stylish square wave pad, capturing the modern, pop/jazz-infused feel of the Persona series.
|
| 1581 |
+
'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5,
|
| 1582 |
+
'vibrato_rate': 2.5, 'vibrato_depth': 4,
|
| 1583 |
+
'smooth_notes_level': 1.0,
|
| 1584 |
+
'continuous_vibrato_level': 0.95,
|
| 1585 |
+
'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1586 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1587 |
+
},
|
| 1588 |
+
"Tactical Brass (Fire Emblem / ファイアーエムブレム)": {
|
| 1589 |
+
# Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of Fire Emblem's tactical themes.
|
| 1590 |
+
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
|
| 1591 |
+
'vibrato_rate': 3.5, 'vibrato_depth': 5,
|
| 1592 |
+
'smooth_notes_level': 0.95,
|
| 1593 |
+
'continuous_vibrato_level': 0.9,
|
| 1594 |
+
'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
|
| 1595 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1596 |
+
},
|
| 1597 |
+
"Mecha & Tactics Brass (Super Robot Wars / スーパーロボット大戦)": {
|
| 1598 |
+
# Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of strategy and mecha anime themes.
|
| 1599 |
+
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
|
| 1600 |
+
'vibrato_rate': 3.5, 'vibrato_depth': 5,
|
| 1601 |
+
'smooth_notes_level': 0.95,
|
| 1602 |
+
'continuous_vibrato_level': 0.9,
|
| 1603 |
+
'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15,
|
| 1604 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1605 |
+
},
|
| 1606 |
+
"Dark/Boss Atmosphere (Shin Megami Tensei / 真・女神転生)": {
|
| 1607 |
+
# Description: An aggressive sawtooth, inspired by the dark, rock-infused themes of SMT.
|
| 1608 |
+
'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35,
|
| 1609 |
+
'vibrato_rate': 7.0, 'vibrato_depth': 12,
|
| 1610 |
+
'smooth_notes_level': 0.1,
|
| 1611 |
+
'continuous_vibrato_level': 0.0,
|
| 1612 |
+
'bass_boost_level': 0.4, 'noise_level': 0.15, 'distortion_level': 0.25,
|
| 1613 |
+
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1614 |
+
},
|
| 1615 |
+
# --- Vocal Synthesis ---
|
| 1616 |
+
"8-Bit Vocal Lead": {
|
| 1617 |
+
# Description: A soft, sustained triangle wave with gentle vibrato to mimic a singing voice.
|
| 1618 |
+
'waveform_type': 'Triangle',
|
| 1619 |
+
'pulse_width': 0.5,
|
| 1620 |
+
'envelope_type': 'Sustained (Full Decay)',
|
| 1621 |
+
'decay_time_s': 0.8,
|
| 1622 |
+
'vibrato_rate': 5.5,
|
| 1623 |
+
'vibrato_depth': 4, # Mapped from the suggested 0.15 range
|
| 1624 |
+
'bass_boost_level': 0.1,
|
| 1625 |
+
'smooth_notes_level': 0.85,
|
| 1626 |
+
'continuous_vibrato_level': 0.9,
|
| 1627 |
+
'noise_level': 0.02,
|
| 1628 |
+
'distortion_level': 0.0,
|
| 1629 |
+
'fm_modulation_depth': 0.05,
|
| 1630 |
+
'fm_modulation_rate': 20
|
| 1631 |
+
},
|
| 1632 |
+
"8-Bit Male Vocal": {
|
| 1633 |
+
# Description: A deeper, fuller triangle wave with more bass and slower vibrato for a masculine feel.
|
| 1634 |
+
'waveform_type': 'Triangle',
|
| 1635 |
+
'pulse_width': 0.5,
|
| 1636 |
+
'envelope_type': 'Sustained (Full Decay)',
|
| 1637 |
+
'decay_time_s': 1.0,
|
| 1638 |
+
'vibrato_rate': 5.0,
|
| 1639 |
+
'vibrato_depth': 3, # Mapped from the suggested 0.12 range
|
| 1640 |
+
'bass_boost_level': 0.3,
|
| 1641 |
+
'smooth_notes_level': 0.9,
|
| 1642 |
+
'continuous_vibrato_level': 0.85,
|
| 1643 |
+
'noise_level': 0.015,
|
| 1644 |
+
'distortion_level': 0.0,
|
| 1645 |
+
'fm_modulation_depth': 0.08,
|
| 1646 |
+
'fm_modulation_rate': 25
|
| 1647 |
+
},
|
| 1648 |
+
"8-Bit Female Vocal": {
|
| 1649 |
+
# Description: A brighter, lighter triangle wave with faster vibrato and less bass for a feminine feel.
|
| 1650 |
+
'waveform_type': 'Triangle',
|
| 1651 |
+
'pulse_width': 0.5,
|
| 1652 |
+
'envelope_type': 'Sustained (Full Decay)',
|
| 1653 |
+
'decay_time_s': 0.7,
|
| 1654 |
+
'vibrato_rate': 6.0,
|
| 1655 |
+
'vibrato_depth': 5, # Mapped from the suggested 0.18 range
|
| 1656 |
+
'bass_boost_level': 0.05,
|
| 1657 |
+
'smooth_notes_level': 0.85,
|
| 1658 |
+
'continuous_vibrato_level': 0.92,
|
| 1659 |
+
'noise_level': 0.025,
|
| 1660 |
+
'distortion_level': 0.0,
|
| 1661 |
+
'fm_modulation_depth': 0.04,
|
| 1662 |
+
'fm_modulation_rate': 30
|
| 1663 |
+
},
|
| 1664 |
+
"Lo-Fi Vocal": {
|
| 1665 |
+
# Description: A gritty, noisy square wave with a short decay to simulate a low-resolution vocal sample.
|
| 1666 |
+
'waveform_type': 'Square',
|
| 1667 |
+
'pulse_width': 0.48,
|
| 1668 |
+
'envelope_type': 'Plucky (AD Envelope)', # "Short" implies a plucky, not sustained, envelope
|
| 1669 |
+
'decay_time_s': 0.4,
|
| 1670 |
+
'vibrato_rate': 4.8,
|
| 1671 |
+
'vibrato_depth': 2, # Mapped from the suggested 0.10 range
|
| 1672 |
+
'bass_boost_level': 0.1,
|
| 1673 |
+
'smooth_notes_level': 0.65,
|
| 1674 |
+
'continuous_vibrato_level': 0.6,
|
| 1675 |
+
'noise_level': 0.05,
|
| 1676 |
+
'distortion_level': 0.05,
|
| 1677 |
+
'fm_modulation_depth': 0.02,
|
| 1678 |
+
'fm_modulation_rate': 20
|
| 1679 |
+
},
|
| 1680 |
+
# --- Sound FX & Experimental ---
|
| 1681 |
"Sci-Fi Energy Field": {
|
| 1682 |
# Description: (SFX) High-speed vibrato and noise create a constant, shimmering hum suitable for energy shields or force fields.
|
| 1683 |
'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4,
|
|
|
|
| 1723 |
'bass_boost_level': 0.8, 'noise_level': 0.2, 'distortion_level': 0.5,
|
| 1724 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1725 |
},
|
| 1726 |
+
# --- Utility & Starting Points ---
|
| 1727 |
"Generic Chiptune Loop": {
|
| 1728 |
# Description: A well-balanced, pleasant square wave lead that serves as a great starting point for custom sounds.
|
| 1729 |
'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2,
|
|
|
|
| 1733 |
'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0,
|
| 1734 |
'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0
|
| 1735 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1736 |
}
|
| 1737 |
|
| 1738 |
+
# --- Function to control visibility of BOTH new UI elements ---
|
| 1739 |
+
def update_vocal_ui_visibility(separate_vocals):
|
| 1740 |
+
"""Shows or hides the separation-related UI controls."""
|
| 1741 |
+
is_visible = gr.update(visible=separate_vocals)
|
| 1742 |
+
return is_visible, is_visible # Return two updates
|
| 1743 |
+
|
| 1744 |
app = gr.Blocks(theme=gr.themes.Base())
|
| 1745 |
|
| 1746 |
with app:
|
|
|
|
| 1783 |
info="If checked, left/right audio channels are transcribed separately and merged. Doubles processing time."
|
| 1784 |
)
|
| 1785 |
|
| 1786 |
+
# --- Vocal Separation Checkboxes ---
|
| 1787 |
+
with gr.Group():
|
| 1788 |
+
separate_vocals = gr.Checkbox(
|
| 1789 |
+
label="Separate Vocals",
|
| 1790 |
+
value=False,
|
| 1791 |
+
info="If checked, separates the audio into vocals and music stems before processing."
|
| 1792 |
+
)
|
| 1793 |
+
transcription_target = gr.Radio(
|
| 1794 |
+
["Transcribe Music (Accompaniment)", "Transcribe Vocals"],
|
| 1795 |
+
label="Transcription Target",
|
| 1796 |
+
value="Transcribe Music (Accompaniment)",
|
| 1797 |
+
info="Choose which part of the separated audio to transcribe to MIDI.",
|
| 1798 |
+
visible=False # Initially hidden
|
| 1799 |
+
)
|
| 1800 |
+
remerge_vocals = gr.Checkbox(
|
| 1801 |
+
label="Re-merge Other Part with Rendered Audio",
|
| 1802 |
+
value=False,
|
| 1803 |
+
info="After rendering, merges the non-transcribed part (e.g., original vocals) back with the new music.",
|
| 1804 |
+
visible=False # Initially hidden
|
| 1805 |
+
)
|
| 1806 |
+
|
| 1807 |
with gr.Accordion("General Purpose Transcription Settings", open=True) as general_transcription_settings:
|
| 1808 |
onset_threshold = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="On-set Threshold", info="Sensitivity for detecting note beginnings. Higher is stricter.")
|
| 1809 |
frame_threshold = gr.Slider(0.0, 1.0, value=0.3, step=0.05, label="Frame Threshold", info="Sensitivity for detecting active notes. Higher is stricter.")
|
|
|
|
| 1920 |
s8bit_waveform_type = gr.Dropdown(['Square', 'Sawtooth', 'Triangle'], value='Square', label="Waveform Type")
|
| 1921 |
s8bit_pulse_width = gr.Slider(0.01, 0.99, value=0.5, step=0.01, label="Pulse Width (Square Wave Only)")
|
| 1922 |
s8bit_envelope_type = gr.Dropdown(['Plucky (AD Envelope)', 'Sustained (Full Decay)'], value='Plucky (AD Envelope)', label="Envelope Type")
|
| 1923 |
+
s8bit_decay_time_s = gr.Slider(0.01, 1.0, value=0.1, step=0.01, label="Decay Time (s)") # Increased max to 0.6 for DQ style
|
| 1924 |
s8bit_vibrato_rate = gr.Slider(0, 20, value=5, label="Vibrato Rate (Hz)")
|
| 1925 |
s8bit_vibrato_depth = gr.Slider(0, 50, value=0, label="Vibrato Depth (Hz)")
|
| 1926 |
s8bit_bass_boost_level = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Bass Boost Level", info="Adjusts the volume of the sub-octave. 0 is off.")
|
|
|
|
| 1967 |
# all_inputs now includes the preset selector itself
|
| 1968 |
# Inputs for the main processing function
|
| 1969 |
all_inputs = [
|
| 1970 |
+
input_file, s8bit_preset_selector,
|
| 1971 |
+
separate_vocals,
|
| 1972 |
+
remerge_vocals,
|
| 1973 |
+
transcription_target,
|
| 1974 |
+
enable_stereo_processing,
|
| 1975 |
transcription_method, onset_threshold, frame_threshold, minimum_note_length,
|
| 1976 |
minimum_frequency, maximum_frequency, infer_onsets, melodia_trick, multiple_pitch_bends,
|
| 1977 |
render_type, soundfont_bank, render_sample_rate, render_with_sustains,
|
|
|
|
| 2007 |
inputs=all_inputs,
|
| 2008 |
outputs=all_outputs # Pass the combined list
|
| 2009 |
)
|
| 2010 |
+
|
| 2011 |
+
# --- The change event now controls TWO components ---
|
| 2012 |
+
separate_vocals.change(
|
| 2013 |
+
fn=update_vocal_ui_visibility,
|
| 2014 |
+
inputs=separate_vocals,
|
| 2015 |
+
outputs=[transcription_target, remerge_vocals] # Update both components
|
| 2016 |
+
)
|
| 2017 |
|
| 2018 |
# --- Listeners for dynamic UI updates ---
|
| 2019 |
transcription_method.change(
|
requirements.txt
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
--extra-index-url https://download.pytorch.org/whl/cu128
|
| 2 |
|
| 3 |
torch
|
|
|
|
| 4 |
numpy
|
| 5 |
-
gradio
|
| 6 |
mido
|
| 7 |
librosa
|
| 8 |
torchlibrosa
|
|
@@ -18,9 +19,12 @@ psutil
|
|
| 18 |
pretty_midi
|
| 19 |
soundfile
|
| 20 |
pyloudnorm
|
|
|
|
| 21 |
piano_transcription_inference
|
| 22 |
|
| 23 |
basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'
|
| 24 |
basic-pitch[tf] @ git+https://github.com/avan06/basic-pitch; sys_platform == 'linux'
|
| 25 |
|
| 26 |
-
git+https://github.com/avan06/pyfluidsynth
|
|
|
|
|
|
|
|
|
| 1 |
--extra-index-url https://download.pytorch.org/whl/cu128
|
| 2 |
|
| 3 |
torch
|
| 4 |
+
torchaudio
|
| 5 |
numpy
|
| 6 |
+
gradio >= 5.42.0
|
| 7 |
mido
|
| 8 |
librosa
|
| 9 |
torchlibrosa
|
|
|
|
| 19 |
pretty_midi
|
| 20 |
soundfile
|
| 21 |
pyloudnorm
|
| 22 |
+
|
| 23 |
piano_transcription_inference
|
| 24 |
|
| 25 |
basic-pitch @ git+https://github.com/avan06/basic-pitch; sys_platform != 'linux'
|
| 26 |
basic-pitch[tf] @ git+https://github.com/avan06/basic-pitch; sys_platform == 'linux'
|
| 27 |
|
| 28 |
+
git+https://github.com/avan06/pyfluidsynth
|
| 29 |
+
|
| 30 |
+
demucs
|