File size: 5,782 Bytes
89f10eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import gradio as gr
import librosa
import soundfile as sf
import os
import tempfile
import shutil
import torch

from demucs.pretrained import get_model as get_demucs_model
from demucs.apply import apply_model
from spleeter.separator import Separator
from matchering import match
from so_vits_svc_fork.inference.core import Svc
import whisper
import madmom

# --- 1. Audio Separation (Demucs/Spleeter) ---
def separate_audio(audio):
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        tmp.write(audio.read())
        tmp_path = tmp.name
    # Demucs
    model = get_demucs_model('htdemucs')
    wav, sr = librosa.load(tmp_path, sr=44100, mono=False)
    sources = apply_model(model, torch.tensor(wav).unsqueeze(0), device='cpu', split=True)
    out_dir = tempfile.mkdtemp()
    stems = {}
    for i, name in enumerate(model.sources):
        out_path = os.path.join(out_dir, f"{name}.wav")
        sf.write(out_path, sources[0, i].cpu().numpy().T, sr)
        stems[name] = out_path
    return stems

# --- 2. Pattern Extraction & Genre Detection ---
def extract_pattern(audio):
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        tmp.write(audio.read())
        tmp_path = tmp.name
    y, sr = librosa.load(tmp_path, sr=None)
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr)
    # Genre detection (replace with ML model if needed)
    genre = "dj bantengan" if tempo > 120 else "pop"
    return {
        "tempo": float(tempo),
        "beats": beats.tolist(),
        "onsets": onsets.tolist(),
        "genre": genre
    }

# --- 3. Genre-Aware Pattern Generator (Magenta/MusicGen style transfer) ---
def generate_pattern(reference_audio, creativity=0.2):
    # TODO: Integrate with MusicGen/Magenta for real pattern generation
    # For now, return extracted pattern as placeholder
    return extract_pattern(reference_audio)

# --- 4. Mixing/Mastering (Matchering) ---
def mix_and_master(input_audio, reference_audio):
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_in, \
         tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_ref:
        tmp_in.write(input_audio.read())
        tmp_ref.write(reference_audio.read())
        in_path = tmp_in.name
        ref_path = tmp_ref.name
    out_path = in_path.replace(".wav", "_mastered.wav")
    match(in_path, ref_path, out_path)
    return out_path

# --- 5. Vocal Processing (so-vits-svc, Spleeter) ---
def change_vocal(audio, model_path):
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        tmp.write(audio.read())
        tmp_path = tmp.name
    svc = Svc(model_path)
    out_wav_path = svc.infer(tmp_path)
    return out_wav_path

# --- 6. Denoising (RNNoise, Demucs) ---
def denoise_audio(audio):
    # TODO: Integrate with RNNoise or Demucs for real denoising
    # For now, just return input
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        tmp.write(audio.read())
        tmp_path = tmp.name
    return tmp_path

# --- 7. Multi-vocal Lyric Detection (Whisper) ---
def detect_lyrics(audio):
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        tmp.write(audio.read())
        tmp_path = tmp.name
    model = whisper.load_model("base")
    result = model.transcribe(tmp_path)
    # For multi-vocal, you can use Spleeter/Demucs to split vocals, then transcribe each
    return {"lyrics": result["text"]}

# --- Gradio UI ---
with gr.Blocks() as demo:
    gr.Markdown("# DAW AI Ultra-Premium Pipeline (All-in-One, Real Pipeline)")

    with gr.Tab("Separate Audio"):
        audio_in = gr.Audio(type="file", label="Input Audio")
        out = gr.JSON(label="Separated Stems (vocals, drums, bass, other)")
        btn = gr.Button("Separate")
        btn.click(separate_audio, inputs=audio_in, outputs=out)

    with gr.Tab("Extract Pattern"):
        audio_in2 = gr.Audio(type="file", label="Input Audio")
        out2 = gr.JSON(label="Pattern Info")
        btn2 = gr.Button("Extract")
        btn2.click(extract_pattern, inputs=audio_in2, outputs=out2)

    with gr.Tab("Generate Pattern"):
        ref_audio = gr.Audio(type="file", label="Reference Audio")
        creativity = gr.Slider(0, 1, value=0.2, label="Creativity")
        out3 = gr.JSON(label="Generated Pattern")
        btn3 = gr.Button("Generate")
        btn3.click(generate_pattern, inputs=[ref_audio, creativity], outputs=out3)

    with gr.Tab("Mix/Master"):
        audio_in3 = gr.Audio(type="file", label="Input Audio")
        ref_audio2 = gr.Audio(type="file", label="Reference Audio")
        out4 = gr.Audio(label="Mastered Output")
        btn4 = gr.Button("Master")
        btn4.click(mix_and_master, inputs=[audio_in3, ref_audio2], outputs=out4)

    with gr.Tab("Vocal Change"):
        audio_in4 = gr.Audio(type="file", label="Input Vocal Audio")
        model_path = gr.Textbox(label="Voice Model Path")
        out5 = gr.Audio(label="Changed Vocal Output")
        btn5 = gr.Button("Change Vocal")
        btn5.click(change_vocal, inputs=[audio_in4, model_path], outputs=out5)

    with gr.Tab("Denoise"):
        audio_in5 = gr.Audio(type="file", label="Input Audio")
        out6 = gr.Audio(label="Denoised Output")
        btn6 = gr.Button("Denoise")
        btn6.click(denoise_audio, inputs=audio_in5, outputs=out6)

    with gr.Tab("Detect Lyrics (Multi-Vocal)"):
        audio_in6 = gr.Audio(type="file", label="Input Audio")
        out7 = gr.JSON(label="Detected Lyrics per Vocal")
        btn7 = gr.Button("Detect Lyrics")
        btn7.click(detect_lyrics, inputs=audio_in6, outputs=out7)

demo.launch()