Spaces:

Tachyeon
/

Swara-Split

Sleeping

App Files Files Community

Tachyeon commited on Jan 4

Commit

8a7120d

verified ·

1 Parent(s): 1259067

Update app.py

Browse files

Files changed (1) hide show

app.py +195 -82

app.py CHANGED Viewed

@@ -6,159 +6,272 @@ import soundfile as sf
 import numpy as np
 import os
 import sys
-# New import to download your model
 from huggingface_hub import hf_hub_download
-# 1. IMPORT YOUR LOCAL MODULES
-from bs_roformer import BSRoformer
-from attend import Attend
-# 2. SETUP DEVICE & PATCHES
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 def safe_attend_forward(self, q, k, v, mask=None):
     return F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0., is_causal=False)
-Attend.forward = safe_attend_forward
-print(f">>> 🎵 INITIALIZING RAW V11 ENGINE on {DEVICE}...")
-# 3. LOAD MODEL FROM YOUR REPO
 def load_model():
-    print(">>> 📡 Downloading/Loading weights from Tachyeon/IAM-RoFormer-Model-Weights...")
-    # This automatically downloads the file if it's not cached
     try:
         checkpoint_path = hf_hub_download(
             repo_id="Tachyeon/IAM-RoFormer-Model-Weights",
             filename="v11_consensus_epoch_30.pt"
         )
-        print(f">>> ✅ Weights found at: {checkpoint_path}")
     except Exception as e:
-        print(f"❌ Error downloading model: {e}")
         return None
-    # Initialize Architecture
-    try:
-        model = BSRoformer(
-            dim=512, depth=12, stereo=True, num_stems=4,
-            time_transformer_depth=1, freq_transformer_depth=1,
-            flash_attn=True
-        ).to(DEVICE)
-    except:
-        model = BSRoformer(
-            dim=512, depth=12, stereo=True, num_stems=4,
-            time_transformer_depth=1, freq_transformer_depth=1
-        ).to(DEVICE)
     # Load Weights
     ck = torch.load(checkpoint_path, map_location=DEVICE)
-    if 'model' in ck:
-        model.load_state_dict(ck['model'])
-    else:
-        model.load_state_dict(ck)
     model.eval()
     return model
 model = load_model()
-# 4. INFERENCE LOGIC
 def separate_audio(audio_path):
-    if model is None:
-        raise ValueError("Model failed to load. Check logs.")
-    if not audio_path:
-        return None, None, None, None
-    print(f"\n>>> 🪄 Separating '{os.path.basename(audio_path)}'...")
     mix, sr = librosa.load(audio_path, sr=44100, mono=False)
-    if len(mix.shape) == 1:
-        mix = np.stack([mix, mix], axis=0)
     chunk_size = 44100 * 10
     overlap = 44100 * 1
     mix_tensor = torch.tensor(mix, dtype=torch.float32).to(DEVICE)
-    if mix_tensor.dim() == 2:
-        mix_tensor = mix_tensor.unsqueeze(0)
     length = mix_tensor.shape[-1]
     final_output = torch.zeros(1, 4, 2, length).to(DEVICE)
     counts = torch.zeros(1, 4, 2, length).to(DEVICE)
-    print("    Processing chunks...")
     with torch.no_grad():
         context = torch.amp.autocast('cuda') if torch.cuda.is_available() else torch.no_grad()
         with context:
             for start in range(0, length, chunk_size - overlap):
                 end = min(start + chunk_size, length)
                 chunk = mix_tensor[:, :, start:end]
                 if chunk.shape[-1] < chunk_size:
                     chunk = F.pad(chunk, (0, chunk_size - chunk.shape[-1]))
                 pred = model(chunk)
                 valid = end - start
                 final_output[:, :, :, start:end] += pred[:, :, :, :valid]
                 counts[:, :, :, start:end] += 1.0
     stems = (final_output / torch.clamp(counts, min=1.0)).cpu().numpy()[0]
     outputs = []
-    stem_names = ["Stem_1", "Stem_2", "Stem_3", "Stem_4"]
-    for i, name in enumerate(stem_names):
-        outfile = f"output_{i}_{name}.wav"
         sf.write(outfile, stems[i].T, sr)
         outputs.append(outfile)
     return outputs[0], outputs[1], outputs[2], outputs[3]
 # ==========================================
-# 6. PROFESSIONAL UI (SWARA STUDIO)
 # ==========================================
-custom_css = """
-@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600&display=swap');
-body, .gradio-container { background-color: #0b0f19 !important; color: #e2e8f0 !important; font-family: 'Inter', sans-serif !important; }
-.main-header { text-align: center; margin-bottom: 2rem; padding: 2rem 0; border-bottom: 1px solid #1e293b; }
-.title-text { font-size: 3.5rem; font-weight: 300; letter-spacing: 4px; color: #f8fafc; margin: 0; text-transform: uppercase; }
-.studio-panel { background: #111827; border: 1px solid #1f2937; border-radius: 16px; padding: 24px; }
-#process-btn { background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%); border: none; color: white; font-weight: 600; padding: 12px; border-radius: 8px; margin-top: 20px; }
-#process-btn:hover { transform: translateY(-1px); box-shadow: 0 10px 15px -3px rgba(37, 99, 235, 0.3); }
-audio { width: 100%; filter: invert(0.9) hue-rotate(180deg); opacity: 0.8; }
-.section-label { font-size: 0.85rem; font-weight: 600; color: #64748b; text-transform: uppercase; margin-bottom: 12px; display: block; }
 """
-with gr.Blocks() as demo:
-    with gr.Row():
-        gr.HTML("""
-            <div class="main-header">
-                <h1 class="title-text">SWARA STUDIO</h1>
             </div>
-        """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            with gr.Group(elem_classes="studio-panel"):
-                gr.HTML("<span class='section-label'>// Source Material</span>")
-                input_audio = gr.Audio(label="", type="filepath", interactive=True)
-                process_btn = gr.Button("INITIALIZE SEPARATION", elem_id="process-btn", size="lg")
-        with gr.Column(scale=1):
-            with gr.Group(elem_classes="studio-panel"):
-                gr.HTML("<span class='section-label'>// Isolated Stems</span>")
-                out1 = gr.Audio(label="Vocals", interactive=False, show_label=True)
-                out2 = gr.Audio(label="Mridangam", interactive=False, show_label=True)
-                out3 = gr.Audio(label="Tanpura", interactive=False, show_label=True)
-                out4 = gr.Audio(label="Other", interactive=False, show_label=True)
     process_btn.click(
-        fn=separate_audio,
-        inputs=[input_audio],
-        outputs=[out1, out2, out3, out4]
     )
 if __name__ == "__main__":
-    demo.launch(theme=gr.themes.Base(primary_hue="slate"), css=custom_css)

 import numpy as np
 import os
 import sys
 from huggingface_hub import hf_hub_download
+# ==========================================
+# 1. SETUP & MODEL LOADING (Backend)
+# ==========================================
+# We keep your exact logic here, just ensuring robust imports
+try:
+    from bs_roformer import BSRoformer
+    from attend import Attend
+except ImportError:
+    # Fallback if running locally without properly set paths
+    # You might need to ensure these files are in your HF Space root
+    pass
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 def safe_attend_forward(self, q, k, v, mask=None):
     return F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0., is_causal=False)
+# Monkey Patch
+try:
+    Attend.forward = safe_attend_forward
+except NameError:
+    pass # Handle case where imports failed
+# Load Model with Caching
 def load_model():
+    print(">>> 📡 Loading Model Weights...")
     try:
         checkpoint_path = hf_hub_download(
             repo_id="Tachyeon/IAM-RoFormer-Model-Weights",
             filename="v11_consensus_epoch_30.pt"
         )
     except Exception as e:
+        print(f"Error: {e}")
         return None
+    # Initialize Architecture (Standard BSRoformer Config)
+    model = BSRoformer(
+        dim=512, depth=12, stereo=True, num_stems=4,
+        time_transformer_depth=1, freq_transformer_depth=1,
+        flash_attn=True
+    ).to(DEVICE)
     # Load Weights
     ck = torch.load(checkpoint_path, map_location=DEVICE)
+    if 'model' in ck: model.load_state_dict(ck['model'])
+    else: model.load_state_dict(ck)
     model.eval()
     return model
+# Initialize Global Model
 model = load_model()
+# ==========================================
+# 2. INFERENCE LOGIC (Chunking)
+# ==========================================
 def separate_audio(audio_path):
+    if model is None: return None, None, None, None
+    if not audio_path: return None, None, None, None
+    # Load & Normalize
     mix, sr = librosa.load(audio_path, sr=44100, mono=False)
+    if len(mix.shape) == 1: mix = np.stack([mix, mix], axis=0)
+    # Chunking Params
     chunk_size = 44100 * 10
     overlap = 44100 * 1
     mix_tensor = torch.tensor(mix, dtype=torch.float32).to(DEVICE)
+    if mix_tensor.dim() == 2: mix_tensor = mix_tensor.unsqueeze(0)
     length = mix_tensor.shape[-1]
     final_output = torch.zeros(1, 4, 2, length).to(DEVICE)
     counts = torch.zeros(1, 4, 2, length).to(DEVICE)
+    # Inference Loop
     with torch.no_grad():
         context = torch.amp.autocast('cuda') if torch.cuda.is_available() else torch.no_grad()
         with context:
             for start in range(0, length, chunk_size - overlap):
                 end = min(start + chunk_size, length)
                 chunk = mix_tensor[:, :, start:end]
+                # Pad if needed
                 if chunk.shape[-1] < chunk_size:
                     chunk = F.pad(chunk, (0, chunk_size - chunk.shape[-1]))
                 pred = model(chunk)
+                # Overlap Add
                 valid = end - start
                 final_output[:, :, :, start:end] += pred[:, :, :, :valid]
                 counts[:, :, :, start:end] += 1.0
+    # Normalize by counts
     stems = (final_output / torch.clamp(counts, min=1.0)).cpu().numpy()[0]
+    # Save Outputs
     outputs = []
+    stem_names = ["Vocals", "Drums", "Bass", "Other"]
+    for i in range(4):
+        outfile = f"stem_{i}.wav"
         sf.write(outfile, stems[i].T, sr)
         outputs.append(outfile)
     return outputs[0], outputs[1], outputs[2], outputs[3]
 # ==========================================
+# 3. UI DESIGN (ELEGANT DARK MODE)
 # ==========================================
+# CSS: High-End VST Plugin Look
+css = """
+@import url('https://fonts.googleapis.com/css2?family=Manrope:wght@300;400;600;800&display=swap');
+:root {
+    --bg-dark: #0F1116;
+    --panel-bg: #161922;
+    --accent: #6C5CE7; /* Elegant Violet */
+    --accent-glow: rgba(108, 92, 231, 0.3);
+    --text-main: #E0E0E0;
+    --text-muted: #888899;
+    --border: #2A2D3A;
+}
+body, .gradio-container {
+    background-color: var(--bg-dark) !important;
+    font-family: 'Manrope', sans-serif !important;
+    color: var(--text-main) !important;
+    margin: 0;
+    padding: 0;
+    height: 100vh; /* Force full screen */
+    overflow: hidden; /* No scroll */
+}
+/* Remove Gradio Bloat */
+footer { display: none !important; }
+.contain { display: flex; flex-direction: column; height: 100%; padding: 20px !important; }
+/* HEADER */
+.header-bar {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding-bottom: 20px;
+    border-bottom: 1px solid var(--border);
+    margin-bottom: 20px;
+}
+.brand {
+    font-size: 1.5rem;
+    font-weight: 800;
+    letter-spacing: 1px;
+    background: linear-gradient(90deg, #fff, #a5b4fc);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+}
+.tagline {
+    font-size: 0.85rem;
+    color: var(--text-muted);
+    font-weight: 400;
+}
+/* PANELS */
+.panel {
+    background: var(--panel-bg);
+    border: 1px solid var(--border);
+    border-radius: 16px;
+    padding: 24px;
+    height: 100%;
+    display: flex;
+    flex-direction: column;
+    box-shadow: 0 10px 30px rgba(0,0,0,0.2);
+}
+.panel-header {
+    font-size: 0.9rem;
+    color: var(--accent);
+    text-transform: uppercase;
+    letter-spacing: 2px;
+    font-weight: 600;
+    margin-bottom: 15px;
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+/* BUTTONS */
+button.primary-btn {
+    background: linear-gradient(135deg, var(--accent) 0%, #4834d4 100%) !important;
+    border: none !important;
+    color: white !important;
+    font-weight: 700 !important;
+    padding: 15px !important;
+    border-radius: 12px !important;
+    font-size: 1rem !important;
+    margin-top: auto !important; /* Push to bottom */
+    transition: all 0.3s ease !important;
+    box-shadow: 0 4px 15px var(--accent-glow) !important;
+}
+button.primary-btn:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 8px 25px var(--accent-glow) !important;
+}
+/* AUDIO PLAYERS - Minimalist */
+.audio-container {
+    background: transparent !important;
+    border: none !important;
+}
 """
+with gr.Blocks(css=css, theme=gr.themes.Base()) as demo:
+    with gr.Column(elem_classes="contain"):
+        # 1. TOP BAR
+        with gr.Row(elem_classes="header-bar"):
+            gr.HTML("""
+            <div>
+                <div class="brand">SWARA STUDIO <span style="font-weight:300; opacity:0.5;">| PRO</span></div>
+                <div class="tagline">Indian Art Music Source Separation Engine</div>
             </div>
+            """)
+        # 2. MAIN WORKSPACE (Grid)
+        with gr.Row(equal_height=True):
+            # LEFT: INPUT DECK
+            with gr.Column(scale=1):
+                with gr.Group(elem_classes="panel"):
+                    gr.HTML('<div class="panel-header">💿 Source Deck</div>')
+                    # File Input
+                    input_audio = gr.Audio(
+                        label="Drop Mix Here",
+                        type="filepath",
+                        interactive=True,
+                        elem_classes="audio-container"
+                    )
+                    gr.Markdown("Supports WAV, MP3, FLAC (44.1kHz)", elem_classes="tagline")
+                    # Separation Button (Pushed to bottom via CSS)
+                    process_btn = gr.Button("⚡ SEPARATE TRACKS", elem_classes="primary-btn")
+            # RIGHT: OUTPUT RACK
+            with gr.Column(scale=2):
+                with gr.Group(elem_classes="panel"):
+                    gr.HTML('<div class="panel-header">🎚️ Stem Rack</div>')
+                    with gr.Row():
+                        with gr.Column():
+                            out_vocals = gr.Audio(label="🎤 Vocals", interactive=False, type="filepath")
+                            out_drums = gr.Audio(label="🥁 Mridangam / Drums", interactive=False, type="filepath")
+                        with gr.Column():
+                            out_bass = gr.Audio(label="🎸 Tanpura / Bass", interactive=False, type="filepath")
+                            out_other = gr.Audio(label="🎻 Violin / Other", interactive=False, type="filepath")
+    # 3. WIRING
     process_btn.click(
+        fn=separate_audio,
+        inputs=[input_audio],
+        outputs=[out_vocals, out_drums, out_bass, out_other]
     )
 if __name__ == "__main__":
+    demo.launch()