ChatGPT commited on
Commit
eb1a122
·
1 Parent(s): 0821fc5

feat: replace gradio with custom extraction web app

Browse files
.gitignore ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .runs/
2
+ .command-logs/
3
+ __pycache__/
4
+ *.pyc
5
+ *.pyo
6
+ *.pyd
7
+ *.egg-info/
8
+ .venv/
9
+ venv/
10
+ dist/
11
+ build/
12
+ .DS_Store
13
+ .env
14
+ *.wav
15
+ *.mp3
16
+ *.flac
17
+ *.aiff
18
+ *.ogg
19
+ *.m4a
20
+ *.mid
21
+ *.zip
22
+ !drum-sample-extractor-updated.zip
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1 \
5
+ PIP_NO_CACHE_DIR=1
6
+
7
+ WORKDIR /app
8
+
9
+ RUN apt-get update \
10
+ && apt-get install -y --no-install-recommends ffmpeg libsndfile1 git \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ COPY requirements.txt ./
14
+ RUN pip install -r requirements.txt
15
+
16
+ COPY . ./
17
+ EXPOSE 7860
18
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,85 @@
1
  ---
2
  title: Drum Sample Extractor
3
- emoji: 📊
4
  colorFrom: gray
5
  colorTo: pink
6
- sdk: gradio
7
- sdk_version: 6.13.0
8
- app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Drum Sample Extractor
3
+ emoji: 🥁
4
  colorFrom: gray
5
  colorTo: pink
6
+ sdk: docker
7
+ app_port: 7860
 
8
  pinned: false
9
  ---
10
 
11
+ # Drum Sample Extractor
12
+
13
+ A custom FastAPI + browser UI for extracting reusable drum samples from an audio file.
14
+
15
+ The pipeline can isolate a stem with Demucs, detect onsets, classify hits, cluster similar transients, choose representative samples, optionally synthesize alternate samples, and export WAVs, MIDI, reconstruction audio, and a complete ZIP sample pack.
16
+
17
+ ## Current status
18
+
19
+ - Gradio has been replaced by a custom web frontend in `web/` served by `app.py`.
20
+ - The extraction pipeline is exposed through a JSON/multipart API and factored into `pipeline_runner.py`.
21
+ - Per-stage timing is captured for every extraction run and written into `manifest.json`.
22
+ - Benchmarking support is available in `scripts/benchmark_subprocesses.py`.
23
+ - Legacy Gradio apps are preserved in `legacy/` for reference only.
24
+
25
+ ## Run locally
26
+
27
+ ```bash
28
+ python3 -m venv .venv
29
+ source .venv/bin/activate
30
+ pip install -r requirements.txt
31
+ uvicorn app:app --host 0.0.0.0 --port 7860
32
+ ```
33
+
34
+ Open `http://127.0.0.1:7860`.
35
+
36
+ For fast iteration, set `Stem` to `all`. That bypasses Demucs and runs onset detection, classification, clustering, representative selection, synthesis, MIDI rendering, and packaging directly on the uploaded audio.
37
+
38
+ ## Run benchmarks
39
+
40
+ ```bash
41
+ python3 scripts/benchmark_subprocesses.py --runs 2 --bars 4 --output docs/benchmark-subprocesses.json
42
+ ```
43
+
44
+ The benchmark uses synthetic drum fixtures and `stem=all` so the DSP stages are measured without Demucs model download/runtime noise.
45
+
46
+ ## API
47
+
48
+ ```bash
49
+ curl http://127.0.0.1:7860/api/config
50
+
51
+ curl -F 'file=@song.wav' \
52
+ -F 'params={"stem":"all","target_min":4,"target_max":12}' \
53
+ http://127.0.0.1:7860/api/jobs
54
+ ```
55
+
56
+ Then poll the returned job id:
57
+
58
+ ```bash
59
+ curl http://127.0.0.1:7860/api/jobs/<job-id>
60
+ ```
61
+
62
+ ## Important files
63
+
64
+ | Path | Purpose |
65
+ |---|---|
66
+ | `app.py` | FastAPI app, static UI serving, job API, artifact downloads |
67
+ | `pipeline_runner.py` | Timed extraction pipeline used by API and benchmarks |
68
+ | `sample_extractor.py` | Core DSP/sample extraction implementation |
69
+ | `web/` | Custom no-build browser frontend |
70
+ | `scripts/benchmark_subprocesses.py` | Synthetic benchmark runner for stage timings |
71
+ | `docs/` | Review, timing, API, and UI documentation |
72
+ | `legacy/` | Previous Gradio apps retained for reference |
73
+
74
+ ## Output per run
75
+
76
+ Each run is stored under `.runs/<job-id>/output/`:
77
+
78
+ - `stem.wav`
79
+ - `reconstruction.wav`
80
+ - `reconstruction.mid`
81
+ - `sample-pack.zip`
82
+ - `samples/*.wav`
83
+ - `manifest.json`
84
+
85
+ `.runs/` is ignored by git.
app.py CHANGED
@@ -1,259 +1,204 @@
 
 
 
 
 
1
  """
2
- Gradio UI — Sample Extractor v9.
3
- SuperFlux onsets, transient NCC, mel pre-filter, MIDI quantization, param locking.
4
- """
5
 
6
- import os, sys
7
- sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
8
-
9
- # ─── HOTFIX: patch _sf() keyword argument bug ────────────────────────────────
10
- _src = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'sample_extractor.py')
11
- with open(_src, 'r') as _f: _content = _f.read()
12
- if '_sf(yh,lag=2,ms=5)' in _content:
13
- _content = _content.replace('_sf(yh,lag=2,ms=5)', '_sf(yh,l=2,ms=5)')
14
- with open(_src, 'w') as _f: _f.write(_content)
15
- print("[HOTFIX] Fixed _sf() kwarg: lag=2 → l=2")
16
- del _src, _content
17
- # ──────────────────────────────────────────────────────────────────────────────
18
-
19
- import gradio as gr
20
- import numpy as np, pandas as pd, json, tempfile
21
- import soundfile as sf, librosa
22
- import matplotlib; matplotlib.use('Agg')
23
- import matplotlib.pyplot as plt
24
-
25
- from sample_extractor import (
26
- extract_stem, detect_onsets, classify_hits,
27
- cluster_hits, select_best, synthesize_from_cluster,
28
- sample_quality_score, export_midi, detect_bpm,
29
- render_midi_with_samples, build_archive, cache_clear, auto_tune,
30
- DEMUCS_MODELS, DEMUCS_STEMS,
 
 
 
 
 
 
 
31
  )
32
- from synth_generator import generate_test_song
33
- from evaluation import evaluate_extraction
34
- from config_store import PipelineConfig, get_leaderboard
35
- from optimizer_v2 import run_optimization
36
-
37
- def audio_tuple(a, sr):
38
- a = a.astype(np.float32); pk = np.abs(a).max()
39
- if pk > 0: a = a / pk * 0.95
40
- return (sr, a)
41
-
42
- def run_auto_tune(audio_in, stem_choice, demucs_model, demucs_shifts, demucs_overlap,
43
- onset_mode, cur_delta, cur_energy, cur_gap, cur_tmin, cur_tmax,
44
- lock_delta, lock_energy, lock_gap, lock_targets, progress=gr.Progress()):
45
- if audio_in is None: return [gr.update()]*5 + ["Upload audio first", ""]
46
- locks = {}
47
- if lock_delta: locks['onset_delta'] = float(cur_delta)
48
- if lock_energy: locks['energy_threshold_db'] = float(cur_energy)
49
- if lock_gap: locks['min_gap'] = float(cur_gap)
50
- if lock_targets: locks['target_min']=int(cur_tmin); locks['target_max']=int(cur_tmax)
51
- progress(0.0); sr_in,data=audio_in; data=data.astype(np.float32)
52
- if data.ndim>1: data=data.mean(axis=1)
53
- pk=np.abs(data).max()
54
- if pk>0: data/=pk
55
- with tempfile.NamedTemporaryFile(suffix='.wav',delete=False) as f:
56
- sf.write(f.name,data,sr_in); tmp=f.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  try:
58
- progress(0.05,desc=f"Stem..."); sa,ssr=extract_stem(tmp,stem=stem_choice,device="cpu",
59
- model_name=demucs_model,shifts=int(demucs_shifts),overlap=float(demucs_overlap))
60
- ld=', '.join(f'{k}={v}' for k,v in locks.items()) if locks else 'none'
61
- progress(0.15,desc=f"Tuning (🔒 {ld})...")
62
- bp,bs,log=auto_tune(sa,ssr,mode=onset_mode,locks=locks)
63
- progress(1.0)
64
- lt='\n'.join(log[-30:])
65
- li=f"🔒 Locked: {ld}" if locks else "All params free"
66
- sm=f"**Score: {bs:.1f}/100** · {li}\n\nClick **Extract** to use these settings."
67
- return [
68
- gr.update(value=bp['onset_delta']) if not lock_delta else gr.update(),
69
- gr.update(value=bp['energy_threshold_db']) if not lock_energy else gr.update(),
70
- gr.update(value=bp['min_gap']) if not lock_gap else gr.update(),
71
- gr.update(value=bp.get('target_min',5)) if not lock_targets else gr.update(),
72
- gr.update(value=bp.get('target_max',20)) if not lock_targets else gr.update(),
73
- sm, lt]
74
- finally: os.unlink(tmp)
75
-
76
- def run_extraction(audio_in, stem_choice, demucs_model, demucs_shifts, demucs_overlap,
77
- onset_mode, onset_delta, energy_db, pre_pad, min_dur, max_dur, min_gap,
78
- ncc_threshold, attack_ms, linkage, target_min, target_max,
79
- do_synthesize, quantize_midi, subdivision, progress=gr.Progress()):
80
- if audio_in is None: return [None]*8
81
- progress(0.0); sr_in,data=audio_in; data=data.astype(np.float32)
82
- if data.ndim>1: data=data.mean(axis=1)
83
- pk=np.abs(data).max()
84
- if pk>0: data/=pk
85
- with tempfile.NamedTemporaryFile(suffix='.wav',delete=False) as f:
86
- sf.write(f.name,data,sr_in); tmp=f.name
 
87
  try:
88
- progress(0.05,desc=f"Stem ({demucs_model})...")
89
- sa,ssr=extract_stem(tmp,stem=stem_choice,device="cpu",
90
- model_name=demucs_model,shifts=int(demucs_shifts),overlap=float(demucs_overlap))
91
- progress(0.15,desc="BPM..."); bpm=detect_bpm(sa,ssr)
92
- progress(0.25,desc="Onsets...")
93
- hits=detect_onsets(sa,ssr,mode=onset_mode,onset_delta=float(onset_delta),
94
- energy_threshold_db=float(energy_db),pre_pad=float(pre_pad),
95
- min_dur=float(min_dur),max_dur=float(max_dur),min_gap=float(min_gap))
96
- if not hits:
97
- return (audio_tuple(sa,ssr),f"**BPM: {bpm}** — No hits.",None,None,None,None,"",pd.DataFrame())
98
- progress(0.35,desc="Classify..."); hits=classify_hits(hits)
99
- progress(0.45,desc="Cluster...")
100
- cl=cluster_hits(hits,audio=sa,sr=ssr,ncc_threshold=float(ncc_threshold),
101
- attack_ms=float(attack_ms),target_min=int(target_min),target_max=int(target_max),linkage=str(linkage))
102
- progress(0.65,desc="Select..."); select_best(cl)
103
- if do_synthesize:
104
- progress(0.7,desc="Synth...")
105
- for c in cl:
106
- if c.count>=2: c.synthesized=synthesize_from_cluster(c)
107
- progress(0.75,desc="MIDI..."); mp=tempfile.mktemp(suffix='.mid')
108
- export_midi(cl,mp,bpm=bpm,quantize=bool(quantize_midi),subdivision=int(subdivision))
109
- progress(0.8,desc="Render..."); rend=render_midi_with_samples(cl,sr=ssr)
110
- progress(0.85,desc="Package...")
111
- sd=tempfile.mkdtemp(); sp=[]
112
- for c in sorted(cl,key=lambda x:x.count,reverse=True):
113
- p=os.path.join(sd,f"{c.label}.wav"); c.best_hit.save(p); sp.append(p)
114
- zp=build_archive(cl,bpm,ssr,midi_path=mp,rendered_audio=rend)
115
- rows=[]
116
- for c in sorted(cl,key=lambda x:x.count,reverse=True):
117
- b=c.best_hit; sc=sample_quality_score(b.audio,b.sr,c.label.rsplit('_',1)[0])
118
- rows.append({'Sample':c.label,'Hits':c.count,'MIDI':c.midi_note,
119
- 'Score':f"{sc['total']:.1f}",'Clean':f"{sc['cleanness']:.2f}",
120
- 'Complete':f"{sc['completeness']:.2f}",
121
- 'Dur':f"{b.duration*1000:.0f}ms",
122
- 'First':f"{sorted(h.onset_time for h in c.hits)[0]:.2f}s"})
123
- sm=f"**BPM: {bpm}** · **{len(cl)} samples** from {len(hits)} hits\n\n"
124
- sm+=f"`{demucs_model}` · δ=`{onset_delta}` · E=`{energy_db}dB` · attack=`{attack_ms}ms`"
125
- if int(target_min)>0 and int(target_max)>0: sm+=f" · clusters `{int(target_min)}–{int(target_max)}`"
126
- if quantize_midi: sm+=f" · MIDI 1/{int(subdivision)}"
127
- sm+="\n\n| Sample | Hits | MIDI |\n|---|---|---|\n"
128
- for c in sorted(cl,key=lambda x:x.count,reverse=True): sm+=f"| {c.label} | {c.count} | {c.midi_note} |\n"
129
- progress(1.0)
130
- return (audio_tuple(sa,ssr),sm,audio_tuple(rend,ssr),sp,mp,zp,"",pd.DataFrame(rows))
131
- finally: os.unlink(tmp)
132
-
133
- def run_eval(pattern,bpm,bars,ncc_threshold,target_min,target_max,progress=gr.Progress()):
134
- progress(0.0); song=generate_test_song(pattern_name=pattern,bars=int(bars),bpm=float(bpm),variation='medium',seed=42)
135
- dbpm=detect_bpm(song.drums_only,song.sr); progress(0.2)
136
- hits=detect_onsets(song.drums_only,song.sr)
137
- if not hits: return None,None,None,None,"",""
138
- hits=classify_hits(hits)
139
- cl=cluster_hits(hits,audio=song.drums_only,sr=song.sr,ncc_threshold=float(ncc_threshold),
140
- target_min=int(target_min),target_max=int(target_max))
141
- select_best(cl)
142
- for c in cl:
143
- if c.count>=2: c.synthesized=synthesize_from_cluster(c)
144
- progress(0.5); rend=render_midi_with_samples(cl,sr=song.sr); progress(0.6)
145
- gt={n:s.audio for n,s in song.samples.items()}
146
- gh=[{'sample':h.sample_name,'onset':h.onset_time,'velocity':h.velocity} for h in song.hits]
147
- r=evaluate_extraction(cl,gt,gh,song.sr,hits)
148
- s=[{'Metric':'BPM','Value':f"{dbpm}",'Target':f"{song.bpm}"},
149
- {'Metric':'Clusters','Value':str(len(cl)),'Target':str(len(gt))},
150
- {'Metric':'Score','Value':f"{r.overall_score:.1f}/100",'Target':'> 70'}]
151
- if r.unmatched_gt: s.append({'Metric':'⚠','Value':', '.join(r.unmatched_gt),'Target':'None'})
152
- m=[{'Cluster':x.cluster_label,'GT':x.gt_name,'Score':f"{x.sample_score:.1f}"} for x in r.matches]
153
- progress(1.0)
154
- return (audio_tuple(song.mix,song.sr),audio_tuple(rend,song.sr),pd.DataFrame(s),pd.DataFrame(m) if m else None,"","")
155
-
156
- def run_optimize(n,name,author,save,progress=gr.Progress()):
157
- logs=[]; progress(0.0)
158
- state=run_optimization(n_iterations=int(n),config_name=name or "opt",
159
- author=author or "anon",save_to_hub=bool(save),log_fn=lambda m:logs.append(m))
160
- progress(1.0)
161
- h=[{'Iter':r.iteration,'Score':f"{r.avg_score:.1f}"} for r in state.history]
162
- if state.history:
163
- fig,ax=plt.subplots(figsize=(10,4)); ax.plot([r.iteration for r in state.history],[r.avg_score for r in state.history],'b-o')
164
- ax.grid(True,alpha=0.3); plt.tight_layout()
165
- else: fig,ax=plt.subplots(); ax.text(0.5,0.5,"No data")
166
- return '\n'.join(logs),pd.DataFrame(h),fig,json.dumps(state.best_config,indent=2)
167
-
168
- def refresh_lb():
169
- try: lb=get_leaderboard(); return pd.DataFrame(lb) if lb else pd.DataFrame(),""
170
- except Exception as e: return pd.DataFrame(),str(e)
171
-
172
- def build_app():
173
- with gr.Blocks(title="🎵 Sample Extractor",theme=gr.themes.Soft(),
174
- css=".gradio-container{max-width:1300px!important}") as app:
175
- gr.Markdown("# 🎵 Sample Extractor v9\n"
176
- "**SuperFlux** onsets · **Transient NCC** (25ms attack) · "
177
- "**Mel pre-filter** · **MIDI quantization** · **Auto-Tune** with 🔒 locks")
178
- with gr.Tabs():
179
- with gr.Tab("🎵 Extract"):
180
- audio_in=gr.Audio(sources=['upload'],type='numpy',label='Upload Audio')
181
- with gr.Accordion("🔧 Stem Separation",open=False):
182
- with gr.Row():
183
- dm=gr.Dropdown(DEMUCS_MODELS,value="htdemucs_ft",label="Model")
184
- st=gr.Dropdown(['drums','bass','other','vocals','all'],value='drums',label='Stem')
185
- dsh=gr.Slider(0,5,value=1,step=1,label='Shifts')
186
- dov=gr.Slider(0.0,0.5,value=0.25,step=0.05,label='Overlap')
187
- with gr.Accordion("🎯 Onset Detection",open=False):
188
- with gr.Row(): om=gr.Dropdown(['auto','percussive','harmonic','broadband'],value='auto',label='Mode')
189
- with gr.Row():
190
- od=gr.Slider(0.01,0.5,value=0.12,step=0.01,label='Delta'); lock_od=gr.Checkbox(value=False,label='🔒',scale=0)
191
- with gr.Row():
192
- ed=gr.Slider(-70,-10,value=-35,step=1,label='Energy (dB)'); lock_ed=gr.Checkbox(value=False,label='🔒',scale=0)
193
- with gr.Row():
194
- mg=gr.Slider(0.005,0.2,value=0.03,step=0.005,label='Min gap'); lock_mg=gr.Checkbox(value=False,label='🔒',scale=0)
195
- with gr.Row():
196
- pp=gr.Slider(0.0,0.05,value=0.003,step=0.001,label='Pre-pad')
197
- mnd=gr.Slider(0.005,0.2,value=0.02,step=0.005,label='Min dur')
198
- mxd=gr.Slider(0.1,5.0,value=1.5,step=0.1,label='Max dur')
199
- with gr.Accordion("🔗 Clustering",open=True):
200
- with gr.Row():
201
- tmin=gr.Number(value=5,label='Target min',precision=0)
202
- tmax=gr.Number(value=20,label='Target max',precision=0)
203
- lock_tgt=gr.Checkbox(value=True,label='🔒 Lock range',scale=0)
204
- gr.Markdown("*🔒 = auto-tune keeps this value fixed*")
205
- with gr.Row():
206
- nt=gr.Slider(0.3,0.99,value=0.80,step=0.01,label='NCC threshold')
207
- atk=gr.Slider(10,100,value=25,step=5,label='Attack (ms)')
208
- lnk=gr.Dropdown(['average','complete','single'],value='average',label='Linkage')
209
- with gr.Accordion("🎹 MIDI & Post",open=False):
210
- with gr.Row():
211
- syn=gr.Checkbox(value=True,label='Synthesize')
212
- qmidi=gr.Checkbox(value=True,label='Quantize MIDI')
213
- subdiv=gr.Dropdown([('8th',8),('16th',16),('32nd',32)],value=16,label='Grid')
214
- with gr.Row():
215
- tune_btn=gr.Button("🎛️ Auto-Tune",variant="secondary",size="lg")
216
- extract_btn=gr.Button("🔬 Extract",variant="primary",size="lg")
217
- tune_summary=gr.Markdown(""); tune_log=gr.Textbox(label="Log",lines=8,max_lines=15,visible=False)
218
- summary_md=gr.Markdown("*Upload → Auto-Tune or Extract*")
219
- with gr.Row():
220
- stem_out=gr.Audio(type='numpy',label='Stem',interactive=False)
221
- rend_out=gr.Audio(type='numpy',label='🔊 Reconstruction',interactive=False)
222
- gr.Markdown("### Downloads")
223
- with gr.Row():
224
- arc=gr.File(label="📦 ZIP",interactive=False); mid=gr.File(label="🎹 MIDI",interactive=False)
225
- smp=gr.File(label="WAVs",file_count="multiple",interactive=False)
226
- met=gr.Dataframe(label="Samples"); stx=gr.Textbox(visible=False)
227
- dm.change(fn=lambda m:gr.update(choices=DEMUCS_STEMS.get(m,["drums","bass","other","vocals"])+["all"]),inputs=[dm],outputs=[st])
228
- tune_btn.click(run_auto_tune,[audio_in,st,dm,dsh,dov,om,od,ed,mg,tmin,tmax,lock_od,lock_ed,lock_mg,lock_tgt],
229
- [od,ed,mg,tmin,tmax,tune_summary,tune_log])
230
- extract_btn.click(run_extraction,[audio_in,st,dm,dsh,dov,om,od,ed,pp,mnd,mxd,mg,nt,atk,lnk,tmin,tmax,syn,qmidi,subdiv],
231
- [stem_out,summary_md,rend_out,smp,mid,arc,stx,met])
232
- with gr.Tab("📊 Evaluate"):
233
- with gr.Row():
234
- ep=gr.Dropdown(['rock','funk','halftime'],value='rock',label='Pattern')
235
- eb=gr.Slider(80,200,value=120,step=2,label='BPM'); ebs=gr.Slider(2,8,value=4,step=1,label='Bars')
236
- with gr.Row():
237
- en=gr.Slider(0.3,0.99,value=0.80,step=0.01,label='NCC')
238
- etm=gr.Number(value=0,label='Min',precision=0); etx=gr.Number(value=0,label='Max',precision=0)
239
- evb=gr.Button("🧪 Evaluate",variant="primary",size="lg")
240
- with gr.Row():
241
- evm=gr.Audio(type='numpy',label='Original',interactive=False)
242
- evr=gr.Audio(type='numpy',label='Reconstruction',interactive=False)
243
- evs=gr.Dataframe(); evm2=gr.Dataframe()
244
- es1=gr.Textbox(visible=False); es2=gr.Textbox(visible=False)
245
- evb.click(run_eval,[ep,eb,ebs,en,etm,etx],[evm,evr,evs,evm2,es1,es2])
246
- with gr.Tab("🔄 Optimize"):
247
- with gr.Row():
248
- on=gr.Slider(2,30,value=5,step=1,label='Iters'); ocn=gr.Textbox(value="opt",label='Name')
249
- oa=gr.Textbox(value="",label='Author'); osv=gr.Checkbox(value=True,label='Save')
250
- ob=gr.Button("🚀 Run",variant="primary",size="lg")
251
- ol=gr.Textbox(label="Log",lines=20,max_lines=40); oh=gr.Dataframe(); op=gr.Plot()
252
- oc=gr.Code(label="Config",language="json")
253
- ob.click(run_optimize,[on,ocn,oa,osv],[ol,oh,op,oc])
254
- with gr.Tab("🏆 Leaderboard"):
255
- lbb=gr.Button("🔄 Refresh"); lt=gr.Dataframe(); ls=gr.Textbox(visible=False)
256
- lbb.click(refresh_lb,[],[lt,ls])
257
- return app
258
-
259
- if __name__=="__main__": build_app().launch(server_name="0.0.0.0",server_port=7860)
 
1
+ #!/usr/bin/env python3
2
+ """Custom web application for the drum sample extractor.
3
+
4
+ Run with:
5
+ uvicorn app:app --host 0.0.0.0 --port 7860
6
  """
 
 
 
7
 
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import shutil
12
+ import traceback
13
+ import uuid
14
+ from concurrent.futures import ThreadPoolExecutor
15
+ from dataclasses import asdict
16
+ from pathlib import Path
17
+ from threading import Lock
18
+ from typing import Any
19
+
20
+ from fastapi import FastAPI, File, Form, HTTPException, UploadFile
21
+ from fastapi.middleware.cors import CORSMiddleware
22
+ from fastapi.responses import FileResponse, JSONResponse
23
+ from fastapi.staticfiles import StaticFiles
24
+
25
+ from pipeline_runner import PipelineParams, initial_stages, run_extraction_pipeline
26
+ from sample_extractor import DEMUCS_MODELS, DEMUCS_STEMS, cache_clear
27
+
28
+ ROOT = Path(__file__).resolve().parent
29
+ WEB_DIR = ROOT / "web"
30
+ RUNS_DIR = ROOT / ".runs"
31
+ RUNS_DIR.mkdir(exist_ok=True)
32
+
33
+ app = FastAPI(title="Drum Sample Extractor", version="10.0.0")
34
+ app.add_middleware(
35
+ CORSMiddleware,
36
+ allow_origins=["*"],
37
+ allow_credentials=False,
38
+ allow_methods=["*"],
39
+ allow_headers=["*"],
40
  )
41
+
42
+ executor = ThreadPoolExecutor(max_workers=1)
43
+ jobs_lock = Lock()
44
+ jobs: dict[str, dict[str, Any]] = {}
45
+
46
+
47
+ def _job_url(job_id: str, relative_path: str) -> str:
48
+ return f"/api/jobs/{job_id}/files/{relative_path}"
49
+
50
+
51
+ def _serialise_job(job: dict[str, Any]) -> dict[str, Any]:
52
+ payload = {key: value for key, value in job.items() if key not in {"input_path", "output_dir"}}
53
+ if payload.get("result"):
54
+ result = dict(payload["result"])
55
+ result["file_urls"] = {key: _job_url(job["id"], path) for key, path in result.get("files", {}).items()}
56
+ result["samples"] = [
57
+ {**sample, "url": _job_url(job["id"], sample["file"])}
58
+ for sample in result.get("samples", [])
59
+ ]
60
+ payload["result"] = result
61
+ return payload
62
+
63
+
64
+ def _update_job(job_id: str, **patch: Any) -> None:
65
+ with jobs_lock:
66
+ jobs[job_id].update(patch)
67
+
68
+
69
+ def _append_log(job_id: str, message: str) -> None:
70
+ with jobs_lock:
71
+ jobs[job_id].setdefault("logs", []).append(message)
72
+
73
+
74
+ def _run_job(job_id: str) -> None:
75
+ with jobs_lock:
76
+ job = jobs[job_id]
77
+ input_path = Path(job["input_path"])
78
+ output_dir = Path(job["output_dir"])
79
+ params = job["params"]
80
+ job["status"] = "running"
81
+
82
+ def progress(event: dict[str, Any]) -> None:
83
+ if "stages" in event:
84
+ _update_job(job_id, stages=event["stages"])
85
+ if event.get("stage"):
86
+ stage = event["stage"]
87
+ if stage.get("status") == "running":
88
+ _append_log(job_id, f"Started: {stage['label']}")
89
+ elif stage.get("status") == "done":
90
+ _append_log(job_id, f"Finished: {stage['label']} in {stage['duration_sec']:.3f}s")
91
+
92
  try:
93
+ result = run_extraction_pipeline(input_path, output_dir, PipelineParams.from_mapping(params), progress_cb=progress)
94
+ _update_job(job_id, status="complete", result=asdict(result), error=None)
95
+ except Exception as exc: # deliberately explicit for UI diagnostics
96
+ _update_job(job_id, status="error", error=str(exc), traceback=traceback.format_exc())
97
+ _append_log(job_id, f"Error: {exc}")
98
+
99
+
100
+ @app.get("/api/health")
101
+ def health() -> dict[str, str]:
102
+ return {"status": "ok"}
103
+
104
+
105
+ @app.get("/api/config")
106
+ def config() -> dict[str, Any]:
107
+ return {
108
+ "demucs_models": DEMUCS_MODELS,
109
+ "demucs_stems": {key: value + ["all"] for key, value in DEMUCS_STEMS.items()},
110
+ "defaults": asdict(PipelineParams()),
111
+ "stages": initial_stages(),
112
+ }
113
+
114
+
115
+ @app.post("/api/cache/clear")
116
+ def clear_cache() -> dict[str, str]:
117
+ cache_clear()
118
+ return {"status": "cleared"}
119
+
120
+
121
+ @app.post("/api/jobs")
122
+ async def create_job(file: UploadFile = File(...), params: str = Form("{}")) -> JSONResponse:
123
  try:
124
+ parsed_params = json.loads(params)
125
+ validated = PipelineParams.from_mapping(parsed_params)
126
+ except Exception as exc:
127
+ raise HTTPException(status_code=400, detail=str(exc)) from exc
128
+
129
+ job_id = uuid.uuid4().hex[:12]
130
+ job_dir = RUNS_DIR / job_id
131
+ input_dir = job_dir / "input"
132
+ output_dir = job_dir / "output"
133
+ input_dir.mkdir(parents=True, exist_ok=True)
134
+ output_dir.mkdir(parents=True, exist_ok=True)
135
+
136
+ suffix = Path(file.filename or "input.wav").suffix or ".wav"
137
+ input_path = input_dir / f"source{suffix}"
138
+ with input_path.open("wb") as handle:
139
+ shutil.copyfileobj(file.file, handle)
140
+
141
+ job = {
142
+ "id": job_id,
143
+ "status": "pending",
144
+ "filename": file.filename,
145
+ "params": asdict(validated),
146
+ "stages": initial_stages(),
147
+ "logs": [],
148
+ "result": None,
149
+ "error": None,
150
+ "traceback": None,
151
+ "input_path": str(input_path),
152
+ "output_dir": str(output_dir),
153
+ }
154
+ with jobs_lock:
155
+ jobs[job_id] = job
156
+ executor.submit(_run_job, job_id)
157
+ return JSONResponse(_serialise_job(job), status_code=202)
158
+
159
+
160
+ @app.get("/api/jobs/{job_id}")
161
+ def get_job(job_id: str) -> dict[str, Any]:
162
+ with jobs_lock:
163
+ job = jobs.get(job_id)
164
+ if not job:
165
+ manifest = RUNS_DIR / job_id / "output" / "manifest.json"
166
+ if manifest.exists():
167
+ result = json.loads(manifest.read_text(encoding="utf-8"))
168
+ return _serialise_job(
169
+ {
170
+ "id": job_id,
171
+ "status": "complete",
172
+ "filename": None,
173
+ "params": result.get("params", {}),
174
+ "stages": result.get("stages", []),
175
+ "logs": [],
176
+ "result": result,
177
+ "error": None,
178
+ "traceback": None,
179
+ "output_dir": str(manifest.parent),
180
+ }
181
+ )
182
+ raise HTTPException(status_code=404, detail="Job not found")
183
+ return _serialise_job(dict(job))
184
+
185
+
186
+ @app.get("/api/jobs/{job_id}/files/{relative_path:path}")
187
+ def get_job_file(job_id: str, relative_path: str) -> FileResponse:
188
+ root = (RUNS_DIR / job_id / "output").resolve()
189
+ path = (root / relative_path).resolve()
190
+ if not str(path).startswith(str(root)) or not path.exists() or not path.is_file():
191
+ raise HTTPException(status_code=404, detail="File not found")
192
+ return FileResponse(path)
193
+
194
+
195
+ if WEB_DIR.exists():
196
+ app.mount("/web", StaticFiles(directory=WEB_DIR), name="web")
197
+
198
+
199
+ @app.get("/")
200
+ def index() -> FileResponse:
201
+ index_path = WEB_DIR / "index.html"
202
+ if not index_path.exists():
203
+ raise HTTPException(status_code=500, detail="web/index.html is missing")
204
+ return FileResponse(index_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/API.md ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # API documentation
2
+
3
+ The active app is `app.py`, a FastAPI application.
4
+
5
+ ## Start server
6
+
7
+ ```bash
8
+ uvicorn app:app --host 0.0.0.0 --port 7860
9
+ ```
10
+
11
+ ## `GET /api/health`
12
+
13
+ Returns backend health.
14
+
15
+ ```json
16
+ {"status":"ok"}
17
+ ```
18
+
19
+ ## `GET /api/config`
20
+
21
+ Returns supported models, stems, default pipeline params, and stage definitions.
22
+
23
+ ```bash
24
+ curl http://127.0.0.1:7860/api/config
25
+ ```
26
+
27
+ ## `POST /api/jobs`
28
+
29
+ Creates an extraction job.
30
+
31
+ Content type: `multipart/form-data`
32
+
33
+ Fields:
34
+
35
+ | Field | Type | Required | Description |
36
+ |---|---|---:|---|
37
+ | `file` | file | yes | Audio source |
38
+ | `params` | JSON string | no | Partial or full pipeline params |
39
+
40
+ Example:
41
+
42
+ ```bash
43
+ curl -F 'file=@song.wav' \
44
+ -F 'params={"stem":"all","target_min":4,"target_max":12,"synthesize":true}' \
45
+ http://127.0.0.1:7860/api/jobs
46
+ ```
47
+
48
+ Response status: `202 Accepted`
49
+
50
+ ```json
51
+ {
52
+ "id": "58ca0db4ac74",
53
+ "status": "pending",
54
+ "filename": "song.wav",
55
+ "params": {"stem": "all"},
56
+ "stages": [],
57
+ "logs": [],
58
+ "result": null,
59
+ "error": null
60
+ }
61
+ ```
62
+
63
+ ## `GET /api/jobs/{job_id}`
64
+
65
+ Poll job status and retrieve results.
66
+
67
+ Statuses:
68
+
69
+ | Status | Meaning |
70
+ |---|---|
71
+ | `pending` | Job is queued |
72
+ | `running` | Job is executing |
73
+ | `complete` | Result and artifacts are ready |
74
+ | `error` | Pipeline failed; `error` and `traceback` are populated |
75
+
76
+ Completed jobs contain:
77
+
78
+ | Key | Meaning |
79
+ |---|---|
80
+ | `duration_sec` | Total wall time |
81
+ | `audio_duration_sec` | Duration of processed stem/source |
82
+ | `realtime_factor` | `duration_sec / audio_duration_sec` |
83
+ | `bpm` | Detected tempo |
84
+ | `hit_count` | Number of accepted onsets/hits |
85
+ | `cluster_count` | Number of sample clusters |
86
+ | `stages` | Per-stage timing/status/detail list |
87
+ | `samples` | Sample rows with score, duration, first onset, and download URL |
88
+ | `overview` | Decimated envelope and onset markers for waveform display |
89
+ | `files` | Relative artifact paths |
90
+ | `file_urls` | Direct API URLs for artifacts |
91
+
92
+ ## `GET /api/jobs/{job_id}/files/{relative_path}`
93
+
94
+ Downloads an artifact from a completed job.
95
+
96
+ Examples:
97
+
98
+ ```bash
99
+ curl -O http://127.0.0.1:7860/api/jobs/58ca0db4ac74/files/sample-pack.zip
100
+ curl -O http://127.0.0.1:7860/api/jobs/58ca0db4ac74/files/reconstruction.mid
101
+ curl -O http://127.0.0.1:7860/api/jobs/58ca0db4ac74/files/samples/hihat_open_0.wav
102
+ ```
103
+
104
+ The endpoint prevents path traversal by resolving downloads under `.runs/<job-id>/output/`.
105
+
106
+ ## `POST /api/cache/clear`
107
+
108
+ Clears the in-memory extraction cache.
109
+
110
+ ```bash
111
+ curl -X POST http://127.0.0.1:7860/api/cache/clear
112
+ ```
113
+
114
+ ## Pipeline parameters
115
+
116
+ Defined in `pipeline_runner.PipelineParams`.
117
+
118
+ | Parameter | Default | Meaning |
119
+ |---|---:|---|
120
+ | `stem` | `drums` | Demucs source to extract, or `all` to bypass Demucs |
121
+ | `demucs_model` | `htdemucs_ft` | Demucs model |
122
+ | `demucs_shifts` | `1` | Test-time shifts for Demucs quality/speed tradeoff |
123
+ | `demucs_overlap` | `0.25` | Demucs chunk overlap |
124
+ | `onset_mode` | `auto` | `auto`, `percussive`, `harmonic`, or `broadband` |
125
+ | `onset_delta` | `0.12` | Peak-pick threshold |
126
+ | `energy_threshold_db` | `-35` | RMS gate for accepting hits |
127
+ | `pre_pad` | `0.003` | Seconds of audio before onset |
128
+ | `min_dur` | `0.02` | Minimum hit duration |
129
+ | `max_dur` | `1.5` | Maximum hit duration |
130
+ | `min_gap` | `0.03` | Minimum time between onsets |
131
+ | `ncc_threshold` | `0.80` | Similarity threshold when not targeting cluster count |
132
+ | `attack_ms` | `25` | Transient window used for NCC |
133
+ | `mel_threshold` | `0.75` | Candidate prefilter threshold |
134
+ | `linkage` | `average` | Agglomerative linkage |
135
+ | `target_min` | `5` | Lower cluster target; `0` disables target mode |
136
+ | `target_max` | `20` | Upper cluster target; `0` disables target mode |
137
+ | `synthesize` | `true` | Write synthesized alternates for clusters with multiple hits |
138
+ | `quantize_midi` | `true` | Snap MIDI notes to grid |
139
+ | `subdivision` | `16` | MIDI grid subdivision |
140
+ | `device` | `cpu` | Torch device for Demucs |
docs/PIPELINE_TIMING_AND_REALTIME.md ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pipeline timing and near-real-time analysis
2
+
3
+ ## Measurement setup
4
+
5
+ Benchmarks were run with `scripts/benchmark_subprocesses.py` using synthetic drum fixtures from `synth_generator.py`.
6
+
7
+ Important constraints:
8
+
9
+ - `stem=all` was used to bypass Demucs and measure the DSP/sample-extraction subprocesses directly.
10
+ - The script performs one warm-up run first, so import/JIT overhead is not included in the summary.
11
+ - Runs used 4 bars at 120 BPM across `rock`, `funk`, and `halftime` synthetic patterns.
12
+ - The benchmark output is stored in `docs/benchmark-subprocesses.json`.
13
+
14
+ ## Measured subprocess lengths
15
+
16
+ | Stage | Mean seconds | Median seconds | Min seconds | Max seconds |
17
+ |---|---:|---:|---:|---:|
18
+ | `stem` | 0.017 | 0.013 | 0.009 | 0.039 |
19
+ | `bpm` | 0.224 | 0.223 | 0.206 | 0.241 |
20
+ | `onsets` | 2.140 | 2.034 | 1.762 | 2.871 |
21
+ | `classification` | 0.034 | 0.035 | 0.024 | 0.045 |
22
+ | `clustering` | 0.496 | 0.597 | 0.059 | 0.913 |
23
+ | `selection` | 0.499 | 0.551 | 0.311 | 0.651 |
24
+ | `synthesis` | 0.002 | 0.002 | 0.002 | 0.003 |
25
+ | `export` | 0.105 | 0.103 | 0.046 | 0.178 |
26
+
27
+ Observed total runtime for warm synthetic 4-bar fixtures was roughly `0.30×–0.43×` realtime when Demucs was bypassed. In plain terms: the pure extraction stages ran faster than the audio duration on these fixtures. The first cold run can be much slower because librosa/scipy/numba-style initialization costs are paid up front.
28
+
29
+ ## Significant subprocesses
30
+
31
+ ### 1. Stem extraction / source load
32
+
33
+ Current implementation:
34
+
35
+ - `stem=all`: load and normalize the source audio with librosa.
36
+ - any other stem: run Demucs via `demucs.pretrained.get_model` and `demucs.apply.apply_model`.
37
+
38
+ Timing profile:
39
+
40
+ - `stem=all` is near-instant after warm-up on short fixtures.
41
+ - Demucs is the offline bottleneck and should be treated as non-realtime in this project.
42
+
43
+ Real-time suitability: **No for Demucs, yes for direct source load.**
44
+
45
+ Recommended strategy:
46
+
47
+ - Keep Demucs as an explicit offline preprocessing stage.
48
+ - Cache stem output by content hash and model parameters.
49
+ - Let users bypass Demucs for drum loops, already-separated stems, and iterative parameter tuning.
50
+
51
+ ### 2. BPM / tempo detection
52
+
53
+ Current implementation:
54
+
55
+ - `librosa.onset.onset_strength`
56
+ - `librosa.feature.tempo`
57
+ - beat-track sanity adjustment
58
+
59
+ Timing profile:
60
+
61
+ - Measured around 0.22 s for ~9 s synthetic clips after warm-up.
62
+
63
+ Real-time suitability: **Near-realtime with buffering.**
64
+
65
+ A live version should estimate tempo over rolling windows and refine continuously. It does not need the entire file, but short windows can be unstable.
66
+
67
+ ### 3. Onset detection + slicing
68
+
69
+ Current implementation:
70
+
71
+ - Multiband SuperFlux-style onset envelope in `auto` mode.
72
+ - Optional percussive/harmonic/broadband modes.
73
+ - Peak picking and hit slicing by onset-to-next-onset boundaries.
74
+ - Energy threshold and duration filtering.
75
+
76
+ Timing profile:
77
+
78
+ - This is the largest non-Demucs DSP stage in the measured benchmark: about 2.14 s mean for ~9 s fixtures.
79
+ - It is still faster than realtime in warm synthetic tests.
80
+
81
+ Real-time suitability: **Yes, with a rolling window and bounded lookahead.**
82
+
83
+ Why:
84
+
85
+ - Onset strength and peak picking are local-window operations.
86
+ - Backtracking and next-onset slicing require a small amount of future context.
87
+ - A live system can emit provisional hits and finalize durations once the next onset or max-duration cutoff arrives.
88
+
89
+ ### 4. Spectral rule classification
90
+
91
+ Current implementation:
92
+
93
+ - STFT per hit.
94
+ - Low/mid/high energy ratios.
95
+ - Spectral centroid, zero-crossing rate, duration rules.
96
+
97
+ Timing profile:
98
+
99
+ - Measured around 34 ms mean for the benchmark fixtures.
100
+
101
+ Real-time suitability: **Yes.**
102
+
103
+ This is cheap per hit and can run immediately after a hit segment is finalized.
104
+
105
+ ### 5. Mel fingerprinting + transient NCC clustering
106
+
107
+ Current implementation:
108
+
109
+ - Build mel fingerprints for hits.
110
+ - Use cosine similarity as a prefilter.
111
+ - Compute transient normalized cross-correlation only for candidate pairs.
112
+ - Run agglomerative clustering on the resulting precomputed distance matrix.
113
+ - Optionally merge singleton clusters into nearby multi-hit clusters.
114
+
115
+ Timing profile:
116
+
117
+ - Measured around 0.50 s mean, but depends strongly on number of hits and pair count.
118
+ - Complexity is roughly quadratic in hit count for pairwise similarity, with mel prefiltering reducing NCC work.
119
+
120
+ Real-time suitability: **Partially.**
121
+
122
+ What can be realtime:
123
+
124
+ - Mel fingerprint extraction per hit.
125
+ - Transient NCC against a bounded set of existing cluster representatives.
126
+ - Online assignment to existing clusters.
127
+
128
+ What is not truly realtime in the current implementation:
129
+
130
+ - Full agglomerative clustering over the complete distance matrix.
131
+ - Target cluster count search through repeated clustering.
132
+
133
+ Recommended live design:
134
+
135
+ 1. Maintain cluster prototypes: representative transient, mel centroid, count, label histogram.
136
+ 2. For each finalized hit, compute fingerprint and compare to prototypes first.
137
+ 3. Only run transient NCC against likely candidates.
138
+ 4. Assign immediately when above threshold; create a new cluster otherwise.
139
+ 5. Periodically run batch reclustering in the background to clean up early mistakes.
140
+
141
+ ### 6. Best representative selection
142
+
143
+ Current implementation:
144
+
145
+ - Compute sample quality score per candidate hit.
146
+ - Choose highest-scoring hit per cluster.
147
+
148
+ Timing profile:
149
+
150
+ - Measured around 0.50 s mean in the benchmark.
151
+ - Cost scales with number of hits and quality scoring work.
152
+
153
+ Real-time suitability: **Yes as an incremental update.**
154
+
155
+ A live version can maintain the current best hit per cluster and only rescore new arrivals or candidates whose cluster changed.
156
+
157
+ ### 7. Optional synthesis
158
+
159
+ Current implementation:
160
+
161
+ - Align cluster members by peak position.
162
+ - Normalize and weighted-average hits to create an alternate synthesized sample.
163
+
164
+ Timing profile:
165
+
166
+ - Measured around 2 ms mean on benchmark fixtures.
167
+
168
+ Real-time suitability: **Yes for small clusters, but better as deferred polish.**
169
+
170
+ It is fast, but users usually do not need synthesized alternates before cluster membership stabilizes.
171
+
172
+ ### 8. Export: MIDI, reconstruction, WAVs, ZIP
173
+
174
+ Current implementation:
175
+
176
+ - Build MIDI notes from hits and cluster sample notes.
177
+ - Render reconstruction with representative samples.
178
+ - Write samples, reconstruction audio, MIDI, archive, and manifest.
179
+
180
+ Timing profile:
181
+
182
+ - Measured around 0.10 s mean on benchmark fixtures.
183
+
184
+ Real-time suitability: **No for ZIP packaging; yes for preview rendering chunks.**
185
+
186
+ The final ZIP is a completion artifact. Reconstruction can be rendered progressively for UI preview.
187
+
188
+ ## Real-time feasibility summary
189
+
190
+ | Subprocess | Current batch status | Near-real-time feasibility | Notes |
191
+ |---|---|---|---|
192
+ | Source load | Fast | Yes | Direct file/stream decode is not the bottleneck |
193
+ | Demucs stem separation | Slow/offline | No | Keep offline and cached |
194
+ | BPM detection | Buffered batch | Partial | Rolling estimate works, exact tempo should refine over time |
195
+ | Onset detection | Batch but local-window | Yes | Needs bounded lookahead/backtracking |
196
+ | Hit slicing | Depends on next onset | Yes | Emit provisional segment, finalize on next onset/max duration |
197
+ | Rule classification | Per-hit | Yes | Cheap and stateless |
198
+ | Mel fingerprinting | Per-hit | Yes | Compute once per finalized hit |
199
+ | Transient NCC | Pairwise batch | Partial | Realtime against prototypes; batch all-pairs is not realtime |
200
+ | Agglomerative clustering | Batch | No | Replace or complement with online prototype assignment |
201
+ | Representative selection | Batch per cluster | Yes | Keep best-so-far per cluster |
202
+ | Synthesis | Batch per cluster | Partial | Can update lazily after cluster changes |
203
+ | MIDI/reconstruction preview | Batch export | Partial | Preview can stream; final MIDI is a completion artifact |
204
+ | ZIP packaging | Final artifact | No | Keep as final step |
205
+
206
+ ## Recommended next technical move
207
+
208
+ Implement a second clustering mode named `online`:
209
+
210
+ ```text
211
+ onset event → segment finalized → classify → mel fingerprint → candidate prototypes → transient NCC → assign/create cluster → update best representative → UI update
212
+ ```
213
+
214
+ Keep the existing agglomerative mode as `batch-quality`. Use online mode for immediate feedback and batch mode for final high-quality export.
docs/PROJECT_REVIEW.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Project review
2
+
3
+ ## Goal
4
+
5
+ Review the uploaded drum sample extractor, identify architectural and UX gaps, replace the Gradio UI with a custom frontend, and document the extraction pipeline with timing and real-time feasibility notes.
6
+
7
+ ## Success checklist
8
+
9
+ - The active app is no longer Gradio-based.
10
+ - The core extraction process is callable independently of the UI.
11
+ - Every significant extraction subprocess is timed.
12
+ - Runtime artifacts are stable and downloadable.
13
+ - Documentation explains current behavior, tradeoffs, and remaining work.
14
+ - Legacy files are preserved but not part of the active path.
15
+
16
+ ## Existing project structure before changes
17
+
18
+ The archive contained a compact Python project:
19
+
20
+ | File | Role |
21
+ |---|---|
22
+ | `app.py` | Active Gradio UI, parameter controls, extraction, eval, optimization tabs |
23
+ | `app_v2.py` | Older Gradio UI variant |
24
+ | `sample_extractor.py` | Current extraction pipeline: Demucs/load, SuperFlux onsets, rule labels, mel+NCC clustering, MIDI/export |
25
+ | `drum_extractor.py` | Older CLI-oriented pipeline with CLAP-era comments and broader experimental code |
26
+ | `synth_generator.py` | Synthetic drum fixture generator |
27
+ | `evaluation.py` | Ground-truth matching and scoring |
28
+ | `optimizer.py`, `optimizer_v2.py` | Parameter search experiments |
29
+ | `quality_metrics.py` | Completeness, cleanness, onset, reference metrics |
30
+ | `config_store.py` | Config persistence and leaderboard helpers |
31
+
32
+ ## Key findings
33
+
34
+ 1. `sample_extractor.py` is the right core to keep. It is compact, stage-oriented, and already exposes most of the operations needed by a proper app/API.
35
+ 2. `app.py` mixed UI code, runtime hotfixing, file conversion, extraction orchestration, and artifact packaging. That made it hard to test or replace the UI.
36
+ 3. The previous Gradio UI was fast to build but not ideal for this use-case: extraction is a staged process with logs, timing, waveform review, downloadable artifacts, and a dense parameter surface that benefits from a purpose-built layout.
37
+ 4. The previous `app.py` patched `sample_extractor.py` at runtime to fix `_sf(..., lag=2)` vs `_sf(..., l=2)`. The underlying bug is now fixed directly in `sample_extractor.py`.
38
+ 5. There was no meaningful project documentation, no API documentation, and no benchmark/timing documentation.
39
+ 6. `requirements.txt` still treated Gradio as first-class. The active app now uses FastAPI; Gradio dependencies have been moved to `requirements-legacy-gradio.txt`.
40
+ 7. `.runs/`, generated audio, MIDI, ZIP files, and local caches needed explicit ignore rules.
41
+
42
+ ## Changes made
43
+
44
+ | Area | Change |
45
+ |---|---|
46
+ | Active UI | Replaced Gradio with `app.py` FastAPI + custom static frontend in `web/` |
47
+ | Pipeline | Added `pipeline_runner.py` with validated params, stage timing, progress callbacks, manifests, and artifact writing |
48
+ | Legacy | Moved old Gradio apps into `legacy/` |
49
+ | Bugfix | Fixed the `_sf(yh, lag=2, ms=5)` keyword mismatch in `sample_extractor.py` |
50
+ | API | Added job creation, polling, config, health, cache clear, and safe artifact download endpoints |
51
+ | UX | Added drag/drop upload, dense controls, stage timeline, logs, waveform/onset overview, audio previews, sample table, downloads |
52
+ | Benchmarking | Added `scripts/benchmark_subprocesses.py` and committed benchmark output JSON |
53
+ | Packaging | Added Dockerfile, updated requirements, added `.gitignore` |
54
+ | Docs | Added project review, timing/real-time analysis, API docs, UI notes, and remaining work |
55
+
56
+ ## Current architecture
57
+
58
+ ```text
59
+ browser UI in web/
60
+
61
+
62
+ FastAPI app.py
63
+
64
+
65
+ pipeline_runner.py
66
+
67
+
68
+ sample_extractor.py + quality_metrics.py
69
+
70
+
71
+ .runs/<job-id>/output/{samples, MIDI, WAV, ZIP, manifest.json}
72
+ ```
73
+
74
+ The UI only talks to the API. The API only calls the timed runner. The runner is now independently testable and usable from scripts.
75
+
76
+ ## Risks and limitations
77
+
78
+ - Demucs can dominate runtime and may require a model download on first use.
79
+ - The current job store is in-memory. Completed jobs can be reloaded from `manifest.json`, but queued/running job state is lost on process restart.
80
+ - The clustering implementation is still batch-oriented. It can be optimized or adapted incrementally, but current agglomerative clustering is not a streaming algorithm.
81
+ - There is no authentication or quota control; this is intended as a local/Hugging Face style app, not a public multi-tenant service.
82
+ - The browser UI is currently no-build static JavaScript/CSS. That is intentional for deployability, but a larger UI should eventually move to TypeScript with a real component/test setup.
83
+
84
+ ## Verification performed
85
+
86
+ - Python syntax compilation for `app.py`, `pipeline_runner.py`, `sample_extractor.py`, and benchmark scripts.
87
+ - FastAPI `TestClient` checks for `/`, `/api/health`, and `/api/config`.
88
+ - End-to-end API job test using a synthetic drum fixture with `stem=all`.
89
+ - Synthetic subprocess benchmark across rock, funk, and halftime patterns.
docs/REMAINING_WORK.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Remaining work
2
+
3
+ ## Highest value next steps
4
+
5
+ 1. **Online clustering mode**: add prototype-based incremental clustering for immediate feedback, while keeping agglomerative clustering as the final-quality batch mode.
6
+ 2. **Run history**: index `.runs/*/output/manifest.json` so prior runs are browsable and comparable in the UI.
7
+ 3. **Waveform editing**: add hit audition, onset adjustment, cluster merge/split, and label reassignment.
8
+ 4. **Demucs caching**: persist stem cache on disk by input digest + model + stem + shifts + overlap.
9
+ 5. **True progress reporting**: expose lower-level progress inside Demucs and pairwise clustering, not only stage transitions.
10
+ 6. **Benchmark panel**: add an in-app benchmark view that can run synthetic fixtures and compare parameter profiles.
11
+ 7. **Frontend test harness**: move the no-build UI to TypeScript once the interaction model stabilizes.
12
+
13
+ ## Known constraints
14
+
15
+ - Demucs is not a realtime stage and should stay explicitly offline.
16
+ - Agglomerative clustering is a batch algorithm; it should not be sold as realtime.
17
+ - First run on a fresh environment can be slower due to imports, model download, and library initialization.
18
+ - The current job queue is process-local and single-worker. That is fine for local use, but not enough for a shared public deployment.
19
+
20
+ ## Suggested implementation order
21
+
22
+ 1. Add disk cache for source decode/stem separation.
23
+ 2. Add run history index and UI browser.
24
+ 3. Add hit audition from `overview.onsets` and sample rows.
25
+ 4. Implement online prototype clustering.
26
+ 5. Add comparison mode between two job manifests.
27
+ 6. Add SSE log/progress streaming.
docs/UI_REPLACEMENT.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Custom UI replacement
2
+
3
+ ## What changed
4
+
5
+ The active interface is now a custom browser UI served from `web/` by the FastAPI app in `app.py`. The old Gradio files were moved to `legacy/`.
6
+
7
+ ## UX goals
8
+
9
+ 1. Make the process feel like a sample-extraction workstation, not a generic notebook form.
10
+ 2. Keep upload, controls, pipeline status, logs, waveform review, audio previews, downloads, and sample rows visible without tab hunting.
11
+ 3. Show stage timing as a first-class result, because extraction quality and speed tradeoffs matter.
12
+ 4. Make `stem=all` obvious for fast iteration when Demucs is unnecessary.
13
+ 5. Keep the frontend deployable without a JavaScript build step.
14
+
15
+ ## UI structure
16
+
17
+ | Area | Purpose |
18
+ |---|---|
19
+ | Hero/status | Backend readiness and product framing |
20
+ | Source panel | Drag/drop upload and source audio preview |
21
+ | Controls panel | Stem, onset, clustering, MIDI, and synthesis parameters |
22
+ | Pipeline panel | Stage statuses, durations, and live logs |
23
+ | Result panel | Summary, waveform/onsets, downloads, stem/reconstruction audio, sample table |
24
+
25
+ ## Frontend implementation
26
+
27
+ Files:
28
+
29
+ - `web/index.html`
30
+ - `web/styles.css`
31
+ - `web/app.js`
32
+
33
+ The frontend uses modern browser APIs directly:
34
+
35
+ - `fetch` for API calls
36
+ - `FormData` for upload
37
+ - `<audio>` for previews
38
+ - `<canvas>` for waveform/onset visualization
39
+ - CSS grid, responsive layout, custom properties, and backdrop filters for layout/polish
40
+
41
+ No Gradio runtime, iframe, or generated UI framework is involved.
42
+
43
+ ## Backend integration
44
+
45
+ The frontend creates a job with `POST /api/jobs`, then polls `GET /api/jobs/{id}` until completion. Completed jobs expose direct download URLs for:
46
+
47
+ - sample pack ZIP
48
+ - MIDI reconstruction
49
+ - stem WAV
50
+ - reconstruction WAV
51
+ - individual sample WAVs
52
+
53
+ ## Why polling instead of websockets/SSE
54
+
55
+ Polling is the simplest robust option here because the current pipeline is CPU-heavy and mostly stage-based. The UI polls every 800 ms, which is enough to show stage transitions and logs without introducing websocket lifecycle complexity.
56
+
57
+ Future improvement: use Server-Sent Events for lower-latency log streaming once the backend has a persistent job store.
58
+
59
+ ## Remaining UI improvements
60
+
61
+ - Add waveform zoom and click-to-audition individual detected hits.
62
+ - Add inline controls for reassigning sample labels and merging/splitting clusters.
63
+ - Add A/B comparison between parameter runs.
64
+ - Add downloadable timing report per job.
65
+ - Add persistent run history browser for `.runs/`.
66
+ - Add online clustering mode for near-realtime progressive preview.
docs/benchmark-subprocesses.json ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "runs": [
3
+ {
4
+ "pattern": "rock",
5
+ "bars": 4,
6
+ "bpm": 120.0,
7
+ "run_index": 0,
8
+ "audio_duration_sec": 8.75,
9
+ "total_duration_sec": 2.594698,
10
+ "realtime_factor": 0.296537,
11
+ "hit_count": 28,
12
+ "cluster_count": 1,
13
+ "stages": [
14
+ {
15
+ "key": "stem",
16
+ "label": "Stem extraction / source load",
17
+ "duration_sec": 0.014633260999971753,
18
+ "status": "done",
19
+ "detail": "loaded full mix"
20
+ },
21
+ {
22
+ "key": "bpm",
23
+ "label": "Tempo detection",
24
+ "duration_sec": 0.23692302500001006,
25
+ "status": "done",
26
+ "detail": "120.2 BPM"
27
+ },
28
+ {
29
+ "key": "onsets",
30
+ "label": "Onset detection + slicing",
31
+ "duration_sec": 1.762329765000004,
32
+ "status": "done",
33
+ "detail": "28 hits"
34
+ },
35
+ {
36
+ "key": "classification",
37
+ "label": "Spectral rule classification",
38
+ "duration_sec": 0.02908633100003044,
39
+ "status": "done",
40
+ "detail": "bright:9, cymbal:1, hihat_closed:1, hihat_open:15, mid:2"
41
+ },
42
+ {
43
+ "key": "clustering",
44
+ "label": "Mel fingerprint + transient NCC clustering",
45
+ "duration_sec": 0.05944011799999771,
46
+ "status": "done",
47
+ "detail": "1 clusters"
48
+ },
49
+ {
50
+ "key": "selection",
51
+ "label": "Best representative scoring",
52
+ "duration_sec": 0.31093429700001707,
53
+ "status": "done",
54
+ "detail": "quality-scored representatives"
55
+ },
56
+ {
57
+ "key": "synthesis",
58
+ "label": "Optional sample synthesis",
59
+ "duration_sec": 0.0028187070000171843,
60
+ "status": "done",
61
+ "detail": "1 synthesized alternates"
62
+ },
63
+ {
64
+ "key": "export",
65
+ "label": "MIDI, reconstruction, WAV, ZIP export",
66
+ "duration_sec": 0.1779485609999938,
67
+ "status": "done",
68
+ "detail": "1 WAVs + MIDI + ZIP"
69
+ }
70
+ ]
71
+ },
72
+ {
73
+ "pattern": "funk",
74
+ "bars": 4,
75
+ "bpm": 120.0,
76
+ "run_index": 0,
77
+ "audio_duration_sec": 8.874989,
78
+ "total_duration_sec": 3.790648,
79
+ "realtime_factor": 0.427116,
80
+ "hit_count": 53,
81
+ "cluster_count": 2,
82
+ "stages": [
83
+ {
84
+ "key": "stem",
85
+ "label": "Stem extraction / source load",
86
+ "duration_sec": 0.009321340000042255,
87
+ "status": "done",
88
+ "detail": "loaded full mix"
89
+ },
90
+ {
91
+ "key": "bpm",
92
+ "label": "Tempo detection",
93
+ "duration_sec": 0.23110938799999303,
94
+ "status": "done",
95
+ "detail": "161.5 BPM"
96
+ },
97
+ {
98
+ "key": "onsets",
99
+ "label": "Onset detection + slicing",
100
+ "duration_sec": 2.1605432889999747,
101
+ "status": "done",
102
+ "detail": "53 hits"
103
+ },
104
+ {
105
+ "key": "classification",
106
+ "label": "Spectral rule classification",
107
+ "duration_sec": 0.04475730899997643,
108
+ "status": "done",
109
+ "detail": "bright:25, hihat_closed:18, hihat_open:7, mid:3"
110
+ },
111
+ {
112
+ "key": "clustering",
113
+ "label": "Mel fingerprint + transient NCC clustering",
114
+ "duration_sec": 0.6768225310000275,
115
+ "status": "done",
116
+ "detail": "2 clusters"
117
+ },
118
+ {
119
+ "key": "selection",
120
+ "label": "Best representative scoring",
121
+ "duration_sec": 0.559724416999984,
122
+ "status": "done",
123
+ "detail": "quality-scored representatives"
124
+ },
125
+ {
126
+ "key": "synthesis",
127
+ "label": "Optional sample synthesis",
128
+ "duration_sec": 0.0024601989999837315,
129
+ "status": "done",
130
+ "detail": "2 synthesized alternates"
131
+ },
132
+ {
133
+ "key": "export",
134
+ "label": "MIDI, reconstruction, WAV, ZIP export",
135
+ "duration_sec": 0.10532420399999864,
136
+ "status": "done",
137
+ "detail": "2 WAVs + MIDI + ZIP"
138
+ }
139
+ ]
140
+ },
141
+ {
142
+ "pattern": "halftime",
143
+ "bars": 4,
144
+ "bpm": 120.0,
145
+ "run_index": 0,
146
+ "audio_duration_sec": 8.874989,
147
+ "total_duration_sec": 3.701891,
148
+ "realtime_factor": 0.417115,
149
+ "hit_count": 66,
150
+ "cluster_count": 2,
151
+ "stages": [
152
+ {
153
+ "key": "stem",
154
+ "label": "Stem extraction / source load",
155
+ "duration_sec": 0.009298575000002529,
156
+ "status": "done",
157
+ "detail": "loaded full mix"
158
+ },
159
+ {
160
+ "key": "bpm",
161
+ "label": "Tempo detection",
162
+ "duration_sec": 0.21581650399997443,
163
+ "status": "done",
164
+ "detail": "120.2 BPM"
165
+ },
166
+ {
167
+ "key": "onsets",
168
+ "label": "Onset detection + slicing",
169
+ "duration_sec": 1.9768937550000487,
170
+ "status": "done",
171
+ "detail": "66 hits"
172
+ },
173
+ {
174
+ "key": "classification",
175
+ "label": "Spectral rule classification",
176
+ "duration_sec": 0.03783250899999757,
177
+ "status": "done",
178
+ "detail": "bright:11, cymbal:2, hihat_closed:48, hihat_open:5"
179
+ },
180
+ {
181
+ "key": "clustering",
182
+ "label": "Mel fingerprint + transient NCC clustering",
183
+ "duration_sec": 0.7498706449999872,
184
+ "status": "done",
185
+ "detail": "2 clusters"
186
+ },
187
+ {
188
+ "key": "selection",
189
+ "label": "Best representative scoring",
190
+ "duration_sec": 0.6169061510000233,
191
+ "status": "done",
192
+ "detail": "quality-scored representatives"
193
+ },
194
+ {
195
+ "key": "synthesis",
196
+ "label": "Optional sample synthesis",
197
+ "duration_sec": 0.0028750459999855593,
198
+ "status": "done",
199
+ "detail": "2 synthesized alternates"
200
+ },
201
+ {
202
+ "key": "export",
203
+ "label": "MIDI, reconstruction, WAV, ZIP export",
204
+ "duration_sec": 0.09185817900004167,
205
+ "status": "done",
206
+ "detail": "2 WAVs + MIDI + ZIP"
207
+ }
208
+ ]
209
+ },
210
+ {
211
+ "pattern": "rock",
212
+ "bars": 4,
213
+ "bpm": 120.0,
214
+ "run_index": 1,
215
+ "audio_duration_sec": 8.75,
216
+ "total_duration_sec": 2.848686,
217
+ "realtime_factor": 0.325564,
218
+ "hit_count": 24,
219
+ "cluster_count": 1,
220
+ "stages": [
221
+ {
222
+ "key": "stem",
223
+ "label": "Stem extraction / source load",
224
+ "duration_sec": 0.03869248300003392,
225
+ "status": "done",
226
+ "detail": "loaded full mix"
227
+ },
228
+ {
229
+ "key": "bpm",
230
+ "label": "Tempo detection",
231
+ "duration_sec": 0.24107510999999704,
232
+ "status": "done",
233
+ "detail": "120.2 BPM"
234
+ },
235
+ {
236
+ "key": "onsets",
237
+ "label": "Onset detection + slicing",
238
+ "duration_sec": 2.0721967459999746,
239
+ "status": "done",
240
+ "detail": "24 hits"
241
+ },
242
+ {
243
+ "key": "classification",
244
+ "label": "Spectral rule classification",
245
+ "duration_sec": 0.024016725000024053,
246
+ "status": "done",
247
+ "detail": "bright:7, hihat_closed:2, hihat_open:15"
248
+ },
249
+ {
250
+ "key": "clustering",
251
+ "label": "Mel fingerprint + transient NCC clustering",
252
+ "duration_sec": 0.05910233800000242,
253
+ "status": "done",
254
+ "detail": "1 clusters"
255
+ },
256
+ {
257
+ "key": "selection",
258
+ "label": "Best representative scoring",
259
+ "duration_sec": 0.3106304350000073,
260
+ "status": "done",
261
+ "detail": "quality-scored representatives"
262
+ },
263
+ {
264
+ "key": "synthesis",
265
+ "label": "Optional sample synthesis",
266
+ "duration_sec": 0.0015013799999792354,
267
+ "status": "done",
268
+ "detail": "1 synthesized alternates"
269
+ },
270
+ {
271
+ "key": "export",
272
+ "label": "MIDI, reconstruction, WAV, ZIP export",
273
+ "duration_sec": 0.10095534999999245,
274
+ "status": "done",
275
+ "detail": "1 WAVs + MIDI + ZIP"
276
+ }
277
+ ]
278
+ },
279
+ {
280
+ "pattern": "funk",
281
+ "bars": 4,
282
+ "bpm": 120.0,
283
+ "run_index": 1,
284
+ "audio_duration_sec": 8.874989,
285
+ "total_duration_sec": 3.416797,
286
+ "realtime_factor": 0.384992,
287
+ "hit_count": 52,
288
+ "cluster_count": 3,
289
+ "stages": [
290
+ {
291
+ "key": "stem",
292
+ "label": "Stem extraction / source load",
293
+ "duration_sec": 0.011181277999980921,
294
+ "status": "done",
295
+ "detail": "loaded full mix"
296
+ },
297
+ {
298
+ "key": "bpm",
299
+ "label": "Tempo detection",
300
+ "duration_sec": 0.20633040499996014,
301
+ "status": "done",
302
+ "detail": "120.2 BPM"
303
+ },
304
+ {
305
+ "key": "onsets",
306
+ "label": "Onset detection + slicing",
307
+ "duration_sec": 1.9962494719999881,
308
+ "status": "done",
309
+ "detail": "52 hits"
310
+ },
311
+ {
312
+ "key": "classification",
313
+ "label": "Spectral rule classification",
314
+ "duration_sec": 0.03461634600000707,
315
+ "status": "done",
316
+ "detail": "bright:23, cymbal:3, hihat_closed:15, hihat_open:8, mid:3"
317
+ },
318
+ {
319
+ "key": "clustering",
320
+ "label": "Mel fingerprint + transient NCC clustering",
321
+ "duration_sec": 0.51767344000001,
322
+ "status": "done",
323
+ "detail": "3 clusters"
324
+ },
325
+ {
326
+ "key": "selection",
327
+ "label": "Best representative scoring",
328
+ "duration_sec": 0.5431782379999959,
329
+ "status": "done",
330
+ "detail": "quality-scored representatives"
331
+ },
332
+ {
333
+ "key": "synthesis",
334
+ "label": "Optional sample synthesis",
335
+ "duration_sec": 0.001988787999948727,
336
+ "status": "done",
337
+ "detail": "3 synthesized alternates"
338
+ },
339
+ {
340
+ "key": "export",
341
+ "label": "MIDI, reconstruction, WAV, ZIP export",
342
+ "duration_sec": 0.10504587100001572,
343
+ "status": "done",
344
+ "detail": "3 WAVs + MIDI + ZIP"
345
+ }
346
+ ]
347
+ },
348
+ {
349
+ "pattern": "halftime",
350
+ "bars": 4,
351
+ "bpm": 120.0,
352
+ "run_index": 1,
353
+ "audio_duration_sec": 8.874989,
354
+ "total_duration_sec": 4.750472,
355
+ "realtime_factor": 0.535265,
356
+ "hit_count": 64,
357
+ "cluster_count": 1,
358
+ "stages": [
359
+ {
360
+ "key": "stem",
361
+ "label": "Stem extraction / source load",
362
+ "duration_sec": 0.016472632999978032,
363
+ "status": "done",
364
+ "detail": "loaded full mix"
365
+ },
366
+ {
367
+ "key": "bpm",
368
+ "label": "Tempo detection",
369
+ "duration_sec": 0.2141354419999857,
370
+ "status": "done",
371
+ "detail": "120.2 BPM"
372
+ },
373
+ {
374
+ "key": "onsets",
375
+ "label": "Onset detection + slicing",
376
+ "duration_sec": 2.8706004370000073,
377
+ "status": "done",
378
+ "detail": "64 hits"
379
+ },
380
+ {
381
+ "key": "classification",
382
+ "label": "Spectral rule classification",
383
+ "duration_sec": 0.036172296999950504,
384
+ "status": "done",
385
+ "detail": "bright:11, cymbal:2, hihat_closed:45, hihat_open:4, mid:2"
386
+ },
387
+ {
388
+ "key": "clustering",
389
+ "label": "Mel fingerprint + transient NCC clustering",
390
+ "duration_sec": 0.9130003360000387,
391
+ "status": "done",
392
+ "detail": "1 clusters"
393
+ },
394
+ {
395
+ "key": "selection",
396
+ "label": "Best representative scoring",
397
+ "duration_sec": 0.6508792970000172,
398
+ "status": "done",
399
+ "detail": "quality-scored representatives"
400
+ },
401
+ {
402
+ "key": "synthesis",
403
+ "label": "Optional sample synthesis",
404
+ "duration_sec": 0.0025003810000043813,
405
+ "status": "done",
406
+ "detail": "1 synthesized alternates"
407
+ },
408
+ {
409
+ "key": "export",
410
+ "label": "MIDI, reconstruction, WAV, ZIP export",
411
+ "duration_sec": 0.04621197200003735,
412
+ "status": "done",
413
+ "detail": "1 WAVs + MIDI + ZIP"
414
+ }
415
+ ]
416
+ }
417
+ ],
418
+ "summary": [
419
+ {
420
+ "stage": "stem",
421
+ "mean_sec": 0.0166,
422
+ "median_sec": 0.012907,
423
+ "min_sec": 0.009299,
424
+ "max_sec": 0.038692
425
+ },
426
+ {
427
+ "stage": "bpm",
428
+ "mean_sec": 0.224232,
429
+ "median_sec": 0.223463,
430
+ "min_sec": 0.20633,
431
+ "max_sec": 0.241075
432
+ },
433
+ {
434
+ "stage": "onsets",
435
+ "mean_sec": 2.139802,
436
+ "median_sec": 2.034223,
437
+ "min_sec": 1.76233,
438
+ "max_sec": 2.8706
439
+ },
440
+ {
441
+ "stage": "classification",
442
+ "mean_sec": 0.034414,
443
+ "median_sec": 0.035394,
444
+ "min_sec": 0.024017,
445
+ "max_sec": 0.044757
446
+ },
447
+ {
448
+ "stage": "clustering",
449
+ "mean_sec": 0.495985,
450
+ "median_sec": 0.597248,
451
+ "min_sec": 0.059102,
452
+ "max_sec": 0.913
453
+ },
454
+ {
455
+ "stage": "selection",
456
+ "mean_sec": 0.498709,
457
+ "median_sec": 0.551451,
458
+ "min_sec": 0.31063,
459
+ "max_sec": 0.650879
460
+ },
461
+ {
462
+ "stage": "synthesis",
463
+ "mean_sec": 0.002357,
464
+ "median_sec": 0.00248,
465
+ "min_sec": 0.001501,
466
+ "max_sec": 0.002875
467
+ },
468
+ {
469
+ "stage": "export",
470
+ "mean_sec": 0.104557,
471
+ "median_sec": 0.103001,
472
+ "min_sec": 0.046212,
473
+ "max_sec": 0.177949
474
+ }
475
+ ]
476
+ }
legacy/gradio_app.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio UI — Sample Extractor v9.
3
+ SuperFlux onsets, transient NCC, mel pre-filter, MIDI quantization, param locking.
4
+ """
5
+
6
+ import os, sys
7
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
8
+
9
+ # ─── HOTFIX: patch _sf() keyword argument bug ────────────────────────────────
10
+ _src = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'sample_extractor.py')
11
+ with open(_src, 'r') as _f: _content = _f.read()
12
+ if '_sf(yh,lag=2,ms=5)' in _content:
13
+ _content = _content.replace('_sf(yh,lag=2,ms=5)', '_sf(yh,l=2,ms=5)')
14
+ with open(_src, 'w') as _f: _f.write(_content)
15
+ print("[HOTFIX] Fixed _sf() kwarg: lag=2 → l=2")
16
+ del _src, _content
17
+ # ──────────────────────────────────────────────────────────────────────────────
18
+
19
+ import gradio as gr
20
+ import numpy as np, pandas as pd, json, tempfile
21
+ import soundfile as sf, librosa
22
+ import matplotlib; matplotlib.use('Agg')
23
+ import matplotlib.pyplot as plt
24
+
25
+ from sample_extractor import (
26
+ extract_stem, detect_onsets, classify_hits,
27
+ cluster_hits, select_best, synthesize_from_cluster,
28
+ sample_quality_score, export_midi, detect_bpm,
29
+ render_midi_with_samples, build_archive, cache_clear, auto_tune,
30
+ DEMUCS_MODELS, DEMUCS_STEMS,
31
+ )
32
+ from synth_generator import generate_test_song
33
+ from evaluation import evaluate_extraction
34
+ from config_store import PipelineConfig, get_leaderboard
35
+ from optimizer_v2 import run_optimization
36
+
37
+ def audio_tuple(a, sr):
38
+ a = a.astype(np.float32); pk = np.abs(a).max()
39
+ if pk > 0: a = a / pk * 0.95
40
+ return (sr, a)
41
+
42
+ def run_auto_tune(audio_in, stem_choice, demucs_model, demucs_shifts, demucs_overlap,
43
+ onset_mode, cur_delta, cur_energy, cur_gap, cur_tmin, cur_tmax,
44
+ lock_delta, lock_energy, lock_gap, lock_targets, progress=gr.Progress()):
45
+ if audio_in is None: return [gr.update()]*5 + ["Upload audio first", ""]
46
+ locks = {}
47
+ if lock_delta: locks['onset_delta'] = float(cur_delta)
48
+ if lock_energy: locks['energy_threshold_db'] = float(cur_energy)
49
+ if lock_gap: locks['min_gap'] = float(cur_gap)
50
+ if lock_targets: locks['target_min']=int(cur_tmin); locks['target_max']=int(cur_tmax)
51
+ progress(0.0); sr_in,data=audio_in; data=data.astype(np.float32)
52
+ if data.ndim>1: data=data.mean(axis=1)
53
+ pk=np.abs(data).max()
54
+ if pk>0: data/=pk
55
+ with tempfile.NamedTemporaryFile(suffix='.wav',delete=False) as f:
56
+ sf.write(f.name,data,sr_in); tmp=f.name
57
+ try:
58
+ progress(0.05,desc=f"Stem..."); sa,ssr=extract_stem(tmp,stem=stem_choice,device="cpu",
59
+ model_name=demucs_model,shifts=int(demucs_shifts),overlap=float(demucs_overlap))
60
+ ld=', '.join(f'{k}={v}' for k,v in locks.items()) if locks else 'none'
61
+ progress(0.15,desc=f"Tuning (🔒 {ld})...")
62
+ bp,bs,log=auto_tune(sa,ssr,mode=onset_mode,locks=locks)
63
+ progress(1.0)
64
+ lt='\n'.join(log[-30:])
65
+ li=f"🔒 Locked: {ld}" if locks else "All params free"
66
+ sm=f"**Score: {bs:.1f}/100** · {li}\n\nClick **Extract** to use these settings."
67
+ return [
68
+ gr.update(value=bp['onset_delta']) if not lock_delta else gr.update(),
69
+ gr.update(value=bp['energy_threshold_db']) if not lock_energy else gr.update(),
70
+ gr.update(value=bp['min_gap']) if not lock_gap else gr.update(),
71
+ gr.update(value=bp.get('target_min',5)) if not lock_targets else gr.update(),
72
+ gr.update(value=bp.get('target_max',20)) if not lock_targets else gr.update(),
73
+ sm, lt]
74
+ finally: os.unlink(tmp)
75
+
76
+ def run_extraction(audio_in, stem_choice, demucs_model, demucs_shifts, demucs_overlap,
77
+ onset_mode, onset_delta, energy_db, pre_pad, min_dur, max_dur, min_gap,
78
+ ncc_threshold, attack_ms, linkage, target_min, target_max,
79
+ do_synthesize, quantize_midi, subdivision, progress=gr.Progress()):
80
+ if audio_in is None: return [None]*8
81
+ progress(0.0); sr_in,data=audio_in; data=data.astype(np.float32)
82
+ if data.ndim>1: data=data.mean(axis=1)
83
+ pk=np.abs(data).max()
84
+ if pk>0: data/=pk
85
+ with tempfile.NamedTemporaryFile(suffix='.wav',delete=False) as f:
86
+ sf.write(f.name,data,sr_in); tmp=f.name
87
+ try:
88
+ progress(0.05,desc=f"Stem ({demucs_model})...")
89
+ sa,ssr=extract_stem(tmp,stem=stem_choice,device="cpu",
90
+ model_name=demucs_model,shifts=int(demucs_shifts),overlap=float(demucs_overlap))
91
+ progress(0.15,desc="BPM..."); bpm=detect_bpm(sa,ssr)
92
+ progress(0.25,desc="Onsets...")
93
+ hits=detect_onsets(sa,ssr,mode=onset_mode,onset_delta=float(onset_delta),
94
+ energy_threshold_db=float(energy_db),pre_pad=float(pre_pad),
95
+ min_dur=float(min_dur),max_dur=float(max_dur),min_gap=float(min_gap))
96
+ if not hits:
97
+ return (audio_tuple(sa,ssr),f"**BPM: {bpm}** — No hits.",None,None,None,None,"",pd.DataFrame())
98
+ progress(0.35,desc="Classify..."); hits=classify_hits(hits)
99
+ progress(0.45,desc="Cluster...")
100
+ cl=cluster_hits(hits,audio=sa,sr=ssr,ncc_threshold=float(ncc_threshold),
101
+ attack_ms=float(attack_ms),target_min=int(target_min),target_max=int(target_max),linkage=str(linkage))
102
+ progress(0.65,desc="Select..."); select_best(cl)
103
+ if do_synthesize:
104
+ progress(0.7,desc="Synth...")
105
+ for c in cl:
106
+ if c.count>=2: c.synthesized=synthesize_from_cluster(c)
107
+ progress(0.75,desc="MIDI..."); mp=tempfile.mktemp(suffix='.mid')
108
+ export_midi(cl,mp,bpm=bpm,quantize=bool(quantize_midi),subdivision=int(subdivision))
109
+ progress(0.8,desc="Render..."); rend=render_midi_with_samples(cl,sr=ssr)
110
+ progress(0.85,desc="Package...")
111
+ sd=tempfile.mkdtemp(); sp=[]
112
+ for c in sorted(cl,key=lambda x:x.count,reverse=True):
113
+ p=os.path.join(sd,f"{c.label}.wav"); c.best_hit.save(p); sp.append(p)
114
+ zp=build_archive(cl,bpm,ssr,midi_path=mp,rendered_audio=rend)
115
+ rows=[]
116
+ for c in sorted(cl,key=lambda x:x.count,reverse=True):
117
+ b=c.best_hit; sc=sample_quality_score(b.audio,b.sr,c.label.rsplit('_',1)[0])
118
+ rows.append({'Sample':c.label,'Hits':c.count,'MIDI':c.midi_note,
119
+ 'Score':f"{sc['total']:.1f}",'Clean':f"{sc['cleanness']:.2f}",
120
+ 'Complete':f"{sc['completeness']:.2f}",
121
+ 'Dur':f"{b.duration*1000:.0f}ms",
122
+ 'First':f"{sorted(h.onset_time for h in c.hits)[0]:.2f}s"})
123
+ sm=f"**BPM: {bpm}** · **{len(cl)} samples** from {len(hits)} hits\n\n"
124
+ sm+=f"`{demucs_model}` · δ=`{onset_delta}` · E=`{energy_db}dB` · attack=`{attack_ms}ms`"
125
+ if int(target_min)>0 and int(target_max)>0: sm+=f" · clusters `{int(target_min)}–{int(target_max)}`"
126
+ if quantize_midi: sm+=f" · MIDI 1/{int(subdivision)}"
127
+ sm+="\n\n| Sample | Hits | MIDI |\n|---|---|---|\n"
128
+ for c in sorted(cl,key=lambda x:x.count,reverse=True): sm+=f"| {c.label} | {c.count} | {c.midi_note} |\n"
129
+ progress(1.0)
130
+ return (audio_tuple(sa,ssr),sm,audio_tuple(rend,ssr),sp,mp,zp,"",pd.DataFrame(rows))
131
+ finally: os.unlink(tmp)
132
+
133
+ def run_eval(pattern,bpm,bars,ncc_threshold,target_min,target_max,progress=gr.Progress()):
134
+ progress(0.0); song=generate_test_song(pattern_name=pattern,bars=int(bars),bpm=float(bpm),variation='medium',seed=42)
135
+ dbpm=detect_bpm(song.drums_only,song.sr); progress(0.2)
136
+ hits=detect_onsets(song.drums_only,song.sr)
137
+ if not hits: return None,None,None,None,"",""
138
+ hits=classify_hits(hits)
139
+ cl=cluster_hits(hits,audio=song.drums_only,sr=song.sr,ncc_threshold=float(ncc_threshold),
140
+ target_min=int(target_min),target_max=int(target_max))
141
+ select_best(cl)
142
+ for c in cl:
143
+ if c.count>=2: c.synthesized=synthesize_from_cluster(c)
144
+ progress(0.5); rend=render_midi_with_samples(cl,sr=song.sr); progress(0.6)
145
+ gt={n:s.audio for n,s in song.samples.items()}
146
+ gh=[{'sample':h.sample_name,'onset':h.onset_time,'velocity':h.velocity} for h in song.hits]
147
+ r=evaluate_extraction(cl,gt,gh,song.sr,hits)
148
+ s=[{'Metric':'BPM','Value':f"{dbpm}",'Target':f"{song.bpm}"},
149
+ {'Metric':'Clusters','Value':str(len(cl)),'Target':str(len(gt))},
150
+ {'Metric':'Score','Value':f"{r.overall_score:.1f}/100",'Target':'> 70'}]
151
+ if r.unmatched_gt: s.append({'Metric':'⚠','Value':', '.join(r.unmatched_gt),'Target':'None'})
152
+ m=[{'Cluster':x.cluster_label,'GT':x.gt_name,'Score':f"{x.sample_score:.1f}"} for x in r.matches]
153
+ progress(1.0)
154
+ return (audio_tuple(song.mix,song.sr),audio_tuple(rend,song.sr),pd.DataFrame(s),pd.DataFrame(m) if m else None,"","")
155
+
156
+ def run_optimize(n,name,author,save,progress=gr.Progress()):
157
+ logs=[]; progress(0.0)
158
+ state=run_optimization(n_iterations=int(n),config_name=name or "opt",
159
+ author=author or "anon",save_to_hub=bool(save),log_fn=lambda m:logs.append(m))
160
+ progress(1.0)
161
+ h=[{'Iter':r.iteration,'Score':f"{r.avg_score:.1f}"} for r in state.history]
162
+ if state.history:
163
+ fig,ax=plt.subplots(figsize=(10,4)); ax.plot([r.iteration for r in state.history],[r.avg_score for r in state.history],'b-o')
164
+ ax.grid(True,alpha=0.3); plt.tight_layout()
165
+ else: fig,ax=plt.subplots(); ax.text(0.5,0.5,"No data")
166
+ return '\n'.join(logs),pd.DataFrame(h),fig,json.dumps(state.best_config,indent=2)
167
+
168
+ def refresh_lb():
169
+ try: lb=get_leaderboard(); return pd.DataFrame(lb) if lb else pd.DataFrame(),""
170
+ except Exception as e: return pd.DataFrame(),str(e)
171
+
172
+ def build_app():
173
+ with gr.Blocks(title="🎵 Sample Extractor",theme=gr.themes.Soft(),
174
+ css=".gradio-container{max-width:1300px!important}") as app:
175
+ gr.Markdown("# 🎵 Sample Extractor v9\n"
176
+ "**SuperFlux** onsets · **Transient NCC** (25ms attack) · "
177
+ "**Mel pre-filter** · **MIDI quantization** · **Auto-Tune** with 🔒 locks")
178
+ with gr.Tabs():
179
+ with gr.Tab("🎵 Extract"):
180
+ audio_in=gr.Audio(sources=['upload'],type='numpy',label='Upload Audio')
181
+ with gr.Accordion("🔧 Stem Separation",open=False):
182
+ with gr.Row():
183
+ dm=gr.Dropdown(DEMUCS_MODELS,value="htdemucs_ft",label="Model")
184
+ st=gr.Dropdown(['drums','bass','other','vocals','all'],value='drums',label='Stem')
185
+ dsh=gr.Slider(0,5,value=1,step=1,label='Shifts')
186
+ dov=gr.Slider(0.0,0.5,value=0.25,step=0.05,label='Overlap')
187
+ with gr.Accordion("🎯 Onset Detection",open=False):
188
+ with gr.Row(): om=gr.Dropdown(['auto','percussive','harmonic','broadband'],value='auto',label='Mode')
189
+ with gr.Row():
190
+ od=gr.Slider(0.01,0.5,value=0.12,step=0.01,label='Delta'); lock_od=gr.Checkbox(value=False,label='🔒',scale=0)
191
+ with gr.Row():
192
+ ed=gr.Slider(-70,-10,value=-35,step=1,label='Energy (dB)'); lock_ed=gr.Checkbox(value=False,label='🔒',scale=0)
193
+ with gr.Row():
194
+ mg=gr.Slider(0.005,0.2,value=0.03,step=0.005,label='Min gap'); lock_mg=gr.Checkbox(value=False,label='🔒',scale=0)
195
+ with gr.Row():
196
+ pp=gr.Slider(0.0,0.05,value=0.003,step=0.001,label='Pre-pad')
197
+ mnd=gr.Slider(0.005,0.2,value=0.02,step=0.005,label='Min dur')
198
+ mxd=gr.Slider(0.1,5.0,value=1.5,step=0.1,label='Max dur')
199
+ with gr.Accordion("🔗 Clustering",open=True):
200
+ with gr.Row():
201
+ tmin=gr.Number(value=5,label='Target min',precision=0)
202
+ tmax=gr.Number(value=20,label='Target max',precision=0)
203
+ lock_tgt=gr.Checkbox(value=True,label='🔒 Lock range',scale=0)
204
+ gr.Markdown("*🔒 = auto-tune keeps this value fixed*")
205
+ with gr.Row():
206
+ nt=gr.Slider(0.3,0.99,value=0.80,step=0.01,label='NCC threshold')
207
+ atk=gr.Slider(10,100,value=25,step=5,label='Attack (ms)')
208
+ lnk=gr.Dropdown(['average','complete','single'],value='average',label='Linkage')
209
+ with gr.Accordion("🎹 MIDI & Post",open=False):
210
+ with gr.Row():
211
+ syn=gr.Checkbox(value=True,label='Synthesize')
212
+ qmidi=gr.Checkbox(value=True,label='Quantize MIDI')
213
+ subdiv=gr.Dropdown([('8th',8),('16th',16),('32nd',32)],value=16,label='Grid')
214
+ with gr.Row():
215
+ tune_btn=gr.Button("🎛️ Auto-Tune",variant="secondary",size="lg")
216
+ extract_btn=gr.Button("🔬 Extract",variant="primary",size="lg")
217
+ tune_summary=gr.Markdown(""); tune_log=gr.Textbox(label="Log",lines=8,max_lines=15,visible=False)
218
+ summary_md=gr.Markdown("*Upload → Auto-Tune or Extract*")
219
+ with gr.Row():
220
+ stem_out=gr.Audio(type='numpy',label='Stem',interactive=False)
221
+ rend_out=gr.Audio(type='numpy',label='🔊 Reconstruction',interactive=False)
222
+ gr.Markdown("### Downloads")
223
+ with gr.Row():
224
+ arc=gr.File(label="📦 ZIP",interactive=False); mid=gr.File(label="🎹 MIDI",interactive=False)
225
+ smp=gr.File(label="WAVs",file_count="multiple",interactive=False)
226
+ met=gr.Dataframe(label="Samples"); stx=gr.Textbox(visible=False)
227
+ dm.change(fn=lambda m:gr.update(choices=DEMUCS_STEMS.get(m,["drums","bass","other","vocals"])+["all"]),inputs=[dm],outputs=[st])
228
+ tune_btn.click(run_auto_tune,[audio_in,st,dm,dsh,dov,om,od,ed,mg,tmin,tmax,lock_od,lock_ed,lock_mg,lock_tgt],
229
+ [od,ed,mg,tmin,tmax,tune_summary,tune_log])
230
+ extract_btn.click(run_extraction,[audio_in,st,dm,dsh,dov,om,od,ed,pp,mnd,mxd,mg,nt,atk,lnk,tmin,tmax,syn,qmidi,subdiv],
231
+ [stem_out,summary_md,rend_out,smp,mid,arc,stx,met])
232
+ with gr.Tab("📊 Evaluate"):
233
+ with gr.Row():
234
+ ep=gr.Dropdown(['rock','funk','halftime'],value='rock',label='Pattern')
235
+ eb=gr.Slider(80,200,value=120,step=2,label='BPM'); ebs=gr.Slider(2,8,value=4,step=1,label='Bars')
236
+ with gr.Row():
237
+ en=gr.Slider(0.3,0.99,value=0.80,step=0.01,label='NCC')
238
+ etm=gr.Number(value=0,label='Min',precision=0); etx=gr.Number(value=0,label='Max',precision=0)
239
+ evb=gr.Button("🧪 Evaluate",variant="primary",size="lg")
240
+ with gr.Row():
241
+ evm=gr.Audio(type='numpy',label='Original',interactive=False)
242
+ evr=gr.Audio(type='numpy',label='Reconstruction',interactive=False)
243
+ evs=gr.Dataframe(); evm2=gr.Dataframe()
244
+ es1=gr.Textbox(visible=False); es2=gr.Textbox(visible=False)
245
+ evb.click(run_eval,[ep,eb,ebs,en,etm,etx],[evm,evr,evs,evm2,es1,es2])
246
+ with gr.Tab("🔄 Optimize"):
247
+ with gr.Row():
248
+ on=gr.Slider(2,30,value=5,step=1,label='Iters'); ocn=gr.Textbox(value="opt",label='Name')
249
+ oa=gr.Textbox(value="",label='Author'); osv=gr.Checkbox(value=True,label='Save')
250
+ ob=gr.Button("🚀 Run",variant="primary",size="lg")
251
+ ol=gr.Textbox(label="Log",lines=20,max_lines=40); oh=gr.Dataframe(); op=gr.Plot()
252
+ oc=gr.Code(label="Config",language="json")
253
+ ob.click(run_optimize,[on,ocn,oa,osv],[ol,oh,op,oc])
254
+ with gr.Tab("🏆 Leaderboard"):
255
+ lbb=gr.Button("🔄 Refresh"); lt=gr.Dataframe(); ls=gr.Textbox(visible=False)
256
+ lbb.click(refresh_lb,[],[lt,ls])
257
+ return app
258
+
259
+ if __name__=="__main__": build_app().launch(server_name="0.0.0.0",server_port=7860)
app_v2.py → legacy/gradio_app_v2.py RENAMED
File without changes
pipeline_runner.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Timed extraction pipeline used by the FastAPI app and benchmarks."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ import os
8
+ import shutil
9
+ import tempfile
10
+ import time
11
+ from contextlib import contextmanager
12
+ from dataclasses import asdict, dataclass, field
13
+ from pathlib import Path
14
+ from typing import Any, Callable
15
+
16
+ import librosa
17
+ import numpy as np
18
+ import soundfile as sf
19
+
20
+ from sample_extractor import (
21
+ DEMUCS_MODELS,
22
+ DEMUCS_STEMS,
23
+ build_archive,
24
+ classify_hits,
25
+ cluster_hits,
26
+ detect_bpm,
27
+ detect_onsets,
28
+ export_midi,
29
+ extract_stem,
30
+ render_midi_with_samples,
31
+ sample_quality_score,
32
+ select_best,
33
+ synthesize_from_cluster,
34
+ )
35
+
36
+ ProgressCallback = Callable[[dict[str, Any]], None]
37
+
38
+
39
+ @dataclass
40
+ class PipelineParams:
41
+ stem: str = "drums"
42
+ demucs_model: str = "htdemucs_ft"
43
+ demucs_shifts: int = 1
44
+ demucs_overlap: float = 0.25
45
+ onset_mode: str = "auto"
46
+ onset_delta: float = 0.12
47
+ energy_threshold_db: float = -35.0
48
+ pre_pad: float = 0.003
49
+ min_dur: float = 0.02
50
+ max_dur: float = 1.5
51
+ min_gap: float = 0.03
52
+ ncc_threshold: float = 0.80
53
+ attack_ms: float = 25.0
54
+ mel_threshold: float = 0.75
55
+ linkage: str = "average"
56
+ target_min: int = 5
57
+ target_max: int = 20
58
+ synthesize: bool = True
59
+ quantize_midi: bool = True
60
+ subdivision: int = 16
61
+ device: str = "cpu"
62
+
63
+ @classmethod
64
+ def from_mapping(cls, data: dict[str, Any] | None) -> "PipelineParams":
65
+ data = dict(data or {})
66
+ allowed = {field.name for field in cls.__dataclass_fields__.values()}
67
+ unknown = sorted(set(data) - allowed)
68
+ if unknown:
69
+ raise ValueError(f"Unknown pipeline parameter(s): {', '.join(unknown)}")
70
+ params = cls(**data)
71
+ params.validate()
72
+ return params
73
+
74
+ def validate(self) -> None:
75
+ if self.demucs_model not in DEMUCS_MODELS:
76
+ raise ValueError(f"Unsupported Demucs model: {self.demucs_model}")
77
+ allowed_stems = set(DEMUCS_STEMS.get(self.demucs_model, [])) | {"all"}
78
+ if self.stem not in allowed_stems:
79
+ raise ValueError(f"Stem '{self.stem}' is not available for {self.demucs_model}")
80
+ if self.onset_mode not in {"auto", "percussive", "harmonic", "broadband"}:
81
+ raise ValueError(f"Unsupported onset mode: {self.onset_mode}")
82
+ if self.linkage not in {"average", "complete", "single"}:
83
+ raise ValueError(f"Unsupported clustering linkage: {self.linkage}")
84
+ if not 0 <= self.demucs_shifts <= 8:
85
+ raise ValueError("demucs_shifts must be between 0 and 8")
86
+ if not 0.0 <= self.demucs_overlap <= 0.9:
87
+ raise ValueError("demucs_overlap must be between 0.0 and 0.9")
88
+ if not 0.001 <= self.onset_delta <= 1.0:
89
+ raise ValueError("onset_delta must be between 0.001 and 1.0")
90
+ if not -100.0 <= self.energy_threshold_db <= 0.0:
91
+ raise ValueError("energy_threshold_db must be between -100 and 0 dB")
92
+ if not 0.0 <= self.pre_pad <= 0.25:
93
+ raise ValueError("pre_pad must be between 0 and 0.25 seconds")
94
+ if not 0.001 <= self.min_dur <= self.max_dur <= 10.0:
95
+ raise ValueError("duration bounds must satisfy 0.001 <= min_dur <= max_dur <= 10")
96
+ if not 0.001 <= self.min_gap <= 1.0:
97
+ raise ValueError("min_gap must be between 0.001 and 1.0 seconds")
98
+ if not 0.0 <= self.ncc_threshold <= 1.0:
99
+ raise ValueError("ncc_threshold must be between 0 and 1")
100
+ if not 1.0 <= self.attack_ms <= 250.0:
101
+ raise ValueError("attack_ms must be between 1 and 250 ms")
102
+ if not 0.0 <= self.mel_threshold <= 1.0:
103
+ raise ValueError("mel_threshold must be between 0 and 1")
104
+ if self.target_min < 0 or self.target_max < 0:
105
+ raise ValueError("target_min and target_max must be non-negative")
106
+ if self.target_max and self.target_min and self.target_min > self.target_max:
107
+ raise ValueError("target_min cannot be greater than target_max")
108
+ if self.subdivision not in {4, 8, 16, 32, 64}:
109
+ raise ValueError("subdivision must be one of 4, 8, 16, 32, 64")
110
+
111
+
112
+ @dataclass
113
+ class StageTiming:
114
+ key: str
115
+ label: str
116
+ duration_sec: float = 0.0
117
+ status: str = "pending"
118
+ detail: str = ""
119
+
120
+
121
+ @dataclass
122
+ class PipelineResult:
123
+ params: dict[str, Any]
124
+ duration_sec: float
125
+ audio_duration_sec: float
126
+ realtime_factor: float
127
+ bpm: float | None
128
+ sample_rate: int
129
+ hit_count: int
130
+ cluster_count: int
131
+ stages: list[dict[str, Any]]
132
+ samples: list[dict[str, Any]]
133
+ overview: dict[str, Any]
134
+ files: dict[str, str]
135
+
136
+
137
+ STAGE_DEFS = [
138
+ ("stem", "Stem extraction / source load"),
139
+ ("bpm", "Tempo detection"),
140
+ ("onsets", "Onset detection + slicing"),
141
+ ("classification", "Spectral rule classification"),
142
+ ("clustering", "Mel fingerprint + transient NCC clustering"),
143
+ ("selection", "Best representative scoring"),
144
+ ("synthesis", "Optional sample synthesis"),
145
+ ("export", "MIDI, reconstruction, WAV, ZIP export"),
146
+ ]
147
+
148
+
149
+ def initial_stages() -> list[dict[str, Any]]:
150
+ return [asdict(StageTiming(key=key, label=label)) for key, label in STAGE_DEFS]
151
+
152
+
153
+ def _notify(cb: ProgressCallback | None, payload: dict[str, Any]) -> None:
154
+ if cb:
155
+ cb(payload)
156
+
157
+
158
+ @contextmanager
159
+ def _timed_stage(stages: list[StageTiming], key: str, cb: ProgressCallback | None = None):
160
+ stage = next(stage for stage in stages if stage.key == key)
161
+ stage.status = "running"
162
+ _notify(cb, {"type": "stage", "stage": asdict(stage), "stages": [asdict(s) for s in stages]})
163
+ started = time.perf_counter()
164
+ try:
165
+ yield stage
166
+ except Exception as exc:
167
+ stage.duration_sec = time.perf_counter() - started
168
+ stage.status = "error"
169
+ stage.detail = str(exc)
170
+ _notify(cb, {"type": "stage", "stage": asdict(stage), "stages": [asdict(s) for s in stages]})
171
+ raise
172
+ else:
173
+ stage.duration_sec = time.perf_counter() - started
174
+ stage.status = "done"
175
+ _notify(cb, {"type": "stage", "stage": asdict(stage), "stages": [asdict(s) for s in stages]})
176
+
177
+
178
+ def _normalise_audio(audio: np.ndarray) -> np.ndarray:
179
+ audio = audio.astype(np.float32)
180
+ if audio.ndim > 1:
181
+ audio = audio.mean(axis=1)
182
+ peak = float(np.max(np.abs(audio))) if audio.size else 0.0
183
+ if peak > 0:
184
+ audio = audio / peak
185
+ return audio.astype(np.float32)
186
+
187
+
188
+ def _write_audio(path: Path, audio: np.ndarray, sr: int, subtype: str = "PCM_24") -> None:
189
+ path.parent.mkdir(parents=True, exist_ok=True)
190
+ sf.write(path, audio.astype(np.float32), sr, subtype=subtype)
191
+
192
+
193
+ def _make_overview(audio: np.ndarray, sr: int, hits: list[Any], max_points: int = 1600) -> dict[str, Any]:
194
+ if len(audio) == 0:
195
+ return {"sample_rate": sr, "duration_sec": 0, "envelope": [], "onsets": []}
196
+ frame = max(1, int(np.ceil(len(audio) / max_points)))
197
+ usable = (len(audio) // frame) * frame
198
+ if usable == 0:
199
+ envelope = [float(np.max(np.abs(audio)))]
200
+ else:
201
+ envelope = np.max(np.abs(audio[:usable].reshape(-1, frame)), axis=1).astype(float).tolist()
202
+ return {
203
+ "sample_rate": sr,
204
+ "duration_sec": round(len(audio) / sr, 6),
205
+ "frame_duration_sec": round(frame / sr, 6),
206
+ "envelope": [round(float(x), 6) for x in envelope],
207
+ "onsets": [
208
+ {
209
+ "time_sec": round(float(h.onset_time), 6),
210
+ "label": h.label,
211
+ "energy": round(float(h.rms_energy), 6),
212
+ "cluster_id": int(getattr(h, "cluster_id", -1)),
213
+ }
214
+ for h in hits
215
+ ],
216
+ }
217
+
218
+
219
+ def _copy_temp_file(src: str | os.PathLike[str], dst: Path) -> str:
220
+ dst.parent.mkdir(parents=True, exist_ok=True)
221
+ shutil.copyfile(src, dst)
222
+ return str(dst)
223
+
224
+
225
+ def run_extraction_pipeline(
226
+ audio_path: str | os.PathLike[str],
227
+ output_dir: str | os.PathLike[str],
228
+ params: PipelineParams | dict[str, Any] | None = None,
229
+ progress_cb: ProgressCallback | None = None,
230
+ ) -> PipelineResult:
231
+ """Run extraction and write all runtime artifacts into output_dir."""
232
+ if not isinstance(params, PipelineParams):
233
+ params = PipelineParams.from_mapping(params)
234
+
235
+ out = Path(output_dir)
236
+ out.mkdir(parents=True, exist_ok=True)
237
+ samples_dir = out / "samples"
238
+ samples_dir.mkdir(parents=True, exist_ok=True)
239
+
240
+ stages = [StageTiming(key=key, label=label) for key, label in STAGE_DEFS]
241
+ started_total = time.perf_counter()
242
+
243
+ bpm: float | None = None
244
+ stem_audio: np.ndarray
245
+ stem_sr: int
246
+ hits: list[Any] = []
247
+ clusters: list[Any] = []
248
+ rendered: np.ndarray | None = None
249
+
250
+ _notify(progress_cb, {"type": "start", "stages": [asdict(s) for s in stages]})
251
+
252
+ with _timed_stage(stages, "stem", progress_cb) as stage:
253
+ stem_audio, stem_sr = extract_stem(
254
+ str(audio_path),
255
+ stem=params.stem,
256
+ device=params.device,
257
+ model_name=params.demucs_model,
258
+ shifts=int(params.demucs_shifts),
259
+ overlap=float(params.demucs_overlap),
260
+ )
261
+ stem_audio = _normalise_audio(stem_audio)
262
+ stage.detail = f"{params.stem} via {params.demucs_model}" if params.stem != "all" else "loaded full mix"
263
+ _write_audio(out / "stem.wav", stem_audio, stem_sr, subtype="PCM_16")
264
+
265
+ audio_duration_sec = len(stem_audio) / stem_sr if stem_sr else 0.0
266
+
267
+ with _timed_stage(stages, "bpm", progress_cb) as stage:
268
+ bpm = detect_bpm(stem_audio, stem_sr)
269
+ stage.detail = f"{bpm} BPM"
270
+
271
+ with _timed_stage(stages, "onsets", progress_cb) as stage:
272
+ hits = detect_onsets(
273
+ stem_audio,
274
+ stem_sr,
275
+ mode=params.onset_mode,
276
+ onset_delta=float(params.onset_delta),
277
+ energy_threshold_db=float(params.energy_threshold_db),
278
+ pre_pad=float(params.pre_pad),
279
+ min_dur=float(params.min_dur),
280
+ max_dur=float(params.max_dur),
281
+ min_gap=float(params.min_gap),
282
+ )
283
+ stage.detail = f"{len(hits)} hits"
284
+
285
+ if hits:
286
+ with _timed_stage(stages, "classification", progress_cb) as stage:
287
+ hits = classify_hits(hits)
288
+ counts: dict[str, int] = {}
289
+ for hit in hits:
290
+ counts[hit.label] = counts.get(hit.label, 0) + 1
291
+ stage.detail = ", ".join(f"{key}:{value}" for key, value in sorted(counts.items()))
292
+
293
+ with _timed_stage(stages, "clustering", progress_cb) as stage:
294
+ clusters = cluster_hits(
295
+ hits,
296
+ audio=stem_audio,
297
+ sr=stem_sr,
298
+ ncc_threshold=float(params.ncc_threshold),
299
+ attack_ms=float(params.attack_ms),
300
+ mel_threshold=float(params.mel_threshold),
301
+ target_min=int(params.target_min),
302
+ target_max=int(params.target_max),
303
+ linkage=params.linkage,
304
+ )
305
+ for cluster in clusters:
306
+ for hit in cluster.hits:
307
+ hit.cluster_id = cluster.cluster_id
308
+ stage.detail = f"{len(clusters)} clusters"
309
+
310
+ with _timed_stage(stages, "selection", progress_cb) as stage:
311
+ select_best(clusters)
312
+ stage.detail = "quality-scored representatives"
313
+
314
+ with _timed_stage(stages, "synthesis", progress_cb) as stage:
315
+ if params.synthesize:
316
+ synth_count = 0
317
+ for cluster in clusters:
318
+ if cluster.count >= 2:
319
+ cluster.synthesized = synthesize_from_cluster(cluster)
320
+ synth_count += int(cluster.synthesized is not None)
321
+ stage.detail = f"{synth_count} synthesized alternates"
322
+ else:
323
+ stage.detail = "disabled"
324
+ else:
325
+ for key, detail in [
326
+ ("classification", "skipped: no hits"),
327
+ ("clustering", "skipped: no hits"),
328
+ ("selection", "skipped: no hits"),
329
+ ("synthesis", "skipped: no hits"),
330
+ ]:
331
+ stage = next(s for s in stages if s.key == key)
332
+ stage.status = "done"
333
+ stage.detail = detail
334
+
335
+ sample_rows: list[dict[str, Any]] = []
336
+ files: dict[str, str] = {"stem": "stem.wav"}
337
+
338
+ with _timed_stage(stages, "export", progress_cb) as stage:
339
+ midi_path = out / "reconstruction.mid"
340
+ if clusters:
341
+ export_midi(
342
+ clusters,
343
+ str(midi_path),
344
+ bpm=bpm or 120.0,
345
+ quantize=bool(params.quantize_midi),
346
+ subdivision=int(params.subdivision),
347
+ )
348
+ rendered = render_midi_with_samples(clusters, sr=stem_sr)
349
+ else:
350
+ rendered = np.zeros_like(stem_audio)
351
+ midi_path.write_bytes(b"")
352
+
353
+ _write_audio(out / "reconstruction.wav", rendered, stem_sr, subtype="PCM_16")
354
+ files["reconstruction"] = "reconstruction.wav"
355
+ files["midi"] = "reconstruction.mid"
356
+
357
+ for cluster in sorted(clusters, key=lambda item: item.count, reverse=True):
358
+ best = cluster.best_hit
359
+ sample_path = samples_dir / f"{cluster.label}.wav"
360
+ best.save(str(sample_path))
361
+ quality = sample_quality_score(best.audio, best.sr, cluster.label.rsplit("_", 1)[0])
362
+ sample_rows.append(
363
+ {
364
+ "label": cluster.label,
365
+ "classification": cluster.label.rsplit("_", 1)[0],
366
+ "hits": int(cluster.count),
367
+ "midi_note": int(cluster.midi_note),
368
+ "score": round(float(quality["total"]), 2),
369
+ "cleanness": round(float(quality["cleanness"]), 4),
370
+ "completeness": round(float(quality["completeness"]), 4),
371
+ "duration_ms": round(float(best.duration * 1000), 1),
372
+ "first_onset_sec": round(float(min(hit.onset_time for hit in cluster.hits)), 4),
373
+ "file": f"samples/{cluster.label}.wav",
374
+ }
375
+ )
376
+ if cluster.synthesized is not None:
377
+ synth_path = samples_dir / f"{cluster.label}__synth.wav"
378
+ _write_audio(synth_path, cluster.synthesized, stem_sr)
379
+
380
+ archive_tmp = build_archive(clusters, bpm or 120.0, stem_sr, midi_path=str(midi_path), rendered_audio=rendered)
381
+ files["archive"] = _copy_temp_file(archive_tmp, out / "sample-pack.zip")
382
+ files["archive"] = "sample-pack.zip"
383
+ try:
384
+ os.unlink(archive_tmp)
385
+ except OSError:
386
+ pass
387
+ stage.detail = f"{len(sample_rows)} WAVs + MIDI + ZIP"
388
+
389
+ duration_sec = time.perf_counter() - started_total
390
+ result = PipelineResult(
391
+ params=asdict(params),
392
+ duration_sec=round(duration_sec, 6),
393
+ audio_duration_sec=round(audio_duration_sec, 6),
394
+ realtime_factor=round(duration_sec / max(audio_duration_sec, 1e-9), 6),
395
+ bpm=bpm,
396
+ sample_rate=stem_sr,
397
+ hit_count=len(hits),
398
+ cluster_count=len(clusters),
399
+ stages=[asdict(stage) for stage in stages],
400
+ samples=sample_rows,
401
+ overview=_make_overview(stem_audio, stem_sr, hits),
402
+ files=files,
403
+ )
404
+
405
+ (out / "manifest.json").write_text(json.dumps(asdict(result), indent=2), encoding="utf-8")
406
+ _notify(progress_cb, {"type": "complete", "result": asdict(result), "stages": result.stages})
407
+ return result
requirements-legacy-gradio.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ -r requirements.txt
2
+ gradio
requirements.txt CHANGED
@@ -1,12 +1,14 @@
1
- demucs==4.0.1
 
 
 
 
2
  librosa>=0.10.0
3
- soundfile
4
- scikit-learn
5
- numpy
6
  torch
7
  torchaudio
8
- scipy
9
- gradio
10
  matplotlib
11
  pandas
12
  pretty_midi
 
1
+ fastapi>=0.110
2
+ uvicorn[standard]>=0.27
3
+ python-multipart>=0.0.9
4
+ numpy>=1.26
5
+ scipy>=1.11
6
  librosa>=0.10.0
7
+ soundfile>=0.12
8
+ scikit-learn>=1.4
 
9
  torch
10
  torchaudio
11
+ demucs==4.0.1
 
12
  matplotlib
13
  pandas
14
  pretty_midi
sample_extractor.py CHANGED
@@ -104,7 +104,7 @@ def detect_onsets(y,sr,pre_pad=0.003,min_dur=0.02,max_dur=1.5,min_gap=0.03,
104
  sr=sr,hop_length=hop_length,lag=l,max_size=ms)
105
  def _n(x): m=x.max(); return x/m if m>0 else x
106
  oe = np.maximum.reduce([_n(_sf(y,20,300)), _n(_sf(y,300,4000)),
107
- _n(_sf(y,4000,16000)), _n(_sf(yh,lag=2,ms=5))])
108
 
109
  wait = max(1, int(min_gap * sr / hop_length))
110
  fr = librosa.onset.onset_detect(onset_envelope=oe,sr=sr,hop_length=hop_length,
 
104
  sr=sr,hop_length=hop_length,lag=l,max_size=ms)
105
  def _n(x): m=x.max(); return x/m if m>0 else x
106
  oe = np.maximum.reduce([_n(_sf(y,20,300)), _n(_sf(y,300,4000)),
107
+ _n(_sf(y,4000,16000)), _n(_sf(yh,l=2,ms=5))])
108
 
109
  wait = max(1, int(min_gap * sr / hop_length))
110
  fr = librosa.onset.onset_detect(onset_envelope=oe,sr=sr,hop_length=hop_length,
scripts/benchmark_subprocesses.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Benchmark significant sample-extraction subprocesses using synthetic fixtures.
3
+
4
+ This intentionally defaults to `stem=all` so the DSP stages can be measured without
5
+ Demucs download/runtime noise. Use `--include-demucs` with a real input file if you
6
+ want to benchmark stem separation on the current machine.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import json
13
+ import statistics
14
+ import sys
15
+ import tempfile
16
+ from pathlib import Path
17
+
18
+ sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
19
+
20
+ import soundfile as sf
21
+
22
+ from pipeline_runner import PipelineParams, run_extraction_pipeline
23
+ from sample_extractor import cache_clear
24
+ from synth_generator import generate_test_song
25
+
26
+
27
+ def run_case(pattern: str, bars: int, bpm: float, run_index: int) -> dict:
28
+ tmp = Path(tempfile.mkdtemp(prefix="dse-bench-"))
29
+ song = generate_test_song(pattern_name=pattern, bars=bars, bpm=bpm, add_bass=False, seed=42 + run_index)
30
+ src = tmp / f"{pattern}-{bars}bars.wav"
31
+ sf.write(src, song.drums_only, song.sr)
32
+ cache_clear()
33
+ params = PipelineParams(stem="all", target_min=4, target_max=12, synthesize=True)
34
+ result = run_extraction_pipeline(src, tmp / "out", params)
35
+ return {
36
+ "pattern": pattern,
37
+ "bars": bars,
38
+ "bpm": bpm,
39
+ "run_index": run_index,
40
+ "audio_duration_sec": result.audio_duration_sec,
41
+ "total_duration_sec": result.duration_sec,
42
+ "realtime_factor": result.realtime_factor,
43
+ "hit_count": result.hit_count,
44
+ "cluster_count": result.cluster_count,
45
+ "stages": result.stages,
46
+ }
47
+
48
+
49
+ def main() -> int:
50
+ parser = argparse.ArgumentParser()
51
+ parser.add_argument("--runs", type=int, default=2)
52
+ parser.add_argument("--bars", type=int, default=4)
53
+ parser.add_argument("--bpm", type=float, default=120.0)
54
+ parser.add_argument("--output", default="docs/benchmark-subprocesses.json")
55
+ args = parser.parse_args()
56
+
57
+ # Warm imports/JIT and discard the result.
58
+ run_case("rock", 1, args.bpm, -1)
59
+
60
+ rows = []
61
+ for run_index in range(args.runs):
62
+ for pattern in ["rock", "funk", "halftime"]:
63
+ rows.append(run_case(pattern, args.bars, args.bpm, run_index))
64
+
65
+ stage_keys = [stage["key"] for stage in rows[0]["stages"]]
66
+ summary = []
67
+ for key in stage_keys:
68
+ values = [next(stage for stage in row["stages"] if stage["key"] == key)["duration_sec"] for row in rows]
69
+ summary.append({
70
+ "stage": key,
71
+ "mean_sec": round(statistics.mean(values), 6),
72
+ "median_sec": round(statistics.median(values), 6),
73
+ "min_sec": round(min(values), 6),
74
+ "max_sec": round(max(values), 6),
75
+ })
76
+
77
+ payload = {"runs": rows, "summary": summary}
78
+ out = Path(args.output)
79
+ out.parent.mkdir(parents=True, exist_ok=True)
80
+ out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
81
+ print(json.dumps(payload, indent=2))
82
+ return 0
83
+
84
+
85
+ if __name__ == "__main__":
86
+ raise SystemExit(main())
scripts/smoke_benchmark.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+ sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
4
+ import json
5
+ import tempfile
6
+ import soundfile as sf
7
+ from synth_generator import generate_test_song
8
+ from pipeline_runner import PipelineParams, run_extraction_pipeline
9
+
10
+ song = generate_test_song(pattern_name='rock', bars=2, bpm=120, add_bass=False)
11
+ out = Path(tempfile.mkdtemp(prefix='dse-test-'))
12
+ inp = out / 'input.wav'
13
+ sf.write(inp, song.drums_only, song.sr)
14
+ params = PipelineParams(stem='all', target_min=4, target_max=8, synthesize=True)
15
+ res = run_extraction_pipeline(inp, out / 'out', params)
16
+ print(json.dumps({
17
+ 'duration_sec': res.duration_sec,
18
+ 'audio_duration_sec': res.audio_duration_sec,
19
+ 'realtime_factor': res.realtime_factor,
20
+ 'hit_count': res.hit_count,
21
+ 'cluster_count': res.cluster_count,
22
+ 'stages': res.stages,
23
+ 'files': res.files,
24
+ }, indent=2))
scripts/test_api_job.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io, json, sys, time
2
+ from pathlib import Path
3
+ sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
4
+ import soundfile as sf
5
+ from fastapi.testclient import TestClient
6
+ from app import app
7
+ from synth_generator import generate_test_song
8
+
9
+ song=generate_test_song(pattern_name='rock', bars=1, bpm=120, add_bass=False)
10
+ buf=io.BytesIO()
11
+ sf.write(buf, song.drums_only, song.sr, format='WAV')
12
+ buf.seek(0)
13
+ client=TestClient(app)
14
+ params={'stem':'all','target_min':2,'target_max':6,'synthesize':True}
15
+ r=client.post('/api/jobs', files={'file':('test.wav', buf, 'audio/wav')}, data={'params':json.dumps(params)})
16
+ r.raise_for_status()
17
+ job=r.json()
18
+ for _ in range(60):
19
+ job=client.get(f"/api/jobs/{job['id']}").json()
20
+ if job['status'] in {'complete','error'}:
21
+ break
22
+ time.sleep(0.25)
23
+ print(json.dumps({'status':job['status'], 'error':job.get('error'), 'hit_count': job.get('result',{}).get('hit_count'), 'files': job.get('result',{}).get('file_urls')}, indent=2))
24
+ assert job['status']=='complete', job.get('error')
25
+ assert job['result']['hit_count'] > 0
web/app.js ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const $ = (id) => document.getElementById(id);
2
+
3
+ const fields = [
4
+ "stem", "demucs_model", "demucs_shifts", "demucs_overlap", "onset_mode", "onset_delta",
5
+ "energy_threshold_db", "pre_pad", "min_dur", "max_dur", "min_gap", "ncc_threshold",
6
+ "attack_ms", "mel_threshold", "linkage", "target_min", "target_max", "subdivision",
7
+ "synthesize", "quantize_midi"
8
+ ];
9
+
10
+ let config = null;
11
+ let selectedFile = null;
12
+ let activePoll = null;
13
+
14
+ function fmtSec(value) {
15
+ if (value === null || value === undefined || Number.isNaN(Number(value))) return "—";
16
+ const n = Number(value);
17
+ if (n < 0.001) return `${(n * 1000).toFixed(2)} ms`;
18
+ if (n < 1) return `${(n * 1000).toFixed(1)} ms`;
19
+ return `${n.toFixed(2)} s`;
20
+ }
21
+
22
+ function setHealth(ok, text, subtext) {
23
+ $("healthDot").className = `status-dot ${ok ? "ok" : "bad"}`;
24
+ $("healthText").textContent = text;
25
+ $("healthSubtext").textContent = subtext;
26
+ }
27
+
28
+ async function api(path, options = {}) {
29
+ const response = await fetch(path, options);
30
+ if (!response.ok) {
31
+ let detail = response.statusText;
32
+ try { detail = (await response.json()).detail ?? detail; } catch {}
33
+ throw new Error(detail);
34
+ }
35
+ return response.json();
36
+ }
37
+
38
+ function setSelectOptions(select, values, labels = null) {
39
+ select.innerHTML = "";
40
+ for (const value of values) {
41
+ const option = document.createElement("option");
42
+ option.value = String(value);
43
+ option.textContent = labels?.[value] ?? String(value);
44
+ select.appendChild(option);
45
+ }
46
+ }
47
+
48
+ function populateConfig() {
49
+ setSelectOptions($("demucs_model"), config.demucs_models);
50
+ const defaults = config.defaults;
51
+ for (const field of fields) {
52
+ const el = $(field);
53
+ if (!el || defaults[field] === undefined) continue;
54
+ if (el.type === "checkbox") el.checked = Boolean(defaults[field]);
55
+ else el.value = defaults[field];
56
+ }
57
+ updateStemOptions();
58
+ renderStages(config.stages);
59
+ }
60
+
61
+ function updateStemOptions() {
62
+ const model = $("demucs_model").value || config.defaults.demucs_model;
63
+ const stems = config.demucs_stems[model] ?? ["drums", "bass", "other", "vocals", "all"];
64
+ const current = $("stem").value || config.defaults.stem;
65
+ setSelectOptions($("stem"), stems);
66
+ $("stem").value = stems.includes(current) ? current : stems[0];
67
+ }
68
+
69
+ function collectParams() {
70
+ const params = {};
71
+ for (const field of fields) {
72
+ const el = $(field);
73
+ if (!el) continue;
74
+ if (el.type === "checkbox") params[field] = el.checked;
75
+ else if (el.type === "number") params[field] = Number(el.value);
76
+ else params[field] = el.value;
77
+ }
78
+ return params;
79
+ }
80
+
81
+ function renderStages(stages = []) {
82
+ $("stageList").innerHTML = stages.map((stage) => `
83
+ <div class="stage ${stage.status}" title="${stage.detail || ""}">
84
+ <span class="badge"></span>
85
+ <div><strong>${stage.label}</strong><small>${stage.detail || stage.status}</small></div>
86
+ <time>${fmtSec(stage.duration_sec)}</time>
87
+ </div>
88
+ `).join("");
89
+ }
90
+
91
+ function drawWaveform(overview) {
92
+ window.__lastOverview = overview;
93
+ const canvas = $("waveform");
94
+ const ctx = canvas.getContext("2d");
95
+ const ratio = window.devicePixelRatio || 1;
96
+ const rect = canvas.getBoundingClientRect();
97
+ canvas.width = Math.max(1, Math.floor(rect.width * ratio));
98
+ canvas.height = Math.max(160, Math.floor(160 * ratio));
99
+ ctx.scale(ratio, ratio);
100
+ const w = rect.width;
101
+ const h = 160;
102
+ ctx.clearRect(0, 0, w, h);
103
+ ctx.fillStyle = "rgba(139,211,255,.045)";
104
+ ctx.fillRect(0, 0, w, h);
105
+ const env = overview?.envelope ?? [];
106
+ if (!env.length) return;
107
+ ctx.strokeStyle = "rgba(139,211,255,.92)";
108
+ ctx.lineWidth = 1.4;
109
+ ctx.beginPath();
110
+ const mid = h / 2;
111
+ env.forEach((v, i) => {
112
+ const x = (i / Math.max(1, env.length - 1)) * w;
113
+ const y = mid - Math.min(1, v) * (h * 0.42);
114
+ if (i === 0) ctx.moveTo(x, y); else ctx.lineTo(x, y);
115
+ });
116
+ for (let i = env.length - 1; i >= 0; i--) {
117
+ const v = env[i];
118
+ const x = (i / Math.max(1, env.length - 1)) * w;
119
+ const y = mid + Math.min(1, v) * (h * 0.42);
120
+ ctx.lineTo(x, y);
121
+ }
122
+ ctx.closePath();
123
+ ctx.fillStyle = "rgba(139,211,255,.28)";
124
+ ctx.fill();
125
+ ctx.stroke();
126
+
127
+ ctx.strokeStyle = "rgba(200,165,255,.55)";
128
+ ctx.lineWidth = 1;
129
+ for (const onset of overview.onsets ?? []) {
130
+ const x = (onset.time_sec / Math.max(overview.duration_sec, 0.001)) * w;
131
+ ctx.beginPath();
132
+ ctx.moveTo(x, 10);
133
+ ctx.lineTo(x, h - 10);
134
+ ctx.stroke();
135
+ }
136
+ }
137
+
138
+ function renderResult(job) {
139
+ const result = job.result;
140
+ if (!result) return;
141
+ const rtf = result.realtime_factor.toFixed(2);
142
+ $("resultSummary").textContent = `${result.hit_count} hits → ${result.cluster_count} samples · BPM ${result.bpm ?? "—"} · ${fmtSec(result.duration_sec)} total · ${rtf}× realtime`;
143
+ drawWaveform(result.overview);
144
+
145
+ const fileUrls = result.file_urls ?? {};
146
+ const labels = { archive: "Sample pack ZIP", midi: "MIDI", stem: "Stem WAV", reconstruction: "Reconstruction WAV" };
147
+ $("downloads").innerHTML = Object.entries(fileUrls).map(([key, url]) => `<a href="${url}" download>${labels[key] ?? key}</a>`).join("");
148
+ $("stemAudio").src = fileUrls.stem ?? "";
149
+ $("reconAudio").src = fileUrls.reconstruction ?? "";
150
+
151
+ const tbody = $("samplesTable").querySelector("tbody");
152
+ tbody.innerHTML = (result.samples ?? []).map((sample) => `
153
+ <tr>
154
+ <td>${sample.label}</td>
155
+ <td>${sample.classification}</td>
156
+ <td>${sample.hits}</td>
157
+ <td>${sample.score}</td>
158
+ <td>${sample.duration_ms} ms</td>
159
+ <td>${sample.first_onset_sec} s</td>
160
+ <td><a href="${sample.url}" download>WAV</a></td>
161
+ </tr>
162
+ `).join("");
163
+ }
164
+
165
+ function renderJob(job) {
166
+ $("jobPill").textContent = `${job.status}${job.id ? ` · ${job.id}` : ""}`;
167
+ renderStages(job.stages ?? []);
168
+ $("logs").textContent = (job.logs ?? []).join("\n");
169
+ if (job.status === "complete") renderResult(job);
170
+ if (job.status === "error") {
171
+ $("resultSummary").textContent = `Extraction failed: ${job.error}`;
172
+ $("logs").textContent = `${(job.logs ?? []).join("\n")}\n\n${job.traceback ?? ""}`;
173
+ }
174
+ }
175
+
176
+ async function pollJob(id) {
177
+ if (activePoll) clearInterval(activePoll);
178
+ const tick = async () => {
179
+ try {
180
+ const job = await api(`/api/jobs/${id}`);
181
+ renderJob(job);
182
+ if (["complete", "error"].includes(job.status)) {
183
+ clearInterval(activePoll);
184
+ activePoll = null;
185
+ $("runButton").disabled = !selectedFile;
186
+ }
187
+ } catch (error) {
188
+ clearInterval(activePoll);
189
+ activePoll = null;
190
+ $("runButton").disabled = !selectedFile;
191
+ $("resultSummary").textContent = error.message;
192
+ }
193
+ };
194
+ await tick();
195
+ activePoll = setInterval(tick, 800);
196
+ }
197
+
198
+ async function runExtraction() {
199
+ if (!selectedFile) return;
200
+ $("runButton").disabled = true;
201
+ $("jobPill").textContent = "uploading";
202
+ $("logs").textContent = "Uploading source and starting extraction…";
203
+ const form = new FormData();
204
+ form.append("file", selectedFile, selectedFile.name);
205
+ form.append("params", JSON.stringify(collectParams()));
206
+ try {
207
+ const job = await api("/api/jobs", { method: "POST", body: form });
208
+ renderJob(job);
209
+ await pollJob(job.id);
210
+ } catch (error) {
211
+ $("runButton").disabled = false;
212
+ $("resultSummary").textContent = error.message;
213
+ }
214
+ }
215
+
216
+ function setFile(file) {
217
+ selectedFile = file;
218
+ $("dropTitle").textContent = file ? file.name : "Drop audio here or click to browse";
219
+ $("dropMeta").textContent = file ? `${(file.size / 1024 / 1024).toFixed(2)} MB · ${file.type || "audio"}` : "No file selected";
220
+ $("runButton").disabled = !file;
221
+ if (file) {
222
+ $("sourcePreview").hidden = false;
223
+ $("sourcePreview").src = URL.createObjectURL(file);
224
+ }
225
+ }
226
+
227
+ async function boot() {
228
+ try {
229
+ await api("/api/health");
230
+ config = await api("/api/config");
231
+ populateConfig();
232
+ setHealth(true, "Ready", "Backend online");
233
+ } catch (error) {
234
+ setHealth(false, "Offline", error.message);
235
+ }
236
+ }
237
+
238
+ $("demucs_model").addEventListener("change", updateStemOptions);
239
+ $("fileInput").addEventListener("change", (event) => setFile(event.target.files?.[0] ?? null));
240
+ $("runButton").addEventListener("click", runExtraction);
241
+ $("useFastButton").addEventListener("click", () => {
242
+ $("stem").value = "all";
243
+ $("demucs_shifts").value = 0;
244
+ $("target_min").value = 4;
245
+ $("target_max").value = 16;
246
+ });
247
+ $("clearCacheButton").addEventListener("click", async () => {
248
+ try {
249
+ await api("/api/cache/clear", { method: "POST" });
250
+ $("logs").textContent = "Pipeline cache cleared.";
251
+ } catch (error) {
252
+ $("logs").textContent = error.message;
253
+ }
254
+ });
255
+
256
+ const dropzone = $("dropzone");
257
+ for (const eventName of ["dragenter", "dragover"]) {
258
+ dropzone.addEventListener(eventName, (event) => { event.preventDefault(); dropzone.classList.add("dragging"); });
259
+ }
260
+ for (const eventName of ["dragleave", "drop"]) {
261
+ dropzone.addEventListener(eventName, (event) => { event.preventDefault(); dropzone.classList.remove("dragging"); });
262
+ }
263
+ dropzone.addEventListener("drop", (event) => setFile(event.dataTransfer.files?.[0] ?? null));
264
+ window.addEventListener("resize", () => {
265
+ const current = window.__lastOverview;
266
+ if (current) drawWaveform(current);
267
+ });
268
+
269
+ boot();
web/index.html ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
6
+ <title>Drum Sample Extractor</title>
7
+ <link rel="stylesheet" href="/web/styles.css" />
8
+ </head>
9
+ <body>
10
+ <div class="shell">
11
+ <header class="hero">
12
+ <div>
13
+ <p class="eyebrow">Sample extraction workstation</p>
14
+ <h1>Extract reusable drum samples from one audio file.</h1>
15
+ <p class="lede">Upload a track, isolate or bypass the stem, detect hits, cluster similar transients, export WAVs, MIDI, reconstruction audio, and a complete sample pack.</p>
16
+ </div>
17
+ <div class="hero-card" aria-live="polite">
18
+ <span class="status-dot" id="healthDot"></span>
19
+ <div>
20
+ <strong id="healthText">Connecting</strong>
21
+ <span id="healthSubtext">FastAPI backend</span>
22
+ </div>
23
+ </div>
24
+ </header>
25
+
26
+ <main class="workspace">
27
+ <section class="panel ingest-panel">
28
+ <div class="panel-heading">
29
+ <div>
30
+ <h2>1. Source</h2>
31
+ <p>Drop a WAV, MP3, FLAC, AIFF, or OGG file. Use <code>all</code> stem for fast iteration without Demucs.</p>
32
+ </div>
33
+ </div>
34
+ <label class="dropzone" id="dropzone">
35
+ <input id="fileInput" type="file" accept="audio/*,.wav,.mp3,.flac,.aiff,.ogg,.m4a" />
36
+ <span class="drop-icon">↥</span>
37
+ <strong id="dropTitle">Drop audio here or click to browse</strong>
38
+ <small id="dropMeta">No file selected</small>
39
+ </label>
40
+ <audio id="sourcePreview" controls hidden></audio>
41
+ </section>
42
+
43
+ <section class="panel controls-panel">
44
+ <div class="panel-heading">
45
+ <div>
46
+ <h2>2. Extraction controls</h2>
47
+ <p>Defaults favor quick full-song extraction. Tighten thresholds after reviewing the timeline.</p>
48
+ </div>
49
+ <button id="clearCacheButton" class="ghost-button" type="button">Clear cache</button>
50
+ </div>
51
+
52
+ <div class="control-grid">
53
+ <label>Stem
54
+ <select id="stem"></select>
55
+ </label>
56
+ <label>Demucs model
57
+ <select id="demucs_model"></select>
58
+ </label>
59
+ <label>Shifts
60
+ <input id="demucs_shifts" type="number" min="0" max="8" step="1" />
61
+ </label>
62
+ <label>Overlap
63
+ <input id="demucs_overlap" type="number" min="0" max="0.9" step="0.05" />
64
+ </label>
65
+ <label>Onset mode
66
+ <select id="onset_mode">
67
+ <option value="auto">auto / multiband</option>
68
+ <option value="percussive">percussive</option>
69
+ <option value="harmonic">harmonic</option>
70
+ <option value="broadband">broadband</option>
71
+ </select>
72
+ </label>
73
+ <label>Onset delta
74
+ <input id="onset_delta" type="number" min="0.001" max="1" step="0.01" />
75
+ </label>
76
+ <label>Energy threshold dB
77
+ <input id="energy_threshold_db" type="number" min="-100" max="0" step="1" />
78
+ </label>
79
+ <label>Minimum gap seconds
80
+ <input id="min_gap" type="number" min="0.001" max="1" step="0.005" />
81
+ </label>
82
+ <label>Pre-pad seconds
83
+ <input id="pre_pad" type="number" min="0" max="0.25" step="0.001" />
84
+ </label>
85
+ <label>Min duration seconds
86
+ <input id="min_dur" type="number" min="0.001" max="10" step="0.005" />
87
+ </label>
88
+ <label>Max duration seconds
89
+ <input id="max_dur" type="number" min="0.01" max="10" step="0.1" />
90
+ </label>
91
+ <label>NCC threshold
92
+ <input id="ncc_threshold" type="number" min="0" max="1" step="0.01" />
93
+ </label>
94
+ <label>Attack window ms
95
+ <input id="attack_ms" type="number" min="1" max="250" step="1" />
96
+ </label>
97
+ <label>Mel prefilter
98
+ <input id="mel_threshold" type="number" min="0" max="1" step="0.01" />
99
+ </label>
100
+ <label>Linkage
101
+ <select id="linkage">
102
+ <option value="average">average</option>
103
+ <option value="complete">complete</option>
104
+ <option value="single">single</option>
105
+ </select>
106
+ </label>
107
+ <label>Target min clusters
108
+ <input id="target_min" type="number" min="0" max="256" step="1" />
109
+ </label>
110
+ <label>Target max clusters
111
+ <input id="target_max" type="number" min="0" max="256" step="1" />
112
+ </label>
113
+ <label>MIDI grid
114
+ <select id="subdivision">
115
+ <option value="8">8th</option>
116
+ <option value="16">16th</option>
117
+ <option value="32">32nd</option>
118
+ <option value="64">64th</option>
119
+ </select>
120
+ </label>
121
+ </div>
122
+
123
+ <div class="toggles">
124
+ <label><input id="synthesize" type="checkbox" /> synthesize alternates</label>
125
+ <label><input id="quantize_midi" type="checkbox" /> quantize MIDI</label>
126
+ </div>
127
+
128
+ <div class="actions">
129
+ <button id="runButton" class="primary-button" type="button" disabled>Extract samples</button>
130
+ <button id="useFastButton" class="secondary-button" type="button">Use fast full-mix mode</button>
131
+ </div>
132
+ </section>
133
+
134
+ <section class="panel progress-panel">
135
+ <div class="panel-heading">
136
+ <div>
137
+ <h2>3. Pipeline</h2>
138
+ <p>Stage timings are captured per run. Stem separation is deliberately isolated because it dominates offline extraction.</p>
139
+ </div>
140
+ <span class="job-pill" id="jobPill">idle</span>
141
+ </div>
142
+ <div id="stageList" class="stage-list"></div>
143
+ <pre id="logs" class="logs" aria-live="polite"></pre>
144
+ </section>
145
+
146
+ <section class="panel result-panel">
147
+ <div class="panel-heading">
148
+ <div>
149
+ <h2>4. Results</h2>
150
+ <p id="resultSummary">Run extraction to populate samples, timing, MIDI, reconstruction, and downloads.</p>
151
+ </div>
152
+ </div>
153
+ <canvas id="waveform" class="waveform" height="160"></canvas>
154
+ <div class="downloads" id="downloads"></div>
155
+ <div class="audio-grid">
156
+ <label>Stem audio<audio id="stemAudio" controls></audio></label>
157
+ <label>Reconstruction<audio id="reconAudio" controls></audio></label>
158
+ </div>
159
+ <div class="table-wrap">
160
+ <table id="samplesTable">
161
+ <thead>
162
+ <tr>
163
+ <th>Sample</th><th>Class</th><th>Hits</th><th>Score</th><th>Duration</th><th>First hit</th><th>File</th>
164
+ </tr>
165
+ </thead>
166
+ <tbody></tbody>
167
+ </table>
168
+ </div>
169
+ </section>
170
+ </main>
171
+ </div>
172
+ <script type="module" src="/web/app.js"></script>
173
+ </body>
174
+ </html>
web/styles.css ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ :root {
2
+ color-scheme: dark;
3
+ --bg: #08090d;
4
+ --panel: rgba(18, 22, 32, 0.84);
5
+ --panel-strong: rgba(28, 34, 48, 0.92);
6
+ --line: rgba(255, 255, 255, 0.1);
7
+ --muted: #8b93a7;
8
+ --text: #eef2ff;
9
+ --accent: #8bd3ff;
10
+ --accent-2: #c8a5ff;
11
+ --good: #55e6a5;
12
+ --bad: #ff6d7a;
13
+ --warn: #ffca6b;
14
+ --shadow: 0 24px 90px rgba(0,0,0,.38);
15
+ }
16
+ * { box-sizing: border-box; }
17
+ html, body { margin: 0; min-height: 100%; background: radial-gradient(circle at 20% 0%, rgba(139,211,255,.20), transparent 30rem), radial-gradient(circle at 88% 8%, rgba(200,165,255,.18), transparent 28rem), var(--bg); color: var(--text); font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "SF Pro Display", "Segoe UI", sans-serif; }
18
+ button, input, select { font: inherit; }
19
+ code { color: var(--accent); }
20
+ .shell { width: min(1520px, calc(100% - 32px)); margin: 0 auto; padding: 32px 0 56px; }
21
+ .hero { display: grid; grid-template-columns: 1fr auto; gap: 24px; align-items: end; margin-bottom: 24px; }
22
+ .eyebrow { margin: 0 0 10px; text-transform: uppercase; letter-spacing: .16em; color: var(--accent); font-size: 12px; font-weight: 800; }
23
+ h1 { margin: 0; font-size: clamp(36px, 6vw, 76px); line-height: .92; letter-spacing: -.07em; max-width: 980px; }
24
+ .lede { margin: 18px 0 0; color: #cbd3e5; font-size: 17px; max-width: 860px; line-height: 1.55; }
25
+ .hero-card { min-width: 250px; display: flex; align-items: center; gap: 14px; padding: 18px; border: 1px solid var(--line); background: rgba(255,255,255,.06); border-radius: 24px; box-shadow: var(--shadow); backdrop-filter: blur(18px); }
26
+ .hero-card strong, .hero-card span { display: block; }
27
+ .hero-card span:last-child { color: var(--muted); font-size: 13px; margin-top: 3px; }
28
+ .status-dot { width: 12px; height: 12px; border-radius: 999px; background: var(--warn); box-shadow: 0 0 26px currentColor; }
29
+ .status-dot.ok { background: var(--good); }
30
+ .status-dot.bad { background: var(--bad); }
31
+ .workspace { display: grid; grid-template-columns: minmax(320px, .9fr) minmax(520px, 1.35fr); gap: 18px; align-items: start; }
32
+ .panel { border: 1px solid var(--line); border-radius: 28px; background: linear-gradient(180deg, var(--panel-strong), var(--panel)); box-shadow: var(--shadow); backdrop-filter: blur(22px); padding: 22px; }
33
+ .result-panel { grid-column: 1 / -1; }
34
+ .panel-heading { display: flex; align-items: flex-start; justify-content: space-between; gap: 18px; margin-bottom: 18px; }
35
+ h2 { margin: 0; font-size: 20px; letter-spacing: -.025em; }
36
+ .panel p { margin: 7px 0 0; color: var(--muted); line-height: 1.45; }
37
+ .dropzone { position: relative; display: grid; place-items: center; gap: 8px; min-height: 260px; padding: 22px; border: 1.5px dashed rgba(139,211,255,.42); border-radius: 24px; background: linear-gradient(145deg, rgba(139,211,255,.08), rgba(200,165,255,.05)); text-align: center; cursor: pointer; transition: transform .2s ease, border-color .2s ease, background .2s ease; }
38
+ .dropzone:hover, .dropzone.dragging { transform: translateY(-1px); border-color: var(--accent); background: rgba(139,211,255,.12); }
39
+ .dropzone input { position: absolute; inset: 0; opacity: 0; cursor: pointer; }
40
+ .drop-icon { width: 74px; height: 74px; display: grid; place-items: center; border-radius: 22px; background: rgba(255,255,255,.08); color: var(--accent); font-size: 42px; line-height: 1; }
41
+ .dropzone strong { font-size: 18px; }
42
+ .dropzone small { color: var(--muted); }
43
+ audio { width: 100%; margin-top: 12px; }
44
+ .control-grid { display: grid; grid-template-columns: repeat(4, minmax(0, 1fr)); gap: 12px; }
45
+ label { display: block; color: #c7d0e4; font-size: 12px; font-weight: 750; letter-spacing: .02em; }
46
+ input, select { width: 100%; margin-top: 7px; border: 1px solid var(--line); border-radius: 14px; padding: 11px 12px; color: var(--text); background: rgba(5, 7, 12, .62); outline: none; }
47
+ input:focus, select:focus { border-color: rgba(139,211,255,.8); box-shadow: 0 0 0 4px rgba(139,211,255,.12); }
48
+ .toggles { display: flex; flex-wrap: wrap; gap: 14px; margin: 16px 0 0; }
49
+ .toggles label { display: flex; align-items: center; gap: 8px; font-size: 13px; font-weight: 700; }
50
+ .toggles input { width: auto; margin: 0; }
51
+ .actions { display: flex; flex-wrap: wrap; gap: 12px; margin-top: 18px; }
52
+ button { border: 0; border-radius: 16px; padding: 12px 16px; color: var(--text); cursor: pointer; transition: transform .16s ease, opacity .16s ease, border-color .16s ease; }
53
+ button:hover:not(:disabled) { transform: translateY(-1px); }
54
+ button:disabled { opacity: .45; cursor: not-allowed; }
55
+ .primary-button { background: linear-gradient(135deg, var(--accent), var(--accent-2)); color: #07101d; font-weight: 900; }
56
+ .secondary-button, .ghost-button { border: 1px solid var(--line); background: rgba(255,255,255,.07); }
57
+ .ghost-button { padding: 9px 12px; color: #cbd3e5; }
58
+ .job-pill { display: inline-flex; align-items: center; border: 1px solid var(--line); border-radius: 999px; padding: 7px 10px; color: var(--muted); background: rgba(255,255,255,.06); font-size: 12px; }
59
+ .stage-list { display: grid; gap: 9px; }
60
+ .stage { display: grid; grid-template-columns: 24px 1fr auto; gap: 10px; align-items: center; padding: 12px; border: 1px solid var(--line); border-radius: 18px; background: rgba(0,0,0,.16); }
61
+ .stage .badge { width: 18px; height: 18px; border-radius: 999px; background: rgba(255,255,255,.16); }
62
+ .stage.running .badge { background: var(--accent); box-shadow: 0 0 22px rgba(139,211,255,.8); }
63
+ .stage.done .badge { background: var(--good); }
64
+ .stage.error .badge { background: var(--bad); }
65
+ .stage strong { display: block; font-size: 14px; }
66
+ .stage small { display: block; color: var(--muted); margin-top: 2px; }
67
+ .stage time { color: #d7def0; font-variant-numeric: tabular-nums; }
68
+ .logs { min-height: 140px; max-height: 240px; overflow: auto; border: 1px solid var(--line); border-radius: 18px; padding: 14px; margin: 14px 0 0; background: #05070b; color: #9db8c8; font-size: 12px; line-height: 1.45; white-space: pre-wrap; }
69
+ .waveform { width: 100%; min-height: 160px; border: 1px solid var(--line); border-radius: 20px; background: rgba(0,0,0,.18); margin: 4px 0 16px; }
70
+ .downloads { display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 16px; }
71
+ .downloads a, .table-wrap a { color: #07101d; text-decoration: none; font-weight: 850; background: var(--accent); border-radius: 999px; padding: 8px 11px; }
72
+ .audio-grid { display: grid; grid-template-columns: repeat(2, minmax(0, 1fr)); gap: 16px; margin-bottom: 16px; }
73
+ .table-wrap { overflow: auto; border: 1px solid var(--line); border-radius: 20px; }
74
+ table { width: 100%; border-collapse: collapse; min-width: 860px; }
75
+ th, td { text-align: left; padding: 12px 14px; border-bottom: 1px solid var(--line); font-size: 13px; }
76
+ th { position: sticky; top: 0; background: #101521; color: #aeb9ce; z-index: 1; }
77
+ td { color: #e5eaf7; }
78
+ tr:last-child td { border-bottom: 0; }
79
+ @media (max-width: 1100px) { .workspace, .hero { grid-template-columns: 1fr; } .control-grid { grid-template-columns: repeat(2, minmax(0, 1fr)); } }
80
+ @media (max-width: 680px) { .shell { width: min(100% - 20px, 1520px); padding-top: 16px; } .panel { padding: 16px; border-radius: 22px; } .control-grid, .audio-grid { grid-template-columns: 1fr; } h1 { letter-spacing: -.045em; } }