Buckets:
hf-doc-build/doc / diffusers /v0.18.2 /en /_app /pages /api /pipelines /spectrogram_diffusion.mdx-hf-doc-builder.js
| import{S as ua,i as ma,s as va,e as i,k as l,w as $,t as p,M as ga,c as r,d as t,m as h,a as s,x as M,h as f,b as o,N as wa,G as a,g as c,y as S,L as ba,q as D,o as k,B as P,v as ya}from"../../../chunks/vendor-hf-doc-builder.js";import{D as da}from"../../../chunks/Docstring-hf-doc-builder.js";import{C as _a}from"../../../chunks/CodeBlock-hf-doc-builder.js";import{I as se}from"../../../chunks/IconCopyLink-hf-doc-builder.js";function Ea(At){let d,De,u,I,oe,C,Ke,ne,Oe,ke,m,U,le,R,Ye,he,et,Pe,J,z,tt,at,Ie,K,it,Ue,A,rt,F,st,ot,Ae,v,x,ce,H,nt,pe,lt,xe,O,Y,xt,Te,ee,ht,Ne,g,T,fe,q,ct,de,pt,Ze,N,ue,w,me,ft,dt,ve,ut,mt,te,vt,gt,ge,b,we,V,wt,bt,be,ye,yt,_t,ae,Et,Ge,y,Z,_e,B,$t,Ee,Mt,We,j,Ce,_,G,$e,L,St,Me,Dt,Re,E,X,kt,W,Q,Pt,Se,It,Je;return C=new se({}),R=new se({}),H=new se({}),q=new se({}),B=new se({}),j=new _a({props:{code:"ZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMFNwZWN0cm9ncmFtRGlmZnVzaW9uUGlwZWxpbmUlMkMlMjBNaWRpUHJvY2Vzc29yJTBBJTBBcGlwZSUyMCUzRCUyMFNwZWN0cm9ncmFtRGlmZnVzaW9uUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUyMmdvb2dsZSUyRm11c2ljLXNwZWN0cm9ncmFtLWRpZmZ1c2lvbiUyMiklMEFwaXBlJTIwJTNEJTIwcGlwZS50byglMjJjdWRhJTIyKSUwQXByb2Nlc3NvciUyMCUzRCUyME1pZGlQcm9jZXNzb3IoKSUwQSUwQSUyMyUyMERvd25sb2FkJTIwTUlESSUyMGZyb20lM0ElMjB3Z2V0JTIwaHR0cCUzQSUyRiUyRnd3dy5waWFuby1taWRpLmRlJTJGbWlkaXMlMkZiZWV0aG92ZW4lMkZiZWV0aG92ZW5faGFtbWVya2xhdmllcl8yLm1pZCUwQW91dHB1dCUyMCUzRCUyMHBpcGUocHJvY2Vzc29yKCUyMmJlZXRob3Zlbl9oYW1tZXJrbGF2aWVyXzIubWlkJTIyKSklMEElMEFhdWRpbyUyMCUzRCUyMG91dHB1dC5hdWRpb3MlNUIwJTVE",highlighted:`<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> SpectrogramDiffusionPipeline, MidiProcessor | |
| pipe = SpectrogramDiffusionPipeline.from_pretrained(<span class="hljs-string">"google/music-spectrogram-diffusion"</span>) | |
| pipe = pipe.to(<span class="hljs-string">"cuda"</span>) | |
| processor = MidiProcessor() | |
| <span class="hljs-comment"># Download MIDI from: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid</span> | |
| output = pipe(processor(<span class="hljs-string">"beethoven_hammerklavier_2.mid"</span>)) | |
| audio = output.audios[<span class="hljs-number">0</span>]`}}),L=new se({}),X=new da({props:{name:"class diffusers.SpectrogramDiffusionPipeline",anchor:"diffusers.SpectrogramDiffusionPipeline",parameters:[{name:"*args",val:""},{name:"**kwargs",val:""}],source:"https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py#L5"}}),Q=new da({props:{name:"__call__",anchor:"diffusers.SpectrogramDiffusionPipeline.__call__",parameters:[{name:"*args",val:""},{name:"**kwargs",val:""}]}}),{c(){d=i("meta"),De=l(),u=i("h1"),I=i("a"),oe=i("span"),$(C.$$.fragment),Ke=l(),ne=i("span"),Oe=p("Multi-instrument Music Synthesis with Spectrogram Diffusion"),ke=l(),m=i("h2"),U=i("a"),le=i("span"),$(R.$$.fragment),Ye=l(),he=i("span"),et=p("Overview"),Pe=l(),J=i("p"),z=i("a"),tt=p("Spectrogram Diffusion"),at=p(" by Curtis Hawthorne, Ian Simon, Adam Roberts, Neil Zeghidour, Josh Gardner, Ethan Manilow, and Jesse Engel."),Ie=l(),K=i("p"),it=p("An ideal music synthesizer should be both interactive and expressive, generating high-fidelity audio in realtime for arbitrary combinations of instruments and notes. Recent neural synthesizers have exhibited a tradeoff between domain-specific models that offer detailed control of only specific instruments, or raw waveform models that can train on any music but with minimal control and slow generation. In this work, we focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments. We use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter. We compare training the decoder as an autoregressive model and as a Denoising Diffusion Probabilistic Model (DDPM) and find that the DDPM approach is superior both qualitatively and as measured by audio reconstruction and Fr\xE9chet distance metrics. Given the interactivity and generality of this approach, we find this to be a promising first step towards interactive and expressive neural synthesis for arbitrary combinations of instruments and notes."),Ue=l(),A=i("p"),rt=p("The original codebase of this implementation can be found at "),F=i("a"),st=p("magenta/music-spectrogram-diffusion"),ot=p("."),Ae=l(),v=i("h2"),x=i("a"),ce=i("span"),$(H.$$.fragment),nt=l(),pe=i("span"),lt=p("Model"),xe=l(),O=i("p"),Y=i("img"),Te=l(),ee=i("p"),ht=p("As depicted above the model takes as input a MIDI file and tokenizes it into a sequence of 5 second intervals. Each tokenized interval then together with positional encodings is passed through the Note Encoder and its representation is concatenated with the previous window\u2019s generated spectrogram representation obtained via the Context Encoder. For the initial 5 second window this is set to zero. The resulting context is then used as conditioning to sample the denoised Spectrogram from the MIDI window and we concatenate this spectrogram to the final output as well as use it for the context of the next MIDI window. The process repeats till we have gone over all the MIDI inputs. Finally a MelGAN decoder converts the potentially long spectrogram to audio which is the final result of this pipeline."),Ne=l(),g=i("h2"),T=i("a"),fe=i("span"),$(q.$$.fragment),ct=l(),de=i("span"),pt=p("Available Pipelines:"),Ze=l(),N=i("table"),ue=i("thead"),w=i("tr"),me=i("th"),ft=p("Pipeline"),dt=l(),ve=i("th"),ut=p("Tasks"),mt=l(),te=i("th"),vt=p("Colab"),gt=l(),ge=i("tbody"),b=i("tr"),we=i("td"),V=i("a"),wt=p("pipeline_spectrogram_diffusion.py"),bt=l(),be=i("td"),ye=i("em"),yt=p("Unconditional Audio Generation"),_t=l(),ae=i("td"),Et=p("-"),Ge=l(),y=i("h2"),Z=i("a"),_e=i("span"),$(B.$$.fragment),$t=l(),Ee=i("span"),Mt=p("Example usage"),We=l(),$(j.$$.fragment),Ce=l(),_=i("h2"),G=i("a"),$e=i("span"),$(L.$$.fragment),St=l(),Me=i("span"),Dt=p("SpectrogramDiffusionPipeline"),Re=l(),E=i("div"),$(X.$$.fragment),kt=l(),W=i("div"),$(Q.$$.fragment),Pt=l(),Se=i("p"),It=p("Call self as a function."),this.h()},l(e){const n=ga('[data-svelte="svelte-1phssyn"]',document.head);d=r(n,"META",{name:!0,content:!0}),n.forEach(t),De=h(e),u=r(e,"H1",{class:!0});var ze=s(u);I=r(ze,"A",{id:!0,class:!0,href:!0});var Tt=s(I);oe=r(Tt,"SPAN",{});var Nt=s(oe);M(C.$$.fragment,Nt),Nt.forEach(t),Tt.forEach(t),Ke=h(ze),ne=r(ze,"SPAN",{});var Zt=s(ne);Oe=f(Zt,"Multi-instrument Music Synthesis with Spectrogram Diffusion"),Zt.forEach(t),ze.forEach(t),ke=h(e),m=r(e,"H2",{class:!0});var Fe=s(m);U=r(Fe,"A",{id:!0,class:!0,href:!0});var Gt=s(U);le=r(Gt,"SPAN",{});var Wt=s(le);M(R.$$.fragment,Wt),Wt.forEach(t),Gt.forEach(t),Ye=h(Fe),he=r(Fe,"SPAN",{});var Ct=s(he);et=f(Ct,"Overview"),Ct.forEach(t),Fe.forEach(t),Pe=h(e),J=r(e,"P",{});var Ut=s(J);z=r(Ut,"A",{href:!0,rel:!0});var Rt=s(z);tt=f(Rt,"Spectrogram Diffusion"),Rt.forEach(t),at=f(Ut," by Curtis Hawthorne, Ian Simon, Adam Roberts, Neil Zeghidour, Josh Gardner, Ethan Manilow, and Jesse Engel."),Ut.forEach(t),Ie=h(e),K=r(e,"P",{});var Jt=s(K);it=f(Jt,"An ideal music synthesizer should be both interactive and expressive, generating high-fidelity audio in realtime for arbitrary combinations of instruments and notes. Recent neural synthesizers have exhibited a tradeoff between domain-specific models that offer detailed control of only specific instruments, or raw waveform models that can train on any music but with minimal control and slow generation. In this work, we focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments. We use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter. We compare training the decoder as an autoregressive model and as a Denoising Diffusion Probabilistic Model (DDPM) and find that the DDPM approach is superior both qualitatively and as measured by audio reconstruction and Fr\xE9chet distance metrics. Given the interactivity and generality of this approach, we find this to be a promising first step towards interactive and expressive neural synthesis for arbitrary combinations of instruments and notes."),Jt.forEach(t),Ue=h(e),A=r(e,"P",{});var He=s(A);rt=f(He,"The original codebase of this implementation can be found at "),F=r(He,"A",{href:!0,rel:!0});var zt=s(F);st=f(zt,"magenta/music-spectrogram-diffusion"),zt.forEach(t),ot=f(He,"."),He.forEach(t),Ae=h(e),v=r(e,"H2",{class:!0});var qe=s(v);x=r(qe,"A",{id:!0,class:!0,href:!0});var Ft=s(x);ce=r(Ft,"SPAN",{});var Ht=s(ce);M(H.$$.fragment,Ht),Ht.forEach(t),Ft.forEach(t),nt=h(qe),pe=r(qe,"SPAN",{});var qt=s(pe);lt=f(qt,"Model"),qt.forEach(t),qe.forEach(t),xe=h(e),O=r(e,"P",{});var Vt=s(O);Y=r(Vt,"IMG",{src:!0,alt:!0}),Vt.forEach(t),Te=h(e),ee=r(e,"P",{});var Bt=s(ee);ht=f(Bt,"As depicted above the model takes as input a MIDI file and tokenizes it into a sequence of 5 second intervals. Each tokenized interval then together with positional encodings is passed through the Note Encoder and its representation is concatenated with the previous window\u2019s generated spectrogram representation obtained via the Context Encoder. For the initial 5 second window this is set to zero. The resulting context is then used as conditioning to sample the denoised Spectrogram from the MIDI window and we concatenate this spectrogram to the final output as well as use it for the context of the next MIDI window. The process repeats till we have gone over all the MIDI inputs. Finally a MelGAN decoder converts the potentially long spectrogram to audio which is the final result of this pipeline."),Bt.forEach(t),Ne=h(e),g=r(e,"H2",{class:!0});var Ve=s(g);T=r(Ve,"A",{id:!0,class:!0,href:!0});var jt=s(T);fe=r(jt,"SPAN",{});var Lt=s(fe);M(q.$$.fragment,Lt),Lt.forEach(t),jt.forEach(t),ct=h(Ve),de=r(Ve,"SPAN",{});var Xt=s(de);pt=f(Xt,"Available Pipelines:"),Xt.forEach(t),Ve.forEach(t),Ze=h(e),N=r(e,"TABLE",{});var Be=s(N);ue=r(Be,"THEAD",{});var Qt=s(ue);w=r(Qt,"TR",{});var ie=s(w);me=r(ie,"TH",{});var Kt=s(me);ft=f(Kt,"Pipeline"),Kt.forEach(t),dt=h(ie),ve=r(ie,"TH",{});var Ot=s(ve);ut=f(Ot,"Tasks"),Ot.forEach(t),mt=h(ie),te=r(ie,"TH",{align:!0});var Yt=s(te);vt=f(Yt,"Colab"),Yt.forEach(t),ie.forEach(t),Qt.forEach(t),gt=h(Be),ge=r(Be,"TBODY",{});var ea=s(ge);b=r(ea,"TR",{});var re=s(b);we=r(re,"TD",{});var ta=s(we);V=r(ta,"A",{href:!0,rel:!0});var aa=s(V);wt=f(aa,"pipeline_spectrogram_diffusion.py"),aa.forEach(t),ta.forEach(t),bt=h(re),be=r(re,"TD",{});var ia=s(be);ye=r(ia,"EM",{});var ra=s(ye);yt=f(ra,"Unconditional Audio Generation"),ra.forEach(t),ia.forEach(t),_t=h(re),ae=r(re,"TD",{align:!0});var sa=s(ae);Et=f(sa,"-"),sa.forEach(t),re.forEach(t),ea.forEach(t),Be.forEach(t),Ge=h(e),y=r(e,"H2",{class:!0});var je=s(y);Z=r(je,"A",{id:!0,class:!0,href:!0});var oa=s(Z);_e=r(oa,"SPAN",{});var na=s(_e);M(B.$$.fragment,na),na.forEach(t),oa.forEach(t),$t=h(je),Ee=r(je,"SPAN",{});var la=s(Ee);Mt=f(la,"Example usage"),la.forEach(t),je.forEach(t),We=h(e),M(j.$$.fragment,e),Ce=h(e),_=r(e,"H2",{class:!0});var Le=s(_);G=r(Le,"A",{id:!0,class:!0,href:!0});var ha=s(G);$e=r(ha,"SPAN",{});var ca=s($e);M(L.$$.fragment,ca),ca.forEach(t),ha.forEach(t),St=h(Le),Me=r(Le,"SPAN",{});var pa=s(Me);Dt=f(pa,"SpectrogramDiffusionPipeline"),pa.forEach(t),Le.forEach(t),Re=h(e),E=r(e,"DIV",{class:!0});var Xe=s(E);M(X.$$.fragment,Xe),kt=h(Xe),W=r(Xe,"DIV",{class:!0});var Qe=s(W);M(Q.$$.fragment,Qe),Pt=h(Qe),Se=r(Qe,"P",{});var fa=s(Se);It=f(fa,"Call self as a function."),fa.forEach(t),Qe.forEach(t),Xe.forEach(t),this.h()},h(){o(d,"name","hf:doc:metadata"),o(d,"content",JSON.stringify($a)),o(I,"id","multiinstrument-music-synthesis-with-spectrogram-diffusion"),o(I,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),o(I,"href","#multiinstrument-music-synthesis-with-spectrogram-diffusion"),o(u,"class","relative group"),o(U,"id","overview"),o(U,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),o(U,"href","#overview"),o(m,"class","relative group"),o(z,"href","https://arxiv.org/abs/2206.05408"),o(z,"rel","nofollow"),o(F,"href","https://github.com/magenta/music-spectrogram-diffusion"),o(F,"rel","nofollow"),o(x,"id","model"),o(x,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),o(x,"href","#model"),o(v,"class","relative group"),wa(Y.src,xt="https://storage.googleapis.com/music-synthesis-with-spectrogram-diffusion/architecture.png")||o(Y,"src",xt),o(Y,"alt","img"),o(T,"id","available-pipelines"),o(T,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),o(T,"href","#available-pipelines"),o(g,"class","relative group"),o(te,"align","center"),o(V,"href","https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py"),o(V,"rel","nofollow"),o(ae,"align","center"),o(Z,"id","example-usage"),o(Z,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),o(Z,"href","#example-usage"),o(y,"class","relative group"),o(G,"id","diffusers.SpectrogramDiffusionPipeline"),o(G,"class","header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full"),o(G,"href","#diffusers.SpectrogramDiffusionPipeline"),o(_,"class","relative group"),o(W,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),o(E,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,n){a(document.head,d),c(e,De,n),c(e,u,n),a(u,I),a(I,oe),S(C,oe,null),a(u,Ke),a(u,ne),a(ne,Oe),c(e,ke,n),c(e,m,n),a(m,U),a(U,le),S(R,le,null),a(m,Ye),a(m,he),a(he,et),c(e,Pe,n),c(e,J,n),a(J,z),a(z,tt),a(J,at),c(e,Ie,n),c(e,K,n),a(K,it),c(e,Ue,n),c(e,A,n),a(A,rt),a(A,F),a(F,st),a(A,ot),c(e,Ae,n),c(e,v,n),a(v,x),a(x,ce),S(H,ce,null),a(v,nt),a(v,pe),a(pe,lt),c(e,xe,n),c(e,O,n),a(O,Y),c(e,Te,n),c(e,ee,n),a(ee,ht),c(e,Ne,n),c(e,g,n),a(g,T),a(T,fe),S(q,fe,null),a(g,ct),a(g,de),a(de,pt),c(e,Ze,n),c(e,N,n),a(N,ue),a(ue,w),a(w,me),a(me,ft),a(w,dt),a(w,ve),a(ve,ut),a(w,mt),a(w,te),a(te,vt),a(N,gt),a(N,ge),a(ge,b),a(b,we),a(we,V),a(V,wt),a(b,bt),a(b,be),a(be,ye),a(ye,yt),a(b,_t),a(b,ae),a(ae,Et),c(e,Ge,n),c(e,y,n),a(y,Z),a(Z,_e),S(B,_e,null),a(y,$t),a(y,Ee),a(Ee,Mt),c(e,We,n),S(j,e,n),c(e,Ce,n),c(e,_,n),a(_,G),a(G,$e),S(L,$e,null),a(_,St),a(_,Me),a(Me,Dt),c(e,Re,n),c(e,E,n),S(X,E,null),a(E,kt),a(E,W),S(Q,W,null),a(W,Pt),a(W,Se),a(Se,It),Je=!0},p:ba,i(e){Je||(D(C.$$.fragment,e),D(R.$$.fragment,e),D(H.$$.fragment,e),D(q.$$.fragment,e),D(B.$$.fragment,e),D(j.$$.fragment,e),D(L.$$.fragment,e),D(X.$$.fragment,e),D(Q.$$.fragment,e),Je=!0)},o(e){k(C.$$.fragment,e),k(R.$$.fragment,e),k(H.$$.fragment,e),k(q.$$.fragment,e),k(B.$$.fragment,e),k(j.$$.fragment,e),k(L.$$.fragment,e),k(X.$$.fragment,e),k(Q.$$.fragment,e),Je=!1},d(e){t(d),e&&t(De),e&&t(u),P(C),e&&t(ke),e&&t(m),P(R),e&&t(Pe),e&&t(J),e&&t(Ie),e&&t(K),e&&t(Ue),e&&t(A),e&&t(Ae),e&&t(v),P(H),e&&t(xe),e&&t(O),e&&t(Te),e&&t(ee),e&&t(Ne),e&&t(g),P(q),e&&t(Ze),e&&t(N),e&&t(Ge),e&&t(y),P(B),e&&t(We),P(j,e),e&&t(Ce),e&&t(_),P(L),e&&t(Re),e&&t(E),P(X),P(Q)}}}const $a={local:"multiinstrument-music-synthesis-with-spectrogram-diffusion",sections:[{local:"overview",title:"Overview"},{local:"model",title:"Model"},{local:"available-pipelines",title:"Available Pipelines:"},{local:"example-usage",title:"Example usage"},{local:"diffusers.SpectrogramDiffusionPipeline",title:"SpectrogramDiffusionPipeline"}],title:"Multi-instrument Music Synthesis with Spectrogram Diffusion"};function Ma(At){return ya(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Ia extends ua{constructor(d){super();ma(this,d,Ma,Ea,va,{})}}export{Ia as default,$a as metadata}; | |
Xet Storage Details
- Size:
- 16.9 kB
- Xet hash:
- e6b1bbf379e71846e72e32735a1f14284f8ce12369a2689a7563c0f8d71fa731
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.