Buckets:
| import{s as Se,o as Xe,n as ge}from"../chunks/scheduler.94020406.js";import{S as ve,i as ke,g as p,s as n,r as J,m as El,E as Ye,h as i,f as e,c as a,j as Lt,u as o,x as M,n as $l,k as Sl,y as xe,a as s,v as y,d as c,t as w,w as r}from"../chunks/index.a08c8d92.js";import{T as qt}from"../chunks/Tip.3b0aeee8.js";import{C as d}from"../chunks/CodeBlock.f1fae7de.js";import{H as h,E as Ae}from"../chunks/getInferenceSnippets.3bf24426.js";function He(f){let m;return{c(){m=El("어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다.")},l(U){m=$l(U,"어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다.")},m(U,u){s(U,m,u)},d(U){U&&e(m)}}}function Fe(f){let m;return{c(){m=El(`Attention slicing은 모델이 하나 이상의 어텐션 헤드를 사용하는 한, 배치 크기가 1인 경우에도 유용합니다. | |
| 하나 이상의 어텐션 헤드가 있는 경우 *QK^T* 어텐션 매트릭스는 상당한 양의 메모리를 절약할 수 있는 각 헤드에 대해 순차적으로 계산될 수 있습니다.`)},l(U){m=$l(U,`Attention slicing은 모델이 하나 이상의 어텐션 헤드를 사용하는 한, 배치 크기가 1인 경우에도 유용합니다. | |
| 하나 이상의 어텐션 헤드가 있는 경우 *QK^T* 어텐션 매트릭스는 상당한 양의 메모리를 절약할 수 있는 각 헤드에 대해 순차적으로 계산될 수 있습니다.`)},m(U,u){s(U,m,u)},d(U){U&&e(m)}}}function ze(f){let m,U,u="모델 오프로딩",j;return{c(){m=El("또 다른 최적화 방법인 "),U=p("a"),U.textContent=u,j=El("을 사용하는 것을 고려하십시오. 이는 훨씬 빠르지만 메모리 절약이 크지는 않습니다."),this.h()},l(T){m=$l(T,"또 다른 최적화 방법인 "),U=i(T,"A",{href:!0,"data-svelte-h":!0}),M(U)!=="svelte-zbpoyt"&&(U.textContent=u),j=$l(T,"을 사용하는 것을 고려하십시오. 이는 훨씬 빠르지만 메모리 절약이 크지는 않습니다."),this.h()},h(){Sl(U,"href","#model_offloading")},m(T,b){s(T,m,b),s(T,U,b),s(T,j,b)},p:ge,d(T){T&&(e(m),e(U),e(j))}}}function De(f){let m;return{c(){m=El("이 기능을 사용하려면 'accelerate' 버전 0.17.0 이상이 필요합니다.")},l(U){m=$l(U,"이 기능을 사용하려면 'accelerate' 버전 0.17.0 이상이 필요합니다.")},m(U,u){s(U,m,u)},d(U){U&&e(m)}}}function Le(f){let m,U,u,j,T,b,V,Ot=`메모리 또는 속도에 대해 🤗 Diffusers <em>추론</em>을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다. | |
| 일반적으로, memory-efficient attention을 위해 <a href="https://github.com/facebookresearch/xformers" rel="nofollow">xFormers</a> 사용을 추천하기 때문에, 추천하는 <a href="xformers">설치 방법</a>을 보고 설치해 보세요.`,Xl,B,Pt="다음 설정이 성능과 메모리에 미치는 영향에 대해 설명합니다.",gl,G,Kt="<thead><tr><th></th> <th>지연시간</th> <th>속도 향상</th></tr></thead> <tbody><tr><td>별도 설정 없음</td> <td>9.50s</td> <td>x1</td></tr> <tr><td>cuDNN auto-tuner</td> <td>9.37s</td> <td>x1.01</td></tr> <tr><td>fp16</td> <td>3.61s</td> <td>x2.63</td></tr> <tr><td>Channels Last 메모리 형식</td> <td>3.30s</td> <td>x2.88</td></tr> <tr><td>traced UNet</td> <td>3.21s</td> <td>x2.96</td></tr> <tr><td>memory-efficient attention</td> <td>2.63s</td> <td>x3.61</td></tr></tbody>",vl,R,le='NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다.',kl,E,Yl,$,te='<a href="https://developer.nvidia.com/cudnn" rel="nofollow">NVIDIA cuDNN</a>은 컨볼루션을 계산하는 많은 알고리즘을 지원합니다. Autotuner는 짧은 벤치마크를 실행하고 주어진 입력 크기에 대해 주어진 하드웨어에서 최고의 성능을 가진 커널을 선택합니다.',xl,_,ee="<strong>컨볼루션 네트워크</strong>를 활용하고 있기 때문에 (다른 유형들은 현재 지원되지 않음), 다음 설정을 통해 추론 전에 cuDNN autotuner를 활성화할 수 있습니다:",Al,W,Hl,N,Fl,S,se=`Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다. | |
| 기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다. | |
| 네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다. | |
| 이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다. | |
| 그것에 대해 <a href="https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32" rel="nofollow">여기</a>서 더 읽을 수 있습니다. | |
| 추론하기 전에 다음을 추가하기만 하면 됩니다:`,zl,X,Dl,g,Ll,v,ne=`더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 불러오고 실행할 수 있습니다. | |
| 여기에는 <code>fp16</code>이라는 브랜치에 저장된 float16 버전의 가중치를 불러오고, 그 때 <code>float16</code> 유형을 사용하도록 PyTorch에 지시하는 작업이 포함됩니다.`,ql,k,Ol,C,Pl,Y,Kl,x,ae="추가 메모리 절약을 위해, 한 번에 모두 계산하는 대신 단계적으로 계산을 수행하는 슬라이스 버전의 어텐션(attention)을 사용할 수 있습니다.",lt,I,tt,A,pe="각 헤드에 대해 순차적으로 어텐션 계산을 수행하려면, 다음과 같이 추론 전에 파이프라인에서 <code>enable_attention_slicing()</code>를 호출하면 됩니다:",et,H,st,F,ie="추론 시간이 약 10% 느려지는 약간의 성능 저하가 있지만 이 방법을 사용하면 3.2GB 정도의 작은 VRAM으로도 Stable Diffusion을 사용할 수 있습니다!",nt,z,at,D,Me="제한된 VRAM에서 대규모 이미지 배치를 디코딩하거나 32개 이상의 이미지가 포함된 배치를 활성화하기 위해, 배치의 latent 이미지를 한 번에 하나씩 디코딩하는 슬라이스 VAE 디코드를 사용할 수 있습니다.",pt,L,Ue="이를 <code>enable_attention_slicing()</code> 또는 <code>enable_xformers_memory_efficient_attention()</code>과 결합하여 메모리 사용을 추가로 최소화할 수 있습니다.",it,q,me="VAE 디코드를 한 번에 하나씩 수행하려면 추론 전에 파이프라인에서 <code>enable_vae_slicing()</code>을 호출합니다. 예를 들어:",Mt,O,Ut,P,Je="다중 이미지 배치에서 VAE 디코드가 약간의 성능 향상이 이루어집니다. 단일 이미지 배치에서는 성능 영향은 없습니다.",mt,_l,Jt,K,oe="추가 메모리 절약을 위해 가중치를 CPU로 오프로드하고 순방향 전달을 수행할 때만 GPU로 로드할 수 있습니다.",ot,ll,ye="CPU 오프로딩을 수행하려면 <code>enable_sequential_cpu_offload()</code>를 호출하기만 하면 됩니다:",yt,tl,ct,el,ce="그러면 메모리 소비를 3GB 미만으로 줄일 수 있습니다.",wt,sl,we="참고로 이 방법은 전체 모델이 아닌 서브모듈 수준에서 작동합니다. 이는 메모리 소비를 최소화하는 가장 좋은 방법이지만 프로세스의 반복적 특성으로 인해 추론 속도가 훨씬 느립니다. 파이프라인의 UNet 구성 요소는 여러 번 실행됩니다(‘num_inference_steps’ 만큼). 매번 UNet의 서로 다른 서브모듈이 순차적으로 온로드된 다음 필요에 따라 오프로드되므로 메모리 이동 횟수가 많습니다.",rt,Z,Tt,nl,re="또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다.",ut,al,dt,pl,Te='<strong>참고</strong>: ‘enable_sequential_cpu_offload()‘를 사용할 때, 미리 파이프라인을 CUDA로 이동하지 <strong>않는</strong> 것이 중요합니다.그렇지 않으면 메모리 소비의 이득이 최소화됩니다. 더 많은 정보를 위해 <a href="https://github.com/huggingface/diffusers/issues/1934" rel="nofollow">이 이슈</a>를 보세요.',ft,Wl,jt,il,ue='<a href="#sequential_offloading">순차적 CPU 오프로딩</a>은 이전 섹션에서 설명한 것처럼 많은 메모리를 보존하지만 필요에 따라 서브모듈을 GPU로 이동하고 새 모듈이 실행될 때 즉시 CPU로 반환되기 때문에 추론 속도가 느려집니다.',ht,Ml,de="전체 모델 오프로딩은 각 모델의 구성 요소인 <em>modules</em>을 처리하는 대신, 전체 모델을 GPU로 이동하는 대안입니다. 이로 인해 추론 시간에 미치는 영향은 미미하지만(파이프라인을 ‘cuda’로 이동하는 것과 비교하여) 여전히 약간의 메모리를 절약할 수 있습니다.",bt,Ul,fe=`이 시나리오에서는 파이프라인의 주요 구성 요소 중 하나만(일반적으로 텍스트 인코더, unet 및 vae) GPU에 있고, 나머지는 CPU에서 대기할 것입니다. | |
| 여러 반복을 위해 실행되는 UNet과 같은 구성 요소는 더 이상 필요하지 않을 때까지 GPU에 남아 있습니다.`,Ct,ml,je="이 기능은 아래와 같이 파이프라인에서 <code>enable_model_cpu_offload()</code>를 호출하여 활성화할 수 있습니다.",It,Jl,Zt,ol,he="이는 추가적인 메모리 절약을 위한 attention slicing과도 호환됩니다.",Qt,yl,Vt,Q,Bt,cl,Gt,wl,be=`Channels Last 메모리 형식은 차원 순서를 보존하는 메모리에서 NCHW 텐서 배열을 대체하는 방법입니다. | |
| Channels Last 텐서는 채널이 가장 조밀한 차원이 되는 방식으로 정렬됩니다(일명 픽셀당 이미지를 저장). | |
| 현재 모든 연산자 Channels Last 형식을 지원하는 것은 아니라 성능이 저하될 수 있으므로, 사용해보고 모델에 잘 작동하는지 확인하는 것이 좋습니다.`,Rt,rl,Ce="예를 들어 파이프라인의 UNet 모델이 channels Last 형식을 사용하도록 설정하려면 다음을 사용할 수 있습니다:",Et,Tl,$t,ul,_t,dl,Ie="추적은 모델을 통해 예제 입력 텐서를 통해 실행되는데, 해당 입력이 모델의 레이어를 통과할 때 호출되는 작업을 캡처하여 실행 파일 또는 ‘ScriptFunction’이 반환되도록 하고, 이는 just-in-time 컴파일로 최적화됩니다.",Wt,fl,Ze="UNet 모델을 추적하기 위해 다음을 사용할 수 있습니다:",Nt,jl,St,hl,Qe="그 다음, 파이프라인의 <code>unet</code> 특성을 다음과 같이 추적된 모델로 바꿀 수 있습니다.",Xt,bl,gt,Cl,vt,Il,Ve=`어텐션 블록의 대역폭을 최적화하는 최근 작업으로 GPU 메모리 사용량이 크게 향상되고 향상되었습니다. | |
| @tridao의 가장 최근의 플래시 어텐션: <a href="https://github.com/HazyResearch/flash-attention" rel="nofollow">code</a>, <a href="https://huggingface.co/papers/2205.14135" rel="nofollow">paper</a>.`,kt,Zl,Be="배치 크기 1(프롬프트 1개)의 512x512 크기로 추론을 실행할 때 몇 가지 Nvidia GPU에서 얻은 속도 향상은 다음과 같습니다:",Yt,Ql,Ge="<thead><tr><th>GPU</th> <th>기준 어텐션 FP16</th> <th>메모리 효율적인 어텐션 FP16</th></tr></thead> <tbody><tr><td>NVIDIA Tesla T4</td> <td>3.5it/s</td> <td>5.5it/s</td></tr> <tr><td>NVIDIA 3060 RTX</td> <td>4.6it/s</td> <td>7.8it/s</td></tr> <tr><td>NVIDIA A10G</td> <td>8.88it/s</td> <td>15.6it/s</td></tr> <tr><td>NVIDIA RTX A6000</td> <td>11.7it/s</td> <td>21.09it/s</td></tr> <tr><td>NVIDIA TITAN RTX</td> <td>12.51it/s</td> <td>18.22it/s</td></tr> <tr><td>A100-SXM4-40GB</td> <td>18.6it/s</td> <td>29.it/s</td></tr> <tr><td>A100-SXM-80GB</td> <td>18.7it/s</td> <td>29.5it/s</td></tr></tbody>",xt,Vl,Re="이를 활용하려면 다음을 만족해야 합니다:",At,Bl,Ee='<li>PyTorch > 1.12</li> <li>Cuda 사용 가능</li> <li><a href="xformers">xformers 라이브러리를 설치함</a></li>',Ht,Gl,Ft,Rl,zt,Nl,Dt;return T=new h({props:{title:"메모리와 속도",local:"메모리와-속도",headingTag:"h1"}}),E=new h({props:{title:"cuDNN auto-tuner 활성화하기",local:"cudnn-auto-tuner-활성화하기",headingTag:"h2"}}),W=new d({props:{code:"aW1wb3J0JTIwdG9yY2glMEElMEF0b3JjaC5iYWNrZW5kcy5jdWRubi5iZW5jaG1hcmslMjAlM0QlMjBUcnVl",highlighted:`<span class="hljs-keyword">import</span> torch | |
| torch.backends.cudnn.benchmark = <span class="hljs-literal">True</span>`,wrap:!1}}),N=new h({props:{title:"fp32 대신 tf32 사용하기 (Ampere 및 이후 CUDA 장치들에서)",local:"fp32-대신-tf32-사용하기-ampere-및-이후-cuda-장치들에서",headingTag:"h3"}}),X=new d({props:{code:"aW1wb3J0JTIwdG9yY2glMEElMEF0b3JjaC5iYWNrZW5kcy5jdWRhLm1hdG11bC5hbGxvd190ZjMyJTIwJTNEJTIwVHJ1ZQ==",highlighted:`<span class="hljs-keyword">import</span> torch | |
| torch.backends.cuda.matmul.allow_tf32 = <span class="hljs-literal">True</span>`,wrap:!1}}),g=new h({props:{title:"반정밀도 가중치",local:"반정밀도-가중치",headingTag:"h2"}}),k=new d({props:{code:"cGlwZSUyMCUzRCUyMFN0YWJsZURpZmZ1c2lvblBpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJzdGFibGUtZGlmZnVzaW9uLXYxLTUlMkZzdGFibGUtZGlmZnVzaW9uLXYxLTUlMjIlMkMlMEElMEElMjAlMjAlMjAlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmZsb2F0MTYlMkMlMEEpJTBBcGlwZSUyMCUzRCUyMHBpcGUudG8oJTIyY3VkYSUyMiklMEElMEFwcm9tcHQlMjAlM0QlMjAlMjJhJTIwcGhvdG8lMjBvZiUyMGFuJTIwYXN0cm9uYXV0JTIwcmlkaW5nJTIwYSUyMGhvcnNlJTIwb24lMjBtYXJzJTIyJTBBaW1hZ2UlMjAlM0QlMjBwaXBlKHByb21wdCkuaW1hZ2VzJTVCMCU1RA==",highlighted:`pipe = StableDiffusionPipeline.from_pretrained( | |
| <span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>, | |
| torch_dtype=torch.float16, | |
| ) | |
| pipe = pipe.to(<span class="hljs-string">"cuda"</span>) | |
| prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span> | |
| image = pipe(prompt).images[<span class="hljs-number">0</span>]`,wrap:!1}}),C=new qt({props:{warning:!0,$$slots:{default:[He]},$$scope:{ctx:f}}}),Y=new h({props:{title:"추가 메모리 절약을 위한 슬라이스 어텐션",local:"추가-메모리-절약을-위한-슬라이스-어텐션",headingTag:"h2"}}),I=new qt({props:{$$slots:{default:[Fe]},$$scope:{ctx:f}}}),H=new d({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uUGlwZWxpbmUlMEElMEFwaXBlJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMnN0YWJsZS1kaWZmdXNpb24tdjEtNSUyRnN0YWJsZS1kaWZmdXNpb24tdjEtNSUyMiUyQyUwQSUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiUyQyUwQSklMEFwaXBlJTIwJTNEJTIwcGlwZS50byglMjJjdWRhJTIyKSUwQSUwQXByb21wdCUyMCUzRCUyMCUyMmElMjBwaG90byUyMG9mJTIwYW4lMjBhc3Ryb25hdXQlMjByaWRpbmclMjBhJTIwaG9yc2UlMjBvbiUyMG1hcnMlMjIlMEFwaXBlLmVuYWJsZV9hdHRlbnRpb25fc2xpY2luZygpJTBBaW1hZ2UlMjAlM0QlMjBwaXBlKHByb21wdCkuaW1hZ2VzJTVCMCU1RA==",highlighted:`<span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline | |
| pipe = StableDiffusionPipeline.from_pretrained( | |
| <span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>, | |
| torch_dtype=torch.float16, | |
| ) | |
| pipe = pipe.to(<span class="hljs-string">"cuda"</span>) | |
| prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span> | |
| pipe.enable_attention_slicing() | |
| image = pipe(prompt).images[<span class="hljs-number">0</span>]`,wrap:!1}}),z=new h({props:{title:"더 큰 배치를 위한 sliced VAE 디코드",local:"더-큰-배치를-위한-sliced-vae-디코드",headingTag:"h2"}}),O=new d({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uUGlwZWxpbmUlMEElMEFwaXBlJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMnN0YWJsZS1kaWZmdXNpb24tdjEtNSUyRnN0YWJsZS1kaWZmdXNpb24tdjEtNSUyMiUyQyUwQSUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiUyQyUwQSklMEFwaXBlJTIwJTNEJTIwcGlwZS50byglMjJjdWRhJTIyKSUwQSUwQXByb21wdCUyMCUzRCUyMCUyMmElMjBwaG90byUyMG9mJTIwYW4lMjBhc3Ryb25hdXQlMjByaWRpbmclMjBhJTIwaG9yc2UlMjBvbiUyMG1hcnMlMjIlMEFwaXBlLmVuYWJsZV92YWVfc2xpY2luZygpJTBBaW1hZ2VzJTIwJTNEJTIwcGlwZSglNUJwcm9tcHQlNUQlMjAqJTIwMzIpLmltYWdlcw==",highlighted:`<span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline | |
| pipe = StableDiffusionPipeline.from_pretrained( | |
| <span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>, | |
| torch_dtype=torch.float16, | |
| ) | |
| pipe = pipe.to(<span class="hljs-string">"cuda"</span>) | |
| prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span> | |
| pipe.enable_vae_slicing() | |
| images = pipe([prompt] * <span class="hljs-number">32</span>).images`,wrap:!1}}),tl=new d({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uUGlwZWxpbmUlMEElMEFwaXBlJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMnN0YWJsZS1kaWZmdXNpb24tdjEtNSUyRnN0YWJsZS1kaWZmdXNpb24tdjEtNSUyMiUyQyUwQSUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiUyQyUwQSklMEElMEFwcm9tcHQlMjAlM0QlMjAlMjJhJTIwcGhvdG8lMjBvZiUyMGFuJTIwYXN0cm9uYXV0JTIwcmlkaW5nJTIwYSUyMGhvcnNlJTIwb24lMjBtYXJzJTIyJTBBcGlwZS5lbmFibGVfc2VxdWVudGlhbF9jcHVfb2ZmbG9hZCgpJTBBaW1hZ2UlMjAlM0QlMjBwaXBlKHByb21wdCkuaW1hZ2VzJTVCMCU1RA==",highlighted:`<span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline | |
| pipe = StableDiffusionPipeline.from_pretrained( | |
| <span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>, | |
| torch_dtype=torch.float16, | |
| ) | |
| prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span> | |
| pipe.enable_sequential_cpu_offload() | |
| image = pipe(prompt).images[<span class="hljs-number">0</span>]`,wrap:!1}}),Z=new qt({props:{$$slots:{default:[ze]},$$scope:{ctx:f}}}),al=new d({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uUGlwZWxpbmUlMEElMEFwaXBlJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMnN0YWJsZS1kaWZmdXNpb24tdjEtNSUyRnN0YWJsZS1kaWZmdXNpb24tdjEtNSUyMiUyQyUwQSUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiUyQyUwQSklMEElMEFwcm9tcHQlMjAlM0QlMjAlMjJhJTIwcGhvdG8lMjBvZiUyMGFuJTIwYXN0cm9uYXV0JTIwcmlkaW5nJTIwYSUyMGhvcnNlJTIwb24lMjBtYXJzJTIyJTBBcGlwZS5lbmFibGVfc2VxdWVudGlhbF9jcHVfb2ZmbG9hZCgpJTBBcGlwZS5lbmFibGVfYXR0ZW50aW9uX3NsaWNpbmcoMSklMEElMEFpbWFnZSUyMCUzRCUyMHBpcGUocHJvbXB0KS5pbWFnZXMlNUIwJTVE",highlighted:`<span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline | |
| pipe = StableDiffusionPipeline.from_pretrained( | |
| <span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>, | |
| torch_dtype=torch.float16, | |
| ) | |
| prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span> | |
| pipe.enable_sequential_cpu_offload() | |
| pipe.enable_attention_slicing(<span class="hljs-number">1</span>) | |
| image = pipe(prompt).images[<span class="hljs-number">0</span>]`,wrap:!1}}),Jl=new d({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uUGlwZWxpbmUlMEElMEFwaXBlJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMnN0YWJsZS1kaWZmdXNpb24tdjEtNSUyRnN0YWJsZS1kaWZmdXNpb24tdjEtNSUyMiUyQyUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiUyQyUwQSklMEElMEFwcm9tcHQlMjAlM0QlMjAlMjJhJTIwcGhvdG8lMjBvZiUyMGFuJTIwYXN0cm9uYXV0JTIwcmlkaW5nJTIwYSUyMGhvcnNlJTIwb24lMjBtYXJzJTIyJTBBcGlwZS5lbmFibGVfbW9kZWxfY3B1X29mZmxvYWQoKSUwQWltYWdlJTIwJTNEJTIwcGlwZShwcm9tcHQpLmltYWdlcyU1QjAlNUQ=",highlighted:`<span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline | |
| pipe = StableDiffusionPipeline.from_pretrained( | |
| <span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>, | |
| torch_dtype=torch.float16, | |
| ) | |
| prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span> | |
| pipe.enable_model_cpu_offload() | |
| image = pipe(prompt).images[<span class="hljs-number">0</span>]`,wrap:!1}}),yl=new d({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uUGlwZWxpbmUlMEElMEFwaXBlJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMnN0YWJsZS1kaWZmdXNpb24tdjEtNSUyRnN0YWJsZS1kaWZmdXNpb24tdjEtNSUyMiUyQyUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiUyQyUwQSklMEElMEFwcm9tcHQlMjAlM0QlMjAlMjJhJTIwcGhvdG8lMjBvZiUyMGFuJTIwYXN0cm9uYXV0JTIwcmlkaW5nJTIwYSUyMGhvcnNlJTIwb24lMjBtYXJzJTIyJTBBcGlwZS5lbmFibGVfbW9kZWxfY3B1X29mZmxvYWQoKSUwQXBpcGUuZW5hYmxlX2F0dGVudGlvbl9zbGljaW5nKDEpJTBBJTBBaW1hZ2UlMjAlM0QlMjBwaXBlKHByb21wdCkuaW1hZ2VzJTVCMCU1RA==",highlighted:`<span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline | |
| pipe = StableDiffusionPipeline.from_pretrained( | |
| <span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>, | |
| torch_dtype=torch.float16, | |
| ) | |
| prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span> | |
| pipe.enable_model_cpu_offload() | |
| pipe.enable_attention_slicing(<span class="hljs-number">1</span>) | |
| image = pipe(prompt).images[<span class="hljs-number">0</span>]`,wrap:!1}}),Q=new qt({props:{$$slots:{default:[De]},$$scope:{ctx:f}}}),cl=new h({props:{title:"Channels Last 메모리 형식 사용하기",local:"channels-last-메모리-형식-사용하기",headingTag:"h2"}}),Tl=new d({props:{code:"cHJpbnQocGlwZS51bmV0LmNvbnZfb3V0LnN0YXRlX2RpY3QoKSU1QiUyMndlaWdodCUyMiU1RC5zdHJpZGUoKSklMjAlMjAlMjMlMjAoMjg4MCUyQyUyMDklMkMlMjAzJTJDJTIwMSklMEFwaXBlLnVuZXQudG8obWVtb3J5X2Zvcm1hdCUzRHRvcmNoLmNoYW5uZWxzX2xhc3QpJTIwJTIwJTIzJTIwaW4tcGxhY2UlMjAlRUMlOTclQjAlRUMlODIlQjAlMEElMjMlMjAyJUVCJUIyJTg4JUVDJUE3JUI4JTIwJUVDJUIwJUE4JUVDJTlCJTkwJUVDJTk3JTkwJUVDJTg0JTlDJTIwJUVDJThBJUE0JUVEJThBJUI4JUVCJTlEJUJDJUVDJTlEJUI0JUVCJTkzJTlDJTIwMSVFQyU5RCU4NCUyMCVFQSVCMCU4MCVFQyVBNyU4MCVFQiU4QSU5NCUyMCgyODgwJTJDJTIwMSUyQyUyMDk2MCUyQyUyMDMyMCklRUIlQTElOUMlMkMlMjAlRUMlOTclQjAlRUMlODIlQjAlRUMlOUQlQjQlMjAlRUMlOUUlOTElRUIlOEYlOTklRUQlOTUlQTglRUMlOUQlODQlMjAlRUMlQTYlOUQlRUIlQUElODUlRUQlOTUlQTklRUIlOEIlODglRUIlOEIlQTQuJTBBcHJpbnQocGlwZS51bmV0LmNvbnZfb3V0LnN0YXRlX2RpY3QoKSU1QiUyMndlaWdodCUyMiU1RC5zdHJpZGUoKSk=",highlighted:`<span class="hljs-built_in">print</span>(pipe.unet.conv_out.state_dict()[<span class="hljs-string">"weight"</span>].stride()) <span class="hljs-comment"># (2880, 9, 3, 1)</span> | |
| pipe.unet.to(memory_format=torch.channels_last) <span class="hljs-comment"># in-place 연산</span> | |
| <span class="hljs-comment"># 2번째 차원에서 스트라이드 1을 가지는 (2880, 1, 960, 320)로, 연산이 작동함을 증명합니다.</span> | |
| <span class="hljs-built_in">print</span>(pipe.unet.conv_out.state_dict()[<span class="hljs-string">"weight"</span>].stride())`,wrap:!1}}),ul=new h({props:{title:"추적(tracing)",local:"추적tracing",headingTag:"h2"}}),jl=new d({props:{code:"aW1wb3J0JTIwdGltZSUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMFN0YWJsZURpZmZ1c2lvblBpcGVsaW5lJTBBaW1wb3J0JTIwZnVuY3Rvb2xzJTBBJTBBJTIzJTIwdG9yY2glMjAlRUElQjglQjAlRUMlOUElQjglRUElQjglQjAlMjAlRUIlQjklODQlRUQlOTklOUMlRUMlODQlQjElRUQlOTklOTQlMEF0b3JjaC5zZXRfZ3JhZF9lbmFibGVkKEZhbHNlKSUwQSUwQSUyMyUyMCVFQiVCMyU4MCVFQyU4OCU5OCUyMCVFQyU4NCVBNCVFQyVBMCU5NSUwQW5fZXhwZXJpbWVudHMlMjAlM0QlMjAyJTBBdW5ldF9ydW5zX3Blcl9leHBlcmltZW50JTIwJTNEJTIwNTAlMEElMEElMEElMjMlMjAlRUMlOUUlODUlRUIlQTAlQTUlMjAlRUIlQjYlODglRUIlOUYlQUMlRUMlOTglQTQlRUElQjglQjAlMEFkZWYlMjBnZW5lcmF0ZV9pbnB1dHMoKSUzQSUwQSUyMCUyMCUyMCUyMHNhbXBsZSUyMCUzRCUyMHRvcmNoLnJhbmRuKCgyJTJDJTIwNCUyQyUyMDY0JTJDJTIwNjQpJTJDJTIwZGV2aWNlJTNEJTIyY3VkYSUyMiUyQyUyMGR0eXBlJTNEdG9yY2guZmxvYXQxNiklMEElMjAlMjAlMjAlMjB0aW1lc3RlcCUyMCUzRCUyMHRvcmNoLnJhbmQoMSUyQyUyMGRldmljZSUzRCUyMmN1ZGElMjIlMkMlMjBkdHlwZSUzRHRvcmNoLmZsb2F0MTYpJTIwKiUyMDk5OSUwQSUyMCUyMCUyMCUyMGVuY29kZXJfaGlkZGVuX3N0YXRlcyUyMCUzRCUyMHRvcmNoLnJhbmRuKCgyJTJDJTIwNzclMkMlMjA3NjgpJTJDJTIwZGV2aWNlJTNEJTIyY3VkYSUyMiUyQyUyMGR0eXBlJTNEdG9yY2guZmxvYXQxNiklMEElMjAlMjAlMjAlMjByZXR1cm4lMjBzYW1wbGUlMkMlMjB0aW1lc3RlcCUyQyUyMGVuY29kZXJfaGlkZGVuX3N0YXRlcyUwQSUwQSUwQXBpcGUlMjAlM0QlMjBTdGFibGVEaWZmdXNpb25QaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyc3RhYmxlLWRpZmZ1c2lvbi12MS01JTJGc3RhYmxlLWRpZmZ1c2lvbi12MS01JTIyJTJDJTBBJTIwJTIwJTIwJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5mbG9hdDE2JTJDJTBBKS50byglMjJjdWRhJTIyKSUwQXVuZXQlMjAlM0QlMjBwaXBlLnVuZXQlMEF1bmV0LmV2YWwoKSUwQXVuZXQudG8obWVtb3J5X2Zvcm1hdCUzRHRvcmNoLmNoYW5uZWxzX2xhc3QpJTIwJTIwJTIzJTIwQ2hhbm5lbHMlMjBMYXN0JTIwJUVCJUE5JTk0JUVCJUFBJUE4JUVCJUE2JUFDJTIwJUVEJTk4JTk1JUVDJThCJTlEJTIwJUVDJTgyJUFDJUVDJTlBJUE5JTBBdW5ldC5mb3J3YXJkJTIwJTNEJTIwZnVuY3Rvb2xzLnBhcnRpYWwodW5ldC5mb3J3YXJkJTJDJTIwcmV0dXJuX2RpY3QlM0RGYWxzZSklMjAlMjAlMjMlMjByZXR1cm5fZGljdCUzREZhbHNlJUVDJTlEJTg0JTIwJUVBJUI4JUIwJUVCJUIzJUI4JUVBJUIwJTkyJUVDJTlDJUJDJUVCJUExJTlDJTIwJUVDJTg0JUE0JUVDJUEwJTk1JTBBJTBBJTIzJTIwJUVDJTlCJThDJUVCJUIwJThEJUVDJTk3JTg1JTBBZm9yJTIwXyUyMGluJTIwcmFuZ2UoMyklM0ElMEElMjAlMjAlMjAlMjB3aXRoJTIwdG9yY2guaW5mZXJlbmNlX21vZGUoKSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlucHV0cyUyMCUzRCUyMGdlbmVyYXRlX2lucHV0cygpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwb3JpZ19vdXRwdXQlMjAlM0QlMjB1bmV0KCppbnB1dHMpJTBBJTBBJTIzJTIwJUVDJUI2JTk0JUVDJUEwJTgxJTBBcHJpbnQoJTIydHJhY2luZy4uJTIyKSUwQXVuZXRfdHJhY2VkJTIwJTNEJTIwdG9yY2guaml0LnRyYWNlKHVuZXQlMkMlMjBpbnB1dHMpJTBBdW5ldF90cmFjZWQuZXZhbCgpJTBBcHJpbnQoJTIyZG9uZSUyMHRyYWNpbmclMjIpJTBBJTBBJTBBJTIzJTIwJUVDJTlCJThDJUVCJUIwJThEJUVDJTk3JTg1JTIwJUVCJUIwJThGJTIwJUVBJUI3JUI4JUVCJTlFJTk4JUVEJTk0JTg0JTIwJUVDJUI1JTlDJUVDJUEwJTgxJUVEJTk5JTk0JTBBZm9yJTIwXyUyMGluJTIwcmFuZ2UoNSklM0ElMEElMjAlMjAlMjAlMjB3aXRoJTIwdG9yY2guaW5mZXJlbmNlX21vZGUoKSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlucHV0cyUyMCUzRCUyMGdlbmVyYXRlX2lucHV0cygpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwb3JpZ19vdXRwdXQlMjAlM0QlMjB1bmV0X3RyYWNlZCgqaW5wdXRzKSUwQSUwQSUwQSUyMyUyMCVFQiVCMiVBNCVFQyVCOSU5OCVFQiVBNyU4OCVFRCU4MiVCOSUwQXdpdGglMjB0b3JjaC5pbmZlcmVuY2VfbW9kZSgpJTNBJTBBJTIwJTIwJTIwJTIwZm9yJTIwXyUyMGluJTIwcmFuZ2Uobl9leHBlcmltZW50cyklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB0b3JjaC5jdWRhLnN5bmNocm9uaXplKCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzdGFydF90aW1lJTIwJTNEJTIwdGltZS50aW1lKCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBmb3IlMjBfJTIwaW4lMjByYW5nZSh1bmV0X3J1bnNfcGVyX2V4cGVyaW1lbnQpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwb3JpZ19vdXRwdXQlMjAlM0QlMjB1bmV0X3RyYWNlZCgqaW5wdXRzKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHRvcmNoLmN1ZGEuc3luY2hyb25pemUoKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHByaW50KGYlMjJ1bmV0JTIwdHJhY2VkJTIwaW5mZXJlbmNlJTIwdG9vayUyMCU3QnRpbWUudGltZSgpJTIwLSUyMHN0YXJ0X3RpbWUlM0EuMmYlN0QlMjBzZWNvbmRzJTIyKSUwQSUyMCUyMCUyMCUyMGZvciUyMF8lMjBpbiUyMHJhbmdlKG5fZXhwZXJpbWVudHMpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwdG9yY2guY3VkYS5zeW5jaHJvbml6ZSgpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc3RhcnRfdGltZSUyMCUzRCUyMHRpbWUudGltZSgpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZm9yJTIwXyUyMGluJTIwcmFuZ2UodW5ldF9ydW5zX3Blcl9leHBlcmltZW50KSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMG9yaWdfb3V0cHV0JTIwJTNEJTIwdW5ldCgqaW5wdXRzKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHRvcmNoLmN1ZGEuc3luY2hyb25pemUoKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHByaW50KGYlMjJ1bmV0JTIwaW5mZXJlbmNlJTIwdG9vayUyMCU3QnRpbWUudGltZSgpJTIwLSUyMHN0YXJ0X3RpbWUlM0EuMmYlN0QlMjBzZWNvbmRzJTIyKSUwQSUwQSUyMyUyMCVFQiVBQSVBOCVFQiU4RCVCOCUyMCVFQyVBMCU4MCVFQyU5RSVBNSUwQXVuZXRfdHJhY2VkLnNhdmUoJTIydW5ldF90cmFjZWQucHQlMjIp",highlighted:`<span class="hljs-keyword">import</span> time | |
| <span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline | |
| <span class="hljs-keyword">import</span> functools | |
| <span class="hljs-comment"># torch 기울기 비활성화</span> | |
| torch.set_grad_enabled(<span class="hljs-literal">False</span>) | |
| <span class="hljs-comment"># 변수 설정</span> | |
| n_experiments = <span class="hljs-number">2</span> | |
| unet_runs_per_experiment = <span class="hljs-number">50</span> | |
| <span class="hljs-comment"># 입력 불러오기</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">generate_inputs</span>(): | |
| sample = torch.randn((<span class="hljs-number">2</span>, <span class="hljs-number">4</span>, <span class="hljs-number">64</span>, <span class="hljs-number">64</span>), device=<span class="hljs-string">"cuda"</span>, dtype=torch.float16) | |
| timestep = torch.rand(<span class="hljs-number">1</span>, device=<span class="hljs-string">"cuda"</span>, dtype=torch.float16) * <span class="hljs-number">999</span> | |
| encoder_hidden_states = torch.randn((<span class="hljs-number">2</span>, <span class="hljs-number">77</span>, <span class="hljs-number">768</span>), device=<span class="hljs-string">"cuda"</span>, dtype=torch.float16) | |
| <span class="hljs-keyword">return</span> sample, timestep, encoder_hidden_states | |
| pipe = StableDiffusionPipeline.from_pretrained( | |
| <span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>, | |
| torch_dtype=torch.float16, | |
| ).to(<span class="hljs-string">"cuda"</span>) | |
| unet = pipe.unet | |
| unet.<span class="hljs-built_in">eval</span>() | |
| unet.to(memory_format=torch.channels_last) <span class="hljs-comment"># Channels Last 메모리 형식 사용</span> | |
| unet.forward = functools.partial(unet.forward, return_dict=<span class="hljs-literal">False</span>) <span class="hljs-comment"># return_dict=False을 기본값으로 설정</span> | |
| <span class="hljs-comment"># 워밍업</span> | |
| <span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">3</span>): | |
| <span class="hljs-keyword">with</span> torch.inference_mode(): | |
| inputs = generate_inputs() | |
| orig_output = unet(*inputs) | |
| <span class="hljs-comment"># 추적</span> | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">"tracing.."</span>) | |
| unet_traced = torch.jit.trace(unet, inputs) | |
| unet_traced.<span class="hljs-built_in">eval</span>() | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">"done tracing"</span>) | |
| <span class="hljs-comment"># 워밍업 및 그래프 최적화</span> | |
| <span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">5</span>): | |
| <span class="hljs-keyword">with</span> torch.inference_mode(): | |
| inputs = generate_inputs() | |
| orig_output = unet_traced(*inputs) | |
| <span class="hljs-comment"># 벤치마킹</span> | |
| <span class="hljs-keyword">with</span> torch.inference_mode(): | |
| <span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n_experiments): | |
| torch.cuda.synchronize() | |
| start_time = time.time() | |
| <span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(unet_runs_per_experiment): | |
| orig_output = unet_traced(*inputs) | |
| torch.cuda.synchronize() | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"unet traced inference took <span class="hljs-subst">{time.time() - start_time:<span class="hljs-number">.2</span>f}</span> seconds"</span>) | |
| <span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n_experiments): | |
| torch.cuda.synchronize() | |
| start_time = time.time() | |
| <span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(unet_runs_per_experiment): | |
| orig_output = unet(*inputs) | |
| torch.cuda.synchronize() | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"unet inference took <span class="hljs-subst">{time.time() - start_time:<span class="hljs-number">.2</span>f}</span> seconds"</span>) | |
| <span class="hljs-comment"># 모델 저장</span> | |
| unet_traced.save(<span class="hljs-string">"unet_traced.pt"</span>)`,wrap:!1}}),bl=new d({props:{code:"ZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMFN0YWJsZURpZmZ1c2lvblBpcGVsaW5lJTBBaW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGF0YWNsYXNzZXMlMjBpbXBvcnQlMjBkYXRhY2xhc3MlMEElMEElMEElNDBkYXRhY2xhc3MlMEFjbGFzcyUyMFVOZXQyRENvbmRpdGlvbk91dHB1dCUzQSUwQSUyMCUyMCUyMCUyMHNhbXBsZSUzQSUyMHRvcmNoLlRlbnNvciUwQSUwQSUwQXBpcGUlMjAlM0QlMjBTdGFibGVEaWZmdXNpb25QaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyc3RhYmxlLWRpZmZ1c2lvbi12MS01JTJGc3RhYmxlLWRpZmZ1c2lvbi12MS01JTIyJTJDJTBBJTIwJTIwJTIwJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5mbG9hdDE2JTJDJTBBKS50byglMjJjdWRhJTIyKSUwQSUwQSUyMyUyMGppdHRlZCUyMHVuZXQlMjAlRUMlODIlQUMlRUMlOUElQTklMEF1bmV0X3RyYWNlZCUyMCUzRCUyMHRvcmNoLmppdC5sb2FkKCUyMnVuZXRfdHJhY2VkLnB0JTIyKSUwQSUwQSUwQSUyMyUyMHBpcGUudW5ldCUyMCVFQyU4MiVBRCVFQyVBMCU5QyUwQWNsYXNzJTIwVHJhY2VkVU5ldCh0b3JjaC5ubi5Nb2R1bGUpJTNBJTBBJTIwJTIwJTIwJTIwZGVmJTIwX19pbml0X18oc2VsZiklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzdXBlcigpLl9faW5pdF9fKCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzZWxmLmluX2NoYW5uZWxzJTIwJTNEJTIwcGlwZS51bmV0LmNvbmZpZy5pbl9jaGFubmVscyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHNlbGYuZGV2aWNlJTIwJTNEJTIwcGlwZS51bmV0LmRldmljZSUwQSUwQSUyMCUyMCUyMCUyMGRlZiUyMGZvcndhcmQoc2VsZiUyQyUyMGxhdGVudF9tb2RlbF9pbnB1dCUyQyUyMHQlMkMlMjBlbmNvZGVyX2hpZGRlbl9zdGF0ZXMpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc2FtcGxlJTIwJTNEJTIwdW5ldF90cmFjZWQobGF0ZW50X21vZGVsX2lucHV0JTJDJTIwdCUyQyUyMGVuY29kZXJfaGlkZGVuX3N0YXRlcyklNUIwJTVEJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcmV0dXJuJTIwVU5ldDJEQ29uZGl0aW9uT3V0cHV0KHNhbXBsZSUzRHNhbXBsZSklMEElMEElMEFwaXBlLnVuZXQlMjAlM0QlMjBUcmFjZWRVTmV0KCklMEElMEF3aXRoJTIwdG9yY2guaW5mZXJlbmNlX21vZGUoKSUzQSUwQSUyMCUyMCUyMCUyMGltYWdlJTIwJTNEJTIwcGlwZSglNUJwcm9tcHQlNUQlMjAqJTIwMSUyQyUyMG51bV9pbmZlcmVuY2Vfc3RlcHMlM0Q1MCkuaW1hZ2VzJTVCMCU1RA==",highlighted:`<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline | |
| <span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> dataclasses <span class="hljs-keyword">import</span> dataclass | |
| <span class="hljs-meta">@dataclass</span> | |
| <span class="hljs-keyword">class</span> <span class="hljs-title class_">UNet2DConditionOutput</span>: | |
| sample: torch.Tensor | |
| pipe = StableDiffusionPipeline.from_pretrained( | |
| <span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>, | |
| torch_dtype=torch.float16, | |
| ).to(<span class="hljs-string">"cuda"</span>) | |
| <span class="hljs-comment"># jitted unet 사용</span> | |
| unet_traced = torch.jit.load(<span class="hljs-string">"unet_traced.pt"</span>) | |
| <span class="hljs-comment"># pipe.unet 삭제</span> | |
| <span class="hljs-keyword">class</span> <span class="hljs-title class_">TracedUNet</span>(torch.nn.Module): | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self</span>): | |
| <span class="hljs-built_in">super</span>().__init__() | |
| self.in_channels = pipe.unet.config.in_channels | |
| self.device = pipe.unet.device | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">self, latent_model_input, t, encoder_hidden_states</span>): | |
| sample = unet_traced(latent_model_input, t, encoder_hidden_states)[<span class="hljs-number">0</span>] | |
| <span class="hljs-keyword">return</span> UNet2DConditionOutput(sample=sample) | |
| pipe.unet = TracedUNet() | |
| <span class="hljs-keyword">with</span> torch.inference_mode(): | |
| image = pipe([prompt] * <span class="hljs-number">1</span>, num_inference_steps=<span class="hljs-number">50</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),Cl=new h({props:{title:"Memory-efficient attention",local:"memory-efficient-attention",headingTag:"h2"}}),Gl=new d({props:{code:"ZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMFN0YWJsZURpZmZ1c2lvblBpcGVsaW5lJTBBaW1wb3J0JTIwdG9yY2glMEElMEFwaXBlJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMnN0YWJsZS1kaWZmdXNpb24tdjEtNSUyRnN0YWJsZS1kaWZmdXNpb24tdjEtNSUyMiUyQyUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiUyQyUwQSkudG8oJTIyY3VkYSUyMiklMEElMEFwaXBlLmVuYWJsZV94Zm9ybWVyc19tZW1vcnlfZWZmaWNpZW50X2F0dGVudGlvbigpJTBBJTBBd2l0aCUyMHRvcmNoLmluZmVyZW5jZV9tb2RlKCklM0ElMEElMjAlMjAlMjAlMjBzYW1wbGUlMjAlM0QlMjBwaXBlKCUyMmElMjBzbWFsbCUyMGNhdCUyMiklMEElMEElMjMlMjAlRUMlODQlQTAlRUQlODMlOUQlM0ElMjAlRUMlOUQlQjQlRUIlQTUlQkMlMjAlRUIlQjklODQlRUQlOTklOUMlRUMlODQlQjElRUQlOTklOTQlMjAlRUQlOTUlOTglRUElQjglQjAlMjAlRUMlOUMlODQlRUQlOTUlQjQlMjAlRUIlOEIlQTQlRUMlOUQlOEMlRUMlOUQlODQlMjAlRUMlODIlQUMlRUMlOUElQTklRUQlOTUlQTAlMjAlRUMlODglOTglMjAlRUMlOUUlODglRUMlOEElQjUlRUIlOEIlODglRUIlOEIlQTQuJTBBJTIzJTIwcGlwZS5kaXNhYmxlX3hmb3JtZXJzX21lbW9yeV9lZmZpY2llbnRfYXR0ZW50aW9uKCk=",highlighted:`<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline | |
| <span class="hljs-keyword">import</span> torch | |
| pipe = StableDiffusionPipeline.from_pretrained( | |
| <span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>, | |
| torch_dtype=torch.float16, | |
| ).to(<span class="hljs-string">"cuda"</span>) | |
| pipe.enable_xformers_memory_efficient_attention() | |
| <span class="hljs-keyword">with</span> torch.inference_mode(): | |
| sample = pipe(<span class="hljs-string">"a small cat"</span>) | |
| <span class="hljs-comment"># 선택: 이를 비활성화 하기 위해 다음을 사용할 수 있습니다.</span> | |
| <span class="hljs-comment"># pipe.disable_xformers_memory_efficient_attention()</span>`,wrap:!1}}),Rl=new Ae({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/ko/optimization/fp16.md"}}),{c(){m=p("meta"),U=n(),u=p("p"),j=n(),J(T.$$.fragment),b=n(),V=p("p"),V.innerHTML=Ot,Xl=n(),B=p("p"),B.textContent=Pt,gl=n(),G=p("table"),G.innerHTML=Kt,vl=n(),R=p("em"),R.textContent=le,kl=n(),J(E.$$.fragment),Yl=n(),$=p("p"),$.innerHTML=te,xl=n(),_=p("p"),_.innerHTML=ee,Al=n(),J(W.$$.fragment),Hl=n(),J(N.$$.fragment),Fl=n(),S=p("p"),S.innerHTML=se,zl=n(),J(X.$$.fragment),Dl=n(),J(g.$$.fragment),Ll=n(),v=p("p"),v.innerHTML=ne,ql=n(),J(k.$$.fragment),Ol=n(),J(C.$$.fragment),Pl=n(),J(Y.$$.fragment),Kl=n(),x=p("p"),x.textContent=ae,lt=n(),J(I.$$.fragment),tt=n(),A=p("p"),A.innerHTML=pe,et=n(),J(H.$$.fragment),st=n(),F=p("p"),F.textContent=ie,nt=n(),J(z.$$.fragment),at=n(),D=p("p"),D.textContent=Me,pt=n(),L=p("p"),L.innerHTML=Ue,it=n(),q=p("p"),q.innerHTML=me,Mt=n(),J(O.$$.fragment),Ut=n(),P=p("p"),P.textContent=Je,mt=n(),_l=p("a"),Jt=El(` | |
| ## 메모리 절약을 위해 가속 기능을 사용하여 CPU로 오프로딩 | |
| `),K=p("p"),K.textContent=oe,ot=n(),ll=p("p"),ll.innerHTML=ye,yt=n(),J(tl.$$.fragment),ct=n(),el=p("p"),el.textContent=ce,wt=n(),sl=p("p"),sl.textContent=we,rt=n(),J(Z.$$.fragment),Tt=n(),nl=p("p"),nl.textContent=re,ut=n(),J(al.$$.fragment),dt=n(),pl=p("p"),pl.innerHTML=Te,ft=n(),Wl=p("a"),jt=El(` | |
| ## 빠른 추론과 메모리 메모리 절약을 위한 모델 오프로딩 | |
| `),il=p("p"),il.innerHTML=ue,ht=n(),Ml=p("p"),Ml.innerHTML=de,bt=n(),Ul=p("p"),Ul.textContent=fe,Ct=n(),ml=p("p"),ml.innerHTML=je,It=n(),J(Jl.$$.fragment),Zt=n(),ol=p("p"),ol.textContent=he,Qt=n(),J(yl.$$.fragment),Vt=n(),J(Q.$$.fragment),Bt=n(),J(cl.$$.fragment),Gt=n(),wl=p("p"),wl.textContent=be,Rt=n(),rl=p("p"),rl.textContent=Ce,Et=n(),J(Tl.$$.fragment),$t=n(),J(ul.$$.fragment),_t=n(),dl=p("p"),dl.textContent=Ie,Wt=n(),fl=p("p"),fl.textContent=Ze,Nt=n(),J(jl.$$.fragment),St=n(),hl=p("p"),hl.innerHTML=Qe,Xt=n(),J(bl.$$.fragment),gt=n(),J(Cl.$$.fragment),vt=n(),Il=p("p"),Il.innerHTML=Ve,kt=n(),Zl=p("p"),Zl.textContent=Be,Yt=n(),Ql=p("table"),Ql.innerHTML=Ge,xt=n(),Vl=p("p"),Vl.textContent=Re,At=n(),Bl=p("ul"),Bl.innerHTML=Ee,Ht=n(),J(Gl.$$.fragment),Ft=n(),J(Rl.$$.fragment),zt=n(),Nl=p("p"),this.h()},l(l){const t=Ye("svelte-u9bgzb",document.head);m=i(t,"META",{name:!0,content:!0}),t.forEach(e),U=a(l),u=i(l,"P",{}),Lt(u).forEach(e),j=a(l),o(T.$$.fragment,l),b=a(l),V=i(l,"P",{"data-svelte-h":!0}),M(V)!=="svelte-m9figs"&&(V.innerHTML=Ot),Xl=a(l),B=i(l,"P",{"data-svelte-h":!0}),M(B)!=="svelte-12e7d13"&&(B.textContent=Pt),gl=a(l),G=i(l,"TABLE",{"data-svelte-h":!0}),M(G)!=="svelte-1sy2nlq"&&(G.innerHTML=Kt),vl=a(l),R=i(l,"EM",{"data-svelte-h":!0}),M(R)!=="svelte-1iy2bqt"&&(R.textContent=le),kl=a(l),o(E.$$.fragment,l),Yl=a(l),$=i(l,"P",{"data-svelte-h":!0}),M($)!=="svelte-1q6vy6i"&&($.innerHTML=te),xl=a(l),_=i(l,"P",{"data-svelte-h":!0}),M(_)!=="svelte-1xwtu1p"&&(_.innerHTML=ee),Al=a(l),o(W.$$.fragment,l),Hl=a(l),o(N.$$.fragment,l),Fl=a(l),S=i(l,"P",{"data-svelte-h":!0}),M(S)!=="svelte-1gm9l4i"&&(S.innerHTML=se),zl=a(l),o(X.$$.fragment,l),Dl=a(l),o(g.$$.fragment,l),Ll=a(l),v=i(l,"P",{"data-svelte-h":!0}),M(v)!=="svelte-1y9jw5r"&&(v.innerHTML=ne),ql=a(l),o(k.$$.fragment,l),Ol=a(l),o(C.$$.fragment,l),Pl=a(l),o(Y.$$.fragment,l),Kl=a(l),x=i(l,"P",{"data-svelte-h":!0}),M(x)!=="svelte-97drxa"&&(x.textContent=ae),lt=a(l),o(I.$$.fragment,l),tt=a(l),A=i(l,"P",{"data-svelte-h":!0}),M(A)!=="svelte-19j5lzh"&&(A.innerHTML=pe),et=a(l),o(H.$$.fragment,l),st=a(l),F=i(l,"P",{"data-svelte-h":!0}),M(F)!=="svelte-1809mre"&&(F.textContent=ie),nt=a(l),o(z.$$.fragment,l),at=a(l),D=i(l,"P",{"data-svelte-h":!0}),M(D)!=="svelte-1klv9ve"&&(D.textContent=Me),pt=a(l),L=i(l,"P",{"data-svelte-h":!0}),M(L)!=="svelte-1bo4p0c"&&(L.innerHTML=Ue),it=a(l),q=i(l,"P",{"data-svelte-h":!0}),M(q)!=="svelte-j8mqed"&&(q.innerHTML=me),Mt=a(l),o(O.$$.fragment,l),Ut=a(l),P=i(l,"P",{"data-svelte-h":!0}),M(P)!=="svelte-1l99s96"&&(P.textContent=Je),mt=a(l),_l=i(l,"A",{name:!0}),Lt(_l).forEach(e),Jt=$l(l,` | |
| ## 메모리 절약을 위해 가속 기능을 사용하여 CPU로 오프로딩 | |
| `),K=i(l,"P",{"data-svelte-h":!0}),M(K)!=="svelte-131mxrq"&&(K.textContent=oe),ot=a(l),ll=i(l,"P",{"data-svelte-h":!0}),M(ll)!=="svelte-f0b7n3"&&(ll.innerHTML=ye),yt=a(l),o(tl.$$.fragment,l),ct=a(l),el=i(l,"P",{"data-svelte-h":!0}),M(el)!=="svelte-tablwz"&&(el.textContent=ce),wt=a(l),sl=i(l,"P",{"data-svelte-h":!0}),M(sl)!=="svelte-1tbver6"&&(sl.textContent=we),rt=a(l),o(Z.$$.fragment,l),Tt=a(l),nl=i(l,"P",{"data-svelte-h":!0}),M(nl)!=="svelte-1hfwhk5"&&(nl.textContent=re),ut=a(l),o(al.$$.fragment,l),dt=a(l),pl=i(l,"P",{"data-svelte-h":!0}),M(pl)!=="svelte-ax3hx7"&&(pl.innerHTML=Te),ft=a(l),Wl=i(l,"A",{name:!0}),Lt(Wl).forEach(e),jt=$l(l,` | |
| ## 빠른 추론과 메모리 메모리 절약을 위한 모델 오프로딩 | |
| `),il=i(l,"P",{"data-svelte-h":!0}),M(il)!=="svelte-7dwxx7"&&(il.innerHTML=ue),ht=a(l),Ml=i(l,"P",{"data-svelte-h":!0}),M(Ml)!=="svelte-1llz1y7"&&(Ml.innerHTML=de),bt=a(l),Ul=i(l,"P",{"data-svelte-h":!0}),M(Ul)!=="svelte-1hmauk"&&(Ul.textContent=fe),Ct=a(l),ml=i(l,"P",{"data-svelte-h":!0}),M(ml)!=="svelte-1gllwmo"&&(ml.innerHTML=je),It=a(l),o(Jl.$$.fragment,l),Zt=a(l),ol=i(l,"P",{"data-svelte-h":!0}),M(ol)!=="svelte-5q5hse"&&(ol.textContent=he),Qt=a(l),o(yl.$$.fragment,l),Vt=a(l),o(Q.$$.fragment,l),Bt=a(l),o(cl.$$.fragment,l),Gt=a(l),wl=i(l,"P",{"data-svelte-h":!0}),M(wl)!=="svelte-1rrl8zz"&&(wl.textContent=be),Rt=a(l),rl=i(l,"P",{"data-svelte-h":!0}),M(rl)!=="svelte-1c0oa55"&&(rl.textContent=Ce),Et=a(l),o(Tl.$$.fragment,l),$t=a(l),o(ul.$$.fragment,l),_t=a(l),dl=i(l,"P",{"data-svelte-h":!0}),M(dl)!=="svelte-1jkjfr1"&&(dl.textContent=Ie),Wt=a(l),fl=i(l,"P",{"data-svelte-h":!0}),M(fl)!=="svelte-mqbplb"&&(fl.textContent=Ze),Nt=a(l),o(jl.$$.fragment,l),St=a(l),hl=i(l,"P",{"data-svelte-h":!0}),M(hl)!=="svelte-1ewkmr2"&&(hl.innerHTML=Qe),Xt=a(l),o(bl.$$.fragment,l),gt=a(l),o(Cl.$$.fragment,l),vt=a(l),Il=i(l,"P",{"data-svelte-h":!0}),M(Il)!=="svelte-1ele8ui"&&(Il.innerHTML=Ve),kt=a(l),Zl=i(l,"P",{"data-svelte-h":!0}),M(Zl)!=="svelte-1aa24j0"&&(Zl.textContent=Be),Yt=a(l),Ql=i(l,"TABLE",{"data-svelte-h":!0}),M(Ql)!=="svelte-13acbqe"&&(Ql.innerHTML=Ge),xt=a(l),Vl=i(l,"P",{"data-svelte-h":!0}),M(Vl)!=="svelte-j9132p"&&(Vl.textContent=Re),At=a(l),Bl=i(l,"UL",{"data-svelte-h":!0}),M(Bl)!=="svelte-gqxwyg"&&(Bl.innerHTML=Ee),Ht=a(l),o(Gl.$$.fragment,l),Ft=a(l),o(Rl.$$.fragment,l),zt=a(l),Nl=i(l,"P",{}),Lt(Nl).forEach(e),this.h()},h(){Sl(m,"name","hf:doc:metadata"),Sl(m,"content",qe),Sl(_l,"name","sequential_offloading"),Sl(Wl,"name","model_offloading")},m(l,t){xe(document.head,m),s(l,U,t),s(l,u,t),s(l,j,t),y(T,l,t),s(l,b,t),s(l,V,t),s(l,Xl,t),s(l,B,t),s(l,gl,t),s(l,G,t),s(l,vl,t),s(l,R,t),s(l,kl,t),y(E,l,t),s(l,Yl,t),s(l,$,t),s(l,xl,t),s(l,_,t),s(l,Al,t),y(W,l,t),s(l,Hl,t),y(N,l,t),s(l,Fl,t),s(l,S,t),s(l,zl,t),y(X,l,t),s(l,Dl,t),y(g,l,t),s(l,Ll,t),s(l,v,t),s(l,ql,t),y(k,l,t),s(l,Ol,t),y(C,l,t),s(l,Pl,t),y(Y,l,t),s(l,Kl,t),s(l,x,t),s(l,lt,t),y(I,l,t),s(l,tt,t),s(l,A,t),s(l,et,t),y(H,l,t),s(l,st,t),s(l,F,t),s(l,nt,t),y(z,l,t),s(l,at,t),s(l,D,t),s(l,pt,t),s(l,L,t),s(l,it,t),s(l,q,t),s(l,Mt,t),y(O,l,t),s(l,Ut,t),s(l,P,t),s(l,mt,t),s(l,_l,t),s(l,Jt,t),s(l,K,t),s(l,ot,t),s(l,ll,t),s(l,yt,t),y(tl,l,t),s(l,ct,t),s(l,el,t),s(l,wt,t),s(l,sl,t),s(l,rt,t),y(Z,l,t),s(l,Tt,t),s(l,nl,t),s(l,ut,t),y(al,l,t),s(l,dt,t),s(l,pl,t),s(l,ft,t),s(l,Wl,t),s(l,jt,t),s(l,il,t),s(l,ht,t),s(l,Ml,t),s(l,bt,t),s(l,Ul,t),s(l,Ct,t),s(l,ml,t),s(l,It,t),y(Jl,l,t),s(l,Zt,t),s(l,ol,t),s(l,Qt,t),y(yl,l,t),s(l,Vt,t),y(Q,l,t),s(l,Bt,t),y(cl,l,t),s(l,Gt,t),s(l,wl,t),s(l,Rt,t),s(l,rl,t),s(l,Et,t),y(Tl,l,t),s(l,$t,t),y(ul,l,t),s(l,_t,t),s(l,dl,t),s(l,Wt,t),s(l,fl,t),s(l,Nt,t),y(jl,l,t),s(l,St,t),s(l,hl,t),s(l,Xt,t),y(bl,l,t),s(l,gt,t),y(Cl,l,t),s(l,vt,t),s(l,Il,t),s(l,kt,t),s(l,Zl,t),s(l,Yt,t),s(l,Ql,t),s(l,xt,t),s(l,Vl,t),s(l,At,t),s(l,Bl,t),s(l,Ht,t),y(Gl,l,t),s(l,Ft,t),y(Rl,l,t),s(l,zt,t),s(l,Nl,t),Dt=!0},p(l,[t]){const $e={};t&2&&($e.$$scope={dirty:t,ctx:l}),C.$set($e);const _e={};t&2&&(_e.$$scope={dirty:t,ctx:l}),I.$set(_e);const We={};t&2&&(We.$$scope={dirty:t,ctx:l}),Z.$set(We);const Ne={};t&2&&(Ne.$$scope={dirty:t,ctx:l}),Q.$set(Ne)},i(l){Dt||(c(T.$$.fragment,l),c(E.$$.fragment,l),c(W.$$.fragment,l),c(N.$$.fragment,l),c(X.$$.fragment,l),c(g.$$.fragment,l),c(k.$$.fragment,l),c(C.$$.fragment,l),c(Y.$$.fragment,l),c(I.$$.fragment,l),c(H.$$.fragment,l),c(z.$$.fragment,l),c(O.$$.fragment,l),c(tl.$$.fragment,l),c(Z.$$.fragment,l),c(al.$$.fragment,l),c(Jl.$$.fragment,l),c(yl.$$.fragment,l),c(Q.$$.fragment,l),c(cl.$$.fragment,l),c(Tl.$$.fragment,l),c(ul.$$.fragment,l),c(jl.$$.fragment,l),c(bl.$$.fragment,l),c(Cl.$$.fragment,l),c(Gl.$$.fragment,l),c(Rl.$$.fragment,l),Dt=!0)},o(l){w(T.$$.fragment,l),w(E.$$.fragment,l),w(W.$$.fragment,l),w(N.$$.fragment,l),w(X.$$.fragment,l),w(g.$$.fragment,l),w(k.$$.fragment,l),w(C.$$.fragment,l),w(Y.$$.fragment,l),w(I.$$.fragment,l),w(H.$$.fragment,l),w(z.$$.fragment,l),w(O.$$.fragment,l),w(tl.$$.fragment,l),w(Z.$$.fragment,l),w(al.$$.fragment,l),w(Jl.$$.fragment,l),w(yl.$$.fragment,l),w(Q.$$.fragment,l),w(cl.$$.fragment,l),w(Tl.$$.fragment,l),w(ul.$$.fragment,l),w(jl.$$.fragment,l),w(bl.$$.fragment,l),w(Cl.$$.fragment,l),w(Gl.$$.fragment,l),w(Rl.$$.fragment,l),Dt=!1},d(l){l&&(e(U),e(u),e(j),e(b),e(V),e(Xl),e(B),e(gl),e(G),e(vl),e(R),e(kl),e(Yl),e($),e(xl),e(_),e(Al),e(Hl),e(Fl),e(S),e(zl),e(Dl),e(Ll),e(v),e(ql),e(Ol),e(Pl),e(Kl),e(x),e(lt),e(tt),e(A),e(et),e(st),e(F),e(nt),e(at),e(D),e(pt),e(L),e(it),e(q),e(Mt),e(Ut),e(P),e(mt),e(_l),e(Jt),e(K),e(ot),e(ll),e(yt),e(ct),e(el),e(wt),e(sl),e(rt),e(Tt),e(nl),e(ut),e(dt),e(pl),e(ft),e(Wl),e(jt),e(il),e(ht),e(Ml),e(bt),e(Ul),e(Ct),e(ml),e(It),e(Zt),e(ol),e(Qt),e(Vt),e(Bt),e(Gt),e(wl),e(Rt),e(rl),e(Et),e($t),e(_t),e(dl),e(Wt),e(fl),e(Nt),e(St),e(hl),e(Xt),e(gt),e(vt),e(Il),e(kt),e(Zl),e(Yt),e(Ql),e(xt),e(Vl),e(At),e(Bl),e(Ht),e(Ft),e(zt),e(Nl)),e(m),r(T,l),r(E,l),r(W,l),r(N,l),r(X,l),r(g,l),r(k,l),r(C,l),r(Y,l),r(I,l),r(H,l),r(z,l),r(O,l),r(tl,l),r(Z,l),r(al,l),r(Jl,l),r(yl,l),r(Q,l),r(cl,l),r(Tl,l),r(ul,l),r(jl,l),r(bl,l),r(Cl,l),r(Gl,l),r(Rl,l)}}}const qe='{"title":"메모리와 속도","local":"메모리와-속도","sections":[{"title":"cuDNN auto-tuner 활성화하기","local":"cudnn-auto-tuner-활성화하기","sections":[{"title":"fp32 대신 tf32 사용하기 (Ampere 및 이후 CUDA 장치들에서)","local":"fp32-대신-tf32-사용하기-ampere-및-이후-cuda-장치들에서","sections":[],"depth":3}],"depth":2},{"title":"반정밀도 가중치","local":"반정밀도-가중치","sections":[],"depth":2},{"title":"추가 메모리 절약을 위한 슬라이스 어텐션","local":"추가-메모리-절약을-위한-슬라이스-어텐션","sections":[],"depth":2},{"title":"더 큰 배치를 위한 sliced VAE 디코드","local":"더-큰-배치를-위한-sliced-vae-디코드","sections":[],"depth":2},{"title":"Channels Last 메모리 형식 사용하기","local":"channels-last-메모리-형식-사용하기","sections":[],"depth":2},{"title":"추적(tracing)","local":"추적tracing","sections":[],"depth":2},{"title":"Memory-efficient attention","local":"memory-efficient-attention","sections":[],"depth":2}],"depth":1}';function Oe(f){return Xe(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class ss extends ve{constructor(m){super(),ke(this,m,Oe,Le,Se,{})}}export{ss as component}; | |
Xet Storage Details
- Size:
- 50 kB
- Xet hash:
- ca5783c33c4cab4ecd00f54862837b9993326eff01682271814e003ae5937d9f
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.