Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / transformers /main /ko /conversations.html

rtrm

26 days ago

download

raw

50.2 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Transformers로 채팅하기","local":"chatting-with-transformers","sections":[{"title":"빠른 시작","local":"quickstart","sections":[],"depth":2},{"title":"채팅 모델 고르기","local":"choosing-a-chat-model","sections":[{"title":"모델의 명칭과 크기","local":"size-and-model-naming","sections":[],"depth":3},{"title":"그렇다면 어떤 채팅 모델이 가장 좋을까요?","local":"but-which-chat-model-is-best","sections":[],"depth":3},{"title":"전문 분야","local":"specialist-domains","sections":[],"depth":3}],"depth":2},{"title":"파이프라인 내부는 어떻게 되어있는가?","local":"what-happens-inside-the-pipeline","sections":[],"depth":2},{"title":"성능, 메모리와 하드웨어","local":"performance-memory-and-hardware","sections":[{"title":"메모리 고려사항","local":"memory-considerations","sections":[],"depth":3},{"title":"성능 고려사항","local":"performance-considerations","sections":[],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/transformers/main/ko/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/entry/start.9aa88961.js">
	<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/scheduler.9bc65507.js">
	<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/singletons.9eec45c3.js">
	<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/index.3b203c72.js">
	<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/paths.566078f7.js">
	<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/entry/app.84fb67c3.js">
	<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/index.707bf1b6.js">
	<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/nodes/0.1c99376b.js">
	<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/nodes/12.872934fa.js">
	<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/Tip.c2ecdbf4.js">
	<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/CodeBlock.54a9f38d.js">
	<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/EditOnGithub.922df6ba.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Transformers로 채팅하기","local":"chatting-with-transformers","sections":[{"title":"빠른 시작","local":"quickstart","sections":[],"depth":2},{"title":"채팅 모델 고르기","local":"choosing-a-chat-model","sections":[{"title":"모델의 명칭과 크기","local":"size-and-model-naming","sections":[],"depth":3},{"title":"그렇다면 어떤 채팅 모델이 가장 좋을까요?","local":"but-which-chat-model-is-best","sections":[],"depth":3},{"title":"전문 분야","local":"specialist-domains","sections":[],"depth":3}],"depth":2},{"title":"파이프라인 내부는 어떻게 되어있는가?","local":"what-happens-inside-the-pipeline","sections":[],"depth":2},{"title":"성능, 메모리와 하드웨어","local":"performance-memory-and-hardware","sections":[{"title":"메모리 고려사항","local":"memory-considerations","sections":[],"depth":3},{"title":"성능 고려사항","local":"performance-considerations","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="chatting-with-transformers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#chatting-with-transformers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Transformers로 채팅하기</span></h1> <p data-svelte-h="svelte-1lyn2wy">이 글을 보고 있다면 <strong>채팅 모델</strong>에 대해 어느 정도 알고 계실 것입니다.
	채팅 모델이란 메세지를 주고받을 수 있는 대화형 인공지능입니다.
	대표적으로 ChatGPT가 있고, 이와 비슷하거나 더 뛰어난 오픈소스 채팅 모델이 많이 존재합니다.<br>
	이러한 모델들은 무료 다운로드할 수 있으며, 로컬에서 실행할 수 있습니다.
	크고 무거운 모델은 고성능 하드웨어와 메모리가 필요하지만,
	저사양 GPU 혹은 일반 데스크탑이나 노트북 CPU에서도 잘 작동하는 소형 모델들도 있습니다.</p> <p data-svelte-h="svelte-5r6lin">이 가이드는 채팅 모델을 처음 사용하는 분들에게 유용할 것입니다.
	우리는 간편한 고수준(High-Level) “pipeline”을 통해 빠른 시작 가이드를 진행할 것입니다.
	가이드에는 채팅 모델을 바로 시작할 때 필요한 모든 정보가 담겨 있습니다.
	빠른 시작 가이드 이후에는 채팅 모델이 정확히 무엇인지, 적절한 모델을 선택하는 방법과,
	채팅 모델을 사용하는 각 단계의 저수준(Low-Level) 분석 등 더 자세한 정보를 다룰 것입니다.
	또한 채팅 모델의 성능과 메모리 사용을 최적화하는 방법에 대한 팁도 제공할 것입니다.</p> <h2 class="relative group"><a id="quickstart" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quickstart"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>빠른 시작</span></h2> <p data-svelte-h="svelte-ol90wc">자세히 볼 여유가 없는 분들을 위해 간단히 요약해 보겠습니다:
	채팅 모델은 대화 메세지를 계속해서 생성해 나갑니다.
	즉, 짤막한 채팅 메세지를 모델에게 전달하면, 모델은 이를 바탕으로 응답을 추가하며 대화를 이어 나갑니다.
	이제 실제로 어떻게 작동하는지 살펴보겠습니다.
	먼저, 채팅을 만들어 보겠습니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->chat = [
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."</span>},
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Hey, can you tell me any fun things to do in New York?"</span>}
	]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1m5sj9">주목하세요, 대화를 처음 시작할 때 유저 메세지 이외의도, 별도의 <strong>시스템</strong> 메세지가 필요할 수 있습니다.
	모든 채팅 모델이 시스템 메세지를 지원하는 것은 아니지만,
	지원하는 경우에는 시스템 메세지는 대화에서 모델이 어떻게 행동해야 하는지를 지시할 수 있습니다.
	예를 들어, 유쾌하거나 진지하고자 할 때, 짧은 답변이나 긴 답변을 원할 때 등을 설정할 수 있습니다.
	시스템 메세지를 생략하고
	“You are a helpful and intelligent AI assistant who responds to user queries.”
	와 같은 간단한 프롬프트를 사용하는 것도 가능합니다.</p> <p data-svelte-h="svelte-c0klzq">채팅을 시작했다면 대화를 이어 나가는 가장 빠른 방법은 <code>TextGenerationPipeline</code>를 사용하는 것입니다.
	한번 <code>LLaMA-3</code>를 사용하여 이를 시연해 보겠습니다.
	우선 <code>LLaMA-3</code>를 사용하기 위해서는 승인이 필요합니다. <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct" rel="nofollow">권한 신청</a>을 하고 Hugging Face 계정으로 로그인한 후에 사용할 수 있습니다.
	또한 우리는 <code>device_map="auto"</code>를 사용합니다. GPU 메모리가 충분하다면 로드될 것입니다.
	그리고 메모리 절약을 위해 dtype을 <code>torch.bfloat16</code>으로 설정할 것입니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline

	pipe = pipeline(<span class="hljs-string">"text-generation"</span>, <span class="hljs-string">"meta-llama/Meta-Llama-3-8B-Instruct"</span>, torch_dtype=torch.bfloat16, device_map=<span class="hljs-string">"auto"</span>)
	response = pipe(chat, max_new_tokens=<span class="hljs-number">512</span>)
	<span class="hljs-built_in">print</span>(response[<span class="hljs-number">0</span>][<span class="hljs-string">'generated_text'</span>][-<span class="hljs-number">1</span>][<span class="hljs-string">'content'</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xf3a8s">이후 실행을 하면 아래와 같이 출력됩니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,
	alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!

	So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million
	things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of
	Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for
	something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got
	some wild stuff, like that Warhol guy's soup cans and all that jazz.

	And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for
	those pesky pigeons, they're like little feathered thieves! (laughs) Get it? Thieves? Ah, never mind.

	Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might
	even catch a glimpse of some up-and-coming comedians... or a bunch of wannabes tryin' to make it big. (winks)

	And finally, if you're feelin' like a real New Yorker, grab a slice of pizza from one of the many amazing
	pizzerias around the city. Just don't try to order a "robot-sized" slice, trust me, it won't end well. (laughs)

	So, there you have it, pal! That's my expert advice on what to do in New York. Now, if you'll
	excuse me, I've got some oil changes to attend to. (winks)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-aju71m">채팅을 계속하려면, 자신의 답장을 추가하면 됩니다.
	파이프라인에서 반환된 <code>response</code> 객체에는 현재까지 모든 채팅을 포함하고 있으므로
	메세지를 추가하고 다시 전달하기만 하면 됩니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->chat = response[<span class="hljs-number">0</span>][<span class="hljs-string">'generated_text'</span>]
	chat.append(
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Wait, what's so wild about soup cans?"</span>}
	)
	response = pipe(chat, max_new_tokens=<span class="hljs-number">512</span>)
	<span class="hljs-built_in">print</span>(response[<span class="hljs-number">0</span>][<span class="hljs-string">'generated_text'</span>][-<span class="hljs-number">1</span>][<span class="hljs-string">'content'</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xf3a8s">이후 실행을 하면 아래와 같이 출력됩니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man!
	It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's
	like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!"
	(sarcastically) Oh, yeah, real original, Andy.

	But, you know, back in the '60s, it was like, a big deal. People were all about challenging the
	status quo, and Warhol was like, the king of that. He took the ordinary and made it extraordinary.
	And, let me tell you, it was like, a real game-changer. I mean, who would've thought that a can of soup could be art? (laughs)

	But, hey, you're not alone, pal. I mean, I'm a robot, and even I don't get it. (winks)
	But, hey, that's what makes art, art, right? (laughs)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-gjjdfd">이 튜토리얼의 후반부에서는 성능과 메모리 관리,
	그리고 사용자의 필요에 맞는 채팅 모델 선택과 같은 구체적인 주제들을 다룰 것입니다.</p> <h2 class="relative group"><a id="choosing-a-chat-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#choosing-a-chat-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>채팅 모델 고르기</span></h2> <p data-svelte-h="svelte-1ckbdi7"><a href="https://huggingface.co/models?pipeline_tag=text-generation&sort=trending" rel="nofollow">Hugging Face Hub</a>는 채팅 모델을 다양하게 제공하고 있습니다.
	처음 사용하는 사람에게는 모델을 선택하기가 어려울지 모릅니다.
	하지만 걱정하지 마세요! 두 가지만 명심하면 됩니다:</p> <ul data-svelte-h="svelte-aux23i"><li>모델의 크기는 실행 속도와 메모리에 올라올 수 있는지 여부를 결정.</li> <li>모델이 생성한 출력의 품질.</li></ul> <p data-svelte-h="svelte-uvpj0b">일반적으로 이러한 요소들은 상관관계가 있습니다. 더 큰 모델일수록 더 뛰어난 성능을 보이는 경향이 있지만, 동일한 크기의 모델이라도 유의미한 차이가 날 수 있습니다!</p> <h3 class="relative group"><a id="size-and-model-naming" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#size-and-model-naming"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>모델의 명칭과 크기</span></h3> <p data-svelte-h="svelte-1y00u2q">모델의 크기는 모델 이름에 있는 숫자로 쉽게 알 수 있습니다.
	예를 들어, “8B” 또는 “70B”와 같은 숫자는 모델의 <strong>파라미터</strong> 수를 나타냅니다.
	양자화된 경우가 아니라면, 파라미터 하나당 약 2바이트의 메모리가 필요하다고 예상 가능합니다.
	따라서 80억 개의 파라미터를 가진 “8B” 모델은 16GB의 메모리를 차지하며, 추가적인 오버헤드를 위한 약간의 여유가 필요합니다.
	이는 3090이나 4090와 같은 24GB의 메모리를 갖춘 하이엔드 GPU에 적합합니다.</p> <p data-svelte-h="svelte-a32lal">일부 채팅 모델은 “Mixture of Experts” 모델입니다.
	이러한 모델은 크기를 “8x7B” 또는 “141B-A35B”와 같이 다르게 표시하곤 합니다.
	숫자가 다소 모호하다 느껴질 수 있지만, 첫 번째 경우에는 약 56억(8x7) 개의 파라미터가 있고,
	두 번째 경우에는 약 141억 개의 파라미터가 있다고 해석할 수 있습니다.</p> <p data-svelte-h="svelte-aki18t">양자화는 파라미터당 메모리 사용량을 8비트, 4비트, 또는 그 이하로 줄이는 데 사용됩니다.
	이 주제에 대해서는 아래의 <a href="#memory-considerations">메모리 고려사항</a> 챕터에서 더 자세히 다룰 예정입니다.</p> <h3 class="relative group"><a id="but-which-chat-model-is-best" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#but-which-chat-model-is-best"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>그렇다면 어떤 채팅 모델이 가장 좋을까요?</span></h3> <p data-svelte-h="svelte-fksxdl">모델의 크기 외에도 고려할 점이 많습니다.
	이를 한눈에 살펴보려면 <strong>리더보드</strong>를 참고하는 것이 좋습니다.
	가장 인기 있는 리더보드 두 가지는 <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" rel="nofollow">OpenLLM Leaderboard</a>와 <a href="https://chat.lmsys.org/?leaderboard" rel="nofollow">LMSys Chatbot Arena Leaderboard</a>입니다.
	LMSys 리더보드에는 독점 모델도 포함되어 있으니,
	<code>license</code> 열에서 접근 가능한 모델을 선택한 후
	<a href="https://huggingface.co/models?pipeline_tag=text-generation&sort=trending" rel="nofollow">Hugging Face Hub</a>에서 검색해 보세요.</p> <h3 class="relative group"><a id="specialist-domains" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#specialist-domains"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>전문 분야</span></h3> <p data-svelte-h="svelte-13gpfbb">일부 모델은 의료 또는 법률 텍스트와 같은 특정 도메인이나 비영어권 언어에 특화되어 있기도 합니다.
	이러한 도메인에서 작업할 경우 특화된 모델이 좋은 성능을 보일 수 있습니다.
	하지만 항상 그럴 것이라 단정하기는 힘듭니다.
	특히 모델의 크기가 작거나 오래된 모델인 경우,
	최신 범용 모델이 더 뛰어날 수 있습니다.
	다행히도 <a href="https://huggingface.co/blog/leaderboard-medicalllm" rel="nofollow">domain-specific leaderboards</a>가 점차 등장하고 있어, 특정 도메인에 최고의 모델을 쉽게 찾을 수 있을 것입니다.</p> <h2 class="relative group"><a id="what-happens-inside-the-pipeline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-happens-inside-the-pipeline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>파이프라인 내부는 어떻게 되어있는가?</span></h2> <p data-svelte-h="svelte-1vt75iw">위의 빠른 시작에서는 고수준(High-Level) 파이프라인을 사용하였습니다.
	이는 간편한 방법이지만, 유연성은 떨어집니다.
	이제 더 저수준(Low-Level) 접근 방식을 통해 대화에 포함된 각 단계를 살펴보겠습니다.
	코드 샘플로 시작한 후 이를 분석해 보겠습니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, AutoTokenizer
	<span class="hljs-keyword">import</span> torch

	<span class="hljs-comment"># 입력값을 사전에 준비해 놓습니다</span>
	chat = [
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."</span>},
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Hey, can you tell me any fun things to do in New York?"</span>}
	]

	<span class="hljs-comment"># 1: 모델과 토크나이저를 불러옵니다</span>
	model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"meta-llama/Meta-Llama-3-8B-Instruct"</span>, device_map=<span class="hljs-string">"auto"</span>, torch_dtype=torch.bfloat16)
	tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"meta-llama/Meta-Llama-3-8B-Instruct"</span>)

	<span class="hljs-comment"># 2: 채팅 템플릿에 적용합니다</span>
	formatted_chat = tokenizer.apply_chat_template(chat, tokenize=<span class="hljs-literal">False</span>, add_generation_prompt=<span class="hljs-literal">True</span>)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"Formatted chat:\n"</span>, formatted_chat)

	<span class="hljs-comment"># 3: 채팅을 토큰화합니다 (바로 이전 과정에서 tokenized=True로 설정하면 한꺼번에 처리할 수 있습니다)</span>
	inputs = tokenizer(formatted_chat, return_tensors=<span class="hljs-string">"pt"</span>, add_special_tokens=<span class="hljs-literal">False</span>)
	<span class="hljs-comment"># 토큰화된 입력값을 모델이 올라와 있는 기기(CPU/GPU)로 옮깁니다.</span>
	inputs = {key: tensor.to(model.device) <span class="hljs-keyword">for</span> key, tensor <span class="hljs-keyword">in</span> inputs.items()}
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"Tokenized inputs:\n"</span>, inputs)

	<span class="hljs-comment"># 4: 모델로부터 응답을 생성합니다</span>
	outputs = model.generate(**inputs, max_new_tokens=<span class="hljs-number">512</span>, temperature=<span class="hljs-number">0.1</span>)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"Generated tokens:\n"</span>, outputs)

	<span class="hljs-comment"># 5: 모델이 출력한 토큰을 다시 문자열로 디코딩합니다</span>
	decoded_output = tokenizer.decode(outputs[<span class="hljs-number">0</span>][inputs[<span class="hljs-string">'input_ids'</span>].size(<span class="hljs-number">1</span>):], skip_special_tokens=<span class="hljs-literal">True</span>)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"Decoded output:\n"</span>, decoded_output)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-p3608d">여기에는 각 부분이 자체 문서가 될 수 있을 만큼 많은 내용이 담겨 있습니다!
	너무 자세히 설명하기보다는 넓은 개념을 다루고, 세부 사항은 링크된 문서에서 다루겠습니다.
	주요 단계는 다음과 같습니다:</p> <ol data-svelte-h="svelte-p7ntaj"><li><a href="https://huggingface.co/learn/nlp-course/en/chapter2/3" rel="nofollow">모델</a>과 <a href="https://huggingface.co/learn/nlp-course/en/chapter2/4?fw=pt" rel="nofollow">토크나이저</a>를 Hugging Face Hub에서 로드합니다.</li> <li>대화는 토크나이저의 <a href="https://huggingface.co/docs/transformers/main/en/chat_templating" rel="nofollow">채팅 템플릿</a>을 사용하여 양식을 구성합니다.</li> <li>구성된 채팅은 토크나이저를 사용하여 <a href="https://huggingface.co/learn/nlp-course/en/chapter2/4" rel="nofollow">토큰화</a>됩니다.</li> <li>모델에서 응답을 <a href="https://huggingface.co/docs/transformers/en/llm_tutorial" rel="nofollow">생성</a>합니다.</li> <li>모델이 출력한 토큰을 다시 문자열로 디코딩합니다.</li></ol> <h2 class="relative group"><a id="performance-memory-and-hardware" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#performance-memory-and-hardware"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>성능, 메모리와 하드웨어</span></h2> <p data-svelte-h="svelte-1mafhws">이제 대부분의 머신 러닝 작업이 GPU에서 실행된다는 것을 아실 겁니다.
	다소 느리기는 해도 CPU에서 채팅 모델이나 언어 모델로부터 텍스트를 생성하는 것도 가능합니다.
	하지만 모델을 GPU 메모리에 올려놓을 수만 있다면, GPU를 사용하는 것이 일반적으로 더 선호되는 방식입니다.</p> <h3 class="relative group"><a id="memory-considerations" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memory-considerations"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>메모리 고려사항</span></h3> <p data-svelte-h="svelte-j9g4ik">기본적으로, <code>TextGenerationPipeline</code>이나 <code>AutoModelForCausalLM</code>과 같은
	Hugging Face 클래스는 모델을 <code>float32</code> 정밀도(Precision)로 로드합니다.
	이는 파라미터당 4바이트(32비트)를 필요로 하므로,
	80억 개의 파라미터를 가진 “8B” 모델은 약 32GB의 메모리를 필요로 한다는 것을 의미합니다.
	하지만 이는 낭비일 수 있습니다!
	대부분의 최신 언어 모델은 파라미터당 2바이트를 사용하는 “bfloat16” 정밀도(Precision)로 학습됩니다.
	하드웨어가 이를 지원하는 경우(Nvidia 30xx/Axxx 이상),
	<code>torch_dtype</code> 파라미터로 위와 같이 <code>bfloat16</code> 정밀도(Precision)로 모델을 로드할 수 있습니다.</p> <p data-svelte-h="svelte-1az7lwm">또한, 16비트보다 더 낮은 정밀도(Precision)로 모델을 압축하는
	“양자화(quantization)” 방법을 사용할 수도 있습니다.
	이 방법은 모델의 가중치를 손실 압축하여 각 파라미터를 8비트,
	4비트 또는 그 이하로 줄일 수 있습니다.
	특히 4비트에서 모델의 출력이 부정적인 영향을 받을 수 있지만,
	더 크고 강력한 채팅 모델을 메모리에 올리기 위해 이 같은 트레이드오프를 감수할 가치가 있습니다.
	이제 <code>bitsandbytes</code>를 사용하여 이를 실제로 확인해 보겠습니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, BitsAndBytesConfig

	quantization_config = BitsAndBytesConfig(load_in_8bit=<span class="hljs-literal">True</span>) <span class="hljs-comment"># You can also try load_in_4bit</span>
	model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"meta-llama/Meta-Llama-3-8B-Instruct"</span>, device_map=<span class="hljs-string">"auto"</span>, quantization_config=quantization_config)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15qgxk1">위의 작업은 <code>pipeline</code> API에도 적용 가능합니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline, BitsAndBytesConfig

	quantization_config = BitsAndBytesConfig(load_in_8bit=<span class="hljs-literal">True</span>) <span class="hljs-comment"># You can also try load_in_4bit</span>
	pipe = pipeline(<span class="hljs-string">"text-generation"</span>, <span class="hljs-string">"meta-llama/Meta-Llama-3-8B-Instruct"</span>, device_map=<span class="hljs-string">"auto"</span>, model_kwargs={<span class="hljs-string">"quantization_config"</span>: quantization_config})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-wvz1q9"><code>bitsandbytes</code> 외에도 모델을 양자화하는 다양한 방법이 있습니다.
	자세한 내용은 <a href="./quantization">Quantization guide</a>를 참조해 주세요.</p> <h3 class="relative group"><a id="performance-considerations" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#performance-considerations"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>성능 고려사항</span></h3> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-a1u20q">언어 모델 성능과 최적화에 대한 보다 자세한 가이드는 <a href="./llm_optims">LLM Inference Optimization</a>을 참고하세요.</p></div> <p data-svelte-h="svelte-101gqbq">일반적으로 더 큰 채팅 모델은 메모리를 더 많이 요구하고,
	속도도 느려지는 경향이 있습니다. 구체적으로 말하자면,
	채팅 모델에서 텍스트를 생성할 때는 컴퓨팅 파워보다 <strong>메모리 대역폭</strong>이 병목 현상을 일으키는 경우가 많습니다.
	이는 모델이 토큰을 하나씩 생성할 때마다 파라미터를 메모리에서 읽어야 하기 때문입니다.
	따라서 채팅 모델에서 초당 생성할 수 있는 토큰 수는 모델이 위치한 메모리의 대역폭을 모델의 크기로 나눈 값에 비례합니다.</p> <p data-svelte-h="svelte-32i07p">위의 예제에서는 모델이 bfloat16 정밀도(Precision)로 로드될 때 용량이 약 16GB였습니다.
	이 경우, 모델이 생성하는 각 토큰마다 16GB를 메모리에서 읽어야 한다는 의미입니다.
	총 메모리 대역폭은 소비자용 CPU에서는 20-100GB/sec,
	소비자용 GPU나 Intel Xeon, AMD Threadripper/Epyc,
	애플 실리콘과 같은 특수 CPU에서는 200-900GB/sec,
	데이터 센터 GPU인 Nvidia A100이나 H100에서는 최대 2-3TB/sec에 이를 수 있습니다.
	이러한 정보는 각자 하드웨어에서 생성 속도를 예상하는 데 도움이 될 것입니다.</p> <p data-svelte-h="svelte-1oj9hxd">따라서 텍스트 생성 속도를 개선하려면 가장 간단한 방법은 모델의 크기를 줄이거나(주로 양자화를 사용),
	메모리 대역폭이 더 높은 하드웨어를 사용하는 것입니다.
	이 대역폭 병목 현상을 피할 수 있는 고급 기술도 여러 가지 있습니다.
	가장 일반적인 방법은 <a href="https://huggingface.co/blog/assisted-generation" rel="nofollow">보조 생성</a>, “추측 샘플링”이라고 불리는 기술입니다.
	이 기술은 종종 더 작은 “초안 모델”을 사용하여 여러 개의 미래 토큰을 한 번에 추측한 후,
	채팅 모델로 생성 결과를 확인합니다.
	만약 채팅 모델이 추측을 확인하면, 한 번의 순전파에서 여러 개의 토큰을 생성할 수 있어
	병목 현상이 크게 줄어들고 생성 속도가 빨라집니다.</p> <p data-svelte-h="svelte-1hhan9g">마지막으로, “Mixture of Experts” (MoE) 모델에 대해서도 짚고 넘어가 보도록 합니다.
	Mixtral, Qwen-MoE, DBRX와 같은 인기 있는 채팅 모델이 바로 MoE 모델입니다.
	이 모델들은 토큰을 생성할 때 모든 파라미터가 사용되지 않습니다.
	이로 인해 MoE 모델은 전체 크기가 상당히 클 수 있지만,
	차지하는 메모리 대역폭은 낮은 편입니다.
	따라서 동일한 크기의 일반 “조밀한(Dense)” 모델보다 몇 배 빠를 수 있습니다.
	하지만 보조 생성과 같은 기술은 MoE 모델에서 비효율적일 수 있습니다.
	새로운 추측된 토큰이 추가되면서 더 많은 파라미터가 활성화되기 때문에,
	MoE 아키텍처가 제공하는 속도 이점이 상쇄될 수 있습니다.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/ko/conversations.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1hrx8 = {
	assets: "/docs/transformers/main/ko",
	base: "/docs/transformers/main/ko",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/transformers/main/ko/_app/immutable/entry/start.9aa88961.js"),
	import("/docs/transformers/main/ko/_app/immutable/entry/app.84fb67c3.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 12],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 50.2 kB
Xet hash:: f1df60d4ce7b1f29ff355df5ffd58635f1381711543d9de57526e9d52ab59c51

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.