| <!--Copyright 2025 The HuggingFace Team. All rights reserved. | |
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | |
| the License. You may obtain a copy of the License at | |
| http://www.apache.org/licenses/LICENSE-2.0 | |
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | |
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | |
| specific language governing permissions and limitations under the License. | |
| โ ๏ธ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | |
| rendered properly in your Markdown viewer. | |
| --> | |
| <div style="float: right;"> | |
| <div class="flex flex-wrap space-x-1"> | |
| <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white"> | |
| <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white"> | |
| </div> | |
| </div> | |
| # Gemma 3 [[gemma3]] | |
| [Gemma 3](https://goo.gle/Gemma3Report)๋ ์ฌ์ ํ๋ จ๋ ๋ฒ์ ๊ณผ ์ง์๋ฌธ ์กฐ์ ๋ฒ์ ์ ๊ฐ์ถ ๋ฉํฐ๋ชจ๋ฌ ๋ชจ๋ธ๋ก, 1B, 13B, 27B ๋งค๊ฐ๋ณ์๋ก ์ ๊ณต๋ฉ๋๋ค. ์ํคํ ์ฒ๋ ์ด์ Gemma ๋ฒ์ ๊ณผ ๋๋ถ๋ถ ๋์ผํฉ๋๋ค. ์ฃผ์ ์ฐจ์ด์ ์ ๋ชจ๋ ๊ธ๋ก๋ฒ ์ ํ ์ดํ ์ ๋ ์ด์ด๋ง๋ค 5๊ฐ์ ๋ก์ปฌ ์ฌ๋ผ์ด๋ฉ ์๋์ฐ ์ ํ ์ดํ ์ ๋ ์ด์ด๋ฅผ ๋ฒ๊ฐ์ ์ฌ์ฉํ๋ ์ , 128K ํ ํฐ์ ๋ ๊ธด ์ปจํ ์คํธ ๊ธธ์ด๋ฅผ ์ง์ํ๋ ์ , ๊ทธ๋ฆฌ๊ณ ๊ณ ํด์๋ ์ด๋ฏธ์ง๋ ์ ์ฌ๊ฐํ์ด ์๋ ์ข ํก๋น์ ์ด๋ฏธ์ง์์ ์ ๋ณด๊ฐ ์ฌ๋ผ์ง๋ ๊ฒ์ ๋ฐฉ์งํ๊ธฐ ์ํด ๊ณ ํด์๋ ์ด๋ฏธ์ง๋ฅผ "ํจ๋ ๋ฐ ์ค์บ๋"ํ ์ ์๋ [SigLip](./siglip) ์ธ์ฝ๋๋ฅผ ์ฌ์ฉํ๋ค๋ ์ ์ ๋๋ค. | |
| ์ง์๋ฌธ ์กฐ์ ๋ฒ์ ์ ์ง์ ์ฆ๋ฅ ๋ฐ ๊ฐํ ํ์ต์ผ๋ก ํ์ ํ์ต๋์์ต๋๋ค. | |
| Gemma 3์ ๋ชจ๋ ์๋ณธ ์ฒดํฌํฌ์ธํธ๋ [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) ๋ฆด๋ฆฌ์ค์์ ํ์ธํ ์ ์์ต๋๋ค. | |
| > [!ํ] | |
| > Gemma๋ฅผ ๋ค์ํ ๋น์ ๋ฐ ์ธ์ด ์์ ์ ์ ์ฉํ๋ ์ถ๊ฐ ์์๋ฅผ ๋ณด๋ ค๋ฉด ์ค๋ฅธ์ชฝ ์ฌ์ด๋๋ฐ์ Gemma 3 ๋ชจ๋ธ์ ํด๋ฆญํ์ธ์. | |
| ์๋ ์์๋ [`Pipeline`] ๋๋ [`AutoModel`] ํด๋์ค๋ฅผ ์ฌ์ฉํ์ฌ ์ด๋ฏธ์ง๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ํ ์คํธ๋ฅผ ์์ฑํ๋ ๋ฐฉ๋ฒ์ ๋ณด์ฌ์ค๋๋ค. | |
| <hfoptions id="usage"> | |
| <hfoption id="Pipeline"> | |
| ```py | |
| import torch | |
| from transformers import pipeline | |
| pipeline = pipeline( | |
| task="image-text-to-text", | |
| model="google/gemma-3-4b-pt", | |
| device=0, | |
| dtype=torch.bfloat16 | |
| ) | |
| pipeline( | |
| "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg", | |
| text="<start_of_image> What is shown in this image?" | |
| ) | |
| ``` | |
| </hfoption> | |
| <hfoption id="AutoModel"> | |
| ```py | |
| import torch | |
| from transformers import AutoProcessor, Gemma3ForConditionalGeneration | |
| model = Gemma3ForConditionalGeneration.from_pretrained( | |
| "google/gemma-3-4b-it", | |
| dtype=torch.bfloat16, | |
| device_map="auto", | |
| attn_implementation="sdpa" | |
| ) | |
| processor = AutoProcessor.from_pretrained( | |
| "google/gemma-3-4b-it", | |
| padding_side="left" | |
| ) | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| {"type": "text", "text": "You are a helpful assistant."} | |
| ] | |
| }, | |
| { | |
| "role": "user", "content": [ | |
| {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"}, | |
| {"type": "text", "text": "What is shown in this image?"}, | |
| ] | |
| }, | |
| ] | |
| inputs = processor.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| return_dict=True, | |
| return_tensors="pt", | |
| add_generation_prompt=True, | |
| ).to(model.device) | |
| output = model.generate(**inputs, max_new_tokens=50, cache_implementation="static") | |
| print(processor.decode(output[0], skip_special_tokens=True)) | |
| ``` | |
| </hfoption> | |
| <hfoption id="transformers CLI"> | |
| ```bash | |
| echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model google/gemma-3-1b-pt --device 0 | |
| ``` | |
| </hfoption> | |
| </hfoptions> | |
| ์์ํ๋ ๊ฐ์ค์น๋ฅผ ๋ ๋ฎ์ ์ ๋ฐ๋๋ก ํํํ์ฌ, ํฐ ๋ชจ๋ธ์ ๋ฉ๋ชจ๋ฆฌ ๋ถ๋ด์ ์ค์ฌ์ค๋๋ค. ์ฌ์ฉ ๊ฐ๋ฅํ ์์ํ ๋ฐฑ์๋์ ๋ํ ๋ ์์ธํ ๋ด์ฉ์ [์์ํ](../quantization/overview) ๊ฐ์๋ฅผ ์ฐธ๊ณ ํ์ธ์. | |
| ์๋ ์์ ์์๋ [torchao](../quantization/torchao)๋ฅผ ์ฌ์ฉํ์ฌ ๊ฐ์ค์น๋ฅผ int4๋ก๋ง ์์ํํฉ๋๋ค. | |
| ```py | |
| # pip install torchao | |
| import torch | |
| from transformers import TorchAoConfig, Gemma3ForConditionalGeneration, AutoProcessor | |
| quantization_config = TorchAoConfig("int4_weight_only", group_size=128) | |
| model = Gemma3ForConditionalGeneration.from_pretrained( | |
| "google/gemma-3-27b-it", | |
| dtype=torch.bfloat16, | |
| device_map="auto", | |
| quantization_config=quantization_config | |
| ) | |
| processor = AutoProcessor.from_pretrained( | |
| "google/gemma-3-27b-it", | |
| padding_side="left" | |
| ) | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| {"type": "text", "text": "You are a helpful assistant."} | |
| ] | |
| }, | |
| { | |
| "role": "user", "content": [ | |
| {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"}, | |
| {"type": "text", "text": "What is shown in this image?"}, | |
| ] | |
| }, | |
| ] | |
| inputs = processor.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| return_dict=True, | |
| return_tensors="pt", | |
| add_generation_prompt=True, | |
| ).to(model.device) | |
| output = model.generate(**inputs, max_new_tokens=50, cache_implementation="static") | |
| print(processor.decode(output[0], skip_special_tokens=True)) | |
| ``` | |
| [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139)๋ฅผ ์ฌ์ฉํ์ฌ ๋ชจ๋ธ์ด ์ฃผ๋ชฉํ ์ ์๋ ํ ํฐ๊ณผ ์ฃผ๋ชฉํ ์ ์๋ ํ ํฐ์ ๋ ์ ์ดํดํ ์ ์์ต๋๋ค. | |
| ```py | |
| from transformers.utils.attention_visualizer import AttentionMaskVisualizer | |
| visualizer = AttentionMaskVisualizer("google/gemma-3-4b-it") | |
| visualizer("<img>What is shown in this image?") | |
| ``` | |
| <div class="flex justify-center"> | |
| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/gemma-3-attn-mask.png"/> | |
| </div> | |
| ## ๋ ธํธ [[notes]] | |
| - ์ด๋ฏธ์ง-ํ ์คํธ ๋ฐ ์ด๋ฏธ์ง ์ ์ฉ ์ ๋ ฅ์๋ [`Gemma3ForConditionalGeneration`]์ ์ฌ์ฉํ์ธ์. | |
| - Gemma 3๋ ๋ค์ค ์ ๋ ฅ ์ด๋ฏธ์ง๋ฅผ ์ง์ํ์ง๋ง, ํ๋ก์ธ์์ ์ ๋ฌํ๊ธฐ ์ ์ ์ด๋ฏธ์ง๊ฐ ์ฌ๋ฐ๋ฅด๊ฒ ๋ฐฐ์น๋์๋์ง ํ์ธํ์ธ์. ๊ฐ ๋ฐฐ์น๋ ํ๋ ์ด์์ ์ด๋ฏธ์ง๋ฅผ ํฌํจํ ๋ฆฌ์คํธ์ฌ์ผ ํฉ๋๋ค. | |
| ```py | |
| url_cow = "https://media.istockphoto.com/id/1192867753/photo/cow-in-berchida-beach-siniscola.jpg?s=612x612&w=0&k=20&c=v0hjjniwsMNfJSuKWZuIn8pssmD5h5bSN1peBd1CmH4=" | |
| url_cat = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" | |
| messages =[ | |
| { | |
| "role": "system", | |
| "content": [ | |
| {"type": "text", "text": "You are a helpful assistant."} | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "url": url_cow}, | |
| {"type": "image", "url": url_cat}, | |
| {"type": "text", "text": "Which image is cuter?"}, | |
| ] | |
| }, | |
| ] | |
| ``` | |
| - ํ๋ก์ธ์์ ์ ๋ฌ๋๋ ํ ์คํธ์๋ ์ด๋ฏธ์ง๊ฐ ์ฝ์ ๋์ด์ผ ํ๋ ์์น๋ง๋ค `<start_of_image>` ํ ํฐ์ด ์์ด์ผ ํฉ๋๋ค. | |
| - ํ๋ก์ธ์์๋ ์ฑํ ๋ฉ์์ง๋ฅผ ๋ชจ๋ธ ์ ๋ ฅ์ผ๋ก ๋ณํํ๋ ์์ฒด [`~ProcessorMixin.apply_chat_template`] ๋ฉ์๋๊ฐ ์์ต๋๋ค. | |
| - ๊ธฐ๋ณธ์ ์ผ๋ก ์ด๋ฏธ์ง๋ ์๋ฆฌ์ง ์์ผ๋ฉฐ ๊ธฐ๋ณธ ์ด๋ฏธ์ง๋ง ๋ชจ๋ธ๋ก ์ ๋ฌ๋ฉ๋๋ค. ๊ณ ํด์๋ ์ด๋ฏธ์ง๋ ์ ์ฌ๊ฐํ์ด ์๋ ์ข ํก๋น์ ์ด๋ฏธ์ง์์๋ ๋น์ ์ธ์ฝ๋๊ฐ 896x896์ ๊ณ ์ ํด์๋๋ฅผ ์ฌ์ฉํ๊ธฐ ๋๋ฌธ์ ์ํฐํฉํธ๊ฐ ๋ฐ์ํ ์ ์์ต๋๋ค. ์ด๋ฌํ ์ํฐํฉํธ๋ฅผ ๋ฐฉ์งํ๊ณ ์ถ๋ก ์ค ์ฑ๋ฅ์ ํฅ์์ํค๋ ค๋ฉด, `do_pan_and_scan=True`๋ฅผ ์ค์ ํ์ฌ ์ด๋ฏธ์ง๋ฅผ ์ฌ๋ฌ ๊ฐ์ ์์ ํจ์น๋ก ์๋ฅด๊ณ ๊ธฐ๋ณธ ์ด๋ฏธ์ง ์๋ฒ ๋ฉ๊ณผ ์ด์ด ๋ถ์ ๋๋ค. ๋ ๋น ๋ฅธ ์ถ๋ก ์ ์ํด ํฌ๊ณผ ์ค์บ์ ๋นํ์ฑํํ ์ ์์ต๋๋ค. | |
| ```diff | |
| inputs = processor.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| return_dict=True, | |
| return_tensors="pt", | |
| add_generation_prompt=True, | |
| + do_pan_and_scan=True, | |
| ).to(model.device) | |
| ``` | |
| - ํ ์คํธ ์ ์ฉ ๋ชจ๋๋ก ํ๋ จ๋ Gemma-3 1B ์ฒดํฌํฌ์ธํธ์ ๊ฒฝ์ฐ, [`AutoModelForCausalLM`]์ ๋์ ์ฌ์ฉํ์ธ์. | |
| ```py | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "google/gemma-3-1b-pt", | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "google/gemma-3-1b-pt", | |
| dtype=torch.bfloat16, | |
| device_map="auto", | |
| attn_implementation="sdpa" | |
| ) | |
| input_ids = tokenizer("Plants create energy through a process known as", return_tensors="pt").to(model.device) | |
| output = model.generate(**input_ids, cache_implementation="static") | |
| print(tokenizer.decode(output[0], skip_special_tokens=True)) | |
| ``` | |
| ## Gemma3ImageProcessor | |
| [[autodoc]] Gemma3ImageProcessor | |
| ## Gemma3ImageProcessorFast | |
| [[autodoc]] Gemma3ImageProcessorFast | |
| ## Gemma3Processor | |
| [[autodoc]] Gemma3Processor | |
| ## Gemma3TextConfig | |
| [[autodoc]] Gemma3TextConfig | |
| ## Gemma3Config | |
| [[autodoc]] Gemma3Config | |
| ## Gemma3TextModel | |
| [[autodoc]] Gemma3TextModel | |
| - forward | |
| ## Gemma3Model | |
| [[autodoc]] Gemma3Model | |
| ## Gemma3ForCausalLM | |
| [[autodoc]] Gemma3ForCausalLM | |
| - forward | |
| ## Gemma3ForConditionalGeneration | |
| [[autodoc]] Gemma3ForConditionalGeneration | |
| - forward | |
| ## Gemma3ForSequenceClassification | |
| [[autodoc]] Gemma3ForSequenceClassification | |
| - forward | |