| <!--Copyright 2025 The HuggingFace Team. All rights reserved. | |
| Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with | |
| the License. You may obtain a copy of the License at | |
| http://www.apache.org/licenses/LICENSE-2.0 | |
| Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on | |
| an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the | |
| specific language governing permissions and limitations under the License. | |
| β οΈ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be | |
| rendered properly in your Markdown viewer. | |
| --> | |
| *μ΄ λͺ¨λΈμ 2025λ 5μ 20μΌμ μΆμλμμΌλ©°, 2025λ 6μ 26μΌμ Hugging Face Transformersμ μΆκ°λμμ΅λλ€.* | |
| <div style="float: right;"> | |
| <div class="flex flex-wrap space-x-1"> | |
| <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white"> | |
| <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white"> | |
| </div> | |
| </div> | |
| # Gemma3n[[gemma3n]] | |
| ## κ°μ[[overview]] | |
| [Gemma3n](https://developers.googleblog.com/en/introducing-gemma-3n/)μ μ¬μ νλ ¨λ λ²μ κ³Ό λͺ λ Ήμ΄ κΈ°λ° λ―ΈμΈμ‘°μ λ²μ μ΄ μ 곡λλ λ©ν°λͺ¨λ¬ λͺ¨λΈμ΄λ©°, λͺ¨λΈ ν¬κΈ°λ E4Bμ E2B λ κ°μ§λ‘ μΆμλμμ΅λλ€. μΈμ΄ λͺ¨λΈ μν€ν μ²λ μ΄μ Gemma λ²μ κ³Ό λ§μ λΆλΆμ 곡μ νμ§λ§ μ΄λ² λ²μ μλ μ¬λ¬ κ°μ§ μλ‘μ΄ κΈ°λ²μ΄ μΆκ°λμμ΅λλ€. λνμ μΌλ‘ [κ΅μ°¨ μ λ°μ΄νΈ(AltUp)](https://proceedings.neurips.cc/paper_files/paper/2023/hash/f2059277ac6ce66e7e5543001afa8bb5-Abstract-Conference.html), [νμ΅λ μ¦κ° μμ¬ λ μ΄μ΄(LAuReL)](https://huggingface.co/papers/2411.07501), [MatFormer](https://huggingface.co/papers/2310.07707), λ μ΄μ΄λ³ μλ² λ©, [ν΅κ³μ Top-kλ₯Ό μ΄μ©ν νμ±ν ν¬μμ±(SPARk-Transformer)](https://huggingface.co/papers/2506.06644), KV μΊμ 곡μ λ±μ΄ μμ΅λλ€. Gemma 3nμ [Gemma 3](./gemma3)μ μ μ¬ν μ΄ν μ ν¨ν΄μ μ¬μ©ν©λλ€. κΈλ‘λ² μ ν μ΄ν μ λ μ΄μ΄ 1κ°λ§λ€ λ‘컬 μ¬λΌμ΄λ© μλμ° μ ν μ΄ν μ λ μ΄μ΄ 4κ°λ₯Ό κ΅μ°¨λ‘ λ°°μΉνλ©°, μ΅λ 컨ν μ€νΈ κΈΈμ΄λ 32k ν ν°κΉμ§ μ§μν©λλ€. λΉμ λͺ¨λ¬λ¦¬ν°μμλ MobileNet v5λ₯Ό λΉμ μΈμ½λλ‘ λμ νμ¬ κΈ°λ³Έ ν΄μλλ₯Ό 768x768 ν½μ λ‘ μ²λ¦¬ν©λλ€. λν μ€λμ€ λͺ¨λ¬λ¦¬ν°μμλ [Universal Speech Model(USM)](https://huggingface.co/papers/2303.01037) μν€ν μ²λ₯Ό κΈ°λ°μΌλ‘ μλ‘κ² νμ΅λ μ€λμ€ μΈμ½λκ° μΆκ°λμμ΅λλ€. | |
| λͺ λ Ήμ΄ κΈ°λ° λ―ΈμΈμ‘°μ λ²μ μ μ§μ μ¦λ₯μ κ°ν νμ΅μ ν΅ν΄ νμ²λ¦¬ νμ΅ λμμ΅λλ€. | |
| Gemma 3nμ μλ³Έ 체ν¬ν¬μΈνΈλ [Gemma 3n][gemma3n-collection] μΆμ νμ΄μ§μμ νμΈν μ μμ΅λλ€. | |
| > [!TIP] | |
| > μ€λ₯Έμͺ½ μ¬μ΄λλ°μ μλ Gemma 3n λͺ¨λΈμ ν΄λ¦νλ©΄, Gemmaλ₯Ό λ€μν λΉμ , μ€λμ€, | |
| > μΈμ΄ μμ μ μ μ©νλ λ λ§μ μμλ₯Ό νμΈν μ μμ΅λλ€. | |
| μλ μμλ [`Pipeline`] λλ [`AutoModel`] ν΄λμ€λ₯Ό μ¬μ©νμ¬ μ΄λ―Έμ§λ₯Ό μ λ ₯μΌλ‘ λ°μ ν μ€νΈλ₯Ό μμ±νλ λ°©λ²μ 보μ¬μ€λλ€. | |
| <hfoptions id="usage"> | |
| <hfoption id="Pipeline"> | |
| ```py | |
| import torch | |
| from transformers import pipeline | |
| pipeline = pipeline( | |
| task="image-text-to-text", | |
| model="google/gemma-3n-e4b", | |
| device=0, | |
| dtype=torch.bfloat16 | |
| ) | |
| pipeline( | |
| "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg", | |
| text="μ΄ μ΄λ―Έμ§μ 무μμ΄ λ³΄μ΄λμ?" | |
| ) | |
| ``` | |
| </hfoption> | |
| <hfoption id="AutoModel"> | |
| ```py | |
| import torch | |
| from transformers import AutoProcessor, Gemma3nForConditionalGeneration | |
| model = Gemma3nForConditionalGeneration.from_pretrained( | |
| "google/gemma-3n-e4b-it", | |
| dtype=torch.bfloat16, | |
| device_map="auto", | |
| attn_implementation="sdpa" | |
| ) | |
| processor = AutoProcessor.from_pretrained( | |
| "google/gemma-3n-e4b-it", | |
| padding_side="left" | |
| ) | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| {"type": "text", "text": "λΉμ μ λμμ΄ λλ μ΄μμ€ν΄νΈμ λλ€."} | |
| ] | |
| }, | |
| { | |
| "role": "user", "content": [ | |
| {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"}, | |
| {"type": "text", "text": "μ΄ μ΄λ―Έμ§μ 무μμ΄ λ³΄μ΄λμ?"}, | |
| ] | |
| }, | |
| ] | |
| inputs = processor.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| return_dict=True, | |
| return_tensors="pt", | |
| add_generation_prompt=True, | |
| ).to(model.device) | |
| output = model.generate(**inputs, max_new_tokens=50, cache_implementation="static") | |
| print(processor.decode(output[0], skip_special_tokens=True)) | |
| ``` | |
| </hfoption> | |
| <hfoption id="transformers CLI"> | |
| ```bash | |
| echo -e "μλ¬Όμ νΉμ κ³Όμ μ ν΅ν΄ μλμ§λ₯Ό μμ±ν©λλ€." | transformers run --task text-generation --model google/gemma-3n-e2b --device 0 | |
| ``` | |
| </hfoption> | |
| </hfoptions> | |
| ## μ°Έκ³ μ¬ν[[notes]] | |
| - [`Gemma3nForConditionalGeneration`] ν΄λμ€λ₯Ό μ¬μ©νλ©΄ μ΄λ―Έμ§-μ€λμ€-ν μ€νΈ, μ΄λ―Έμ§-ν μ€νΈ, μ΄λ―Έμ§-μ€λμ€, μ€λμ€-ν μ€νΈ, μ΄λ―Έμ§ λ¨λ , μ€λμ€ λ¨λ μ λ ₯μ λͺ¨λ μ²λ¦¬ν μ μμ΅λλ€. | |
| - Gemma 3nμ ν λ²μ μ λ ₯μ μ¬λ¬ μ΄λ―Έμ§λ₯Ό μ§μν©λλ€. λ€λ§ νλ‘μΈμμ μ λ¬νκΈ° μ μ μ΄λ―Έμ§λ€μ΄ λ°°μΉ λ¨μλ‘ μ¬λ°λ₯΄κ² λ¬Άμ¬μλμ§ νμΈν΄μΌ ν©λλ€. κ° λ°°μΉλ νλ μ΄μμ μ΄λ―Έμ§λ₯Ό λ΄μ 리μ€νΈ νμμ λλ€. | |
| ```py | |
| url_cow = "https://media.istockphoto.com/id/1192867753/photo/cow-in-berchida-beach-siniscola.jpg?s=612x612&w=0&k=20&c=v0hjjniwsMNfJSuKWZuIn8pssmD5h5bSN1peBd1CmH4=" | |
| url_cat = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" | |
| messages =[ | |
| { | |
| "role": "system", | |
| "content": [ | |
| {"type": "text", "text": "λΉμ μ λμμ΄ λλ μ΄μμ€ν΄νΈμ λλ€."} | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "url": url_cow}, | |
| {"type": "image", "url": url_cat}, | |
| {"type": "text", "text": "μ΄λ€ μ΄λ―Έμ§κ° λ κ·μ½μ΅λκΉ?"}, | |
| ] | |
| }, | |
| ] | |
| ``` | |
| - νλ‘μΈμμ μ λ¬λλ ν μ€νΈμλ μ΄λ―Έμ§λ₯Ό μ½μ ν΄μΌ νλ μμΉμ `<image_soft_token>` ν ν°μ ν¬ν¨ν΄μΌ ν©λλ€. | |
| - Gemma 3nμ μ λ ₯λΉ μ΅λ νλμ νκΉ μ€λμ€ ν΄λ¦½λ§ νμ©ν©λλ€. λ€λ§ ν¨μ· ν둬ννΈμμλ μ¬λ¬ κ°μ μ€λμ€ ν΄λ¦½μ ν¨κ» μ 곡ν μ μμ΅λλ€. | |
| - νλ‘μΈμμ μ λ¬λλ ν μ€νΈμλ μ€λμ€ ν΄λ¦½μ μ½μ ν΄μΌ νλ μμΉμ `<audio_soft_token>` ν ν°μ ν¬ν¨ν΄μΌ ν©λλ€. | |
| - νλ‘μΈμμλ μ±ν λ©μμ§λ₯Ό λͺ¨λΈ μ λ ₯ νμμΌλ‘ λ³ννκΈ° μν μ체 λ©μλμΈ [`~ProcessorMixin.apply_chat_template`]κ° ν¬ν¨λμ΄ μμ΅λλ€. | |
| ## Gemma3nAudioFeatureExtractor[[transformers.Gemma3nAudioFeatureExtractor]] | |
| [[autodoc]] Gemma3nAudioFeatureExtractor | |
| ## Gemma3nProcessor[[transformers.Gemma3nProcessor]] | |
| [[autodoc]] Gemma3nProcessor | |
| ## Gemma3nTextConfig[[transformers.Gemma3nTextConfig]] | |
| [[autodoc]] Gemma3nTextConfig | |
| ## Gemma3nVisionConfig[[transformers.Gemma3nVisionConfig]] | |
| [[autodoc]] Gemma3nVisionConfig | |
| ## Gemma3nAudioConfig[[transformers.Gemma3nAudioConfig]] | |
| [[autodoc]] Gemma3nAudioConfig | |
| ## Gemma3nConfig[[transformers.Gemma3nConfig]] | |
| [[autodoc]] Gemma3nConfig | |
| ## Gemma3nTextModel[[transformers.Gemma3nTextModel]] | |
| [[autodoc]] Gemma3nTextModel | |
| - forward | |
| ## Gemma3nModel[[transformers.Gemma3nModel]] | |
| [[autodoc]] Gemma3nModel | |
| - forward | |
| ## Gemma3nForCausalLM[[transformers.Gemma3nForCausalLM]] | |
| [[autodoc]] Gemma3nForCausalLM | |
| - forward | |
| ## Gemma3nForConditionalGeneration[[transformers.Gemma3nForConditionalGeneration]] | |
| [[autodoc]] Gemma3nForConditionalGeneration | |
| - forward | |