thanks to facebook ❤

Files changed (35) hide show

.gitattributes +4 -1
LICENSE +124 -0
README.md +390 -0
base/config.json +118 -0
base/generation_config.json +14 -0
base/model.safetensors +3 -0
base/special_tokens_map.json +5 -0
base/tokenizer.json +3 -0
base/tokenizer.model +3 -0
base/tokenizer_config.json +0 -0
base_int4_accelerator/config.json +118 -0
base_int4_accelerator/generation_config.json +14 -0
base_int4_accelerator/pytorch_model.bin +3 -0
base_int4_accelerator/special_tokens_map.json +5 -0
base_int4_accelerator/tokenizer.json +3 -0
base_int4_accelerator/tokenizer.model +3 -0
base_int4_accelerator/tokenizer_config.json +0 -0
base_int4_cpu/config.json +118 -0
base_int4_cpu/generation_config.json +14 -0
base_int4_cpu/pytorch_model.bin +3 -0
base_int4_cpu/special_tokens_map.json +5 -0
base_int4_cpu/tokenizer.json +3 -0
base_int4_cpu/tokenizer.model +3 -0
base_int4_cpu/tokenizer_config.json +0 -0
config.json +118 -0
configuration_mobilellm_p1.py +270 -0
instruct/chat_template.jinja +18 -0
instruct/config.json +118 -0
instruct/generation_config.json +14 -0
instruct/model.safetensors +3 -0
instruct/special_tokens_map.json +5 -0
instruct/tokenizer.json +3 -0
instruct/tokenizer.model +3 -0
instruct/tokenizer_config.json +0 -0
modeling_mobilellm_p1.py +447 -0

.gitattributes CHANGED Viewed

@@ -23,7 +23,6 @@
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+base/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+instruct/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+base_int4_cpu/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+base_int4_accelerator/tokenizer.json filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,124 @@

+FAIR Noncommercial Research License
+v1 Last Updated: September 23, 2025
+“Acceptable Use Policy” means the FAIR Acceptable Use Policy, applicable to Research Materials, that is incorporated into this Agreement.
+“Agreement” means the terms and conditions for use, reproduction, distribution and modification of the Research Materials set forth herein.
+“Documentation” means the specifications, manuals and documentation accompanying
+Research Materials distributed by Meta.
+“Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+“Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
+“Noncommercial Research Uses” means noncommercial research use cases related to research, development, education, processing, or analysis and in each case, is not primarily intended for commercial advantage or monetary compensation to you or others.
+“Research Materials” means, collectively, Documentation and the models, software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code, demonstration materials and other elements of the foregoing distributed by Meta and made available under this Agreement.
+By clicking “I Accept” below or by using or distributing any portion or element of the Research Materials, you agree to be bound by this Agreement.
+1. License Rights and Redistribution.
+a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Research Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Research Materials.
+b. Redistribution and Use.
+ i. You will not use the Research Materials or any outputs or results of the Research Materials in connection with any commercial uses or for any uses other than Noncommercial Research Uses;
+ii. Distribution of Research Materials, and any derivative works thereof, are subject to the terms of this Agreement. If you distribute or make the Research Materials, or any derivative works thereof, available to a third party, you may only do so under the terms of this Agreement. You shall also provide a copy of this Agreement to such third party.
+iii.  If you submit for publication the results of research you perform on, using, or otherwise in connection with Research Materials, you must acknowledge the use of Research Materials in your publication.
+iv. Your use of the Research Materials must comply with applicable laws and regulations (including Trade Control Laws) and adhere to the FAIR Acceptable Use Policy, which is hereby incorporated by reference into this Agreement.
+2. User Support. Your Noncommercial Research Use of the Research Materials is done at your own discretion; Meta does not process any information nor provide any service in relation to such use.  Meta is under no obligation to provide any support services for the Research Materials. Any support provided is “as is”, “with all faults”, and without warranty of any kind.
+3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE RESEARCH MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE RESEARCH MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE RESEARCH MATERIALS AND ANY OUTPUT AND RESULTS.
+4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT OR INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+5. Intellectual Property.
+a. Subject to Meta’s ownership of Research Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Research Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
+b. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Research Materials, outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Research Materials.
+6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Research Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Research Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement.
+7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
+8. Modifications and Amendments. Meta may modify this Agreement from time to time; provided that they are similar in spirit to the current version of the Agreement, but may differ in detail to address new problems or concerns. All such changes will be effective immediately. Your continued use of the Research Materials after any modification to this Agreement constitutes your agreement to such modification. Except as provided in this Agreement, no modification or addition to any provision of this Agreement will be binding unless it is in writing and signed by an authorized representative of both you and Meta.
+FAIR Acceptable Use Policy
+The Fundamental AI Research (FAIR) team at Meta seeks to further understanding of new and existing research domains with the mission of advancing the state-of-the-art in artificial intelligence through open research for the benefit of all.
+As part of this mission, Meta makes certain research materials available for noncommercial research use. Meta is committed to promoting the safe and responsible use of such research materials.
+Prohibited Uses
+You agree you will not use, or allow others to use, Research Materials to:
+ Violate the law or others’ rights, including to:
+Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as:
+Violence or terrorism
+Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material
+Human trafficking, exploitation, and sexual violence
+The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials.
+Sexual solicitation
+Any other criminal activity
+Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals
+Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services
+Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices
+Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws
+Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any technology using FAIR research materials
+Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system
+2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of research artifacts related to the following:
+Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are subject to the International Traffic Arms Regulations (ITAR) maintained by the United States Department of State
+Guns and illegal weapons (including weapon development)
+Illegal drugs and regulated/controlled substances
+Operation of critical infrastructure, transportation technologies, or heavy machinery
+Self-harm or harm to others, including suicide, cutting, and eating disorders
+Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual
+3. Intentionally deceive or mislead others, including use of FAIR Research Materials related to the following:
+ Generating, promoting, or furthering fraud or the creation or promotion of disinformation
+ Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content
+Generating, promoting, or further distributing spam
+ Impersonating another individual without consent, authorization, or legal right
+Representing that outputs of FAIR research materials or outputs from technology using FAIR research materials are human-generated
+Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement
+4. Fail to appropriately disclose to end users any known dangers of your Research Materials.
+Please report any violation of this Policy or other problems that could lead to a violation of this Policy by submitting a report here [https://docs.google.com/forms/d/e/1FAIpQLSeb11cryAopJ7LNrC4nxEUXrHY26hfkXQMf_uH-oFgA3WlYZQ/viewform].

README.md ADDED Viewed

	@@ -0,0 +1,390 @@

+---
+license: fair-noncommercial-research-license
+extra_gated_fields:
+  First Name: text
+  Last Name: text
+  Date of birth: date_picker
+  Country: country
+  Affiliation: text
+  Job title:
+    type: select
+    options:
+    - Student
+    - Research Graduate
+    - AI researcher
+    - AI developer/engineer
+    - Reporter
+    - Other
+  geo: ip_location
+  By clicking Submit below I accept the terms of the license and acknowledge that the information I provide will be collected stored processed and shared in accordance with the Meta Privacy Policy: checkbox
+extra_gated_description: >-
+  The information you provide will be collected, stored, processed and shared in
+  accordance with the [Meta Privacy
+  Policy](https://www.facebook.com/privacy/policy/).
+extra_gated_button_content: Submit
+language:
+- en
+library_name: transformers
+tags:
+- facebook
+- meta
+- pytorch
+---
+# MobileLLM-P1 Model Card
+We are introducing MobileLLM-P1 or Pro, a 1B foundational language model in the MobileLLM series, designed to deliver high-quality, efficient on-device inference across a wide range of general language modeling tasks. <br>
+We open-source two variants of the model: A **pre-trained base model** along with **quantized checkpoints** for CPU and accelerator inference, as well as an **instruction tuned version**, showing competitive performance against models in the this size range on tasks like tool calling, question answering, rewriting and summarization.
+<p align="center">🤗 &nbsp;&nbsp;<a href="https://huggingface.co/spaces/akhaliq/MobileLLM-Pro">Chat with MobileLLM-Pro</a></p>
+## Key Features
+- **Strong Pre-training Performance:** MobileLLM-Pro base achieves impressive pre-training results, outperforming Gemma 3 1B and Llama 3.2 1B by on average 5.7% and 7.9% respectively on reasoning, knowledge, and long-context retrieval benchmarks. This performance is achieved by pre-training on less than 2T fully open-source tokens.
+- **128k Context Window:** The model supports up to 128k tokens, enabling long-context understanding for applications such as document summarization and information retrieval, implicitly learned from a large teacher model.
+- **Efficient Long-Context Inference:** Interleaving local and global attention layers at a 3:1 ratio with 512 local attention, MobileLLM-Pro reduces prefill latency by 1.8x* and lowers KV cache size from 117MB to 40MB* compared to fully global attention, enabling faster and more memory-efficient inference. (*Assuming 8k context length)
+- **Near Lossless int4 Quantization:** We provide int4 quantization-ready checkpoints for our pre-trained model with less than 1.3% quality degradation compared to floating point baselines:
+    - CPU: int4 weights (group size 32), int8 dynamic activations, int8 KV cache, with only 0.4% regression.
+    - Accelerators: int4 per-channel weights, with only 1.3% quality regression.
+- **Instruction Fine-Tuned Model:** We provide a competitive instruction fine-tuned (IFT) model specializing in use-cases such as tool calling, question answering, rewriting and summarization.
+MobileLLM-Pro sets a new standard for efficient, high-quality on-device language modeling. We invite the community to explore, evaluate, and build upon this model.
+## Model Information
+**Layers:** 30<br>
+**Attention Heads:** 20<br>
+**KV Heads:** 4<br>
+**Dimension:** 1280<br>
+**Hidden Dimension:** 6144<br>
+**Vocabulary Size:** 202,048<br>
+**Total Parameters:** 1,084M (1.08B)
+**Input Modality:** Text<br>
+**Output Modality:** Text<br>
+**Languages:** English<br>
+**Training Method:** Knowledge Distillation<br>
+**Context Length:** 128k tokens<br>
+**Teacher Model:** [Llama 4-Scout](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E)<br>
+**Loss Function:** KL Divergence<br>
+**Quantization:** 16-bit, 4-bit<br>
+**Other Features:** Shared Embeddings, Local-Global Attention
+**Model Developer:** Meta Reality Labs <br>
+**Model Release Date:**  October 2025 <br>
+**License:** MobileLLM-Pro is FAIR NC licensed
+## Results
+### Base Pretrained Model
+| Benchmark       | **P1 (FP)**   | **P1&#32; (Q-CPU)** | **P1 (Q-Acc)** | **Gemma 3 1B** | **Llama 3.2 1B** |
+|-----------------|---------------|---------------------|----------------|----------------|------------------|
+| HellaSwag       | **67.11%**    | 64.89%              | 65.10%         | 62.30%         | 65.69%           |
+| BoolQ           | **76.24%**    | **77.49%**          | **76.36%**     | 63.20%         | 62.51%           |
+| PIQA            | **76.55%**    | **76.66%**          | **75.52%**     | 73.80%         | 75.14%           |
+| SocialIQA       | **50.87%**    | **51.18%**          | **50.05%**     | 48.90%         | 45.60%           |
+| TriviaQA        | **39.85%**    | 37.26%              | 36.42%         | 39.80%         | 23.81%           |
+| NatQ            | **15.76%**    | **15.43%**          | **13.19%**     | 9.48%          | 5.48%            |
+| ARC-c           | **52.62%**    | **52.45%**          | **51.24%**     | 38.40%         | 38.28%           |
+| ARC-e           | **76.28%**    | **76.58%**          | **75.73%**     | 73.00%         | 63.47%           |
+| WinoGrande      | **62.83%**    | **62.43%**          | **61.96%**     | 58.20%         | 61.09%           |
+| OBQA            | **43.60%**    | **44.20%**          | **40.40%**     |                | 37.20%           |
+| NIH             | **100.00%**   | 96.44%              | **98.67%**     |
+FP = Full precision, bf16<br>
+Q-CPU = int4, group-wise quantized (for CPU)<br>
+Q-Acc = int4, channel-wise quantized (for Accelerators (ANE&HTP))
+### Instruction Tuned Model
+| Benchmark     | **P1 (IFT)** | **Gemma 3 1B (IFT)** | **Llama 3.2 1B (IFT)** |
+|---------------|--------------|----------------------|------------------------|
+| MMLU          | 44.8%        | 29.9%                | **49.3%**              |
+| IFEval        | 62.0%        | **80.2%**            | 59.5%                  |
+| MBPP          | **46.8%**    | 35.2%                | 39.6%                  |
+| HumanEval     | **59.8%**    | 41.5%                | 37.8%                  |
+| ARC-C         | **62.7%**    |                      | 59.4%                  |
+| HellaSwag     | **58.4%**    |                      | 41.2%                  |
+| BFCL v2       | **29.4%**    |                      | 25.7%                  |
+| Open Rewrite  | **51.0%**    |                      | 41.6%                  |
+| TLDR9+        | **16.8%**    |                      | **16.8%**              |
+## Training Data
+We constructed our datamix by selecting publicly available datasets that cover a range of domains. Using data-specific simulation runs, each dataset's contribution to the training process was carefully balanced by assigning it a specific sampling weight. These weights remained consistent throughout the base model pretraining and were informed by the extended work of [Automixer](https://scholar.google.com/citations?view_op=view_citation&hl=en&user=FbR5cAMAAAAJ&sortby=pubdate&citation_for_view=FbR5cAMAAAAJ:cFHS6HbyZ2cC) and additional ablation studies. <br>
+The pre-training datamix primarily consists of a large educational web dataset, which makes up the vast majority of the training data. Smaller but significant portions come from coding data, mathematics, Wikipedia, scientific papers, Q&A forums, and algebraic content. In total, the datamix includes approximately 1,500 million rows and 1,640 billion tokens. <br>
+For our instruction fine-tuned data-mix, we focus on data diversity from existing open-source fine-tuning corpora. Specifically, we combine datasets for general instruction tuning with chat, science, safety, coding and math domains. For our final DPO phase, we rely on completely synthetic datasets.
+## Training Process
+### Pretraining
+Our general pre-training process contains three distinct phases using logit-based knowledge distillation from the [Llama 4-Scout](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E) model and a novel model merging paradigm:
+**Phase 1 (KD)**: Language Learning – Learn general language skills from high-quality, well balanced pre-training data <br>
+**Phase 2 (KD)**: Long-context awareness – Extend the model context-length to 128k tokens using implicit positional distillation from the teacher model <br>
+**Phase 3 (KD)**: Domain abilities – Acquire domain understanding through annealing of multiple models in parallel and merging the specialist models, resulting in improvements across a diverse range of domains
+![image](https://cdn-uploads.huggingface.co/production/uploads/68c1aa07c02e455d06f93a42/DpI3Yk1fxWA789N76fvjr.png)
+On top of the three pre-training phases, we add a fourth phase of Quantization-Aware Training (QAT) for our 4-bit quantized model checkpoint.
+### Instruction Fine-Tuning
+We split the instruction fine-tuning stage into three distinct phases combining SFT and DPO methods:
+ **Phase 1 (SFT)**: Learn general instruction-following with a focus on data diversity <br>
+**Phase 2 (SFT)**: Domain-weight the Phase 1 data given its shortcomings (e.g. upsample code data to improve logical reasoning) <br>
+**Phase 3 (SFT + DPO)**: Train and align the model for safety and self-identification
+![image](https://cdn-uploads.huggingface.co/production/uploads/68c1aa07c02e455d06f93a42/wBAO_0Bu3dnCn8R2K9HXD.png)
+## Quantization
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/68c1aa07c02e455d06f93a42/NJ_d8jyeVwkLIp9kwZRtR.png)
+We apply Quantization Aware Training (QAT) to our baseline and instruction fine-tuned models, yielding quantization-ready checkpoints that can either be directly converted to integer datatype (with minimal quality loss) or used for QAT on additional data. We release two quantization-ready checkpoints:
+- **4-bit groupwise weight quantization** with block size 32, 8-bit dynamic activations, and 8-bit kv-cache quantizations — optimized for CPU/GPU backends ([xnnpack](https://docs.pytorch.org/executorch/0.5/native-delegates-executorch-xnnpack-delegate.html)).
+- **4-bit channelwise quantization** without activation quantization and 8-bit kv-cache quantizations — designed for edge hardware accelerators such as Apple Neural Engine ([ANE](https://apple.github.io/coremltools/docs-guides/source/opt-quantization-overview.html)) and Qualcomm’s Hexagon Tensor Processor ([HTP](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/htp_guidelines_int4_weights.html)).
+Our QAT approach incorporates long-context awareness (up to 128k tokens) and self-knowledge distillation using the full-precision teacher model. We compared the QAT-trained model to a standard round-to-nearest Post-Training Quantization (PTQ) baseline. In the groupwise pre-training setting, we observe a 34% (absolute) regression in average benchmark score when using PTQ and only a 1.5% (absolute) regression for QAT. For instruction fine-tuning, we observe less than 1% average regression using QAT.
+## How to use
+### Full precision:
+```python
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from huggingface_hub import login
+login(token="<HF_TOKEN>")
+MODEL_ID = "facebook/MobileLLM-Pro"
+def generate(user_input: str, model, tokenizer, chat: bool) -> str:
+    if chat:
+        user_input = [{"role": "user", "content": user_input}]
+        inputs = tokenizer.apply_chat_template(
+            user_input, return_tensors="pt", add_generation_prompt=True
+        ).to(model.device)
+    else:
+        inputs = tokenizer(user_input, return_tensors="pt")["input_ids"].to(model.device)
+    outputs = model.generate(inputs, max_new_tokens=128)
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+def main():
+    version = "instruct"  # "base" | "instruct"
+    tokenizer = AutoTokenizer.from_pretrained(
+        MODEL_ID, trust_remote_code=True, subfolder=version
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID, trust_remote_code=True, subfolder=version
+    )
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+    prompt = "Why are open-source on-device language models great?"
+    result = generate(prompt, model, tokenizer, chat=(version == "instruct"))
+    print(result)
+if __name__ == "__main__":
+    main()
+```
+### Quantize Checkpoints
+#### 4-bit Groupwise Quantization
+```python
+from torchao.quantization import quantize_
+from torchao.quantization.qat import (
+    QATConfig,
+    IntxFakeQuantizeConfig
+)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    trust_remote_code=True
+)
+# Prepare for QAT.
+# 8-bit dynamic per-token quantization for activations
+activation_config = IntxFakeQuantizeConfig(
+    torch.int8, "per_token", is_symmetric=False,
+)
+# 4-bit group-size=32 with range_learning=True for weights
+weight_config = IntxFakeQuantizeConfig(
+    torch.int4,
+    group_size=32,
+    is_symmetric=True,
+    is_dynamic=True,
+)
+qat_config = QATConfig(
+    activation_config=activation_config,
+    weight_config=weight_config,
+    step="prepare",
+)
+quantize_(model, qat_config)
+embedding_filter_fn = lambda m, fqn: isinstance(m, torch.nn.Embedding)
+embedding_qat_config = IntxFakeQuantizeConfig(
+    torch.int4,
+    group_size=32,
+    is_symmetric=True,
+    is_dynamic=True,
+)
+quantize_(
+    model,
+    QATConfig(
+        weight_config=embedding_qat_config,
+   step="prepare"
+    ),
+    embedding_filter_fn
+)
+# The model is now ready for Quantization aware Training (QAT)
+# trainer.train()
+model.save_pretrained(
+    save_directory=<QAT_save_directory>,
+    safe_serialization=False
+)
+# Convert model after training
+from torchao.quantization import (
+    IntxWeightOnlyConfig,
+    Int8DynamicActivationIntxWeightConfig
+)
+from torchao.quantization.granularity import PerGroup
+qat_convert_config = QATConfig(
+    Int8DynamicActivationIntxWeightConfig(
+        weight_dtype=torch.int4
+        weight_granularity=PerGroup(32),
+    ),
+    step="convert",
+)
+quantize_(model, qat_convert_config)
+embedding_convert_config = IntxWeightOnlyConfig(
+    weight_dtype=torch.int4,
+    granularity=PerGroup(32)
+)
+quantize_(
+    model,
+    QATConfig(
+        embedding_convert_config,
+        step="convert"
+    ),
+    embedding_filter_fn
+)
+# Save model after convert
+model.save_pretrained(
+    save_directory=<quantized_model_directory>,
+    safe_serialization=False
+)
+```
+#### 4-bit Channelwise Quantization
+```python
+from torchao.quantization import quantize_
+from torchao.quantization.granularity import PerAxis
+from torchao.quantization.qat import (
+    initialize_fake_quantizers,
+    IntxFakeQuantizeConfig,
+    QATConfig
+)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    trust_remote_code=True
+)
+# 4-bit per-channel with range_learning=True for weights
+weight_config = IntxFakeQuantizeConfig(
+    torch.int4,
+    granularity=PerAxis(0),
+    is_symmetric=True,
+    is_dynamic=False,
+    range_learning=True,
+)
+qat_config = QATConfig(
+    weight_config=weight_config,
+    step="prepare",
+)
+quantize_(model, qat_config)
+embedding_filter_fn = lambda m, fqn: isinstance(m, torch.nn.Embedding)
+quantize_(model, qat_config, embedding_filter_fn)
+# Initialize the fake quantizers for range-learning
+example_inputs = (torch.tensor([[1]], dtype=torch.long),)
+initialize_fake_quantizers(model, example_inputs)
+# The model is now ready for Quantization aware Training (QAT)
+# trainer.train()
+model.save_pretrained(
+    save_directory=<QAT_save_directory>,
+    safe_serialization=False
+)
+# Convert model after training
+from torchao.quantization import IntxWeightOnlyConfig
+wt_convert_config = IntxWeightOnlyConfig(
+    weight_dtype=torch.int4,
+    granularity=PerAxis(0)
+)
+qat_convert_config = QATConfig(
+    wt_convert_config,
+    step="convert",
+)
+quantize_(model, qat_convert_config)
+quantize_(model, qat_convert_config, embedding_filter_fn)
+# Save model after convert
+model.save_pretrained(
+    save_directory=<quantized_model_directory>,
+    safe_serialization=False
+)
+```
+## Latency benchmarking
+Latency benchmarking was done on a Samsung Galaxy S25 CPU and Samsung Galaxy S24 Hexagon Tensor Processor (HTP). Models were exported to ExecuTorch with XNNPACK backend (for CPU) and HTP backend (for accelerator). The model size of the CPU model with 4-bit groupwise quantization is 590MB. The CPU and HTP prefill latency for different input prompt lengths of 2k, 4k and 8k along with decode speed for generating 1k tokens is shown in the following table.
+| Model / Prompt length     | 2k     | 4k     | 8k    |
+|---------------------------|--------|--------|-------|
+| CPU Prefill Latency (s)   | 8.9    | 24.8   | 63.5  |
+| CPU Decode Speed (tok/s)  | 33.6   | 24.8   | 19.7  |
+| HTP Prefill Latency (s)   | 1.96   | 3.38   | 9.82  |
+| HTP Decode Speed (tok/s)  | 31.60  | 28.95  | 22.77 |
+| KV Cache Size (MB)        | 14     | 23     | 40    |
+To validate the benefit of interleaved local-global attention (LGA), we benchmark models across different prompt lengths and measure the speed-up in prefill & decode relative to using global attention at every layer:
+![image](https://cdn-uploads.huggingface.co/production/uploads/68c1aa07c02e455d06f93a42/_p8JT_Wtljwyp23TmKsTc.png)
+## Citation
+@misc{mobilellm_pro,<br>
+title={MobileLLM-Pro Model Card},<br>
+author={Patrick Huber*, Ernie Chang*, Wei Wen*, Igor Fedorov*, Tarek Elgamal, Hanxian Huang, Naveen Suda, Chinnadhurai Sankar, Vish Vogeti, Yanghan Wang, Alex Gladkov, Kai Sheng Tai, Abdelrahman Elogeel, Tarek Hefny, Vikas Chandra, Ahmed Aly, Anuj Kumar, Raghuraman Krishnamoorthi**, Adithya Sagar**}, <br>
+year={2025},<br>
+month={October},<br>
+url = {[https://huggingface.co/facebook/MobileLLM-Pro](https://huggingface.co/facebook/MobileLLM-Pro)}}
+## Contact
+Patrick Huber, Meta Inc, Reality Labs ([patrickhuber@meta.com](mailto:patrickhuber@meta.com))<br>
+Ernie Chang, Meta Inc, Reality Labs ([erniecyc@meta.com](mailto:erniecyc@meta.com))<br>
+Wei Wen,  Meta Inc, Reality Labs ([wewen@meta.com](mailto:wewen@meta.com))<br>
+Igor Fedorov, Meta Inc, Reality Labs ([ifedorov@meta.com](mailto:ifedorov@meta.com))<br>
+Raghuraman Krishnamoorthi,  Meta Inc Reality Labs ([raghuraman@meta.com](mailto:raghuraman@meta.com))<br>
+Adithya Sagar, Meta Inc, Reality Labs (adithyasagar@meta.com)
+## Acknowledgements
+We want to thank the team involved in this project, especially: Kimish Patel, Andrew Or, Min Guo, Shen Xu, Brian Moran, Maho Takahashi, Claire Lesage, Rylan Conway, Karan Chadha, Matthew Grange, Tomasz Wołcyrz, Shiv Desai, Amarlin Anand, Joele Sires, Robert Carrillo, Francisc Bungiu, Jayden Yu, AJ Brush, Yang Li, Samuel Selvan, Anand Sharma, Peng Shan, Anand Dass, Abhishek Sharma
+## License
+MobileLLM-Pro is distributed under the [FAIR NC license](https://huggingface.co/facebook/MobileLLM-Pro/blob/main/LICENSE)

base/config.json ADDED Viewed

	@@ -0,0 +1,118 @@

+{
+  "architectures": [
+    "MobileLLMP1ForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_mobilellm_p1.MobileLLMP1TextConfig",
+    "AutoModelForCausalLM": "modeling_mobilellm_p1.MobileLLMP1ForCausalLM"
+  },
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "bos_token_id": 200000,
+  "eos_token_id": [
+    200001,
+    200007,
+    200008
+  ],
+  "pad_token_id": 200018,
+  "for_llm_compressor": false,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1280,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "intermediate_size_mlp": 6144,
+  "layer_types": [
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "sliding_window": 512,
+  "max_position_embeddings": 131072,
+  "model_type": "llama4_text",
+  "moe_layers": [],
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1
+  ],
+  "num_attention_heads": 20,
+  "num_experts_per_tok": 0,
+  "num_hidden_layers": 30,
+  "num_key_value_heads": 4,
+  "num_local_experts": 0,
+  "output_router_logits": false,
+    "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 16.0,
+    "high_freq_factor": 1.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.53.0.dev0",
+  "use_cache": true,
+  "use_qk_norm": false,
+  "vocab_size": 202048
+}

base/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 200000,
+  "eos_token_id": [
+    200001,
+    200007,
+    200008
+  ],
+  "pad_token_id": 200018,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "do_sample": true,
+  "transformers_version": "4.55.0"
+}

base/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fbc0be0c62f63ef9770116b4e7081db712ed147c157be324eb5990daa67293b
+size 2168938424

base/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|begin_of_text|>",
+  "eos_token": "<|eot|>",
+  "pad_token": "<|finetune_right_pad|>"
+}

base/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:172c9eb4beafc72601690da3ccfcede5c2e6806a8d5ec1fca33e22acea8023a4
+size 27948578

base/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0bdbaf59b0762c8c807617e2d8ea51420eb1b1de266df2495be755c8e0ed6ed
+size 3622230

base/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

base_int4_accelerator/config.json ADDED Viewed

	@@ -0,0 +1,118 @@

+{
+  "architectures": [
+    "MobileLLMP1ForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_mobilellm_p1.MobileLLMP1TextConfig",
+    "AutoModelForCausalLM": "modeling_mobilellm_p1.MobileLLMP1ForCausalLM"
+  },
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "bos_token_id": 200000,
+  "eos_token_id": [
+    200001,
+    200007,
+    200008
+  ],
+  "pad_token_id": 200018,
+  "for_llm_compressor": false,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1280,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "intermediate_size_mlp": 6144,
+  "layer_types": [
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "sliding_window": 512,
+  "max_position_embeddings": 131072,
+  "model_type": "llama4_text",
+  "moe_layers": [],
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1
+  ],
+  "num_attention_heads": 20,
+  "num_experts_per_tok": 0,
+  "num_hidden_layers": 30,
+  "num_key_value_heads": 4,
+  "num_local_experts": 0,
+  "output_router_logits": false,
+    "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 16.0,
+    "high_freq_factor": 1.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.53.0.dev0",
+  "use_cache": true,
+  "use_qk_norm": false,
+  "vocab_size": 202048
+}

base_int4_accelerator/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 200000,
+  "eos_token_id": [
+    200001,
+    200007,
+    200008
+  ],
+  "pad_token_id": 200018,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "do_sample": true,
+  "transformers_version": "4.55.0"
+}

base_int4_accelerator/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12fef2380a85b302729d08d73a34b399e32afa04c1d503a85252915588afae1d
+size 2170471151

base_int4_accelerator/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|begin_of_text|>",
+  "eos_token": "<|eot|>",
+  "pad_token": "<|finetune_right_pad|>"
+}

base_int4_accelerator/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:172c9eb4beafc72601690da3ccfcede5c2e6806a8d5ec1fca33e22acea8023a4
+size 27948578

base_int4_accelerator/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0bdbaf59b0762c8c807617e2d8ea51420eb1b1de266df2495be755c8e0ed6ed
+size 3622230

base_int4_accelerator/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

base_int4_cpu/config.json ADDED Viewed

	@@ -0,0 +1,118 @@

+{
+  "architectures": [
+    "MobileLLMP1ForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_mobilellm_p1.MobileLLMP1TextConfig",
+    "AutoModelForCausalLM": "modeling_mobilellm_p1.MobileLLMP1ForCausalLM"
+  },
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "bos_token_id": 200000,
+  "eos_token_id": [
+    200001,
+    200007,
+    200008
+  ],
+  "pad_token_id": 200018,
+  "for_llm_compressor": false,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1280,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "intermediate_size_mlp": 6144,
+  "layer_types": [
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "sliding_window": 512,
+  "max_position_embeddings": 131072,
+  "model_type": "llama4_text",
+  "moe_layers": [],
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1
+  ],
+  "num_attention_heads": 20,
+  "num_experts_per_tok": 0,
+  "num_hidden_layers": 30,
+  "num_key_value_heads": 4,
+  "num_local_experts": 0,
+  "output_router_logits": false,
+    "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 16.0,
+    "high_freq_factor": 1.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.53.0.dev0",
+  "use_cache": true,
+  "use_qk_norm": false,
+  "vocab_size": 202048
+}

base_int4_cpu/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 200000,
+  "eos_token_id": [
+    200001,
+    200007,
+    200008
+  ],
+  "pad_token_id": 200018,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "do_sample": true,
+  "transformers_version": "4.55.0"
+}

base_int4_cpu/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5be59329a78ba1bc9a1be7a439355e259b635834187136ecb0218fa1ee9d3afb
+size 2168995683

base_int4_cpu/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|begin_of_text|>",
+  "eos_token": "<|eot|>",
+  "pad_token": "<|finetune_right_pad|>"
+}

base_int4_cpu/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:172c9eb4beafc72601690da3ccfcede5c2e6806a8d5ec1fca33e22acea8023a4
+size 27948578

base_int4_cpu/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0bdbaf59b0762c8c807617e2d8ea51420eb1b1de266df2495be755c8e0ed6ed
+size 3622230

base_int4_cpu/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,118 @@

+{
+  "architectures": [
+    "MobileLLMP1ForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_mobilellm_p1.MobileLLMP1TextConfig",
+    "AutoModelForCausalLM": "modeling_mobilellm_p1.MobileLLMP1ForCausalLM"
+  },
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "bos_token_id": 200000,
+  "eos_token_id": [
+    200001,
+    200007,
+    200008
+  ],
+  "pad_token_id": 200018,
+  "for_llm_compressor": false,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1280,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "intermediate_size_mlp": 6144,
+  "layer_types": [
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "sliding_window": 512,
+  "max_position_embeddings": 131072,
+  "model_type": "llama4_text",
+  "moe_layers": [],
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1
+  ],
+  "num_attention_heads": 20,
+  "num_experts_per_tok": 0,
+  "num_hidden_layers": 30,
+  "num_key_value_heads": 4,
+  "num_local_experts": 0,
+  "output_router_logits": false,
+    "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 16.0,
+    "high_freq_factor": 1.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.53.0.dev0",
+  "use_cache": true,
+  "use_qk_norm": false,
+  "vocab_size": 202048
+}

configuration_mobilellm_p1.py ADDED Viewed

	@@ -0,0 +1,270 @@

+from transformers.configuration_utils import PretrainedConfig
+class MobileLLMP1TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MobileLLMP1TextModel`]. It is used to instantiate a
+    MobileLLMP1 text model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the MobileLLMP1 1B model.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 202048):
+            Vocabulary size of the Llama4 text model. Defines the maximum number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`Llama4TextModel`].
+        hidden_size (`int`, *optional*, defaults to 5120):
+            Dimensionality of the embeddings and hidden states.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        intermediate_size_mlp (`int`, *optional*, defaults to 16384): TODO
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 40):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If not
+            specified, will default to `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 128): TODO
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        sliding_window (`int`, *optional*, defaults to 512):
+            In MobileLLMP1, every 4 out of 5 layers use sliding window attention. This is the size of the sliding window.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions.
+        pad_token_id (`int`, *optional*, defaults to 128004):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the beginning of sentence token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the end of sentence token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to `500000.0`):
+            The base period of the RoPE embeddings.
+        attention_dropout (`int`, *optional*, defaults to 0.0): TODO
+        num_experts_per_tok (`int`, *optional*, defaults to 1): TODO
+        num_local_experts (`int`, *optional*, defaults to 16): TODO
+        moe_layers (`int`, *optional*): TODO
+        interleave_moe_layer_step (`int`, *optional*, defaults to 1): TODO
+        use_qk_norm (`int`, *optional*, defaults to `True`): TODO
+        output_router_logits (`int`, *optional*, defaults to `False`): TODO
+        router_aux_loss_coef (`int`, *optional*, defaults to 0.001): TODO
+        router_jitter_noise (`int`, *optional*, defaults to 0.0): TODO
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+            <TODO>
+            <TODO>
+        no_rope_layers (`list[int]`, *optional*):
+            List with at least the same length as the number of layers in the model.
+            A `1` at an index position indicates that the corresponding layer will use RoPE,
+            while a `0` indicates that it's a NoPE layer.
+        no_rope_layer_interval (`int`, *optional*, defaults to 4):
+            If `no_rope_layers` is `None`, it will be created using a NoPE layer every
+            `no_rope_layer_interval` layers.
+        attention_chunk_size (`int`, *optional*, defaults to 8192):
+            <TODO>
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
+        attn_temperature_tuning (`bool`, *optional*, defaults to `True`):
+            Whether to dynamically scale the attention temperature for each query token based on sequence length.
+            Recommended for long sequences (e.g., >32k tokens) to maintain stable output results.
+        floor_scale (`int`, *optional*, defaults to 8192): TODO
+        attn_scale (`int`, *optional*, defaults to 0.1): TODO
+    Example:
+    """
+    model_type = "llama4_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.feed_forward.shared_expert.gate_proj": "local_colwise",
+        "layers.*.feed_forward.shared_expert.up_proj": "local_colwise",
+        "layers.*.feed_forward.shared_expert.down_proj": "local_rowwise",
+        "layers.*.feed_forward.experts.gate_up_proj": "local_packed_rowwise",  # row because not linear
+        "layers.*.feed_forward.experts.down_proj": "local_colwise",  # col because not linear
+        "layers.*.feed_forward.experts": "local",
+        "layers.*.feed_forward.gate_proj": "local_colwise",
+        "layers.*.feed_forward.up_proj": "local_colwise",
+        "layers.*.feed_forward.down_proj": "local_rowwise",
+        "layers.*.feed_forward": "gather",
+    }
+    base_model_ep_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.feed_forward.experts.gate_up_proj": "grouped_gemm",  # row because not linear
+        "layers.*.feed_forward.experts.down_proj": "grouped_gemm",  # col because not linear
+        "layers.*.feed_forward.experts": "gather",  # all reduce
+        "layers.*.feed_forward.gate_proj": "local_colwise",
+        "layers.*.feed_forward.up_proj": "local_colwise",
+        "layers.*.feed_forward.down_proj": "local_rowwise",
+        "layers.*.feed_forward.router": "ep_router",
+    }
+    def __init__(
+        self,
+        vocab_size=202048,
+        hidden_size=1280,
+        intermediate_size=6144,
+        intermediate_size_mlp=6144,
+        num_hidden_layers=30,
+        num_attention_heads=20,
+        num_key_value_heads=4,
+        head_dim=64,
+        hidden_act="silu",
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        sliding_window=512,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=500000,
+        attention_dropout=0.0,
+        num_experts_per_tok=1,
+        num_local_experts=16,
+        moe_layers=None,
+        interleave_moe_layer_step=1,
+        use_qk_norm=False,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        router_jitter_noise=0.0,
+        rope_scaling=None,
+        no_rope_layers=None,
+        no_rope_layer_interval=4,
+        attention_chunk_size=8192,
+        layer_types=None,
+        attn_temperature_tuning=True,
+        floor_scale=8192,
+        attn_scale=0.1,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.attn_temperature_tuning = attn_temperature_tuning
+        self.attn_scale = attn_scale
+        self.floor_scale = floor_scale
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.intermediate_size_mlp = intermediate_size_mlp
+        self.num_hidden_layers = num_hidden_layers
+        self.sliding_window = sliding_window
+        self.num_attention_heads = num_attention_heads
+        self.rope_scaling = rope_scaling
+        self.attention_bias = False
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.head_dim = (
+            head_dim
+            if head_dim is not None
+            else self.hidden_size // self.num_attention_heads
+        )
+        self.use_qk_norm = use_qk_norm
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_jitter_noise = router_jitter_noise
+        self.layer_types = layer_types
+        # Backwards compatibility
+        if no_rope_layers == []:
+            no_rope_layers = None
+        default_no_rope_layers = [
+            int((layer_idx + 1) % no_rope_layer_interval != 0)
+            for layer_idx in range(self.num_hidden_layers)
+        ]
+        self.no_rope_layers = (
+            no_rope_layers if no_rope_layers else default_no_rope_layers
+        )
+        # If no pattern set, use our default pattern
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if bool((i) % 4) else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ] + [
+                "full_attention"
+            ]  # Last layer is always full attention
+        self.interleave_moe_layer_step = interleave_moe_layer_step
+        self.moe_layers = (
+            moe_layers
+            if moe_layers is not None
+            else list(
+                range(
+                    interleave_moe_layer_step - 1,
+                    num_hidden_layers,
+                    interleave_moe_layer_step,
+                )
+            )
+        )

instruct/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,18 @@

+{{- bos_token }}
+{%- set intro = "You are MobileLLM Pro, a helpful assistant created by Meta Reality Labs." %}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set user_system_message = messages[0]['content']|trim %}
+    {%- set system_message = intro + "\n" + user_system_message %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = intro %}
+{%- endif %}
+{{- "<|header_start|>system<|header_end|>\n\n" }}
+{{- system_message }}
+{{- "<|eot|>" }}
+{%- for message in messages %}
+    {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n'+ message['content'] | trim + '<|eot|>' }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|header_start|>assistant<|header_end|>\n\n' }}
+{%- endif %}

instruct/config.json ADDED Viewed

	@@ -0,0 +1,118 @@

+{
+  "architectures": [
+    "MobileLLMP1ForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_mobilellm_p1.MobileLLMP1TextConfig",
+    "AutoModelForCausalLM": "modeling_mobilellm_p1.MobileLLMP1ForCausalLM"
+  },
+  "attention_bias": false,
+  "attention_chunk_size": 8192,
+  "attention_dropout": 0.0,
+  "attn_scale": 0.1,
+  "bos_token_id": 200000,
+  "eos_token_id": [
+    200001,
+    200007,
+    200008
+  ],
+  "pad_token_id": 200018,
+  "for_llm_compressor": false,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1280,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "intermediate_size_mlp": 6144,
+  "layer_types": [
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "sliding_window": 512,
+  "max_position_embeddings": 131072,
+  "model_type": "llama4_text",
+  "moe_layers": [],
+  "no_rope_layers": [
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1
+  ],
+  "num_attention_heads": 20,
+  "num_experts_per_tok": 0,
+  "num_hidden_layers": 30,
+  "num_key_value_heads": 4,
+  "num_local_experts": 0,
+  "output_router_logits": false,
+    "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 16.0,
+    "high_freq_factor": 1.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.53.0.dev0",
+  "use_cache": true,
+  "use_qk_norm": false,
+  "vocab_size": 202048
+}

instruct/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 200000,
+  "eos_token_id": [
+    200001,
+    200007,
+    200008
+  ],
+  "pad_token_id": 200018,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "do_sample": true,
+  "transformers_version": "4.55.0"
+}

instruct/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4ad9e2b937e2daf12859a6367d081c57e5be7dc4cc4b76a3dc44fa4a5ad21a4
+size 2168938424

instruct/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|begin_of_text|>",
+  "eos_token": "<|eot|>",
+  "pad_token": "<|finetune_right_pad|>"
+}

instruct/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:172c9eb4beafc72601690da3ccfcede5c2e6806a8d5ec1fca33e22acea8023a4
+size 27948578

instruct/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0bdbaf59b0762c8c807617e2d8ea51420eb1b1de266df2495be755c8e0ed6ed
+size 3622230

instruct/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_mobilellm_p1.py ADDED Viewed

	@@ -0,0 +1,447 @@

+import math
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import (
+    create_causal_mask,
+    create_sliding_window_causal_mask,
+)
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    ModelOutput,
+)
+from transformers.modeling_rope_utils import dynamic_rope_update, ROPE_INIT_FUNCTIONS
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.models.llama4.configuration_llama4 import (
+    Llama4Config,
+    Llama4TextConfig,
+    Llama4VisionConfig,
+)
+from transformers.models.llama4.modeling_llama4 import (
+    apply_rotary_emb,
+    eager_attention_forward,
+    Llama4PreTrainedModel,
+    Llama4TextDecoderLayer,
+    Llama4TextL2Norm,
+    Llama4TextMLP,
+    Llama4TextMoe,
+    Llama4TextRMSNorm,
+    Llama4TextRotaryEmbedding,
+    Llama4TextAttention,
+    Llama4TextDecoderLayer,
+    Llama4ForCausalLM
+)
+from transformers.processing_utils import Unpack
+from transformers.utils import (
+    auto_docstring,
+    can_return_tuple,
+    logging,
+    TransformersKwargs,
+)
+from transformers.utils.deprecation import deprecate_kwarg
+from transformers.utils.generic import check_model_inputs
+from .configuration_mobilellm_p1 import MobileLLMP1TextConfig
+logger = logging.get_logger(__name__)
+class MobileLLMP1TextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: MobileLLMP1TextConfig, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+        self.head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        self.num_attention_heads = config.num_attention_heads
+        self.num_key_value_groups = (
+            config.num_attention_heads // config.num_key_value_heads
+        )
+        self.num_key_value_heads = config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attn_scale = config.attn_scale
+        self.floor_scale = config.floor_scale
+        self.attn_temperature_tuning = config.attn_temperature_tuning
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.use_rope = config.no_rope_layers[layer_idx]
+        self.sliding_window = config.sliding_window if self.is_sliding else None
+        self.q_proj = nn.Linear(
+            config.hidden_size,
+            config.num_attention_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size,
+            config.num_key_value_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size,
+            config.num_key_value_heads * self.head_dim,
+            bias=config.attention_bias,
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim,
+            config.hidden_size,
+            bias=config.attention_bias,
+        )
+        if self.config.use_qk_norm and self.use_rope:
+            self.qk_norm = Llama4TextL2Norm(config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape)
+        key_states = self.k_proj(hidden_states).view(*input_shape, -1, self.head_dim)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        if self.use_rope:  # the 16E model skips rope for long context on certain layers
+            query_states, key_states = apply_rotary_emb(
+                query_states, key_states, position_embeddings.to(query_states.device)
+            )
+        if hasattr(self, "qk_norm"):  # the 128E model does not use qk_norm
+            query_states = self.qk_norm(query_states)
+            key_states = self.qk_norm(key_states)
+        # Use temperature tuning from https://huggingface.co/papers/2501.19399) to NoROPE layers
+        if self.attn_temperature_tuning and not self.use_rope:
+            attn_scales = (
+                torch.log(
+                    torch.floor((cache_position.float() + 1.0) / self.floor_scale) + 1.0
+                )
+                * self.attn_scale
+                + 1.0
+            )
+            attn_scales = attn_scales.view((1, input_shape[-1], 1, 1)).expand(
+                (*input_shape, 1, 1)
+            )  # batch size > 1
+            query_states = (query_states * attn_scales).to(query_states.dtype)
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"cache_position": cache_position}
+            key_states, value_states = past_key_values.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[
+                self.config._attn_implementation
+            ]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class MobileLLMP1TextDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        self.attention_type = config.layer_types[layer_idx]
+        self.self_attn = MobileLLMP1TextAttention(config, layer_idx)
+        self.is_moe_layer = layer_idx in config.moe_layers
+        if self.is_moe_layer:  # the 128E model interleaves dense / sparse
+            self.feed_forward = Llama4TextMoe(config)
+        else:
+            self.feed_forward = Llama4TextMLP(
+                config, intermediate_size=config.intermediate_size_mlp
+            )
+        self.input_layernorm = Llama4TextRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = Llama4TextRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[
+        torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        attention_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = residual + attention_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        if self.is_moe_layer:
+            hidden_states, _ = hidden_states
+        hidden_states = residual + hidden_states.view(residual.shape)
+        return hidden_states
+class MobileLLMP1TextModel(Llama4PreTrainedModel):
+    _no_split_modules = ["MobileLLMP1TextDecoderLayer"]
+    base_model_prefix = "model"
+    config: MobileLLMP1TextConfig
+    _can_record_outputs = {
+        "attentions": MobileLLMP1TextAttention,
+        "hidden_states": MobileLLMP1TextDecoderLayer,
+        "router_logits": Llama4TextMoe,
+    }
+    def __init__(self, config: MobileLLMP1TextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.hidden_size, self.padding_idx
+        )
+        self.layers = nn.ModuleList(
+            [
+                MobileLLMP1TextDecoderLayer(config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = Llama4TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Llama4TextRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(
+                input_ids.to(self.embed_tokens.weight.device)
+            )
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if cache_position is None:
+            past_seen_tokens = (
+                past_key_values.get_seq_length() if past_key_values is not None else 0
+            )
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            sliding_mask_kwargs = mask_kwargs.copy()
+            del sliding_mask_kwargs['position_ids']
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(
+                    **sliding_mask_kwargs
+                ),
+            }
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        freq_cis = self.rotary_emb(hidden_states, position_ids)
+        # found = False
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=freq_cis,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+class MobileLLMP1ForCausalLM(Llama4PreTrainedModel, GenerationMixin):
+    _no_split_modules = ["MobileLLMP1TextDecoderLayer"]
+    base_model_prefix = "language_model"
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    config: MobileLLMP1TextConfig
+    def __init__(self, config: MobileLLMP1TextConfig):
+        super().__init__(config)
+        self.model = MobileLLMP1TextModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, Llama4ForCausalLM
+        >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logixts, and do not upcast them to float if we are not computing the loss
+        slice_indices = (
+            slice(-logits_to_keep, None)
+            if isinstance(logits_to_keep, int)
+            else logits_to_keep
+        )
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = [
+    "MobileLLMP1ForCausalLM",
+    "MobileLLMP1TextModel",
+    "MobileLLMP1TextDecoderLayer",
+    "MobileLLMP1TextAttention",
+]