diff --git "a/data/prs.json" "b/data/prs.json" --- "a/data/prs.json" +++ "b/data/prs.json" @@ -1,27450 +1,26330 @@ [ { - "additions": 1, - "author": "ydshieh", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Minor update", - "changed_files": 1, - "cluster_id": null, - "cluster_ids": [], - "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/45484", - "created_at": "2026-04-16T20:10:45Z", - "deletions": 1, + "additions": 195, + "author": "sirzechs66", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": "# What does this PR do? This PR adds full GGUF loading support for GPT\u2011OSS models (20B/120B). It allows Transformers (and consequently vLLM) to directly load GPT\u2011OSS GGUF files without falling back to a wrong architecture. The changes incl\u2026", + "changed_files": 4, + "cluster_id": "cluster-43366-4", + "cluster_ids": [ + "cluster-43366-4" + ], + "cluster_role": "member", + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/45506", + "created_at": "2026-04-18T08:43:19Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45484/files", - "html_url": "https://github.com/huggingface/transformers/pull/45484", + "files_url": "https://github.com/huggingface/transformers/pull/45506/files", + "html_url": "https://github.com/huggingface/transformers/pull/45506", "labels": [], - "merged": true, - "number": 45484, + "merged": false, + "number": 45506, "review_comments_count": 0, - "state": "closed", - "title": "Minor update", - "updated_at": "2026-04-16T20:27:48Z" + "state": "open", + "title": "Add full GGUF loading support for GPT\u2011OSS (fixes #43366, supersedes #43757) latest", + "updated_at": "2026-04-18T09:01:44Z" }, { - "additions": 36, - "author": "vasqu", - "author_association": "MEMBER", - "body_excerpt": "Fixup some conversion mappings --> everything should live directly under `mapping` except we add onto it - Solar change is the same as qwen2 moe - Cohere moved - Ernie moe similar to minimax with one additional rename", - "changed_files": 1, + "additions": 1245, + "author": "kchpp940", + "author_association": "NONE", + "body_excerpt": "# What does this PR do? everything should live directly under `mapping` except we add onto it - Solar change is the same as qwen2 moe - Cohere moved - Ernie moe similar to minimax with one additional rename", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 0, - "conversation_url": "https://github.com/huggingface/transformers/pull/45460", - "created_at": "2026-04-15T13:18:36Z", - "deletions": 5, + "comments_count": 4, + "conversation_url": "https://github.com/huggingface/transformers/pull/45483", + "created_at": "2026-04-16T15:59:26Z", + "deletions": 64, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45460/files", - "html_url": "https://github.com/huggingface/transformers/pull/45460", - "labels": [ - "Code agent slop" - ], + "files_url": "https://github.com/huggingface/transformers/pull/45483/files", + "html_url": "https://github.com/huggingface/transformers/pull/45483", + "labels": [], "merged": false, - "number": 45460, - "review_comments_count": 0, - "state": "closed", - "title": "fix(tokenization): re-raise ImportError to allow RuntimeError/OSError fallback (#45459)", - "updated_at": "2026-04-16T10:48:07Z" + "number": 45483, + "review_comments_count": 3, + "state": "open", + "title": "[`Conversion Mapping`] Small fixups", + "updated_at": "2026-04-17T12:09:04Z" }, { - "additions": 2, - "author": "tomaarsen", + "additions": 184, + "author": "zucchini-nlp", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Currently, for qwen2_5_omni and qwen3_omni_moe, you can only load the 'Talker' variant, i.e. with the audio output. This is a bit like only being able to load a checkpoint with `AutoModelForCausalLM` while `AutoMode\u2026", - "changed_files": 2, + "body_excerpt": ".", + "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 8, - "conversation_url": "https://github.com/huggingface/transformers/pull/45457", - "created_at": "2026-04-15T12:29:47Z", - "deletions": 2, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/45481", + "created_at": "2026-04-16T15:51:21Z", + "deletions": 185, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45457/files", - "html_url": "https://github.com/huggingface/transformers/pull/45457", + "files_url": "https://github.com/huggingface/transformers/pull/45481/files", + "html_url": "https://github.com/huggingface/transformers/pull/45481", "labels": [], "merged": true, - "number": 45457, - "review_comments_count": 4, + "number": 45481, + "review_comments_count": 1, "state": "closed", - "title": "Allow loading Qwen Thinker 'base' models without generative head", - "updated_at": "2026-04-16T12:24:48Z" + "title": "Add check-auto in repo-consistency and fix sorting", + "updated_at": "2026-04-17T11:18:52Z" }, { - "additions": 2, - "author": "tarekziade", + "additions": 27, + "author": "SunMarc", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? we're extending `ty` to more modules and we need stubs from more libs like openai.", + "body_excerpt": "# What does this PR do? This PR fixes quantization tests. A few things were deprecated when compressed-tensors had their latest release, so i'm updating the tests. For fouroversix, it's just that the model was a bit too big for the CI", "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/45456", - "created_at": "2026-04-15T12:10:18Z", - "deletions": 2, + "comments_count": 5, + "conversation_url": "https://github.com/huggingface/transformers/pull/45480", + "created_at": "2026-04-16T15:23:30Z", + "deletions": 86, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45456/files", - "html_url": "https://github.com/huggingface/transformers/pull/45456", + "files_url": "https://github.com/huggingface/transformers/pull/45480/files", + "html_url": "https://github.com/huggingface/transformers/pull/45480", "labels": [], - "merged": true, - "number": 45456, + "merged": false, + "number": 45480, "review_comments_count": 0, - "state": "closed", - "title": "refactor(qa): extend extras so ty can run on server modules", - "updated_at": "2026-04-15T16:08:23Z" + "state": "open", + "title": "Update quants tests ", + "updated_at": "2026-04-17T13:46:50Z" }, { - "additions": 3, - "author": "tomaarsen", + "additions": 330, + "author": "zucchini-nlp", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? There's 2 changes, one is a definite fix and one is a preference. Some background: there are a lot of models that have finetuned `qwen2_5_omni`, e.g. https://huggingface.co/LCO-Embedding/LCO-Embedding-Omni-3B, and i\u2026", - "changed_files": 2, - "cluster_id": null, - "cluster_ids": [], - "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/45455", - "created_at": "2026-04-15T11:38:05Z", - "deletions": 25, - "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45455/files", - "html_url": "https://github.com/huggingface/transformers/pull/45455", - "labels": [], - "merged": true, - "number": 45455, - "review_comments_count": 5, - "state": "closed", - "title": "[`fix`] Make Qwen2_5OmniProcessor warning a lot less noisy via warning_once", - "updated_at": "2026-04-16T12:10:43Z" - }, - { - "additions": 81, - "author": "zucchini-nlp", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Fixes https://github.com/huggingface/transformers/issues/45200 As per title, this error was actually needed only in PG. Other models don't have such prefix/suffix separation when training", - "changed_files": 7, + "body_excerpt": "# What does this PR do? As per title, I think this pattern is used quite often and deserves to be a public mask-fn. Used currently in gemma/paligemma family, GIT, PI0 and will be used in two upcoming models (deepseekOcr and Molmo2) This PR\u2026", + "changed_files": 9, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 13, - "conversation_url": "https://github.com/huggingface/transformers/pull/45454", - "created_at": "2026-04-15T11:11:34Z", - "deletions": 104, + "comments_count": 6, + "conversation_url": "https://github.com/huggingface/transformers/pull/45477", + "created_at": "2026-04-16T14:12:03Z", + "deletions": 595, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45454/files", - "html_url": "https://github.com/huggingface/transformers/pull/45454", + "files_url": "https://github.com/huggingface/transformers/pull/45477/files", + "html_url": "https://github.com/huggingface/transformers/pull/45477", "labels": [], "merged": false, - "number": 45454, + "number": 45477, "review_comments_count": 0, "state": "open", - "title": "Gemma4 training with text-only samples", - "updated_at": "2026-04-16T04:05:06Z" + "title": "Blockwise mask fn as opt arg in all masking functions", + "updated_at": "2026-04-17T14:33:10Z" }, { - "additions": 917, - "author": "ArthurZucker", + "additions": 14, + "author": "ydshieh", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Ai init", - "changed_files": 5, + "body_excerpt": "# What does this PR do? Call CI workflow", + "changed_files": 4, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45453", - "created_at": "2026-04-15T10:22:17Z", - "deletions": 67, - "draft": true, - "files_url": "https://github.com/huggingface/transformers/pull/45453/files", - "html_url": "https://github.com/huggingface/transformers/pull/45453", + "comments_count": 3, + "conversation_url": "https://github.com/huggingface/transformers/pull/45476", + "created_at": "2026-04-16T13:49:50Z", + "deletions": 0, + "draft": false, + "files_url": "https://github.com/huggingface/transformers/pull/45476/files", + "html_url": "https://github.com/huggingface/transformers/pull/45476", "labels": [], "merged": false, - "number": 45453, + "number": 45476, "review_comments_count": 0, "state": "open", - "title": "Draft commit", - "updated_at": "2026-04-16T20:19:24Z" + "title": "[Don't merge] Call CI workflow", + "updated_at": "2026-04-16T21:04:16Z" }, { - "additions": 3058, - "author": "DavidSolanas", - "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": "Fixes #45306 ## What and why All `models/X/__init__.py` files used `from .module import *` inside the `TYPE_CHECKING` block. This makes it impossible for static analysis tools (pyright, mypy, IDEs) to know which symbols are actually export\u2026", - "changed_files": 446, + "additions": 54, + "author": "tarekziade", + "author_association": "MEMBER", + "body_excerpt": "# What does this PR do? Split out the `mlinter` tool see https://github.com/huggingface/transformers-mlinter We want to be able to: - use it from other CI projects - remove the ability to alter the linter from Transformers PRs This change\u2026", + "changed_files": 29, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45452", - "created_at": "2026-04-15T08:44:00Z", - "deletions": 1305, + "conversation_url": "https://github.com/huggingface/transformers/pull/45475", + "created_at": "2026-04-16T12:01:49Z", + "deletions": 3405, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45452/files", - "html_url": "https://github.com/huggingface/transformers/pull/45452", + "files_url": "https://github.com/huggingface/transformers/pull/45475/files", + "html_url": "https://github.com/huggingface/transformers/pull/45475", "labels": [], "merged": false, - "number": 45452, + "number": 45475, "review_comments_count": 0, "state": "open", - "title": "refactor: replace wildcard imports with explicit imports in model __init__.py files", - "updated_at": "2026-04-15T12:15:24Z" + "title": "chore(qa): split out mlinter", + "updated_at": "2026-04-17T14:52:39Z" }, { - "additions": 20, - "author": "MukundaKatta", - "author_association": "NONE", - "body_excerpt": "Docstring/comment-only typo fixes across Qwen3-VL, Qwen3.5, GLM4V, GLM4.6V, GLM-OCR and their MoE variants. `seperate` -> `separate`. No behavior changes. I deliberately left `image_seperate.weight` in `convert_mm_grounding_dino_to_hf.py`\u2026", - "changed_files": 10, + "additions": 2, + "author": "rtrompier", + "author_association": "MEMBER", + "body_excerpt": "Bump the pinned doc-builder SHA so that main documentation builds also sync to the HF bucket (dual-write).", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45451", - "created_at": "2026-04-15T08:33:29Z", - "deletions": 20, - "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45451/files", - "html_url": "https://github.com/huggingface/transformers/pull/45451", + "conversation_url": "https://github.com/huggingface/transformers/pull/45474", + "created_at": "2026-04-16T11:59:17Z", + "deletions": 2, + "draft": true, + "files_url": "https://github.com/huggingface/transformers/pull/45474/files", + "html_url": "https://github.com/huggingface/transformers/pull/45474", "labels": [], "merged": false, - "number": 45451, + "number": 45474, "review_comments_count": 0, "state": "closed", - "title": "Fix 'seperate' typo in qwen3/glm video-model docstrings", - "updated_at": "2026-04-15T10:57:26Z" + "title": "chore: bump doc-builder SHA for main doc build workflow", + "updated_at": "2026-04-17T09:17:57Z" }, { - "additions": 1, - "author": "rtrompier", + "additions": 201, + "author": "AmineDiro", "author_association": "MEMBER", - "body_excerpt": "Switch the PR doc upload flow from the legacy dataset push to the new HF bucket.", - "changed_files": 1, + "body_excerpt": "While benchmarking Qwen3-30B-A3B SFT training with Expert Parallelism (EP) using TRL, I found three bugs that combine to produce silently wrong results or NaN loss. Every existing test uses `tp_plan=\"auto\"` which bypasses `RouterParallel`\u2026", + "changed_files": 4, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/45450", - "created_at": "2026-04-15T08:26:45Z", - "deletions": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/45473", + "created_at": "2026-04-16T10:59:14Z", + "deletions": 41, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45450/files", - "html_url": "https://github.com/huggingface/transformers/pull/45450", + "files_url": "https://github.com/huggingface/transformers/pull/45473/files", + "html_url": "https://github.com/huggingface/transformers/pull/45473", "labels": [], "merged": false, - "number": 45450, + "number": 45473, "review_comments_count": 0, "state": "open", - "title": "chore: bump doc-builder SHA for PR upload workflow", - "updated_at": "2026-04-15T10:13:31Z" + "title": "Fix EP: RouterParallel shape, tp_plan property, grouped_mm sentinels", + "updated_at": "2026-04-16T13:56:10Z" }, { - "additions": 1, - "author": "hmellor", - "author_association": "MEMBER", - "body_excerpt": "This model also has the wrong tokenizer class in its config", + "additions": 2, + "author": "kevinmalana", + "author_association": "NONE", + "body_excerpt": "## What does this PR do? Fixes a crash in `get_device_properties()` in `testing_utils.py` when CUDA is installed on the system but no GPU device is present (e.g., a CPU-only cloud studio with CUDA libraries installed). The function called\u2026", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45449", - "created_at": "2026-04-15T08:26:05Z", + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/45472", + "created_at": "2026-04-16T10:03:07Z", "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45449/files", - "html_url": "https://github.com/huggingface/transformers/pull/45449", - "labels": [], - "merged": true, - "number": 45449, + "files_url": "https://github.com/huggingface/transformers/pull/45472/files", + "html_url": "https://github.com/huggingface/transformers/pull/45472", + "labels": [ + "Code agent slop" + ], + "merged": false, + "number": 45472, "review_comments_count": 0, "state": "closed", - "title": "Add `step3_vl` to `MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS`", - "updated_at": "2026-04-15T09:28:46Z" + "title": "fix(testing_utils): guard get_device_capability with torch.cuda.is_available()", + "updated_at": "2026-04-16T10:57:34Z" }, { - "additions": 438, - "author": "Cyrilvallez", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? As per the title. ## The issue The problem is that transforms that want to remove a full part of a model name (such as a prefix, e.g. the `model.` start) are non bijective in general, i.e. we completely lose the inf\u2026", - "changed_files": 4, + "additions": 3413, + "author": "nuxlear", + "author_association": "CONTRIBUTOR", + "body_excerpt": "# What does this PR do? Add EXAONE 4.5 architecture for the [EXAONE 4.5 model](https://huggingface.co/collections/LGAI-EXAONE/exaone-45) released by LG AI Research. This PR adds the modeling code for EXAONE 4.5, which uses the same LLM arc\u2026", + "changed_files": 16, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45448", - "created_at": "2026-04-15T08:06:53Z", - "deletions": 69, + "conversation_url": "https://github.com/huggingface/transformers/pull/45471", + "created_at": "2026-04-16T08:52:35Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45448/files", - "html_url": "https://github.com/huggingface/transformers/pull/45448", + "files_url": "https://github.com/huggingface/transformers/pull/45471/files", + "html_url": "https://github.com/huggingface/transformers/pull/45471", "labels": [], "merged": false, - "number": 45448, - "review_comments_count": 24, + "number": 45471, + "review_comments_count": 23, "state": "open", - "title": "[loading] Clean way to add/remove full parts in checkpoint names", - "updated_at": "2026-04-16T19:57:25Z" + "title": "Add EXAONE 4.5 implementations", + "updated_at": "2026-04-17T15:23:49Z" }, { - "additions": 1, - "author": "ZSLsherly", - "author_association": "FIRST_TIMER", - "body_excerpt": "This commit corrects the PyTorch version check for importing `AuxRequest` from `torch.nn.attention.flex_attention`(line51). The `AuxRequest` class was actually introduced in PyTorch 2.9.1, not 2.9.0. The current code attempts to import it\u2026", + "additions": 7, + "author": "kaixuanliu", + "author_association": "CONTRIBUTOR", + "body_excerpt": "@ydshieh pls help review, thx!", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 8, - "conversation_url": "https://github.com/huggingface/transformers/pull/45445", - "created_at": "2026-04-15T03:09:38Z", - "deletions": 1, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/45470", + "created_at": "2026-04-16T07:44:20Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45445/files", - "html_url": "https://github.com/huggingface/transformers/pull/45445", + "files_url": "https://github.com/huggingface/transformers/pull/45470/files", + "html_url": "https://github.com/huggingface/transformers/pull/45470", "labels": [], "merged": false, - "number": 45445, - "review_comments_count": 1, - "state": "closed", - "title": "Update Torch version check for flex attention", - "updated_at": "2026-04-15T11:40:34Z" + "number": 45470, + "review_comments_count": 0, + "state": "open", + "title": "skip test_flash_attn_2_can_dispatch_composite_models tests for", + "updated_at": "2026-04-16T07:45:33Z" }, { - "additions": 50, - "author": "tomaarsen", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Resolves https://github.com/huggingface/sentence-transformers/issues/3724 ## Code Agent Policy - [x] I confirm that this is not a pure code agent PR. ## Before submitting - [ ] This PR fixes a typo or improves the d\u2026", + "additions": 12, + "author": "Spectual", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": "# What does this PR do? `PixioPatchEmbeddings.forward` already accepted an `interpolate_pos_encoding` flag (inherited from `ViTPatchEmbeddings`) to skip image-size validation and allow variable-resolution inputs. However, neither `PixioEmb\u2026", "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 6, - "conversation_url": "https://github.com/huggingface/transformers/pull/45444", - "created_at": "2026-04-14T19:28:34Z", - "deletions": 20, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/45469", + "created_at": "2026-04-16T06:54:56Z", + "deletions": 10, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45444/files", - "html_url": "https://github.com/huggingface/transformers/pull/45444", + "files_url": "https://github.com/huggingface/transformers/pull/45469/files", + "html_url": "https://github.com/huggingface/transformers/pull/45469", "labels": [], - "merged": true, - "number": 45444, - "review_comments_count": 4, - "state": "closed", - "title": "[`fix`] Always early return for non-Mistral models in _patch_mistral_regex", - "updated_at": "2026-04-16T12:19:46Z" + "merged": false, + "number": 45469, + "review_comments_count": 0, + "state": "open", + "title": "Fix: propagate interpolate_pos_encoding through Pixio model hierarchy", + "updated_at": "2026-04-16T06:56:04Z" }, { - "additions": 38, - "author": "qgallouedec", - "author_association": "MEMBER", - "body_excerpt": "When `transformers serve` is launched with a positional model argument, the server silently overwrites the `\"model\"` field in every incoming request with the pinned model id. This is surprising: a client that asks for model B receives a re\u2026", - "changed_files": 2, + "additions": 17, + "author": "Jah-yee", + "author_association": "NONE", + "body_excerpt": "Good day, ## Problem On Apple Silicon (MPS backend), `torch.nn.functional.scaled_dot_product_attention` produces incorrect output when the value tensor's head dimension differs from the query tensor's head dimension. This affects DeepSeek\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/45443", - "created_at": "2026-04-14T19:14:10Z", - "deletions": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/45467", + "created_at": "2026-04-16T06:44:51Z", + "deletions": 1, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45443/files", - "html_url": "https://github.com/huggingface/transformers/pull/45443", - "labels": [], + "files_url": "https://github.com/huggingface/transformers/pull/45467/files", + "html_url": "https://github.com/huggingface/transformers/pull/45467", + "labels": [ + "Code agent slop" + ], "merged": false, - "number": 45443, + "number": 45467, "review_comments_count": 0, - "state": "open", - "title": "Raise 400 on model mismatch when `transformers serve` is pinned", - "updated_at": "2026-04-15T11:42:44Z" + "state": "closed", + "title": "Fix MPS SDPA output shape when value head dim differs from query head dim", + "updated_at": "2026-04-16T10:53:42Z" }, { - "additions": 4, - "author": "paulinebm", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Fixes # (issue) ## Code Agent Policy The Transformers repo is currently being overwhelmed by a large number of PRs and issue comments written by c\u2026", - "changed_files": 8, + "additions": 625, + "author": "mohamad-tohidi", + "author_association": "NONE", + "body_excerpt": "# What does this PR do? i added an example for hierarchical text classification ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [x] Did you read the [contributor\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/45350", - "created_at": "2026-04-09T17:46:37Z", + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/45374", + "created_at": "2026-04-11T10:00:07Z", "deletions": 0, - "draft": true, - "files_url": "https://github.com/huggingface/transformers/pull/45350/files", - "html_url": "https://github.com/huggingface/transformers/pull/45350", + "draft": false, + "files_url": "https://github.com/huggingface/transformers/pull/45374/files", + "html_url": "https://github.com/huggingface/transformers/pull/45374", "labels": [], "merged": false, - "number": 45350, + "number": 45374, "review_comments_count": 0, - "state": "open", - "title": "WIP: Add support for Granite4VisionForConditionalGeneration", - "updated_at": "2026-04-10T12:34:50Z" + "state": "closed", + "title": "Adding hierarchical classification example", + "updated_at": "2026-04-13T13:05:08Z" }, { - "additions": 90, - "author": "florian6973", - "author_association": "CONTRIBUTOR", - "body_excerpt": "# What does this PR do? Fixes #45305 Add a regression test in `TrainerGradientAccumulationTest` to avoid passing the GAS value to Accelerate by mistake Description: I force the value of the `num_steps` parameter to be 1, and the regression\u2026", - "changed_files": 2, + "additions": 3, + "author": "HelloAnner", + "author_association": "NONE", + "body_excerpt": "Fixes #45341 When CUDA is installed but no GPU is available, `get_device_properties()` calls `torch.cuda.get_device_capability()` which fails because there is no CUDA device. The fix moves `import torch` to the top of the function and adds\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45349", - "created_at": "2026-04-09T17:24:39Z", - "deletions": 2, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/45371", + "created_at": "2026-04-11T06:22:52Z", + "deletions": 3, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45349/files", - "html_url": "https://github.com/huggingface/transformers/pull/45349", - "labels": [ - "for patch" - ], - "merged": true, - "number": 45349, - "review_comments_count": 6, + "files_url": "https://github.com/huggingface/transformers/pull/45371/files", + "html_url": "https://github.com/huggingface/transformers/pull/45371", + "labels": [], + "merged": false, + "number": 45371, + "review_comments_count": 0, "state": "closed", - "title": "Fix #45305 + add regression test GAS", - "updated_at": "2026-04-13T14:41:43Z" + "title": "fix: check CUDA availability before calling get_device_capability", + "updated_at": "2026-04-13T06:40:16Z" }, { - "additions": 50, - "author": "qgallouedec", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Fixes #45290 ## Code Agent Policy The Transformers repo is currently being overwhelmed by a large number of PRs and issue comments written by code agents. We are currently bottlenecked by our ability to review and r\u2026", - "changed_files": 5, + "additions": 12, + "author": "RudrenduPaul", + "author_association": "CONTRIBUTOR", + "body_excerpt": "# What does this PR do? Fixes five documentation errors in the `Gemma3nTextConfig` docstring in `modular_gemma3n.py` (and the generated `configuration_gemma3n.py`): 1. **Typo**: `\"emebeddings\"` \u2192 `\"embeddings\"` in `hidden_size_per_layer_in\u2026", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45348", - "created_at": "2026-04-09T15:59:07Z", - "deletions": 19, + "conversation_url": "https://github.com/huggingface/transformers/pull/45370", + "created_at": "2026-04-11T06:15:05Z", + "deletions": 12, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45348/files", - "html_url": "https://github.com/huggingface/transformers/pull/45348", + "files_url": "https://github.com/huggingface/transformers/pull/45370/files", + "html_url": "https://github.com/huggingface/transformers/pull/45370", "labels": [], "merged": true, - "number": 45348, - "review_comments_count": 7, + "number": 45370, + "review_comments_count": 0, "state": "closed", - "title": "Fix `apply_chat_template` crash on `tool_call` messages without content", - "updated_at": "2026-04-13T19:44:38Z" + "title": "docs: fix 5 docstring errors in Gemma3nTextConfig (typos, grammar, formatting)", + "updated_at": "2026-04-13T13:14:39Z" }, { - "additions": 35, - "author": "Cyrilvallez", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? As per the title. `accelerate` destroys the dict otherwise, if it's not BOTH passed as kwarg AND part of `_skip_keys_device_placement`.......... `per_layer_input` needs to stay as a positional arg, for gradient chec\u2026", - "changed_files": 3, + "additions": 10, + "author": "sharziki", + "author_association": "CONTRIBUTOR", + "body_excerpt": "## Summary Fixes #45245 \u2014 `torch.multinomial` crashes with `RuntimeError: number of categories cannot exceed 2^24` when `num_beams * vocab_size > 16,777,216` during beam search with `do_sample=True`. **Root cause:** In `_get_top_k_continua\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45347", - "created_at": "2026-04-09T15:31:34Z", - "deletions": 6, + "conversation_url": "https://github.com/huggingface/transformers/pull/45369", + "created_at": "2026-04-11T02:42:07Z", + "deletions": 3, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45347/files", - "html_url": "https://github.com/huggingface/transformers/pull/45347", + "files_url": "https://github.com/huggingface/transformers/pull/45369/files", + "html_url": "https://github.com/huggingface/transformers/pull/45369", "labels": [], - "merged": true, - "number": 45347, + "merged": false, + "number": 45369, "review_comments_count": 0, "state": "closed", - "title": "[gemma4] Fix device map auto", - "updated_at": "2026-04-09T15:45:15Z" + "title": "fix(generation): handle CUDA multinomial limit in beam search sampling", + "updated_at": "2026-04-13T12:49:42Z" }, { - "additions": 16, - "author": "ionut-anghelina", - "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": null, - "changed_files": 2, + "additions": 6, + "author": "sharziki", + "author_association": "CONTRIBUTOR", + "body_excerpt": "## Summary Fixes #45362 \u2014 `transformers chat` crashes with `AttributeError: 'Qwen3VLProcessor' object has no attribute '_tokenizer'` when streaming responses from Qwen models. **Root cause:** `GenerateManager.generate_streaming()` and `CBG\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 5, - "conversation_url": "https://github.com/huggingface/transformers/pull/45346", - "created_at": "2026-04-09T14:48:28Z", - "deletions": 0, + "comments_count": 3, + "conversation_url": "https://github.com/huggingface/transformers/pull/45368", + "created_at": "2026-04-11T02:34:32Z", + "deletions": 2, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45346/files", - "html_url": "https://github.com/huggingface/transformers/pull/45346", + "files_url": "https://github.com/huggingface/transformers/pull/45368/files", + "html_url": "https://github.com/huggingface/transformers/pull/45368", "labels": [], - "merged": false, - "number": 45346, - "review_comments_count": 1, - "state": "open", - "title": "Fix Double Application of Softmax for Router Logits in MoE models", - "updated_at": "2026-04-13T12:40:28Z" + "merged": true, + "number": 45368, + "review_comments_count": 0, + "state": "closed", + "title": "fix(serving): resolve rust tokenizer from ProcessorMixin in streaming generation", + "updated_at": "2026-04-13T15:01:51Z" }, { - "additions": 30, - "author": "ansley", - "author_association": "NONE", - "body_excerpt": "The `transformers` V5 \"rm slow tokenizers\" refactor (\\#40936) aliased `LlamaTokenizerFast` to `LlamaTokenizer`, whose `__init__` unconditionally installs a SentencePiece Metaspace pre-tokenizer. This is correct for classic Llama/Llama-2 mo\u2026", + "additions": 15, + "author": "jackcook", + "author_association": "CONTRIBUTOR", + "body_excerpt": "Update to the latest Four Over Six API by adding options to specify the data type of activations, weights, and gradients individually cc @SunMarc", "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 6, - "conversation_url": "https://github.com/huggingface/transformers/pull/45345", - "created_at": "2026-04-09T14:31:40Z", - "deletions": 14, + "comments_count": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/45367", + "created_at": "2026-04-11T01:47:51Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45345/files", - "html_url": "https://github.com/huggingface/transformers/pull/45345", + "files_url": "https://github.com/huggingface/transformers/pull/45367/files", + "html_url": "https://github.com/huggingface/transformers/pull/45367", "labels": [], "merged": false, - "number": 45345, + "number": 45367, "review_comments_count": 0, - "state": "closed", - "title": "Fix ByteLevel-BPE tokenizers silently breaking in `LlamaTokenizer`", - "updated_at": "2026-04-10T12:45:24Z" + "state": "open", + "title": "Add dtype config options for Four Over Six", + "updated_at": "2026-04-13T13:29:07Z" }, { - "additions": 6, - "author": "tarekziade", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Simple hook to display test duration. This will append inline duration per test during the run, example: ``` tests/utils/test_configuration_utils.py::ConfigPushToHubTester::test_push_to_hub [gw1] [ 90%] PASSED tests\u2026", - "changed_files": 1, + "additions": 64, + "author": "owwll", + "author_association": "NONE", + "body_excerpt": "This PR addresses two separate issues: 1. **Fixes a bug in `Mistral4` RoPE dimension calculation.** The `Mistral4RotaryEmbedding` was incorrectly using the full `head_dim` to calculate the rotary dimension, instead of respecting the `parti\u2026", + "changed_files": 5, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/45344", - "created_at": "2026-04-09T14:22:46Z", - "deletions": 0, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/45366", + "created_at": "2026-04-10T21:13:32Z", + "deletions": 11, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45344/files", - "html_url": "https://github.com/huggingface/transformers/pull/45344", + "files_url": "https://github.com/huggingface/transformers/pull/45366/files", + "html_url": "https://github.com/huggingface/transformers/pull/45366", "labels": [], - "merged": true, - "number": 45344, + "merged": false, + "number": 45366, "review_comments_count": 0, "state": "closed", - "title": "refactor: display test duration", - "updated_at": "2026-04-09T15:19:26Z" + "title": "Fix OLMoE routing and Mistral4 RoPE dimensions", + "updated_at": "2026-04-10T21:40:40Z" }, { - "additions": 8, - "author": "Cyrilvallez", + "additions": 46, + "author": "burtenshaw", "author_association": "MEMBER", - "body_excerpt": null, + "body_excerpt": "Applied the overlapping GPT-J refactor from the staged PRs: added `_can_record_outputs`, moved `GPTJModel.forward` to decorator-based output capture, switched wrapper forwards to `@can_return_tuple`, and removed manual hidden-state/attenti\u2026", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45342", - "created_at": "2026-04-09T14:13:15Z", - "deletions": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/45365", + "created_at": "2026-04-10T19:37:53Z", + "deletions": 101, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45342/files", - "html_url": "https://github.com/huggingface/transformers/pull/45342", + "files_url": "https://github.com/huggingface/transformers/pull/45365/files", + "html_url": "https://github.com/huggingface/transformers/pull/45365", "labels": [], "merged": false, - "number": 45342, + "number": 45365, "review_comments_count": 0, - "state": "open", - "title": "Use `_keys_to_ignore_on_load_unexpected/missing` recursively from children", - "updated_at": "2026-04-09T14:23:31Z" + "state": "closed", + "title": "Refactor GPT-J output tracing to use standardized decorators", + "updated_at": "2026-04-10T19:53:25Z" }, { - "additions": 17, - "author": "Cyrilvallez", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Supersedes https://github.com/huggingface/transformers/pull/45314 with a better fix. Fixes https://github.com/huggingface/transformers/issues/45216 and https://github.com/huggingface/transformers/issues/45310 and ht\u2026", - "changed_files": 2, + "additions": 815, + "author": "caiovicentino", + "author_association": "NONE", + "body_excerpt": "### Summary Adds a third backend to `QuantizedCache`: `polarquant`. Joins the existing `quanto` and `hqq` options and implements a Walsh-Hadamard rotation plus Lloyd-Max scalar quantization scheme tuned for KV cache compression. Pure PyTor\u2026", + "changed_files": 5, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 7, - "conversation_url": "https://github.com/huggingface/transformers/pull/45340", - "created_at": "2026-04-09T12:02:14Z", - "deletions": 14, - "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45340/files", - "html_url": "https://github.com/huggingface/transformers/pull/45340", + "comments_count": 8, + "conversation_url": "https://github.com/huggingface/transformers/pull/45364", + "created_at": "2026-04-10T19:02:16Z", + "deletions": 2, + "draft": true, + "files_url": "https://github.com/huggingface/transformers/pull/45364/files", + "html_url": "https://github.com/huggingface/transformers/pull/45364", "labels": [], - "merged": true, - "number": 45340, + "merged": false, + "number": 45364, "review_comments_count": 0, "state": "closed", - "title": "Fix conversion mappings for vlms", - "updated_at": "2026-04-10T10:18:09Z" + "title": "Add PolarQuant backend to QuantizedCache (Hadamard-rotated Lloyd-Max)", + "updated_at": "2026-04-13T13:31:36Z" }, { - "additions": 156, - "author": "tarekziade", + "additions": 301, + "author": "michaelbenayoun", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? The CircleCI config file is not ruff formatted, leading to unwanted changes when it's opened in an editor that follows our repository ruff configuration. This patch adds it and runs `make style` to update it", - "changed_files": 3, + "body_excerpt": "# What does this PR do? This PR adds support for fusing multiple modules into a single kernel \u2014 the motivating case being fused RMSNorm+MLP kernels, but the API is generic. ## What changed - `FusedModuleBase`, `fuse_modules`, `unfuse_modul\u2026", + "changed_files": 5, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/45339", - "created_at": "2026-04-09T09:44:16Z", - "deletions": 58, - "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45339/files", - "html_url": "https://github.com/huggingface/transformers/pull/45339", + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/45363", + "created_at": "2026-04-10T18:53:22Z", + "deletions": 5, + "draft": true, + "files_url": "https://github.com/huggingface/transformers/pull/45363/files", + "html_url": "https://github.com/huggingface/transformers/pull/45363", "labels": [], - "merged": true, - "number": 45339, - "review_comments_count": 0, - "state": "closed", - "title": "chore: added circleci python script to ruff and ty checkers", - "updated_at": "2026-04-09T12:00:08Z" + "merged": false, + "number": 45363, + "review_comments_count": 3, + "state": "open", + "title": "n-to-1 kernel fusion via `KernelConfig`", + "updated_at": "2026-04-14T13:14:57Z" }, { - "additions": 37, - "author": "RudrenduPaul", - "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": "Closes #45162 ## What this PR does Expands the docstrings of `_can_set_attn_implementation` and `_can_set_experts_implementation` in `modeling_utils.py` to explicitly document the known limitations of their source-inspection heuristic. **C\u2026", - "changed_files": 1, + "additions": 98, + "author": "zucchini-nlp", + "author_association": "MEMBER", + "body_excerpt": "# What does this PR do? Fixes https://github.com/huggingface/trl/issues/5497, also fixes https://github.com/huggingface/transformers/issues/45390 TL;DR; the base model prefix is never appended if it is part of a bigger VLM, which was true\u2026", + "changed_files": 4, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/45338", - "created_at": "2026-04-09T09:35:52Z", - "deletions": 4, + "comments_count": 12, + "conversation_url": "https://github.com/huggingface/transformers/pull/45361", + "created_at": "2026-04-10T13:49:28Z", + "deletions": 13, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45338/files", - "html_url": "https://github.com/huggingface/transformers/pull/45338", + "files_url": "https://github.com/huggingface/transformers/pull/45361/files", + "html_url": "https://github.com/huggingface/transformers/pull/45361", "labels": [], "merged": false, - "number": 45338, + "number": 45361, "review_comments_count": 0, "state": "closed", - "title": "docs: document known limitations of _can_set_attn/experts_implementation source inspection", - "updated_at": "2026-04-09T13:43:04Z" + "title": "Add CLIP-like models in conversion to VLMs", + "updated_at": "2026-04-17T10:27:31Z" }, { - "additions": 13, - "author": "tarekziade", + "additions": 3, + "author": "hanouticelina", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Removing test_hub from CI for now", + "body_excerpt": "# What does this PR do? `huggingface-cli` is deprecated and no longer maintained. This PR updates the remaining references with `hf`", "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/45337", - "created_at": "2026-04-09T08:54:45Z", - "deletions": 30, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/45360", + "created_at": "2026-04-10T11:59:37Z", + "deletions": 3, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45337/files", - "html_url": "https://github.com/huggingface/transformers/pull/45337", + "files_url": "https://github.com/huggingface/transformers/pull/45360/files", + "html_url": "https://github.com/huggingface/transformers/pull/45360", "labels": [], - "merged": true, - "number": 45337, + "merged": false, + "number": 45360, "review_comments_count": 0, - "state": "closed", - "title": "chore: remove test_hub for now", - "updated_at": "2026-04-09T09:28:52Z" + "state": "open", + "title": "Replace deprecated `huggingface-cli` references with `hf`", + "updated_at": "2026-04-10T12:13:18Z" }, { - "additions": 84, - "author": "Cyrilvallez", + "additions": 3, + "author": "ArthurZucker", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? As per the title. Follow-up of https://github.com/huggingface/transformers/pull/45312. This removes the unnecessary weights, and silently skip them during loading, so that the checkpoints on the hub do not have to b\u2026", - "changed_files": 3, + "body_excerpt": "Fixes #45356 ## Summary - Remove `kimi_k25` from `MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS`: its remote `TikTokenTokenizer` is the only correct backend \u2014 the model has no `tokenizer.json`, and its `added_tokens_decoder` has non-sequential\u2026", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45336", - "created_at": "2026-04-09T08:43:55Z", - "deletions": 26, + "conversation_url": "https://github.com/huggingface/transformers/pull/45359", + "created_at": "2026-04-10T10:42:32Z", + "deletions": 4, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45336/files", - "html_url": "https://github.com/huggingface/transformers/pull/45336", - "labels": [], + "files_url": "https://github.com/huggingface/transformers/pull/45359/files", + "html_url": "https://github.com/huggingface/transformers/pull/45359", + "labels": [ + "for patch" + ], "merged": true, - "number": 45336, - "review_comments_count": 7, + "number": 45359, + "review_comments_count": 0, "state": "closed", - "title": "[gemma4] Remove all shared weights, and silently skip them during loading", - "updated_at": "2026-04-09T13:23:33Z" + "title": "Fix Kimi-K2.5 tokenizer regression and _patch_mistral_regex AttributeError", + "updated_at": "2026-04-13T15:16:26Z" }, { - "additions": 1333, - "author": "kmswin1", - "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": "Add A.X K1 model architecture What does this PR do? This PR adds support for A.X K1, a large-scale Mixture-of-Experts (MoE) language model developed by [SK Telecom](https://huggingface.co/skt). A.X K1 contains 519B total parameters with 33\u2026", - "changed_files": 8, + "additions": 111, + "author": "Cyrilvallez", + "author_association": "MEMBER", + "body_excerpt": "# What does this PR do? Fix https://github.com/huggingface/transformers/issues/45357 finally. This was not catched in the previous fix, as the model can be reloaded correctly by `from_pretrained`, but keys are still wrongly serialized! Aft\u2026", + "changed_files": 24, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 0, - "conversation_url": "https://github.com/huggingface/transformers/pull/45334", - "created_at": "2026-04-09T06:21:43Z", - "deletions": 0, + "comments_count": 5, + "conversation_url": "https://github.com/huggingface/transformers/pull/45358", + "created_at": "2026-04-10T10:19:42Z", + "deletions": 42, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45334/files", - "html_url": "https://github.com/huggingface/transformers/pull/45334", - "labels": [], - "merged": false, - "number": 45334, - "review_comments_count": 0, + "files_url": "https://github.com/huggingface/transformers/pull/45358/files", + "html_url": "https://github.com/huggingface/transformers/pull/45358", + "labels": [ + "for patch" + ], + "merged": true, + "number": 45358, + "review_comments_count": 2, "state": "closed", - "title": "Feature/add axk1", - "updated_at": "2026-04-14T07:23:08Z" + "title": "Fix vlm weight mappings", + "updated_at": "2026-04-10T15:41:47Z" }, { - "additions": 471, - "author": "eladsegal", - "author_association": "CONTRIBUTOR", - "body_excerpt": "# What does this PR do? Adds heterogeneous model support - the ability for individual layers to differ from the global config (e.g., different `intermediate_size`, `num_key_value_heads`) and to skip sub-modules entirely (MLP, attention, et\u2026", - "changed_files": 5, + "additions": 2297, + "author": "Shikhar-S", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": "# What does this PR do? This PR introduces [PhoneticXeus](https://arxiv.org/abs/2603.29042), which is the state-of-the-art universal phone recognizer trained on 70+ languages and evaluated on ~100 languages. The model should have high util\u2026", + "changed_files": 16, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/45333", - "created_at": "2026-04-09T06:18:11Z", + "comments_count": 5, + "conversation_url": "https://github.com/huggingface/transformers/pull/45355", + "created_at": "2026-04-10T04:26:05Z", "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45333/files", - "html_url": "https://github.com/huggingface/transformers/pull/45333", - "labels": [], + "files_url": "https://github.com/huggingface/transformers/pull/45355/files", + "html_url": "https://github.com/huggingface/transformers/pull/45355", + "labels": [ + "New model", + "Audio" + ], "merged": false, - "number": 45333, + "number": 45355, "review_comments_count": 0, "state": "open", - "title": "Add heterogeneous config support (per-layer configuration)", - "updated_at": "2026-04-14T14:08:07Z" + "title": "Add universal phone recognition model - PhoneticXeus", + "updated_at": "2026-04-14T15:02:02Z" }, { - "additions": 2152, - "author": "eladsegal", - "author_association": "CONTRIBUTOR", - "body_excerpt": "# What does this PR do? Adds heterogeneous model support - the ability for individual layers to differ from the global config (e.g., different `intermediate_size`, `num_key_value_heads`) and to skip sub-modules entirely (MLP, attention, et\u2026", - "changed_files": 14, + "additions": 8, + "author": "winglian", + "author_association": "COLLABORATOR", + "body_excerpt": "# What does this PR do? Gemma 4 was calculating the CE loss incorrectly and not handling gradient accumulation steps properly, leading to losses scaled up by the value of the gradient accumulation steps rather than letting the built in HF\u2026", + "changed_files": 4, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45332", - "created_at": "2026-04-09T05:56:31Z", - "deletions": 40, + "conversation_url": "https://github.com/huggingface/transformers/pull/45354", + "created_at": "2026-04-10T02:54:56Z", + "deletions": 76, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45332/files", - "html_url": "https://github.com/huggingface/transformers/pull/45332", + "files_url": "https://github.com/huggingface/transformers/pull/45354/files", + "html_url": "https://github.com/huggingface/transformers/pull/45354", "labels": [], - "merged": false, - "number": 45332, + "merged": true, + "number": 45354, "review_comments_count": 0, - "state": "open", - "title": "Add heterogeneous model support (per-layer config and modeling)", - "updated_at": "2026-04-15T04:50:09Z" + "state": "closed", + "title": "fix gemma4 gradient accumulation loss and last token incorrect labels", + "updated_at": "2026-04-10T10:08:12Z" }, { - "additions": 12, - "author": "Kash6", + "additions": 30, + "author": "wilnn", "author_association": "CONTRIBUTOR", - "body_excerpt": "get_rope_index unconditionally applies tokens_per_second temporal scaling to both images and videos. For still images (modality_type == 1), this shifts the temporal position origin to start_position * tokens_per_second instead of start_pos\u2026", - "changed_files": 2, + "body_excerpt": "# What does this PR do? Add `**kwargs` to all methods in the `CallbackHandler` class. Previously, only the `CallbackHandler.on_push_begin()` method accepted `**kwargs`, while all other methods did not. This forces users who want to customi\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/45330", - "created_at": "2026-04-08T23:51:52Z", - "deletions": 2, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/45353", + "created_at": "2026-04-09T23:14:20Z", + "deletions": 30, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45330/files", - "html_url": "https://github.com/huggingface/transformers/pull/45330", - "labels": [ - "for patch" - ], + "files_url": "https://github.com/huggingface/transformers/pull/45353/files", + "html_url": "https://github.com/huggingface/transformers/pull/45353", + "labels": [], "merged": true, - "number": 45330, + "number": 45353, "review_comments_count": 0, "state": "closed", - "title": "Fix Qwen2.5-VL temporal RoPE scaling applied to still images", - "updated_at": "2026-04-14T03:21:32Z" + "title": "add kwargs to all methods in the CallbackHandler class", + "updated_at": "2026-04-10T14:16:46Z" }, { - "additions": 152, - "author": "abidlabs", - "author_association": "MEMBER", - "body_excerpt": "Updates `TrackioCallback` and `TrainingArguments` for the latest version of Trackio using HF Buckets as the backend, and control over creating a static Space for the Trackio dashboard during or at the end of training. These are now the `Tr\u2026", - "changed_files": 2, + "additions": 4, + "author": "RudrenduPaul", + "author_association": "CONTRIBUTOR", + "body_excerpt": "## What does this PR do? Corrects an incorrect return type annotation on `Qwen3MoeSparseMoeBlock.forward`. The method is annotated as returning `tuple[torch.Tensor, torch.Tensor]` but actually returns a single reshaped `torch.Tensor` (see\u2026", + "changed_files": 4, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 6, - "conversation_url": "https://github.com/huggingface/transformers/pull/45329", - "created_at": "2026-04-08T22:36:08Z", - "deletions": 57, + "comments_count": 5, + "conversation_url": "https://github.com/huggingface/transformers/pull/45352", + "created_at": "2026-04-09T21:53:31Z", + "deletions": 4, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45329/files", - "html_url": "https://github.com/huggingface/transformers/pull/45329", + "files_url": "https://github.com/huggingface/transformers/pull/45352/files", + "html_url": "https://github.com/huggingface/transformers/pull/45352", "labels": [], "merged": true, - "number": 45329, - "review_comments_count": 21, + "number": 45352, + "review_comments_count": 0, "state": "closed", - "title": "Update `trackio` integration to use Buckets and \"freeze\" Space after training", - "updated_at": "2026-04-13T14:30:27Z" + "title": "fix(qwen3_moe): correct return type annotation on Qwen3MoeSparseMoeBlock.forward", + "updated_at": "2026-04-13T14:07:30Z" }, { - "additions": 9, - "author": "RyanMullins", + "additions": 15, + "author": "RudrenduPaul", "author_association": "CONTRIBUTOR", - "body_excerpt": "# What does this PR do? Fixes #45242 * Drops `k_proj`, `k_norm`, and `v_proj` weights for `Gemma4TextAttention` modules from the checkpoint if the layer shares KV cache values. These changes can also be adapted to Gemma 3n if that's desira\u2026", + "body_excerpt": "## What does this PR do? Fixes a crash in `get_device_properties()` in `testing_utils.py` when CUDA is installed on the system but no GPU device is present (e.g., a CPU-only cloud studio with CUDA libraries installed). The function called\u2026", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 9, - "conversation_url": "https://github.com/huggingface/transformers/pull/45328", - "created_at": "2026-04-08T20:43:42Z", - "deletions": 6, + "comments_count": 5, + "conversation_url": "https://github.com/huggingface/transformers/pull/45351", + "created_at": "2026-04-09T21:51:42Z", + "deletions": 14, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45328/files", - "html_url": "https://github.com/huggingface/transformers/pull/45328", + "files_url": "https://github.com/huggingface/transformers/pull/45351/files", + "html_url": "https://github.com/huggingface/transformers/pull/45351", "labels": [], "merged": false, - "number": 45328, - "review_comments_count": 0, + "number": 45351, + "review_comments_count": 1, "state": "open", - "title": "Drop unused Gemma4TextAttention weights when sharing KV Cache", - "updated_at": "2026-04-09T18:31:13Z" + "title": "fix(testing_utils): guard get_device_capability with torch.cuda.is_available()", + "updated_at": "2026-04-17T02:42:53Z" }, { - "additions": 337, - "author": "stevhliu", - "author_association": "MEMBER", - "body_excerpt": "refactors the how to add a model with modular transformers doc: - structure: - flipped the order so you learn how to write the modular file first before generating it - remove the motivator examples with BERT/RoBERTa - merge the two `super\u2026", - "changed_files": 2, + "additions": 1090, + "author": "mrutkows", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": "# What does this PR do? Support new Granite 4 vision arch. Fixes # (issue) ## Code Agent Policy The Transformers repo is currently being overwhelmed by a large number of PRs and issue comments written by c\u2026", + "changed_files": 8, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45327", - "created_at": "2026-04-08T20:23:28Z", - "deletions": 408, + "comments_count": 3, + "conversation_url": "https://github.com/huggingface/transformers/pull/45350", + "created_at": "2026-04-09T17:46:37Z", + "deletions": 0, "draft": true, - "files_url": "https://github.com/huggingface/transformers/pull/45327/files", - "html_url": "https://github.com/huggingface/transformers/pull/45327", + "files_url": "https://github.com/huggingface/transformers/pull/45350/files", + "html_url": "https://github.com/huggingface/transformers/pull/45350", "labels": [], "merged": false, - "number": 45327, - "review_comments_count": 1, + "number": 45350, + "review_comments_count": 0, "state": "open", - "title": "[docs] modular transformers", - "updated_at": "2026-04-15T18:15:34Z" + "title": "WIP: Add support for Granite4VisionForConditionalGeneration", + "updated_at": "2026-04-10T12:34:50Z" }, { - "additions": 19, - "author": "harshaljanjani", + "additions": 90, + "author": "florian6973", "author_association": "CONTRIBUTOR", - "body_excerpt": "### What does this PR do? \u2192 This PR introduces compat fixes across several audio models to ensure they can be loaded and used by a companion vLLM PR. These changes are deliberate and are blocking [this vLLM PR](https://github.co\u2026", - "changed_files": 11, + "body_excerpt": "# What does this PR do? Fixes #45305 Add a regression test in `TrainerGradientAccumulationTest` to avoid passing the GAS value to Accelerate by mistake Description: I force the value of the `num_steps` parameter to be 1, and the regression\u2026", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 4, - "conversation_url": "https://github.com/huggingface/transformers/pull/45326", - "created_at": "2026-04-08T18:28:35Z", - "deletions": 0, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/45349", + "created_at": "2026-04-09T17:24:39Z", + "deletions": 2, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45326/files", - "html_url": "https://github.com/huggingface/transformers/pull/45326", - "labels": [], - "merged": false, - "number": 45326, - "review_comments_count": 0, - "state": "open", - "title": "feat[vLLM \u00d7 v5]: Add vLLM compatibility for audio models", - "updated_at": "2026-04-14T15:45:32Z" + "files_url": "https://github.com/huggingface/transformers/pull/45349/files", + "html_url": "https://github.com/huggingface/transformers/pull/45349", + "labels": [ + "for patch" + ], + "merged": true, + "number": 45349, + "review_comments_count": 6, + "state": "closed", + "title": "Fix #45305 + add regression test GAS", + "updated_at": "2026-04-13T14:41:43Z" }, { - "additions": 236, - "author": "zucchini-nlp", + "additions": 50, + "author": "qgallouedec", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Fixes https://github.com/huggingface/transformers/issues/45276 and https://github.com/huggingface/transformers/issues/45335 In gemma4 per-layer inputs have to be resized as long as they aren't part of soft multimoda\u2026", - "changed_files": 16, + "body_excerpt": "# What does this PR do? Fixes #45290 ## Code Agent Policy The Transformers repo is currently being overwhelmed by a large number of PRs and issue comments written by code agents. We are currently bottlenecked by our ability to review and r\u2026", + "changed_files": 5, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 4, - "conversation_url": "https://github.com/huggingface/transformers/pull/45324", - "created_at": "2026-04-08T17:06:26Z", - "deletions": 53, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/45348", + "created_at": "2026-04-09T15:59:07Z", + "deletions": 19, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45324/files", - "html_url": "https://github.com/huggingface/transformers/pull/45324", + "files_url": "https://github.com/huggingface/transformers/pull/45348/files", + "html_url": "https://github.com/huggingface/transformers/pull/45348", "labels": [], "merged": true, - "number": 45324, - "review_comments_count": 5, + "number": 45348, + "review_comments_count": 7, "state": "closed", - "title": "Gemma4 resizing per layer inputs", - "updated_at": "2026-04-15T11:15:23Z" + "title": "Fix `apply_chat_template` crash on `tool_call` messages without content", + "updated_at": "2026-04-13T19:44:38Z" }, { - "additions": 225, - "author": "remi-or", + "additions": 35, + "author": "Cyrilvallez", "author_association": "MEMBER", - "body_excerpt": "# Summary This PR fixes the issue raised in https://github.com/huggingface/transformers/pull/45274 . CUDA graph reuse in continuous batching used (num_q_tokens, max_kv_read) as the graph cache key. However, FlashAttention varlen kernels al\u2026", - "changed_files": 7, + "body_excerpt": "# What does this PR do? As per the title. `accelerate` destroys the dict otherwise, if it's not BOTH passed as kwarg AND part of `_skip_keys_device_placement`.......... `per_layer_input` needs to stay as a positional arg, for gradient chec\u2026", + "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 7, - "conversation_url": "https://github.com/huggingface/transformers/pull/45323", - "created_at": "2026-04-08T16:30:18Z", - "deletions": 126, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/45347", + "created_at": "2026-04-09T15:31:34Z", + "deletions": 6, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45323/files", - "html_url": "https://github.com/huggingface/transformers/pull/45323", + "files_url": "https://github.com/huggingface/transformers/pull/45347/files", + "html_url": "https://github.com/huggingface/transformers/pull/45347", "labels": [], - "merged": false, - "number": 45323, - "review_comments_count": 2, - "state": "open", - "title": "[CB] Fix capture of max_seqlen", - "updated_at": "2026-04-16T10:19:54Z" + "merged": true, + "number": 45347, + "review_comments_count": 0, + "state": "closed", + "title": "[gemma4] Fix device map auto", + "updated_at": "2026-04-09T15:45:15Z" }, { - "additions": 20, - "author": "andrewor14", - "author_association": "CONTRIBUTOR", - "body_excerpt": "**Summary:** TorchAO recently deprecated AffineQuantizedTensor and related classes (pytorch/ao#2752). These will be removed in the next release. We should remove references of these classes in transformers before then. **Test Plan:** ``` p\u2026", + "additions": 16, + "author": "ionut-anghelina", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": null, "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 4, - "conversation_url": "https://github.com/huggingface/transformers/pull/45321", - "created_at": "2026-04-08T15:42:16Z", - "deletions": 29, + "comments_count": 5, + "conversation_url": "https://github.com/huggingface/transformers/pull/45346", + "created_at": "2026-04-09T14:48:28Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45321/files", - "html_url": "https://github.com/huggingface/transformers/pull/45321", + "files_url": "https://github.com/huggingface/transformers/pull/45346/files", + "html_url": "https://github.com/huggingface/transformers/pull/45346", "labels": [], "merged": false, - "number": 45321, - "review_comments_count": 0, + "number": 45346, + "review_comments_count": 1, "state": "open", - "title": "Remove references to torchao's AffineQuantizedTensor", - "updated_at": "2026-04-09T12:21:03Z" + "title": "Fix Double Application of Softmax for Router Logits in MoE models", + "updated_at": "2026-04-13T12:40:28Z" }, { - "additions": 5, - "author": "Regata3010", - "author_association": "CONTRIBUTOR", - "body_excerpt": "## What does this PR do? Fixes a crash in assisted generation when using model pairs with different vocabulary sizes but the same tokenizer family (e.g., Qwen2.5-7B + Qwen2.5-0.5B). `map_input_embeddings` is only initialized when `len(self\u2026", - "changed_files": 1, + "additions": 30, + "author": "ansley", + "author_association": "NONE", + "body_excerpt": "The `transformers` V5 \"rm slow tokenizers\" refactor (\\#40936) aliased `LlamaTokenizerFast` to `LlamaTokenizer`, whose `__init__` unconditionally installs a SentencePiece Metaspace pre-tokenizer. This is correct for classic Llama/Llama-2 mo\u2026", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45320", - "created_at": "2026-04-08T15:30:16Z", - "deletions": 1, + "comments_count": 6, + "conversation_url": "https://github.com/huggingface/transformers/pull/45345", + "created_at": "2026-04-09T14:31:40Z", + "deletions": 14, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45320/files", - "html_url": "https://github.com/huggingface/transformers/pull/45320", + "files_url": "https://github.com/huggingface/transformers/pull/45345/files", + "html_url": "https://github.com/huggingface/transformers/pull/45345", "labels": [], - "merged": true, - "number": 45320, + "merged": false, + "number": 45345, "review_comments_count": 0, "state": "closed", - "title": "Fix AttributeError in AssistantToTargetTranslator.unmap_input_ids with cross-vocab models", - "updated_at": "2026-04-10T17:46:37Z" + "title": "Fix ByteLevel-BPE tokenizers silently breaking in `LlamaTokenizer`", + "updated_at": "2026-04-10T12:45:24Z" }, { - "additions": 78, + "additions": 6, "author": "tarekziade", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? - Removes `HUGGINGFACE_CO_STAGING` when downloading artifacts - adds a retry mechanism for external URLs (with partial file cleanup)", - "changed_files": 6, + "body_excerpt": "# What does this PR do? Simple hook to display test duration. This will append inline duration per test during the run, example: ``` tests/utils/test_configuration_utils.py::ConfigPushToHubTester::test_push_to_hub [gw1] [ 90%] PASSED tests\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45319", - "created_at": "2026-04-08T14:51:48Z", - "deletions": 32, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/45344", + "created_at": "2026-04-09T14:22:46Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45319/files", - "html_url": "https://github.com/huggingface/transformers/pull/45319", + "files_url": "https://github.com/huggingface/transformers/pull/45344/files", + "html_url": "https://github.com/huggingface/transformers/pull/45344", "labels": [], "merged": true, - "number": 45319, - "review_comments_count": 3, + "number": 45344, + "review_comments_count": 0, "state": "closed", - "title": "fix: dont download artifacts from the test hub", - "updated_at": "2026-04-15T16:52:10Z" + "title": "refactor: display test duration", + "updated_at": "2026-04-09T15:19:26Z" }, { - "additions": 5, - "author": "tarekziade", + "additions": 8, + "author": "Cyrilvallez", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? AutoTokenizer.register() adds classes to the global `REGISTERED_TOKENIZER_CLASSES` dict and some tests did not clean up behind them, leading to leaky state between tests", + "body_excerpt": null, "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 5, - "conversation_url": "https://github.com/huggingface/transformers/pull/45318", - "created_at": "2026-04-08T13:46:47Z", + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/45342", + "created_at": "2026-04-09T14:13:15Z", "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45318/files", - "html_url": "https://github.com/huggingface/transformers/pull/45318", - "labels": [], - "merged": true, - "number": 45318, - "review_comments_count": 0, - "state": "closed", - "title": "fix: leak in tokenizer registry for `test_processors`", - "updated_at": "2026-04-09T10:12:46Z" - }, - { - "additions": 24, - "author": "mohdfaour03", - "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": "Fixes #45081 ## Problem Loading a Mistral tokenizer with `fix_mistral_regex=True` crashes because `_patch_mistral_regex` receives a raw `tokenizers.Tokenizer` but tries to access `.backend_tokenizer.pre_tokenizer` on it \u2014 that attribute on\u2026", - "changed_files": 2, - "cluster_id": "cluster-45081-3", - "cluster_ids": [ - "cluster-45081-3" - ], - "cluster_role": "canonical", - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/45317", - "created_at": "2026-04-08T13:38:46Z", - "deletions": 3, - "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45317/files", - "html_url": "https://github.com/huggingface/transformers/pull/45317", + "files_url": "https://github.com/huggingface/transformers/pull/45342/files", + "html_url": "https://github.com/huggingface/transformers/pull/45342", "labels": [], "merged": false, - "number": 45317, - "review_comments_count": 1, + "number": 45342, + "review_comments_count": 0, "state": "open", - "title": "Fix AttributeError in _patch_mistral_regex when fix_mistral_regex=True ", - "updated_at": "2026-04-09T13:52:30Z" + "title": "Use `_keys_to_ignore_on_load_unexpected/missing` recursively from children", + "updated_at": "2026-04-09T14:23:31Z" }, { - "additions": 9, - "author": "zucchini-nlp", + "additions": 17, + "author": "Cyrilvallez", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? As per title and seems like there are no objections. Also added some colors in verbose logging cc @tarekziade @tomaarsen @yonigozlan if you have better ideas to style this (just tagging since you reacted \u2795 ) This is\u2026", - "changed_files": 1, + "body_excerpt": "# What does this PR do? Supersedes https://github.com/huggingface/transformers/pull/45314 with a better fix. Fixes https://github.com/huggingface/transformers/issues/45216 and https://github.com/huggingface/transformers/issues/45310 and ht\u2026", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 8, - "conversation_url": "https://github.com/huggingface/transformers/pull/45316", - "created_at": "2026-04-08T13:01:15Z", - "deletions": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/45340", + "created_at": "2026-04-09T12:02:14Z", + "deletions": 14, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45316/files", - "html_url": "https://github.com/huggingface/transformers/pull/45316", + "files_url": "https://github.com/huggingface/transformers/pull/45340/files", + "html_url": "https://github.com/huggingface/transformers/pull/45340", "labels": [], "merged": true, - "number": 45316, - "review_comments_count": 1, + "number": 45340, + "review_comments_count": 0, "state": "closed", - "title": "Logger has `[transformers]` prefix in non-verbose mode", - "updated_at": "2026-04-14T14:08:04Z" + "title": "Fix conversion mappings for vlms", + "updated_at": "2026-04-17T08:25:29Z" }, { - "additions": 46, - "author": "Rocketknight1", + "additions": 156, + "author": "tarekziade", "author_association": "MEMBER", - "body_excerpt": "Reusing a variable name meant that we returned a softmaxed value instead of the original logits in some MoE routers. This generally did not affect inference, but could affect the auxiliary loss on MoE logits in training when the coefficien\u2026", - "changed_files": 15, + "body_excerpt": "# What does this PR do? The CircleCI config file is not ruff formatted, leading to unwanted changes when it's opened in an editor that follows our repository ruff configuration. This patch adds it and runs `make style` to update it", + "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 6, - "conversation_url": "https://github.com/huggingface/transformers/pull/45315", - "created_at": "2026-04-08T12:54:52Z", - "deletions": 30, + "comments_count": 3, + "conversation_url": "https://github.com/huggingface/transformers/pull/45339", + "created_at": "2026-04-09T09:44:16Z", + "deletions": 58, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45315/files", - "html_url": "https://github.com/huggingface/transformers/pull/45315", + "files_url": "https://github.com/huggingface/transformers/pull/45339/files", + "html_url": "https://github.com/huggingface/transformers/pull/45339", "labels": [], - "merged": false, - "number": 45315, + "merged": true, + "number": 45339, "review_comments_count": 0, "state": "closed", - "title": "Fix softmaxing router logits", - "updated_at": "2026-04-10T13:25:20Z" + "title": "chore: added circleci python script to ruff and ty checkers", + "updated_at": "2026-04-09T12:00:08Z" }, { - "additions": 18, - "author": "zucchini-nlp", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? fixes https://github.com/huggingface/transformers/issues/45216 and https://github.com/huggingface/transformers/issues/45310 and https://github.com/huggingface/transformers/issues/45313 TBH load-save-load works for t\u2026", - "changed_files": 10, + "additions": 37, + "author": "RudrenduPaul", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": "Closes #45162 ## What this PR does Expands the docstrings of `_can_set_attn_implementation` and `_can_set_experts_implementation` in `modeling_utils.py` to explicitly document the known limitations of their source-inspection heuristic. **C\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45314", - "created_at": "2026-04-08T11:54:53Z", - "deletions": 27, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/45338", + "created_at": "2026-04-09T09:35:52Z", + "deletions": 4, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45314/files", - "html_url": "https://github.com/huggingface/transformers/pull/45314", + "files_url": "https://github.com/huggingface/transformers/pull/45338/files", + "html_url": "https://github.com/huggingface/transformers/pull/45338", "labels": [], "merged": false, - "number": 45314, + "number": 45338, "review_comments_count": 0, "state": "closed", - "title": "Conversion for LLM class loading with VLM ckpt ", - "updated_at": "2026-04-10T09:18:26Z" + "title": "docs: document known limitations of _can_set_attn/experts_implementation source inspection", + "updated_at": "2026-04-09T13:43:04Z" }, { - "additions": 61, + "additions": 13, + "author": "tarekziade", + "author_association": "MEMBER", + "body_excerpt": "# What does this PR do? Removing test_hub from CI for now", + "changed_files": 3, + "cluster_id": null, + "cluster_ids": [], + "cluster_role": null, + "comments_count": 3, + "conversation_url": "https://github.com/huggingface/transformers/pull/45337", + "created_at": "2026-04-09T08:54:45Z", + "deletions": 30, + "draft": false, + "files_url": "https://github.com/huggingface/transformers/pull/45337/files", + "html_url": "https://github.com/huggingface/transformers/pull/45337", + "labels": [], + "merged": true, + "number": 45337, + "review_comments_count": 0, + "state": "closed", + "title": "chore: remove test_hub for now", + "updated_at": "2026-04-09T09:28:52Z" + }, + { + "additions": 84, "author": "Cyrilvallez", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? As per the title. It was confirmed that the weight matrices of shared layers are NEVER used, and that kv states should ALWAYS be shared, even during training or inference without Cache. I will fully remove them on a\u2026", + "body_excerpt": "# What does this PR do? As per the title. Follow-up of https://github.com/huggingface/transformers/pull/45312. This removes the unnecessary weights, and silently skip them during loading, so that the checkpoints on the hub do not have to b\u2026", "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/45312", - "created_at": "2026-04-08T11:33:33Z", - "deletions": 24, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/45336", + "created_at": "2026-04-09T08:43:55Z", + "deletions": 26, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45312/files", - "html_url": "https://github.com/huggingface/transformers/pull/45312", + "files_url": "https://github.com/huggingface/transformers/pull/45336/files", + "html_url": "https://github.com/huggingface/transformers/pull/45336", "labels": [], "merged": true, - "number": 45312, + "number": 45336, + "review_comments_count": 7, + "state": "closed", + "title": "[gemma4] Remove all shared weights, and silently skip them during loading", + "updated_at": "2026-04-09T13:23:33Z" + }, + { + "additions": 1333, + "author": "kmswin1", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": "Add A.X K1 model architecture What does this PR do? This PR adds support for A.X K1, a large-scale Mixture-of-Experts (MoE) language model developed by [SK Telecom](https://huggingface.co/skt). A.X K1 contains 519B total parameters with 33\u2026", + "changed_files": 8, + "cluster_id": null, + "cluster_ids": [], + "cluster_role": null, + "comments_count": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/45334", + "created_at": "2026-04-09T06:21:43Z", + "deletions": 0, + "draft": false, + "files_url": "https://github.com/huggingface/transformers/pull/45334/files", + "html_url": "https://github.com/huggingface/transformers/pull/45334", + "labels": [], + "merged": false, + "number": 45334, "review_comments_count": 0, "state": "closed", - "title": "[gemma4] Dissociate kv states sharing from the Cache", - "updated_at": "2026-04-09T08:08:07Z" + "title": "Feature/add axk1", + "updated_at": "2026-04-14T07:23:08Z" }, { - "additions": 2, - "author": "KoichiYasuoka", + "additions": 471, + "author": "eladsegal", "author_association": "CONTRIBUTOR", - "body_excerpt": "# What does this PR do? Fixes #45292 (seems to come from #41580) ## Code Agent Policy The Transformers repo is currently being overwhelmed by a large number of PRs and issue comments written by code agents. We are currently bottlenecked by\u2026", - "changed_files": 1, + "body_excerpt": "# What does this PR do? Adds heterogeneous model support - the ability for individual layers to differ from the global config (e.g., different `intermediate_size`, `num_key_value_heads`) and to skip sub-modules entirely (MLP, attention, et\u2026", + "changed_files": 5, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 6, - "conversation_url": "https://github.com/huggingface/transformers/pull/45311", - "created_at": "2026-04-08T10:38:34Z", + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/45333", + "created_at": "2026-04-09T06:18:11Z", "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45311/files", - "html_url": "https://github.com/huggingface/transformers/pull/45311", + "files_url": "https://github.com/huggingface/transformers/pull/45333/files", + "html_url": "https://github.com/huggingface/transformers/pull/45333", "labels": [], "merged": false, - "number": 45311, + "number": 45333, "review_comments_count": 0, "state": "open", - "title": "resize_token_embeddings does not effect to output_embeddings", - "updated_at": "2026-04-16T15:04:43Z" + "title": "Add heterogeneous config support (per-layer configuration)", + "updated_at": "2026-04-14T14:08:07Z" }, { - "additions": 301, - "author": "agentspan", - "author_association": "NONE", - "body_excerpt": "## Summary Fixes #45290. `ProcessorMixin.apply_chat_template` and several related code paths assumed every message in a conversation has a `content` key. Assistant messages with `tool_calls` and no textual content (a valid shape per the Op\u2026", - "changed_files": 9, + "additions": 2152, + "author": "eladsegal", + "author_association": "CONTRIBUTOR", + "body_excerpt": "# What does this PR do? Adds heterogeneous model support - the ability for individual layers to differ from the global config (e.g., different `intermediate_size`, `num_key_value_heads`) and to skip sub-modules entirely (MLP, attention, et\u2026", + "changed_files": 14, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45309", - "created_at": "2026-04-08T08:40:08Z", - "deletions": 23, + "conversation_url": "https://github.com/huggingface/transformers/pull/45332", + "created_at": "2026-04-09T05:56:31Z", + "deletions": 40, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45309/files", - "html_url": "https://github.com/huggingface/transformers/pull/45309", - "labels": [ - "Code agent slop" - ], + "files_url": "https://github.com/huggingface/transformers/pull/45332/files", + "html_url": "https://github.com/huggingface/transformers/pull/45332", + "labels": [], "merged": false, - "number": 45309, + "number": 45332, "review_comments_count": 0, - "state": "closed", - "title": "Fix KeyError in apply_chat_template when message has no content (#45290)", - "updated_at": "2026-04-08T11:30:37Z" + "state": "open", + "title": "Add heterogeneous model support (per-layer config and modeling)", + "updated_at": "2026-04-15T04:50:09Z" }, { - "additions": 10, - "author": "juliabush", - "author_association": "NONE", - "body_excerpt": "## What does this PR do? Fixes #29942 Flash Attention 2 inference equivalence tests for Whisper can fail due to higher numerical variance compared to the eager attention implementation. This PR increases the tolerance (`atol`, `rtol`) spec\u2026", - "changed_files": 1, + "additions": 12, + "author": "Kash6", + "author_association": "CONTRIBUTOR", + "body_excerpt": "get_rope_index unconditionally applies tokens_per_second temporal scaling to both images and videos. For still images (modality_type == 1), this shifts the temporal position origin to start_position * tokens_per_second instead of start_pos\u2026", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 0, - "conversation_url": "https://github.com/huggingface/transformers/pull/45303", - "created_at": "2026-04-07T21:37:00Z", + "comments_count": 3, + "conversation_url": "https://github.com/huggingface/transformers/pull/45330", + "created_at": "2026-04-08T23:51:52Z", "deletions": 2, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45303/files", - "html_url": "https://github.com/huggingface/transformers/pull/45303", + "files_url": "https://github.com/huggingface/transformers/pull/45330/files", + "html_url": "https://github.com/huggingface/transformers/pull/45330", "labels": [ - "Code agent slop" + "for patch" ], - "merged": false, - "number": 45303, + "merged": true, + "number": 45330, "review_comments_count": 0, "state": "closed", - "title": "Fix FA2 inference equivalence failures for Whisper (closes #29942)", - "updated_at": "2026-04-08T14:42:36Z" + "title": "Fix Qwen2.5-VL temporal RoPE scaling applied to still images", + "updated_at": "2026-04-14T03:21:32Z" }, { - "additions": 7, - "author": "jagwar", + "additions": 152, + "author": "abidlabs", "author_association": "MEMBER", - "body_excerpt": "## Security Fix Fixes a trust check bypass in `trl-ci-bot.yml` that allowed any GitHub user to trigger TRL CI on self-hosted GPU runners by commenting `/trl-ci` on any PR. ### The bug The \"Ignore untrusted commenter\" step used `exit 0`, wh\u2026", - "changed_files": 1, + "body_excerpt": "Updates `TrackioCallback` and `TrainingArguments` for the latest version of Trackio using HF Buckets as the backend, and control over creating a static Space for the Trackio dashboard during or at the end of training. These are now the `Tr\u2026", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/45302", - "created_at": "2026-04-07T21:35:38Z", - "deletions": 3, + "comments_count": 6, + "conversation_url": "https://github.com/huggingface/transformers/pull/45329", + "created_at": "2026-04-08T22:36:08Z", + "deletions": 57, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/45302/files", - "html_url": "https://github.com/huggingface/transformers/pull/45302", + "files_url": "https://github.com/huggingface/transformers/pull/45329/files", + "html_url": "https://github.com/huggingface/transformers/pull/45329", "labels": [], "merged": true, - "number": 45302, - "review_comments_count": 0, + "number": 45329, + "review_comments_count": 21, "state": "closed", - "title": "fix(security): prevent untrusted users from triggering TRL CI dispatch", - "updated_at": "2026-04-07T21:59:38Z" + "title": "Update `trackio` integration to use Buckets and \"freeze\" Space after training", + "updated_at": "2026-04-13T14:30:27Z" }, { - "additions": 0, - "author": "sahildando", - "author_association": "NONE", - "body_excerpt": "# What does this PR do? save locally --> local locally) ```\u2026", - "changed_files": 2, + "additions": 4, + "author": "BillionClaw", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": "The question-answering pipeline was removed in v5.0.0 per MIGRATION_GUIDE_V5.md, but the non-English task guides still referenced it. This updates the Arabic, Chinese, Japanese, and Korean question answering task guides to remove usage of\u2026", + "changed_files": 4, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44730", - "created_at": "2026-03-15T20:44:32Z", - "deletions": 4, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/44787", + "created_at": "2026-03-17T08:24:09Z", + "deletions": 66, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44730/files", - "html_url": "https://github.com/huggingface/transformers/pull/44730", + "files_url": "https://github.com/huggingface/transformers/pull/44787/files", + "html_url": "https://github.com/huggingface/transformers/pull/44787", "labels": [], "merged": true, - "number": 44730, - "review_comments_count": 6, + "number": 44787, + "review_comments_count": 0, "state": "closed", - "title": "Fix `mlcd` auto config/model/mapping issues", - "updated_at": "2026-03-16T12:12:30Z" + "title": "docs(tasks): remove references to removed question-answering pipeline", + "updated_at": "2026-03-17T16:23:50Z" }, { - "additions": 214, - "author": "xenova", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? This PR introduces a helper utility function, `int_div_ceil`, which performs `math.ceil(a / b)` for non-negative integer operands. This is necessary as the current approach is both error-prone and imprecise (especia\u2026", - "changed_files": 58, + "additions": 25, + "author": "BillionClaw", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": "AMD Strix Halo APUs (gfx1151) experience OOM errors when loading large models via safetensors mmap due to unified memory architecture issues. This fix detects Strix Halo GPUs by checking the GPU architecture name (gfx1151) and forces a CPU\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44729", - "created_at": "2026-03-15T20:29:38Z", - "deletions": 225, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44786", + "created_at": "2026-03-17T08:17:32Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44729/files", - "html_url": "https://github.com/huggingface/transformers/pull/44729", + "files_url": "https://github.com/huggingface/transformers/pull/44786/files", + "html_url": "https://github.com/huggingface/transformers/pull/44786", "labels": [], "merged": false, - "number": 44729, + "number": 44786, "review_comments_count": 0, - "state": "open", - "title": "Avoid floating point math for ceil operations", - "updated_at": "2026-03-15T20:49:34Z" + "state": "closed", + "title": "fix(core_model_loading): disable mmap on Strix Halo to avoid OOM", + "updated_at": "2026-03-17T10:29:44Z" }, { - "additions": 88, - "author": "ajmeese7", - "author_association": "NONE", - "body_excerpt": "# What does this PR do? Fixes a GPU memory leak in `Bnb4bitQuantize.convert()` where float16 source tensors are never freed during 4-bit quantized model loading via `from_pretrained`, causing OOM on models whose float16 size exceeds GPU VR\u2026", - "changed_files": 2, + "additions": 307, + "author": "BillionClaw", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": "AMD Strix Halo APUs (e.g., Radeon 8060S) have issues with mmap-based tensor loading from safetensors, causing out-of-memory errors even when sufficient memory is available. This fix: - Adds `is_strix_halo()` helper to detect Strix Halo GPU\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 4, - "conversation_url": "https://github.com/huggingface/transformers/pull/44728", - "created_at": "2026-03-15T19:56:44Z", - "deletions": 1, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44785", + "created_at": "2026-03-17T06:55:31Z", + "deletions": 83, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44728/files", - "html_url": "https://github.com/huggingface/transformers/pull/44728", + "files_url": "https://github.com/huggingface/transformers/pull/44785/files", + "html_url": "https://github.com/huggingface/transformers/pull/44785", "labels": [], "merged": false, - "number": 44728, + "number": 44785, "review_comments_count": 0, "state": "closed", - "title": "Fix float16 memory leak during 4-bit quantized model loading", - "updated_at": "2026-03-16T20:53:54Z" + "title": "fix(model_loading): Disable mmap on Strix Halo to avoid OOM", + "updated_at": "2026-03-17T10:28:06Z" }, { - "additions": 202, - "author": "LincolnBurrows2017", + "additions": 2, + "author": "BillionClaw", "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": "Fixed issue where kwargs like force_download, proxies, token were not being passed to cached_file function.", - "changed_files": 11, + "body_excerpt": "This PR fixes the DeepSeek tokenizer issue where spaces were lost during decoding in Transformers v5. ## Problem DeepSeek V2 and V3 models use SentencePiece tokenization (like Llama) but were falling back to the generic TokenizersBackend i\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44727", - "created_at": "2026-03-15T19:41:24Z", - "deletions": 33, + "comments_count": 3, + "conversation_url": "https://github.com/huggingface/transformers/pull/44783", + "created_at": "2026-03-17T05:58:54Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44727/files", - "html_url": "https://github.com/huggingface/transformers/pull/44727", - "labels": [ - "Code agent slop" - ], + "files_url": "https://github.com/huggingface/transformers/pull/44783/files", + "html_url": "https://github.com/huggingface/transformers/pull/44783", + "labels": [], "merged": false, - "number": 44727, + "number": 44783, "review_comments_count": 0, "state": "closed", - "title": "fix: AutoProcessor.from_pretrained not passing kwargs to cached_file", - "updated_at": "2026-03-18T13:15:46Z" + "title": "fix(auto): Map deepseek_v2 and deepseek_v3 to LlamaTokenizer", + "updated_at": "2026-04-16T10:47:47Z" }, { - "additions": 198, - "author": "LincolnBurrows2017", - "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": "Replaced bare except clause with except Exception in _safe_convert_tensor function to follow Python best practices (PEP 8).", - "changed_files": 10, + "additions": 6, + "author": "JiwaniZakir", + "author_association": "CONTRIBUTOR", + "body_excerpt": "Fixes #44737 `XLNetModel.relative_positional_encoding` was creating all `torch.arange` tensors on CPU by default, then calling `.to(output_h.device)` at the call site to move them. Adds a `device` parameter to `relative_positional_encoding\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44725", - "created_at": "2026-03-15T17:41:18Z", - "deletions": 29, + "comments_count": 6, + "conversation_url": "https://github.com/huggingface/transformers/pull/44782", + "created_at": "2026-03-17T05:11:36Z", + "deletions": 7, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44725/files", - "html_url": "https://github.com/huggingface/transformers/pull/44725", - "labels": [ - "Code agent slop" - ], - "merged": false, - "number": 44725, + "files_url": "https://github.com/huggingface/transformers/pull/44782/files", + "html_url": "https://github.com/huggingface/transformers/pull/44782", + "labels": [], + "merged": true, + "number": 44782, "review_comments_count": 0, "state": "closed", - "title": "fix: replace bare except with Exception in Fuyu image processing", - "updated_at": "2026-03-18T13:16:22Z" + "title": "fix: XLNet: relative_positional_encoding computes on CPU every forward", + "updated_at": "2026-03-19T13:30:48Z" }, { - "additions": 6, - "author": "ydshieh", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? TO be explained.", - "changed_files": 5, + "additions": 5, + "author": "bensons", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": "# What does this PR do? Some model repos provide `extra_special_tokens` as a list in their tokenizer_config.json, which caused an `AttributeError: 'list' object has no attribute 'keys'`. This converts list inputs to a dict mapping each tok\u2026", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44724", - "created_at": "2026-03-15T17:14:12Z", - "deletions": 5, - "draft": true, - "files_url": "https://github.com/huggingface/transformers/pull/44724/files", - "html_url": "https://github.com/huggingface/transformers/pull/44724", + "comments_count": 3, + "conversation_url": "https://github.com/huggingface/transformers/pull/44781", + "created_at": "2026-03-17T04:59:02Z", + "deletions": 2849, + "draft": false, + "files_url": "https://github.com/huggingface/transformers/pull/44781/files", + "html_url": "https://github.com/huggingface/transformers/pull/44781", "labels": [], "merged": false, - "number": 44724, - "review_comments_count": 1, + "number": 44781, + "review_comments_count": 0, "state": "open", - "title": "Fix some missing / incorrect entries in auto files", - "updated_at": "2026-03-16T09:59:56Z" + "title": "Fix `_set_model_specific_special_tokens` to accept list-format `extra_special_tokens`", + "updated_at": "2026-03-27T23:19:21Z" }, { - "additions": 12, - "author": "aashirpersonal", - "author_association": "NONE", - "body_excerpt": "## Summary This PR fixes #44716 by exposing and forwarding `interpolate_pos_encoding` through the Pixio embedding/model call chain so the option is actually usable from `PixioModel.forward()`. ### Changes - Added `interpolate_pos_encoding:\u2026", - "changed_files": 2, + "additions": 145, + "author": "LincolnBurrows2017", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": "Fixed logic error in is_tiktoken_available function. The original code `return with_blobfile and _is_package_available(\"blobfile\")[0] or True` would always return True due to operator precedence.", + "changed_files": 8, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44723", - "created_at": "2026-03-15T16:52:03Z", - "deletions": 6, + "comments_count": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/44778", + "created_at": "2026-03-16T23:41:29Z", + "deletions": 28, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44723/files", - "html_url": "https://github.com/huggingface/transformers/pull/44723", + "files_url": "https://github.com/huggingface/transformers/pull/44778/files", + "html_url": "https://github.com/huggingface/transformers/pull/44778", "labels": [ "Code agent slop" ], "merged": false, - "number": 44723, + "number": 44778, "review_comments_count": 0, "state": "closed", - "title": "Fix: propagate interpolate_pos_encoding through PixioEmbeddings and PixioModel", - "updated_at": "2026-03-18T15:05:52Z" + "title": "fix: correct logic error in is_tiktoken_available function", + "updated_at": "2026-03-18T13:15:37Z" }, { - "additions": 38, - "author": "chandan11248", - "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": "## What does this PR do? Migrates the GPT-J model to use the new `@capture_outputs` and `@can_return_tuple` decorators for standardized output collection, as described in #43979. ### Changes - Added `_can_record_outputs` to `GPTJPreTrained\u2026", - "changed_files": 2, - "cluster_id": "cluster-43979-11", - "cluster_ids": [ - "cluster-43979-11" - ], - "cluster_role": "member", - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44722", - "created_at": "2026-03-15T15:33:25Z", - "deletions": 110, - "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44722/files", - "html_url": "https://github.com/huggingface/transformers/pull/44722", - "labels": [], - "merged": false, - "number": 44722, - "review_comments_count": 0, - "state": "open", - "title": "Refactor gptj output tracing to use standardized decorators", - "updated_at": "2026-03-19T18:12:59Z" - }, - { - "additions": 4, - "author": "rsmed31", - "author_association": "NONE", - "body_excerpt": "## Summary Fixes #44716 `PixioPatchEmbeddings.forward` already accepted `interpolate_pos_encoding` but it was silently dropped \u2014 never passed from `PixioEmbeddings.forward` or `PixioModel.forward`, making the parameter effectively unusable\u2026", + "additions": 35, + "author": "stevhliu", + "author_association": "MEMBER", + "body_excerpt": "adds docs for #43705 (enable bidirectional attention for decoder-only models)", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44718", - "created_at": "2026-03-14T23:57:14Z", - "deletions": 3, + "conversation_url": "https://github.com/huggingface/transformers/pull/44777", + "created_at": "2026-03-16T21:58:40Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44718/files", - "html_url": "https://github.com/huggingface/transformers/pull/44718", + "files_url": "https://github.com/huggingface/transformers/pull/44777/files", + "html_url": "https://github.com/huggingface/transformers/pull/44777", "labels": [], - "merged": false, - "number": 44718, - "review_comments_count": 0, + "merged": true, + "number": 44777, + "review_comments_count": 1, "state": "closed", - "title": "Fix: propagate interpolate_pos_encoding through PixioEmbeddings and PixioModel", - "updated_at": "2026-03-15T17:58:58Z" + "title": "[docs] is_causal feature", + "updated_at": "2026-03-17T19:50:43Z" }, { - "additions": 15, - "author": "ydshieh", + "additions": 0, + "author": "stevhliu", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? As discussed internally, some component model classes didn't specify the correct config classes. This PR fixes them (those I could found - because the tiny model creation script fails due to those mistakes).", - "changed_files": 7, + "body_excerpt": "the doc-builder is breaking because it can't find `Mistral4ForQuestionAnswering`, which looks like it doesn't exist", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 4, - "conversation_url": "https://github.com/huggingface/transformers/pull/44715", - "created_at": "2026-03-14T21:11:52Z", - "deletions": 2, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/44776", + "created_at": "2026-03-16T20:43:33Z", + "deletions": 4, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44715/files", - "html_url": "https://github.com/huggingface/transformers/pull/44715", + "files_url": "https://github.com/huggingface/transformers/pull/44776/files", + "html_url": "https://github.com/huggingface/transformers/pull/44776", "labels": [], "merged": true, - "number": 44715, + "number": 44776, "review_comments_count": 0, "state": "closed", - "title": "Fix missing / incorrect `config` class in some model class definitions", - "updated_at": "2026-03-15T11:19:51Z" + "title": "[fix] mistral 4 docs", + "updated_at": "2026-03-16T21:11:29Z" }, { - "additions": 181, - "author": "LincolnBurrows2017", - "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": "## Summary Fixes issue #44625: Qwen3.5 num_labels not propagating from core config to text_config. When calling `AutoConfig.from_pretrained(\"Qwen3.5\", num_labels=1)`, the main config gets `num_labels=1` but `text_config` still has default\u2026", - "changed_files": 8, + "additions": 177, + "author": "stevhliu", + "author_association": "MEMBER", + "body_excerpt": "refactors the current [Parallelism methods](https://huggingface.co/docs/transformers/main/en/perf_train_gpu_many#zero-data-parallelism-pipeline-parallelism-and-model-parallelism-3d-parallelism) doc to: - focus on practical examples of comb\u2026", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44714", - "created_at": "2026-03-14T20:42:46Z", - "deletions": 26, - "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44714/files", - "html_url": "https://github.com/huggingface/transformers/pull/44714", + "comments_count": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/44775", + "created_at": "2026-03-16T20:23:29Z", + "deletions": 109, + "draft": true, + "files_url": "https://github.com/huggingface/transformers/pull/44775/files", + "html_url": "https://github.com/huggingface/transformers/pull/44775", "labels": [], "merged": false, - "number": 44714, + "number": 44775, "review_comments_count": 0, - "state": "closed", - "title": "fix: propagate num_labels to text_config for Qwen models", - "updated_at": "2026-03-18T12:56:27Z" + "state": "open", + "title": "[docs] n-d parallelism", + "updated_at": "2026-03-16T20:28:48Z" }, { - "additions": 15, - "author": "kulkarni-rohan", - "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": "Applies the output tracing refactor to ColQwen2ForRetrieval as part of the broader effort tracked in issue #43979 to modernize output handling across all models in the library. Changes in both modular_colqwen2.py and modeling_colqwen2.py:\u2026", + "additions": 0, + "author": "ydshieh", + "author_association": "MEMBER", + "body_excerpt": "# What does this PR do? Remove `is_causal` from `EuroBertConfig`", "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44713", - "created_at": "2026-03-14T20:20:14Z", - "deletions": 28, + "comments_count": 5, + "conversation_url": "https://github.com/huggingface/transformers/pull/44774", + "created_at": "2026-03-16T18:56:19Z", + "deletions": 6, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44713/files", - "html_url": "https://github.com/huggingface/transformers/pull/44713", + "files_url": "https://github.com/huggingface/transformers/pull/44774/files", + "html_url": "https://github.com/huggingface/transformers/pull/44774", + "labels": [], + "merged": true, + "number": 44774, + "review_comments_count": 0, + "state": "closed", + "title": "Remove `is_causal` from `EuroBertConfig`", + "updated_at": "2026-03-17T09:33:21Z" + }, + { + "additions": 3, + "author": "githubnemo", + "author_association": "MEMBER", + "body_excerpt": "The links to the quantization offloading were outdated and 4-bit quantization also supports offloading which should be mentioned. cc @SunMarc", + "changed_files": 3, + "cluster_id": null, + "cluster_ids": [], + "cluster_role": null, + "comments_count": 4, + "conversation_url": "https://github.com/huggingface/transformers/pull/44772", + "created_at": "2026-03-16T18:46:13Z", + "deletions": 3, + "draft": false, + "files_url": "https://github.com/huggingface/transformers/pull/44772/files", + "html_url": "https://github.com/huggingface/transformers/pull/44772", "labels": [], "merged": false, - "number": 44713, + "number": 44772, "review_comments_count": 0, "state": "open", - "title": "[ColQwen2] Refactor output tracing (issue #43979)", - "updated_at": "2026-03-14T20:21:24Z" + "title": "bitsandbytes: Update links and docs", + "updated_at": "2026-03-17T15:57:56Z" }, { "additions": 2, "author": "ydshieh", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? torch 2.11 is going to be released soon, but we still use 2.9. Let's update it to 2.10 so at least a run with torch 2.10, before we update to torch 2.11 later.", + "body_excerpt": "# What does this PR do? wtf", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44712", - "created_at": "2026-03-14T20:18:01Z", - "deletions": 2, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44771", + "created_at": "2026-03-16T18:45:11Z", + "deletions": 1, + "draft": true, + "files_url": "https://github.com/huggingface/transformers/pull/44771/files", + "html_url": "https://github.com/huggingface/transformers/pull/44771", + "labels": [], + "merged": false, + "number": 44771, + "review_comments_count": 0, + "state": "open", + "title": "wtf", + "updated_at": "2026-03-16T18:56:00Z" + }, + { + "additions": 203, + "author": "zucchini-nlp", + "author_association": "MEMBER", + "body_excerpt": "# What does this PR do? Fix tests failing because of `strict` type validation and decorate two missing configs, Nemotron and VibeVoice", + "changed_files": 12, + "cluster_id": null, + "cluster_ids": [], + "cluster_role": null, + "comments_count": 6, + "conversation_url": "https://github.com/huggingface/transformers/pull/44770", + "created_at": "2026-03-16T18:44:03Z", + "deletions": 268, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44712/files", - "html_url": "https://github.com/huggingface/transformers/pull/44712", + "files_url": "https://github.com/huggingface/transformers/pull/44770/files", + "html_url": "https://github.com/huggingface/transformers/pull/44770", "labels": [], "merged": true, - "number": 44712, - "review_comments_count": 0, + "number": 44770, + "review_comments_count": 1, "state": "closed", - "title": "Update Nvidia CI docker file to use torch 2.10", - "updated_at": "2026-03-14T20:29:04Z" + "title": "Fix configs with `@strict`", + "updated_at": "2026-03-17T15:39:43Z" }, { - "additions": 339, - "author": "anuq", - "author_association": "NONE", - "body_excerpt": "## What does this PR do? Fixes #35141. When `tie_word_embeddings=False`, calling `resize_token_embeddings()` creates a new `nn.Linear` for the LM head via `_get_resized_lm_head()`. The new module's weight and bias tensors do **not** carry\u2026", - "changed_files": 4, + "additions": 145, + "author": "LincolnBurrows2017", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": "## Summary The `is_batched_video()` and `convert_pil_frames_to_video()` functions in `src/transformers/video_utils.py` were accessing `videos[0]` without first checking if the list is empty, causing `IndexError` when empty lists are passed\u2026", + "changed_files": 8, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44711", - "created_at": "2026-03-14T19:21:21Z", - "deletions": 205, + "comments_count": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/44769", + "created_at": "2026-03-16T18:40:07Z", + "deletions": 28, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44711/files", - "html_url": "https://github.com/huggingface/transformers/pull/44711", + "files_url": "https://github.com/huggingface/transformers/pull/44769/files", + "html_url": "https://github.com/huggingface/transformers/pull/44769", "labels": [ "Code agent slop" ], "merged": false, - "number": 44711, + "number": 44769, "review_comments_count": 0, "state": "closed", - "title": "fix: mark new lm_head params as `_is_hf_initialized` after `resize_token_embeddings`", - "updated_at": "2026-03-20T13:36:58Z" + "title": "Fix: Handle empty lists in video_utils functions", + "updated_at": "2026-03-18T13:15:55Z" }, { - "additions": 12, - "author": "he-yufeng", - "author_association": "CONTRIBUTOR", - "body_excerpt": "## What does this PR do? Fixes `AutoProcessor.from_pretrained` silently dropping hub kwargs like `force_download`, `cache_dir`, `token`, `revision`, etc. ### The bug The existing code on line ~300 filters kwargs using `inspect.signature(ca\u2026", + "additions": 20, + "author": "michaelbenayoun", + "author_association": "MEMBER", + "body_excerpt": "The function `add_tensor_parallel_hooks_to_module` has unused parameters, in this PR we: - Remove `tp_plan`, which is not used. - Remove `parameter_name` which is not used - Remove `layer_name`. This parameter is only used for logging purp\u2026", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 4, - "conversation_url": "https://github.com/huggingface/transformers/pull/44710", - "created_at": "2026-03-14T18:33:53Z", - "deletions": 2, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/44768", + "created_at": "2026-03-16T18:29:52Z", + "deletions": 9, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44710/files", - "html_url": "https://github.com/huggingface/transformers/pull/44710", + "files_url": "https://github.com/huggingface/transformers/pull/44768/files", + "html_url": "https://github.com/huggingface/transformers/pull/44768", "labels": [], "merged": true, - "number": 44710, - "review_comments_count": 0, + "number": 44768, + "review_comments_count": 3, "state": "closed", - "title": "Fix AutoProcessor.from_pretrained silently dropping hub kwargs", - "updated_at": "2026-03-25T18:13:14Z" + "title": "Remove unused parameters and improve add_tensor_parallel_hooks_t\u2026", + "updated_at": "2026-04-09T17:11:55Z" }, { - "additions": 6778, - "author": "LucasMa2025", - "author_association": "FIRST_TIMER", - "body_excerpt": "# \ud83c\udf9b\ufe0f Add Configurable Generation Scheduler and State Machine for `generate()` ## Summary This PR introduces a **fully optional, zero-intrusion** Generation Scheduler (`GenerationScheduler`) and explicit state machine (`GenerationStateMachi\u2026", - "changed_files": 15, + "additions": 11, + "author": "tarekziade", + "author_association": "MEMBER", + "body_excerpt": "# What does this PR do? EuroBertConfig was missing `@strict(accept_kwargs=True)` unlike its parent LlamaConfig, causing failures when reloading saved configs that include extra keys like `architectures`. Also fixed the test helper passing\u2026", + "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 0, - "conversation_url": "https://github.com/huggingface/transformers/pull/44708", - "created_at": "2026-03-14T17:13:34Z", - "deletions": 7, - "draft": true, - "files_url": "https://github.com/huggingface/transformers/pull/44708/files", - "html_url": "https://github.com/huggingface/transformers/pull/44708", + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44767", + "created_at": "2026-03-16T17:31:26Z", + "deletions": 5, + "draft": false, + "files_url": "https://github.com/huggingface/transformers/pull/44767/files", + "html_url": "https://github.com/huggingface/transformers/pull/44767", "labels": [], - "merged": false, - "number": 44708, - "review_comments_count": 0, + "merged": true, + "number": 44767, + "review_comments_count": 6, "state": "closed", - "title": "Add Configurable Generation Scheduler and State Machine for `generate()`", - "updated_at": "2026-03-14T19:19:11Z" + "title": "Fix: Eurobert model was missing @strict decorator and invalid test kwargs", + "updated_at": "2026-03-16T19:02:31Z" }, { - "additions": 3, - "author": "saivedant169", - "author_association": "NONE", - "body_excerpt": "Fixes part of #32937 ## What does this PR do? Adds `position_ids` as an explicit parameter to `MptForCausalLM.forward()` and `MptModel.forward()`, bringing MPT in line with other CausalLM models. Same rationale as the Bloom PR (#44706) \u2014 M\u2026", - "changed_files": 1, + "additions": 26, + "author": "itazap", + "author_association": "MEMBER", + "body_excerpt": "for when remote code tries to import from `tokenization_xxx_fast`", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44707", - "created_at": "2026-03-14T17:12:16Z", - "deletions": 0, + "comments_count": 6, + "conversation_url": "https://github.com/huggingface/transformers/pull/44766", + "created_at": "2026-03-16T17:30:23Z", + "deletions": 1, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44707/files", - "html_url": "https://github.com/huggingface/transformers/pull/44707", - "labels": [ - "Code agent slop" - ], - "merged": false, - "number": 44707, + "files_url": "https://github.com/huggingface/transformers/pull/44766/files", + "html_url": "https://github.com/huggingface/transformers/pull/44766", + "labels": [], + "merged": true, + "number": 44766, "review_comments_count": 0, "state": "closed", - "title": "Add position_ids to MptForCausalLM forward pass", - "updated_at": "2026-03-18T13:39:36Z" + "title": "support xxxFast alias in v5 tokenizers", + "updated_at": "2026-03-18T13:40:05Z" }, { - "additions": 3, - "author": "saivedant169", - "author_association": "NONE", - "body_excerpt": "Fixes part of #32937 ## What does this PR do? Adds `position_ids` as an explicit parameter to `BloomForCausalLM.forward()` and `BloomModel.forward()`, bringing Bloom in line with other CausalLM models like Llama, Falcon, Gemma, and Mistral\u2026", - "changed_files": 1, + "additions": 19, + "author": "harshaljanjani", + "author_association": "CONTRIBUTOR", + "body_excerpt": "### What does this PR do? The following failing tests were identified and fixed in this PR: \u2192 **PaliGemma 2:** The [PaliGemma 1 test class](https://github.com/huggingface/transformers/blob/main/tests/models/paligemma/test_modeling_paligemm\u2026", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44706", - "created_at": "2026-03-14T17:09:11Z", + "comments_count": 5, + "conversation_url": "https://github.com/huggingface/transformers/pull/44765", + "created_at": "2026-03-16T17:26:22Z", "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44706/files", - "html_url": "https://github.com/huggingface/transformers/pull/44706", - "labels": [ - "Code agent slop" - ], - "merged": false, - "number": 44706, + "files_url": "https://github.com/huggingface/transformers/pull/44765/files", + "html_url": "https://github.com/huggingface/transformers/pull/44765", + "labels": [], + "merged": true, + "number": 44765, "review_comments_count": 0, "state": "closed", - "title": "Add position_ids to BloomForCausalLM forward pass", - "updated_at": "2026-03-18T13:39:51Z" + "title": "fix(testing): Fix PaliGemma 2 and PaddleOCR-VL test failures on main", + "updated_at": "2026-04-18T08:24:53Z" }, { - "additions": 14, - "author": "saivedant169", - "author_association": "NONE", - "body_excerpt": "Fixes part of #32937 ## What does this PR do? RoFormer introduced rotary position embeddings, but its `ForCausalLM` forward method doesn't accept `position_ids` \u2014 which means callers can't specify custom positions for packed sequences or f\u2026", - "changed_files": 1, + "additions": 12, + "author": "tarekziade", + "author_association": "MEMBER", + "body_excerpt": "# What does this PR do? Fixes the siglip import. that was also crashing the test fetcher", + "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44705", - "created_at": "2026-03-14T16:48:06Z", - "deletions": 1, + "comments_count": 6, + "conversation_url": "https://github.com/huggingface/transformers/pull/44764", + "created_at": "2026-03-16T17:15:40Z", + "deletions": 4, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44705/files", - "html_url": "https://github.com/huggingface/transformers/pull/44705", - "labels": [ - "Code agent slop" - ], - "merged": false, - "number": 44705, - "review_comments_count": 0, + "files_url": "https://github.com/huggingface/transformers/pull/44764/files", + "html_url": "https://github.com/huggingface/transformers/pull/44764", + "labels": [], + "merged": true, + "number": 44764, + "review_comments_count": 2, "state": "closed", - "title": "Add position_ids to RoFormerForCausalLM forward pass", - "updated_at": "2026-03-18T13:40:05Z" + "title": "fix: sig lip import", + "updated_at": "2026-03-16T17:38:41Z" }, { - "additions": 26, - "author": "vasqu", + "additions": 17, + "author": "xenova", "author_association": "MEMBER", - "body_excerpt": "As per title, it seems that the `cute` subfolder can be even distributed if you only install FA2 which implies something wrong. Now we check under the (normalized) distribution names", - "changed_files": 2, + "body_excerpt": "# What does this PR do? Adds support for MLP mixers, used by [nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16). Previously, it would crash because it would not recognize the `-` char in t\u2026", + "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44703", - "created_at": "2026-03-14T14:46:02Z", - "deletions": 10, + "comments_count": 7, + "conversation_url": "https://github.com/huggingface/transformers/pull/44763", + "created_at": "2026-03-16T17:04:36Z", + "deletions": 5, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44703/files", - "html_url": "https://github.com/huggingface/transformers/pull/44703", + "files_url": "https://github.com/huggingface/transformers/pull/44763/files", + "html_url": "https://github.com/huggingface/transformers/pull/44763", "labels": [], - "merged": true, - "number": 44703, + "merged": false, + "number": 44763, "review_comments_count": 1, "state": "closed", - "title": "[`FA`] Fix fa detection", - "updated_at": "2026-03-14T17:19:07Z" + "title": "[nemotron_h] Add support for MLP mixers", + "updated_at": "2026-04-14T13:46:14Z" }, { - "additions": 148, - "author": "LincolnBurrows2017", + "additions": 4, + "author": "BillionClaw", "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": "## What does this PR fix? The `rms_norm_eps` parameter in `MistralConfig` was incorrectly typed as `int | None` but defaults to `1e-6` which is a float. This parameter is passed to `MistralRMSNorm` which expects `eps: float`. ### Bug Detai\u2026", - "changed_files": 8, + "body_excerpt": "XLNet.relative_positional_encoding creates intermediate tensors on CPU every forward pass because torch.arange was missing the device parameter. This causes unnecessary CPU-GPU transfers when running on CUDA. Added device=self.device to al\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44702", - "created_at": "2026-03-14T14:41:15Z", - "deletions": 25, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/44762", + "created_at": "2026-03-16T16:17:54Z", + "deletions": 4, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44702/files", - "html_url": "https://github.com/huggingface/transformers/pull/44702", + "files_url": "https://github.com/huggingface/transformers/pull/44762/files", + "html_url": "https://github.com/huggingface/transformers/pull/44762", "labels": [ "Code agent slop" ], "merged": false, - "number": 44702, + "number": 44762, "review_comments_count": 0, "state": "closed", - "title": "fix: Correct rms_norm_eps type hint from int to float in MistralConfig", - "updated_at": "2026-03-18T13:00:12Z" + "title": "fix: Cache XLNet relative_positional_encoding to avoid CPU computation", + "updated_at": "2026-03-18T15:16:14Z" }, { - "additions": 219, - "author": "hmellor", + "additions": 152, + "author": "tarekziade", "author_association": "MEMBER", - "body_excerpt": "These models have `base_model_pp_plan`s but currently do not work because the base model's forward pass depends on all the `layers` being `Qwen2VLDecoderLayer`. i.e. if one of the layers is removed/replaced with `Identity`, `decoder_layer.\u2026", - "changed_files": 52, + "body_excerpt": "# What does this PR do? This adds rule 10: ``` Direct config definitions must use @strict(accept_kwargs=True). ```", + "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44699", - "created_at": "2026-03-14T11:44:24Z", - "deletions": 148, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/44761", + "created_at": "2026-03-16T16:05:03Z", + "deletions": 7, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44699/files", - "html_url": "https://github.com/huggingface/transformers/pull/44699", + "files_url": "https://github.com/huggingface/transformers/pull/44761/files", + "html_url": "https://github.com/huggingface/transformers/pull/44761", "labels": [], "merged": true, - "number": 44699, - "review_comments_count": 0, + "number": 44761, + "review_comments_count": 7, "state": "closed", - "title": "Fix several based models' pipeline parallel support", - "updated_at": "2026-03-20T13:53:27Z" + "title": "model-linter: Added rule 10", + "updated_at": "2026-03-17T08:52:19Z" }, { - "additions": 1, - "author": "hmellor", - "author_association": "MEMBER", - "body_excerpt": "The typo in the `elif` chain meant that `image` and `video` modalidty encoders could not be set using this method. This PR fixes the typo so that they can.", - "changed_files": 1, + "additions": 2090, + "author": "juliendenize", + "author_association": "CONTRIBUTOR", + "body_excerpt": "# What does this PR do? save locally --> local locally) ```\u2026", "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44682", - "created_at": "2026-03-13T18:52:41Z", - "deletions": 73, + "comments_count": 3, + "conversation_url": "https://github.com/huggingface/transformers/pull/44730", + "created_at": "2026-03-15T20:44:32Z", + "deletions": 4, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44682/files", - "html_url": "https://github.com/huggingface/transformers/pull/44682", + "files_url": "https://github.com/huggingface/transformers/pull/44730/files", + "html_url": "https://github.com/huggingface/transformers/pull/44730", "labels": [], - "merged": false, - "number": 44682, - "review_comments_count": 0, - "state": "open", - "title": "transformers serve + llamacpp", - "updated_at": "2026-03-14T07:05:29Z" + "merged": true, + "number": 44730, + "review_comments_count": 6, + "state": "closed", + "title": "Fix `mlcd` auto config/model/mapping issues", + "updated_at": "2026-03-16T12:12:30Z" }, { - "additions": 47, - "author": "dacorvo", + "additions": 214, + "author": "xenova", "author_association": "MEMBER", - "body_excerpt": "Fixes #44679 ## Summary - Custom attention kernels registered via `load_and_register_attn_kernel` currently get hardcoded `flash_attention_2` mask dispatch, which produces 2D or `None` masks - Kernels that need SDPA-style 4D boolean masks\u2026", - "changed_files": 2, + "body_excerpt": "# What does this PR do? This PR introduces a helper utility function, `int_div_ceil`, which performs `math.ceil(a / b)` for non-negative integer operands. This is necessary as the current approach is both error-prone and imprecise (especia\u2026", + "changed_files": 58, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44680", - "created_at": "2026-03-13T17:55:54Z", - "deletions": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/44729", + "created_at": "2026-03-15T20:29:38Z", + "deletions": 225, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44680/files", - "html_url": "https://github.com/huggingface/transformers/pull/44680", + "files_url": "https://github.com/huggingface/transformers/pull/44729/files", + "html_url": "https://github.com/huggingface/transformers/pull/44729", "labels": [], "merged": false, - "number": 44680, - "review_comments_count": 12, + "number": 44729, + "review_comments_count": 0, "state": "open", - "title": "Allow kernel modules to declare their preferred mask function", - "updated_at": "2026-04-14T19:29:06Z" + "title": "Avoid floating point math for ceil operations", + "updated_at": "2026-03-15T20:49:34Z" }, { - "additions": 9, - "author": "JokeYoonic", - "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": "Problem: - On macOS ARM64 + Python 3.13 + transformers 5.x, GPT-2 model's lm_head forward pass produces NaN/Inf values during inference - Root cause: lm_head.weight is tied to transformer.wte.weight, and the shared memory reference causes\u2026", - "changed_files": 1, + "additions": 88, + "author": "ajmeese7", + "author_association": "NONE", + "body_excerpt": "# What does this PR do? Fixes a GPU memory leak in `Bnb4bitQuantize.convert()` where float16 source tensors are never freed during 4-bit quantized model loading via `from_pretrained`, causing OOM on models whose float16 size exceeds GPU VR\u2026", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44676", - "created_at": "2026-03-13T16:28:01Z", - "deletions": 2, + "comments_count": 4, + "conversation_url": "https://github.com/huggingface/transformers/pull/44728", + "created_at": "2026-03-15T19:56:44Z", + "deletions": 1, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44676/files", - "html_url": "https://github.com/huggingface/transformers/pull/44676", + "files_url": "https://github.com/huggingface/transformers/pull/44728/files", + "html_url": "https://github.com/huggingface/transformers/pull/44728", "labels": [], "merged": false, - "number": 44676, + "number": 44728, "review_comments_count": 0, - "state": "open", - "title": "fix(gpt2): Resolve NaN/Inf issue in lm_head on Python 3.13 with tied weights", - "updated_at": "2026-03-18T17:16:49Z" + "state": "closed", + "title": "Fix float16 memory leak during 4-bit quantized model loading", + "updated_at": "2026-03-16T20:53:54Z" }, { - "additions": 32, - "author": "stevhliu", - "author_association": "MEMBER", - "body_excerpt": "properly formats the `ContinuousBatchingConfig` below: \"Screenshot", - "changed_files": 1, + "additions": 202, + "author": "LincolnBurrows2017", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": "Fixed issue where kwargs like force_download, proxies, token were not being passed to cached_file function.", + "changed_files": 11, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44675", - "created_at": "2026-03-13T16:10:28Z", - "deletions": 14, + "conversation_url": "https://github.com/huggingface/transformers/pull/44727", + "created_at": "2026-03-15T19:41:24Z", + "deletions": 33, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44675/files", - "html_url": "https://github.com/huggingface/transformers/pull/44675", - "labels": [], - "merged": true, - "number": 44675, + "files_url": "https://github.com/huggingface/transformers/pull/44727/files", + "html_url": "https://github.com/huggingface/transformers/pull/44727", + "labels": [ + "Code agent slop" + ], + "merged": false, + "number": 44727, "review_comments_count": 0, "state": "closed", - "title": "[docs] cb config", - "updated_at": "2026-03-13T23:15:04Z" + "title": "fix: AutoProcessor.from_pretrained not passing kwargs to cached_file", + "updated_at": "2026-03-18T13:15:46Z" }, { - "additions": 408, - "author": "Rocketknight1", - "author_association": "MEMBER", - "body_excerpt": "We've had `parse_response()` in the library for a while, but it's been a soft launch / prototype feature. This PR cleans it up and documents it, making it an official feature! The API is largely unchanged from the prototype, but we drop `x\u2026", - "changed_files": 5, + "additions": 198, + "author": "LincolnBurrows2017", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": "Replaced bare except clause with except Exception in _safe_convert_tensor function to follow Python best practices (PEP 8).", + "changed_files": 10, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 4, - "conversation_url": "https://github.com/huggingface/transformers/pull/44674", - "created_at": "2026-03-13T15:41:42Z", - "deletions": 34, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/44725", + "created_at": "2026-03-15T17:41:18Z", + "deletions": 29, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44674/files", - "html_url": "https://github.com/huggingface/transformers/pull/44674", - "labels": [], - "merged": true, - "number": 44674, - "review_comments_count": 11, + "files_url": "https://github.com/huggingface/transformers/pull/44725/files", + "html_url": "https://github.com/huggingface/transformers/pull/44725", + "labels": [ + "Code agent slop" + ], + "merged": false, + "number": 44725, + "review_comments_count": 0, "state": "closed", - "title": "Officially launch parse_response", - "updated_at": "2026-03-24T15:55:05Z" + "title": "fix: replace bare except with Exception in Fuyu image processing", + "updated_at": "2026-03-18T13:16:22Z" }, { - "additions": 73, - "author": "remi-or", + "additions": 6, + "author": "ydshieh", "author_association": "MEMBER", - "body_excerpt": "This PR fixes a bug in continuous batching where non-CUDA devices cannot use the feature because some CUDA-exclusive objects are always instantiated. It also adds a test to make sure this will not break again in the future.", - "changed_files": 3, + "body_excerpt": "# What does this PR do? TO be explained.", + "changed_files": 5, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44673", - "created_at": "2026-03-13T15:37:01Z", - "deletions": 15, - "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44673/files", - "html_url": "https://github.com/huggingface/transformers/pull/44673", + "conversation_url": "https://github.com/huggingface/transformers/pull/44724", + "created_at": "2026-03-15T17:14:12Z", + "deletions": 5, + "draft": true, + "files_url": "https://github.com/huggingface/transformers/pull/44724/files", + "html_url": "https://github.com/huggingface/transformers/pull/44724", "labels": [], - "merged": true, - "number": 44673, - "review_comments_count": 0, - "state": "closed", - "title": "[CB] [Bug] Fix crashes when running without cuda", - "updated_at": "2026-03-15T23:59:55Z" + "merged": false, + "number": 44724, + "review_comments_count": 1, + "state": "open", + "title": "Fix some missing / incorrect entries in auto files", + "updated_at": "2026-03-16T09:59:56Z" }, { - "additions": 1, - "author": "neo", - "author_association": "CONTRIBUTOR", - "body_excerpt": "# What does this PR do? modular doesn't properly convert some files (e.g. kyutai) Also fixes red CI on main", + "additions": 4, + "author": "harshaljanjani", + "author_association": "CONTRIBUTOR", + "body_excerpt": "### What does this PR do? The following failing Dia use case was identified and fixed in this PR: \u2192 [MIGRATION_GUIDE_V5.md](https://github.com/harshaljanjani/transformers/blob/main/MIGRATION_GUIDE_V5.md) states that v5 renamed `additional_\u2026", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44283", - "created_at": "2026-02-25T18:33:17Z", - "deletions": 1, + "comments_count": 3, + "conversation_url": "https://github.com/huggingface/transformers/pull/44362", + "created_at": "2026-02-28T20:04:05Z", + "deletions": 6, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44283/files", - "html_url": "https://github.com/huggingface/transformers/pull/44283", + "files_url": "https://github.com/huggingface/transformers/pull/44362/files", + "html_url": "https://github.com/huggingface/transformers/pull/44362", "labels": [], "merged": true, - "number": 44283, + "number": 44362, "review_comments_count": 0, "state": "closed", - "title": "[`Modular`] Fix file type regression", - "updated_at": "2026-02-25T20:04:41Z" + "title": "fix(tokenizer): Fix MLukeTokenizer AttributeError post-v5 refactor", + "updated_at": "2026-03-02T14:51:18Z" }, { - "additions": 5, - "author": "Rocketknight1", - "author_association": "MEMBER", - "body_excerpt": "Response schema save-loading was broken in #40936, this PR restores it! I did most of this in #42300 but missed an issue with loading/saving.", - "changed_files": 1, + "additions": 341, + "author": "sxu75374", + "author_association": "NONE", + "body_excerpt": "# What does this PR do? Adds a workaround for the PyTorch MPS `sdpa_vector_2pass_mps` correctness bug ([pytorch/pytorch#174861](https://github.com/pytorch/pytorch/issues/174861)). **The problem:** On Apple Silicon with MPS backend, `F.scal\u2026", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44282", - "created_at": "2026-02-25T17:57:54Z", - "deletions": 0, + "comments_count": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/44359", + "created_at": "2026-02-28T17:47:01Z", + "deletions": 1, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44282/files", - "html_url": "https://github.com/huggingface/transformers/pull/44282", - "labels": [], - "merged": true, - "number": 44282, + "files_url": "https://github.com/huggingface/transformers/pull/44359/files", + "html_url": "https://github.com/huggingface/transformers/pull/44359", + "labels": [ + "Code agent slop" + ], + "merged": false, + "number": 44359, "review_comments_count": 0, "state": "closed", - "title": "Restore response_schema saving-loading", - "updated_at": "2026-02-25T18:27:22Z" + "title": "fix(sdpa): add workaround for MPS sdpa_vector_2pass_mps correctness bug", + "updated_at": "2026-03-02T13:54:58Z" }, { - "additions": 1, - "author": "ArthurZucker", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Its a very small fix for #44062", + "additions": 6, + "author": "hardikmeisheri", + "author_association": "FIRST_TIMER", + "body_excerpt": "## Summary - `ShieldGemma2ForImageClassification` was missing `_tied_weights_keys`, so `model.lm_head.weight` was randomly re-initialized on every `from_pretrained` call instead of being tied to `embed_tokens.weight`. - This caused non-det\u2026", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44281", - "created_at": "2026-02-25T16:28:37Z", - "deletions": 0, + "comments_count": 14, + "conversation_url": "https://github.com/huggingface/transformers/pull/44358", + "created_at": "2026-02-28T16:49:27Z", + "deletions": 2, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44281/files", - "html_url": "https://github.com/huggingface/transformers/pull/44281", + "files_url": "https://github.com/huggingface/transformers/pull/44358/files", + "html_url": "https://github.com/huggingface/transformers/pull/44358", "labels": [], "merged": true, - "number": 44281, - "review_comments_count": 0, + "number": 44358, + "review_comments_count": 1, "state": "closed", - "title": "Fix special token maps BC", - "updated_at": "2026-02-26T10:34:17Z" + "title": "Fix ShieldGemma2 non-reproducible outputs by adding _tied_weights_keys", + "updated_at": "2026-03-16T20:02:09Z" }, { - "additions": 614, - "author": "RishabhMehra", - "author_association": "FIRST_TIMER", - "body_excerpt": "# What does this PR do? - Adds an opt-in use_fast_grouping flag to TokenClassificationPipeline to enable a NumPy-vectorised BIO grouping path (~5\u00d7 faster on long sequences) while keeping the legacy path as default. - Improves correctness:\u2026", - "changed_files": 3, + "additions": 482, + "author": "NabilMch", + "author_association": "NONE", + "body_excerpt": "# What does this PR do? needs a test", - "changed_files": 36, + "additions": 16, + "author": "manavshrivastavagit", + "author_association": "NONE", + "body_excerpt": "## Summary Fixes #44336 `utils/loading_report.py` was emitting ANSI codes for **bold** and *italic* via `PALETTE['bold']` and `PALETTE['italic']` without checking if stdout is connected to a terminal. `_color()` already respects `sys.stdou\u2026", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 4, - "conversation_url": "https://github.com/huggingface/transformers/pull/44264", - "created_at": "2026-02-24T18:06:58Z", - "deletions": 210, - "draft": true, - "files_url": "https://github.com/huggingface/transformers/pull/44264/files", - "html_url": "https://github.com/huggingface/transformers/pull/44264", - "labels": [], + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/44343", + "created_at": "2026-02-27T20:58:33Z", + "deletions": 9, + "draft": false, + "files_url": "https://github.com/huggingface/transformers/pull/44343/files", + "html_url": "https://github.com/huggingface/transformers/pull/44343", + "labels": [ + "Code agent slop" + ], "merged": false, - "number": 44264, - "review_comments_count": 3, - "state": "open", - "title": "[`Moe`] Enable aux loss automatically when in training + coef is not 0", - "updated_at": "2026-02-25T18:53:20Z" + "number": 44343, + "review_comments_count": 0, + "state": "closed", + "title": "Fix ANSI codes in loading_report when stdout is not a TTY (fixes #44336)", + "updated_at": "2026-03-02T13:44:43Z" }, { - "additions": 5882, - "author": "SunMarc", + "additions": 384, + "author": "stevhliu", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? This PR refactor the common tests that we have in Trainer. I've mainly did the following: - Split the tests that we have in `test_trainer.py` into multiple files. - Fix common tests that were failing in the CI", - "changed_files": 18, + "body_excerpt": "- created a new performance section divided into memory and speed optimizations - model memory training anatomy [guide](https://huggingface.co/docs/transformers/main/en/model_memory_anatomy) is now the more descriptive and simplified GPU m\u2026", + "changed_files": 9, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44260", - "created_at": "2026-02-24T15:51:11Z", - "deletions": 6147, + "conversation_url": "https://github.com/huggingface/transformers/pull/44342", + "created_at": "2026-02-27T20:10:49Z", + "deletions": 274, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44260/files", - "html_url": "https://github.com/huggingface/transformers/pull/44260", + "files_url": "https://github.com/huggingface/transformers/pull/44342/files", + "html_url": "https://github.com/huggingface/transformers/pull/44342", "labels": [], "merged": true, - "number": 44260, - "review_comments_count": 3, + "number": 44342, + "review_comments_count": 12, "state": "closed", - "title": "Update common tests Trainer", - "updated_at": "2026-02-27T17:31:59Z" + "title": "[docs] training performance", + "updated_at": "2026-04-09T20:43:32Z" }, { - "additions": 1830, - "author": "winglian", - "author_association": "COLLABORATOR", - "body_excerpt": "# What does this PR do? This PR supersedes #43985 to replace the dataset/sampler/dataloader with a data producer that should allow us to more easily get to the next step of async training for RL. modular doesn't properly convert some files (e.g. kyutai) Also fixes red CI on main", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44197", - "created_at": "2026-02-21T04:47:32Z", - "deletions": 2, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44283", + "created_at": "2026-02-25T18:33:17Z", + "deletions": 1, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44197/files", - "html_url": "https://github.com/huggingface/transformers/pull/44197", + "files_url": "https://github.com/huggingface/transformers/pull/44283/files", + "html_url": "https://github.com/huggingface/transformers/pull/44283", "labels": [], - "merged": false, - "number": 44197, + "merged": true, + "number": 44283, "review_comments_count": 0, "state": "closed", - "title": "Fix #43937: [GLM-5] ValueError: GenerationConfig is invalid", - "updated_at": "2026-02-23T09:42:54Z" + "title": "[`Modular`] Fix file type regression", + "updated_at": "2026-02-25T20:04:41Z" }, { - "additions": 12, - "author": "danielalanbates", - "author_association": "NONE", - "body_excerpt": "Fixes #43881 ## Summary This PR fixes: glm-4v-9b loading failed ## Changes ``` src/transformers/configuration_utils.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) ``` ## Testing Please review the changes carefully. T\u2026", + "additions": 5, + "author": "Rocketknight1", + "author_association": "MEMBER", + "body_excerpt": "Response schema save-loading was broken in #40936, this PR restores it! I did most of this in #42300 but missed an issue with loading/saving.", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 0, - "conversation_url": "https://github.com/huggingface/transformers/pull/44196", - "created_at": "2026-02-21T04:41:02Z", - "deletions": 1, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44282", + "created_at": "2026-02-25T17:57:54Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44196/files", - "html_url": "https://github.com/huggingface/transformers/pull/44196", - "labels": [ - "Code agent slop" - ], - "merged": false, - "number": 44196, + "files_url": "https://github.com/huggingface/transformers/pull/44282/files", + "html_url": "https://github.com/huggingface/transformers/pull/44282", + "labels": [], + "merged": true, + "number": 44282, "review_comments_count": 0, "state": "closed", - "title": "Fix #43881: glm-4v-9b loading failed", - "updated_at": "2026-02-23T09:45:03Z" + "title": "Restore response_schema saving-loading", + "updated_at": "2026-02-25T18:27:22Z" }, { - "additions": 2, - "author": "danielalanbates", - "author_association": "NONE", - "body_excerpt": "Fixes #44062 ## Summary This PR fixes: TypeError: tokenizers.AddedToken() got multiple values for keyword argument 'special' ## Changes ``` src/transformers/tokenization_utils_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-\u2026", + "additions": 1, + "author": "ArthurZucker", + "author_association": "MEMBER", + "body_excerpt": "# What does this PR do? Its a very small fix for #44062", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 0, - "conversation_url": "https://github.com/huggingface/transformers/pull/44195", - "created_at": "2026-02-21T04:38:14Z", - "deletions": 2, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/44281", + "created_at": "2026-02-25T16:28:37Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44195/files", - "html_url": "https://github.com/huggingface/transformers/pull/44195", - "labels": [ - "Code agent slop" - ], - "merged": false, - "number": 44195, + "files_url": "https://github.com/huggingface/transformers/pull/44281/files", + "html_url": "https://github.com/huggingface/transformers/pull/44281", + "labels": [], + "merged": true, + "number": 44281, "review_comments_count": 0, "state": "closed", - "title": "Fix #44062: TypeError: tokenizers.AddedToken() got multiple values for k", - "updated_at": "2026-02-23T14:10:30Z" + "title": "Fix special token maps BC", + "updated_at": "2026-02-26T10:34:17Z" }, { - "additions": 16, - "author": "danielalanbates", - "author_association": "NONE", - "body_excerpt": "Fixes #44075 ## Summary This PR fixes: Optimizer SGD args are not used ## Changes ``` src/transformers/trainer_optimizer.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) ``` ## Testing Please review the changes carefully. The fix\u2026", - "changed_files": 1, + "additions": 614, + "author": "RishabhMehra", + "author_association": "FIRST_TIMER", + "body_excerpt": "# What does this PR do? - Adds an opt-in use_fast_grouping flag to TokenClassificationPipeline to enable a NumPy-vectorised BIO grouping path (~5\u00d7 faster on long sequences) while keeping the legacy path as default. - Improves correctness:\u2026", + "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 0, - "conversation_url": "https://github.com/huggingface/transformers/pull/44194", - "created_at": "2026-02-21T04:35:53Z", - "deletions": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/44278", + "created_at": "2026-02-25T12:49:56Z", + "deletions": 63, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44194/files", - "html_url": "https://github.com/huggingface/transformers/pull/44194", + "files_url": "https://github.com/huggingface/transformers/pull/44278/files", + "html_url": "https://github.com/huggingface/transformers/pull/44278", "labels": [ "Code agent slop" ], "merged": false, - "number": 44194, + "number": 44278, "review_comments_count": 0, "state": "closed", - "title": "Fix #44075: Optimizer SGD args are not used", - "updated_at": "2026-02-23T14:10:20Z" + "title": "[FEAT] Pipelines - Faster group_entities", + "updated_at": "2026-02-25T13:54:58Z" }, { - "additions": 2, - "author": "danielalanbates", + "additions": 105, + "author": "tarekziade", + "author_association": "MEMBER", + "body_excerpt": "# What does this PR do? This patch makes the GLM-ASR doc example runnable by using `runnables` - see https://github.com/huggingface/doc-builder/blob/main/docs/runnable-code-blocks.md", + "changed_files": 5, + "cluster_id": null, + "cluster_ids": [], + "cluster_role": null, + "comments_count": 36, + "conversation_url": "https://github.com/huggingface/transformers/pull/44277", + "created_at": "2026-02-25T08:49:20Z", + "deletions": 19, + "draft": false, + "files_url": "https://github.com/huggingface/transformers/pull/44277/files", + "html_url": "https://github.com/huggingface/transformers/pull/44277", + "labels": [], + "merged": true, + "number": 44277, + "review_comments_count": 6, + "state": "closed", + "title": "Use doc-builder runnable example for GLM-ASR", + "updated_at": "2026-04-02T16:16:55Z" + }, + { + "additions": 0, + "author": "vishalpatil-45", "author_association": "NONE", - "body_excerpt": "Fixes #43986 ## Summary This PR fixes: Confusing crash when loading a video model through AutoProcessor without torchvision installed ## Changes ``` src/transformers/models/auto/video_processing_auto.py | 2 ++ 1 file changed, 2 insertions(\u2026", - "changed_files": 1, + "body_excerpt": "# What does this PR do? This PR addresses the performance regression where `import transformers` takes ~3.5s. The issue was caused by eager imports of heavy backend libraries (like torch/numpy) during the initial module load. By moving the\u2026", + "changed_files": 0, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 0, - "conversation_url": "https://github.com/huggingface/transformers/pull/44193", - "created_at": "2026-02-21T04:34:37Z", + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/44275", + "created_at": "2026-02-25T08:27:32Z", "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44193/files", - "html_url": "https://github.com/huggingface/transformers/pull/44193", + "files_url": "https://github.com/huggingface/transformers/pull/44275/files", + "html_url": "https://github.com/huggingface/transformers/pull/44275", "labels": [ "Code agent slop" ], "merged": false, - "number": 44193, + "number": 44275, "review_comments_count": 0, "state": "closed", - "title": "Fix #43986: Confusing crash when loading a video model through AutoProce", - "updated_at": "2026-02-23T09:46:15Z" + "title": "[Fix] Restore lazy loading to improve import performance (#44273)", + "updated_at": "2026-02-25T20:37:18Z" }, { - "additions": 3, - "author": "danielalanbates", + "additions": 559, + "author": "paipeline", "author_association": "NONE", - "body_excerpt": "Fixes #44079 ## Summary This PR fixes: `ModelOutput` keys aren't correctly assigned if key was previously None ## Changes ``` src/transformers/utils/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) ``` ## Testing Please r\u2026", - "changed_files": 1, + "body_excerpt": "## Description Fixes #44242 This PR resolves an issue where the auxiliary load balancing loss was not computed when `output_router_logits=False`, even when `router_aux_loss_coef != 0`. ## Problem The auxiliary loss computation was incorrec\u2026", + "changed_files": 6, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44192", - "created_at": "2026-02-21T04:33:52Z", + "conversation_url": "https://github.com/huggingface/transformers/pull/44274", + "created_at": "2026-02-25T06:38:02Z", "deletions": 1, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44192/files", - "html_url": "https://github.com/huggingface/transformers/pull/44192", + "files_url": "https://github.com/huggingface/transformers/pull/44274/files", + "html_url": "https://github.com/huggingface/transformers/pull/44274", "labels": [ "Code agent slop" ], "merged": false, - "number": 44192, + "number": 44274, "review_comments_count": 0, "state": "closed", - "title": "Fix #44079: `ModelOutput` keys aren't correctly assigned if key was prev", - "updated_at": "2026-02-23T14:10:14Z" + "title": "Fix auxiliary load balancing loss computation when output_router_logits=False", + "updated_at": "2026-02-25T13:36:03Z" }, { - "additions": 95, - "author": "danielalanbates", - "author_association": "NONE", - "body_excerpt": "Fixes #44155 ## Summary This PR fixes: [AudioFlamingo3] Batched inference produces incorrect results due to embedding/token leak between tracks ## Changes ``` .../audioflamingo3/modeling_audioflamingo3.py | 51 +++++++++++++++++++--- .../au\u2026", - "changed_files": 3, + "additions": 1, + "author": "hangjun-ezra", + "author_association": "CONTRIBUTOR", + "body_excerpt": "## What does this PR do? Fixes a `TypeError: unsupported operand type(s) for |: 'list' and 'set'` in `RotaryEmbeddingConfigMixin.convert_rope_params_to_dict` when `ignore_keys_at_rope_validation` is a `list` instead of a `set`. ### Root ca\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44191", - "created_at": "2026-02-21T04:32:30Z", - "deletions": 11, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44272", + "created_at": "2026-02-25T03:52:04Z", + "deletions": 1, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44191/files", - "html_url": "https://github.com/huggingface/transformers/pull/44191", - "labels": [ - "Audio" - ], - "merged": false, - "number": 44191, + "files_url": "https://github.com/huggingface/transformers/pull/44272/files", + "html_url": "https://github.com/huggingface/transformers/pull/44272", + "labels": [], + "merged": true, + "number": 44272, "review_comments_count": 0, "state": "closed", - "title": "Fix #44155: [AudioFlamingo3] Batched inference produces incorrect result", - "updated_at": "2026-03-19T16:16:17Z" + "title": "Fix TypeError in convert_rope_params_to_dict when ignore_keys is a list", + "updated_at": "2026-02-25T14:38:36Z" }, { - "additions": 3, - "author": "excepshenal", - "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": "# What does this PR do? Under fp16_full_eval or bf16_full_eval, still don't move model to device if using another dist train backend. This is causing bugs with FSDP2 + bf16_full_eval. The dist train backend would still be in charge of movi\u2026", - "changed_files": 1, + "additions": 1272, + "author": "balak4", + "author_association": "CONTRIBUTOR", + "body_excerpt": "## Summary - Add GreedyLR, a metric-based adaptive learning rate scheduler that adjusts the learning rate during training based on the current loss - Based on [\"Dynamic Learning Rate Scheduling based on Loss Changes Leads to Faster Converg\u2026", + "changed_files": 10, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 0, - "conversation_url": "https://github.com/huggingface/transformers/pull/44189", - "created_at": "2026-02-21T00:06:16Z", - "deletions": 1, + "comments_count": 9, + "conversation_url": "https://github.com/huggingface/transformers/pull/44271", + "created_at": "2026-02-25T01:40:57Z", + "deletions": 7, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44189/files", - "html_url": "https://github.com/huggingface/transformers/pull/44189", + "files_url": "https://github.com/huggingface/transformers/pull/44271/files", + "html_url": "https://github.com/huggingface/transformers/pull/44271", "labels": [], - "merged": false, - "number": 44189, - "review_comments_count": 0, - "state": "open", - "title": "fix: don't move model to device under other dist train backends", - "updated_at": "2026-02-21T00:06:16Z" + "merged": true, + "number": 44271, + "review_comments_count": 3, + "state": "closed", + "title": "Add GreedyLR adaptive learning rate scheduler", + "updated_at": "2026-03-18T18:45:46Z" }, { - "additions": 3, - "author": "harshaljanjani", - "author_association": "CONTRIBUTOR", - "body_excerpt": "### What does this PR do? The following issues were identified and fixed in this PR: \u2192 The NER/token classification issue and the downstream bug uncovered in the batched preprocessing use case with `LayoutLMv2Tokenizer`. \u2192 **Reasoning:** T\u2026", - "changed_files": 1, + "additions": 88, + "author": "yonigozlan", + "author_association": "MEMBER", + "body_excerpt": "# What does this PR do? A lot of ProcessorsKwargs have incorrect/unspecified type hints in their ProcessorsKwargs TypedDict for their images_kwargs attribute. Functionnaly, this did not cause issues as \"_merge_kwargs\" automatically picks u\u2026", + "changed_files": 44, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 5, - "conversation_url": "https://github.com/huggingface/transformers/pull/44187", - "created_at": "2026-02-20T20:02:04Z", + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44270", + "created_at": "2026-02-25T00:11:31Z", "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44187/files", - "html_url": "https://github.com/huggingface/transformers/pull/44187", + "files_url": "https://github.com/huggingface/transformers/pull/44270/files", + "html_url": "https://github.com/huggingface/transformers/pull/44270", "labels": [], - "merged": true, - "number": 44187, + "merged": false, + "number": 44270, "review_comments_count": 0, - "state": "closed", - "title": "fix(models): Fix LayoutLMv2 NER crash and broken batched truncation/padding", - "updated_at": "2026-02-23T10:30:51Z" + "state": "open", + "title": "Add correct typing to custom images_kwargs in ProcessorsKwargs", + "updated_at": "2026-02-25T01:12:06Z" }, { - "additions": 361, - "author": "stevhliu", + "additions": 30, + "author": "yonigozlan", "author_association": "MEMBER", - "body_excerpt": "part 1 of refactoring the `Trainer` docs - restructure the `toctree` a bit to accommodate new sections and docs - slim down `trainer.md` to be a clearer entry point (will expand the `## Next steps` section as we continue for better navigat\u2026", - "changed_files": 7, + "body_excerpt": "# What does this PR do? This is a follow-up to https://github.com/huggingface/transformers/pull/43748, and will allow to have clickable links to the full modality kwargs when present in the docstring of a processor or image processor Cc @s\u2026", + "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44185", - "created_at": "2026-02-20T19:25:07Z", - "deletions": 578, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44269", + "created_at": "2026-02-25T00:05:47Z", + "deletions": 2, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44185/files", - "html_url": "https://github.com/huggingface/transformers/pull/44185", + "files_url": "https://github.com/huggingface/transformers/pull/44269/files", + "html_url": "https://github.com/huggingface/transformers/pull/44269", "labels": [], "merged": true, - "number": 44185, - "review_comments_count": 19, + "number": 44269, + "review_comments_count": 0, "state": "closed", - "title": "[docs] trainer part 1", - "updated_at": "2026-02-24T21:18:42Z" + "title": "Add `ProcessingKwargs` `ImagesKwargs` etc. to docs", + "updated_at": "2026-02-27T19:03:15Z" }, { - "additions": 191, - "author": "mariam851", + "additions": 5, + "author": "ethanknights", "author_association": "CONTRIBUTOR", - "body_excerpt": "This PR implements the initial architecture for CircuitGPT (based on OpenAI's research), as discussed in #44121. Key implementations: SparseLinear: Custom layer with Top-K weight sparsity logic. CircuitGpt Components: Attention, MLP, and C\u2026", - "changed_files": 3, + "body_excerpt": "# What does this PR do? Some improvements to the `trainer.py` docs. ## Before submitting - [x] This PR fixes a typo or improves the docs. ## Who can review? Documentation: @stevhliu", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 0, - "conversation_url": "https://github.com/huggingface/transformers/pull/44184", - "created_at": "2026-02-20T16:58:27Z", - "deletions": 0, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44268", + "created_at": "2026-02-24T23:20:16Z", + "deletions": 4, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44184/files", - "html_url": "https://github.com/huggingface/transformers/pull/44184", + "files_url": "https://github.com/huggingface/transformers/pull/44268/files", + "html_url": "https://github.com/huggingface/transformers/pull/44268", "labels": [], - "merged": false, - "number": 44184, + "merged": true, + "number": 44268, "review_comments_count": 0, - "state": "open", - "title": "feat: add OpenAI CircuitGPT core architecture and sparse linear layers", - "updated_at": "2026-02-20T17:18:44Z" + "state": "closed", + "title": "chore: fixes in `Trainer` class docs (`compute_loss` & `hyperparameter_search`)", + "updated_at": "2026-02-26T00:50:23Z" }, { - "additions": 1, - "author": "Rocketknight1", - "author_association": "MEMBER", - "body_excerpt": "Our code has some references to the `grouped_entities` arg to the token classification pipeline, but this is no longer usable. This PR cleans them up entirely! Fixes #44016", - "changed_files": 2, + "additions": 4, + "author": "manavshrivastavagit", + "author_association": "NONE", + "body_excerpt": "## Summary - Update the `DocumentQuestionAnsweringPipeline` docstring to explicitly mention the task summary in the Transformers documentation. - Remove the stale TODO comment now that document question answering is covered in the task sum\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44182", - "created_at": "2026-02-20T15:28:26Z", + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/44267", + "created_at": "2026-02-24T20:35:18Z", "deletions": 4, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44182/files", - "html_url": "https://github.com/huggingface/transformers/pull/44182", - "labels": [], - "merged": true, - "number": 44182, + "files_url": "https://github.com/huggingface/transformers/pull/44267/files", + "html_url": "https://github.com/huggingface/transformers/pull/44267", + "labels": [ + "Code agent slop" + ], + "merged": false, + "number": 44267, "review_comments_count": 0, "state": "closed", - "title": "Remove refs to grouped_entities", - "updated_at": "2026-02-24T16:07:24Z" + "title": "Docs: point DocumentQuestionAnswering pipeline to task summary", + "updated_at": "2026-02-25T13:34:48Z" }, { - "additions": 898, - "author": "Cyrilvallez", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? As per the title! Follow up of https://github.com/huggingface/transformers/pull/44130 and https://github.com/huggingface/transformers/pull/44226. Finally remove the `cache_position` everywhere (not ALL models, but a\u2026", - "changed_files": 169, + "additions": 27, + "author": "harshaljanjani", + "author_association": "CONTRIBUTOR", + "body_excerpt": "### What does this PR do? The following issue was identified and fixed in this PR: \u2192 **Reasoning:** The impact of this fix goes beyond `Mask2Former` and `DeformableDetr` and should fix any model that uses `torch_compilable_check`. Most use\u2026", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 11, - "conversation_url": "https://github.com/huggingface/transformers/pull/44181", - "created_at": "2026-02-20T15:24:39Z", - "deletions": 2698, + "comments_count": 8, + "conversation_url": "https://github.com/huggingface/transformers/pull/44266", + "created_at": "2026-02-24T20:02:06Z", + "deletions": 1, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44181/files", - "html_url": "https://github.com/huggingface/transformers/pull/44181", + "files_url": "https://github.com/huggingface/transformers/pull/44266/files", + "html_url": "https://github.com/huggingface/transformers/pull/44266", "labels": [], "merged": true, - "number": 44181, - "review_comments_count": 32, + "number": 44266, + "review_comments_count": 0, "state": "closed", - "title": "[core] \ud83d\udea8 Completely remove cache positions", - "updated_at": "2026-03-04T18:08:42Z" + "title": "fix(utils): Make torch_compilable_check compatible with torch.export strict mode", + "updated_at": "2026-04-18T08:31:33Z" }, { - "additions": 28, - "author": "tarekziade", + "additions": 90, + "author": "vasqu", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Fixes a flaky test in IdeficsForVisionText2TextTest::test_generate_continue_from_inputs_embeds. The flakiness can be reproduced with: ``` pytest -q -p no:rerunfailures --flake-finder --flake-runs=20 \\ tests/models/i\u2026", - "changed_files": 1, + "body_excerpt": "As per title, WIP --> needs a test", + "changed_files": 36, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 10, - "conversation_url": "https://github.com/huggingface/transformers/pull/44180", - "created_at": "2026-02-20T14:30:46Z", - "deletions": 2, - "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44180/files", - "html_url": "https://github.com/huggingface/transformers/pull/44180", + "comments_count": 4, + "conversation_url": "https://github.com/huggingface/transformers/pull/44264", + "created_at": "2026-02-24T18:06:58Z", + "deletions": 210, + "draft": true, + "files_url": "https://github.com/huggingface/transformers/pull/44264/files", + "html_url": "https://github.com/huggingface/transformers/pull/44264", "labels": [], - "merged": true, - "number": 44180, - "review_comments_count": 0, - "state": "closed", - "title": "fix(flaky): idefics generate cache flake", - "updated_at": "2026-02-26T16:18:18Z" + "merged": false, + "number": 44264, + "review_comments_count": 3, + "state": "open", + "title": "[`Moe`] Enable aux loss automatically when in training + coef is not 0", + "updated_at": "2026-02-25T18:53:20Z" }, { - "additions": 27, - "author": "itazap", + "additions": 5882, + "author": "SunMarc", "author_association": "MEMBER", - "body_excerpt": "Models with incorrect tokenizer_class in tokenization_config.json that should use TokenziersBackend", - "changed_files": 1, + "body_excerpt": "# What does this PR do? This PR refactor the common tests that we have in Trainer. I've mainly did the following: - Split the tests that we have in `test_trainer.py` into multiple files. - Fix common tests that were failing in the CI", + "changed_files": 18, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 6, - "conversation_url": "https://github.com/huggingface/transformers/pull/44179", - "created_at": "2026-02-20T13:51:44Z", - "deletions": 0, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/44260", + "created_at": "2026-02-24T15:51:11Z", + "deletions": 6147, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44179/files", - "html_url": "https://github.com/huggingface/transformers/pull/44179", + "files_url": "https://github.com/huggingface/transformers/pull/44260/files", + "html_url": "https://github.com/huggingface/transformers/pull/44260", "labels": [], "merged": true, - "number": 44179, - "review_comments_count": 1, + "number": 44260, + "review_comments_count": 3, "state": "closed", - "title": "Models with incorrect tokenizer_class in tokenization_config.json tha\u2026", - "updated_at": "2026-02-23T08:33:13Z" + "title": "Update common tests Trainer", + "updated_at": "2026-02-27T17:31:59Z" }, { - "additions": 2993, - "author": "ebezzam", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Re-opening https://github.com/huggingface/transformers/pull/37868 TODO - [x] recompute expected outputs - [x] passthrough code given new conventions - [x] check for unused code paths / configuration parameters Origi\u2026", - "changed_files": 27, + "additions": 1830, + "author": "winglian", + "author_association": "COLLABORATOR", + "body_excerpt": "# What does this PR do? This PR supersedes #43985 to replace the dataset/sampler/dataloader with a data producer that should allow us to more easily get to the next step of async training for RL. \"\". Then we compare `\"\" != \"LlamaTokenizer\"` (the `tokenizer_class` in `tokenizer_config.json`). Since that's true we earl\u2026", + "body_excerpt": "# What does this PR do? main is currently failing with ``` FAILED tests/models/higgs_audio_v2/test_modeling_higgs_audio_v2.py::HiggsAudioV2ModelTest::test_generate_compilation_all_outputs - AssertionError: Lists differ: [torch.Size([2, 15,\u2026", "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 5, - "conversation_url": "https://github.com/huggingface/transformers/pull/44127", - "created_at": "2026-02-18T10:41:48Z", - "deletions": 8, + "comments_count": 9, + "conversation_url": "https://github.com/huggingface/transformers/pull/44201", + "created_at": "2026-02-21T10:03:41Z", + "deletions": 13, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44127/files", - "html_url": "https://github.com/huggingface/transformers/pull/44127", + "files_url": "https://github.com/huggingface/transformers/pull/44201/files", + "html_url": "https://github.com/huggingface/transformers/pull/44201", "labels": [], "merged": true, - "number": 44127, - "review_comments_count": 0, + "number": 44201, + "review_comments_count": 3, "state": "closed", - "title": "AutoTokenizer ignores config when model_type is None", - "updated_at": "2026-02-18T14:47:52Z" + "title": "fix: HiggsAudioV2 cached decode inputs in compiled generation", + "updated_at": "2026-02-23T12:39:19Z" }, { - "additions": 17, - "author": "Cyrilvallez", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? As per the title. Let's simplify after https://github.com/huggingface/transformers/pull/42848", - "changed_files": 2, + "additions": 3, + "author": "pragnyanramtha", + "author_association": "FIRST_TIME_CONTRIBUTOR", + "body_excerpt": "Fixes #43782 The `weights_only` parameter passed to `from_pretrained()` was not being forwarded to `load_state_dict()` when loading `.bin` checkpoint files in the non-DeepSpeed code path. This caused `weights_only` to always default to `Tr\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44126", - "created_at": "2026-02-18T09:58:49Z", - "deletions": 40, + "comments_count": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/44200", + "created_at": "2026-02-21T06:24:17Z", + "deletions": 1, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44126/files", - "html_url": "https://github.com/huggingface/transformers/pull/44126", - "labels": [], - "merged": true, - "number": 44126, + "files_url": "https://github.com/huggingface/transformers/pull/44200/files", + "html_url": "https://github.com/huggingface/transformers/pull/44200", + "labels": [ + "Code agent slop" + ], + "merged": false, + "number": 44200, "review_comments_count": 0, "state": "closed", - "title": "Simplify input preparation in generate", - "updated_at": "2026-02-18T10:30:48Z" + "title": "fix: propagate `weights_only` param to `load_state_dict` in .bin loading path (#43782)", + "updated_at": "2026-02-23T14:20:12Z" }, { - "additions": 8, - "author": "zucchini-nlp", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Fixes https://github.com/huggingface/transformers/issues/43986", + "additions": 3, + "author": "gowthamr-tech", + "author_association": "CONTRIBUTOR", + "body_excerpt": "## What does this PR do? This PR fixes an issue in `run_image_classification_no_trainer.py` where the script always loaded `dataset_name` (e.g., CIFAR10) even when `--train_dir` or `--validation_dir` was provided. Now, when local dataset d\u2026", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44125", - "created_at": "2026-02-18T09:34:54Z", - "deletions": 7, + "comments_count": 3, + "conversation_url": "https://github.com/huggingface/transformers/pull/44199", + "created_at": "2026-02-21T06:03:29Z", + "deletions": 2, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44125/files", - "html_url": "https://github.com/huggingface/transformers/pull/44125", + "files_url": "https://github.com/huggingface/transformers/pull/44199/files", + "html_url": "https://github.com/huggingface/transformers/pull/44199", "labels": [], "merged": true, - "number": 44125, - "review_comments_count": 2, + "number": 44199, + "review_comments_count": 0, "state": "closed", - "title": "Raise informative error when loading video processors", - "updated_at": "2026-02-20T08:23:35Z" + "title": "Fix local dataset loading priority in run_image_classification_no_tra\u2026", + "updated_at": "2026-02-24T15:10:17Z" }, { - "additions": 10, - "author": "mariam851", - "author_association": "CONTRIBUTOR", - "body_excerpt": "Description: Adds eval_on_end to TrainingArguments to force evaluation at the end of training, even if the last step doesn't align with eval_steps. Changes: training_args.py: Added eval_on_end field. trainer.py: Added logic to call evaluat\u2026", + "additions": 71, + "author": "danielalanbates", + "author_association": "NONE", + "body_excerpt": "Fixes #43975 ## Summary This PR fixes: `deepseek-ai/deepseek-coder-6.7b-instruct` incorrectly detokenizes in v5 ## Changes ``` src/transformers/tokenization_utils_tokenizers.py | 12 ++++- tests/models/llama/test_tokenization_llama.py | 60\u2026", "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 0, - "conversation_url": "https://github.com/huggingface/transformers/pull/44124", - "created_at": "2026-02-18T08:52:23Z", - "deletions": 0, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/44198", + "created_at": "2026-02-21T04:54:47Z", + "deletions": 1, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44124/files", - "html_url": "https://github.com/huggingface/transformers/pull/44124", - "labels": [], + "files_url": "https://github.com/huggingface/transformers/pull/44198/files", + "html_url": "https://github.com/huggingface/transformers/pull/44198", + "labels": [ + "Code agent slop" + ], "merged": false, - "number": 44124, + "number": 44198, "review_comments_count": 0, "state": "closed", - "title": "feat: add eval_on_end to Trainer for final evaluation", - "updated_at": "2026-02-18T14:14:16Z" + "title": "Fix #43975: `deepseek-ai/deepseek-coder-6.7b-instruct` incorrectly detok", + "updated_at": "2026-02-23T14:10:47Z" }, { - "additions": 33, - "author": "cyyever", - "author_association": "CONTRIBUTOR", - "body_excerpt": "# What does this PR do? This PR avoids device sync in training loss accumulation by ```torch.where```. The `is_torch_xla_available` condition is also removed.", - "changed_files": 1, + "additions": 37, + "author": "danielalanbates", + "author_association": "NONE", + "body_excerpt": "Fixes #43937 ## Summary This PR fixes: [GLM-5] ValueError: GenerationConfig is invalid ## Changes ``` src/transformers/generation/configuration_utils.py | 13 +++++++++++- src/transformers/modeling_utils.py | 2 +- tests/generation/test_conf\u2026", + "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44123", - "created_at": "2026-02-18T08:22:57Z", - "deletions": 22, + "conversation_url": "https://github.com/huggingface/transformers/pull/44197", + "created_at": "2026-02-21T04:47:32Z", + "deletions": 2, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44123/files", - "html_url": "https://github.com/huggingface/transformers/pull/44123", + "files_url": "https://github.com/huggingface/transformers/pull/44197/files", + "html_url": "https://github.com/huggingface/transformers/pull/44197", "labels": [], "merged": false, - "number": 44123, + "number": 44197, "review_comments_count": 0, - "state": "open", - "title": "Avoid device sync in training loss accumulation", - "updated_at": "2026-03-30T07:57:16Z" + "state": "closed", + "title": "Fix #43937: [GLM-5] ValueError: GenerationConfig is invalid", + "updated_at": "2026-02-23T09:42:54Z" }, { - "additions": 158, - "author": "adityuhkapoor", + "additions": 12, + "author": "danielalanbates", "author_association": "NONE", - "body_excerpt": "# What does this PR do? Adds 4-bit embedding quantization for BitsAndBytes, mirroring TorchAO's existing `include_input_output_embeddings` and `untie_embedding_weights` pattern (PRs #37802, #37905, #37935). Large-vocabulary models (Llama 3\u2026", - "changed_files": 4, + "body_excerpt": "Fixes #43881 ## Summary This PR fixes: glm-4v-9b loading failed ## Changes ``` src/transformers/configuration_utils.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) ``` ## Testing Please review the changes carefully. T\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44122", - "created_at": "2026-02-18T06:35:09Z", - "deletions": 2, + "comments_count": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/44196", + "created_at": "2026-02-21T04:41:02Z", + "deletions": 1, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44122/files", - "html_url": "https://github.com/huggingface/transformers/pull/44122", + "files_url": "https://github.com/huggingface/transformers/pull/44196/files", + "html_url": "https://github.com/huggingface/transformers/pull/44196", "labels": [ "Code agent slop" ], "merged": false, - "number": 44122, + "number": 44196, "review_comments_count": 0, "state": "closed", - "title": "Add BnB 4-bit embedding quantization support", - "updated_at": "2026-02-18T14:27:25Z" + "title": "Fix #43881: glm-4v-9b loading failed", + "updated_at": "2026-02-23T09:45:03Z" }, { - "additions": 14, - "author": "tirth8205", + "additions": 2, + "author": "danielalanbates", "author_association": "NONE", - "body_excerpt": "Fixes #34920 After applying `normalize()`, images can have negative values. Calling `resize()` on such images fails because it internally converts to PIL, which requires values in [0, 1] or [0, 255]. ### Fix When the image has values outsi\u2026", + "body_excerpt": "Fixes #44062 ## Summary This PR fixes: TypeError: tokenizers.AddedToken() got multiple values for keyword argument 'special' ## Changes ``` src/transformers/tokenization_utils_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-\u2026", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 0, - "conversation_url": "https://github.com/huggingface/transformers/pull/44120", - "created_at": "2026-02-17T23:56:48Z", - "deletions": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/44195", + "created_at": "2026-02-21T04:38:14Z", + "deletions": 2, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44120/files", - "html_url": "https://github.com/huggingface/transformers/pull/44120", + "files_url": "https://github.com/huggingface/transformers/pull/44195/files", + "html_url": "https://github.com/huggingface/transformers/pull/44195", "labels": [ "Code agent slop" ], "merged": false, - "number": 44120, + "number": 44195, "review_comments_count": 0, "state": "closed", - "title": "fix: allow image_transforms.resize to handle negative values after normalization", - "updated_at": "2026-02-18T14:08:54Z" + "title": "Fix #44062: TypeError: tokenizers.AddedToken() got multiple values for k", + "updated_at": "2026-02-23T14:10:30Z" }, { - "additions": 1, - "author": "tirth8205", + "additions": 16, + "author": "danielalanbates", "author_association": "NONE", - "body_excerpt": "Fixes #44117 `TOKENIZER_MAPPING_NAMES.get(config_model_type, \"\")` returns `None` when the key exists with value `None`, causing `AttributeError: 'NoneType' object has no attribute 'replace'` when loading models like `google/siglip2-so400m-\u2026", + "body_excerpt": "Fixes #44075 ## Summary This PR fixes: Optimizer SGD args are not used ## Changes ``` src/transformers/trainer_optimizer.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) ``` ## Testing Please review the changes carefully. The fix\u2026", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44119", - "created_at": "2026-02-17T23:53:20Z", - "deletions": 1, + "comments_count": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/44194", + "created_at": "2026-02-21T04:35:53Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44119/files", - "html_url": "https://github.com/huggingface/transformers/pull/44119", - "labels": [], + "files_url": "https://github.com/huggingface/transformers/pull/44194/files", + "html_url": "https://github.com/huggingface/transformers/pull/44194", + "labels": [ + "Code agent slop" + ], "merged": false, - "number": 44119, + "number": 44194, "review_comments_count": 0, "state": "closed", - "title": "fix: handle None value from TOKENIZER_MAPPING_NAMES.get() in AutoTokenizer", - "updated_at": "2026-02-18T14:04:47Z" + "title": "Fix #44075: Optimizer SGD args are not used", + "updated_at": "2026-02-23T14:10:20Z" }, { - "additions": 32, - "author": "tirth8205", + "additions": 2, + "author": "danielalanbates", "author_association": "NONE", - "body_excerpt": "## Fix Fixes #44079 When a `ModelOutput` dataclass field is initialized as `None`, it is correctly excluded from the OrderedDict keys. However, **subsequently setting that field to a non-None value** via attribute assignment (e.g. `outputs\u2026", - "changed_files": 2, + "body_excerpt": "Fixes #43986 ## Summary This PR fixes: Confusing crash when loading a video model through AutoProcessor without torchvision installed ## Changes ``` src/transformers/models/auto/video_processing_auto.py | 2 ++ 1 file changed, 2 insertions(\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 0, - "conversation_url": "https://github.com/huggingface/transformers/pull/44118", - "created_at": "2026-02-17T23:31:31Z", + "conversation_url": "https://github.com/huggingface/transformers/pull/44193", + "created_at": "2026-02-21T04:34:37Z", "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44118/files", - "html_url": "https://github.com/huggingface/transformers/pull/44118", + "files_url": "https://github.com/huggingface/transformers/pull/44193/files", + "html_url": "https://github.com/huggingface/transformers/pull/44193", "labels": [ "Code agent slop" ], "merged": false, - "number": 44118, + "number": 44193, "review_comments_count": 0, "state": "closed", - "title": "fix: ModelOutput keys not updated when setting previously-None dataclass fields", - "updated_at": "2026-02-18T14:18:12Z" + "title": "Fix #43986: Confusing crash when loading a video model through AutoProce", + "updated_at": "2026-02-23T09:46:15Z" }, { - "additions": 27, - "author": "dtiourine", - "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": "Migrate Flaubert to the @capture_outputs and @can_return_tuple decorator pattern for output handling, as part of #43979. # What does this PR do? - Add `_can_record_outputs = {\"attentions\": MultiHeadAttention}` on `FlaubertPreTrainedModel`\u2026", + "additions": 3, + "author": "danielalanbates", + "author_association": "NONE", + "body_excerpt": "Fixes #44079 ## Summary This PR fixes: `ModelOutput` keys aren't correctly assigned if key was previously None ## Changes ``` src/transformers/utils/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) ``` ## Testing Please r\u2026", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44116", - "created_at": "2026-02-17T21:52:13Z", - "deletions": 102, + "conversation_url": "https://github.com/huggingface/transformers/pull/44192", + "created_at": "2026-02-21T04:33:52Z", + "deletions": 1, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44116/files", - "html_url": "https://github.com/huggingface/transformers/pull/44116", - "labels": [], + "files_url": "https://github.com/huggingface/transformers/pull/44192/files", + "html_url": "https://github.com/huggingface/transformers/pull/44192", + "labels": [ + "Code agent slop" + ], "merged": false, - "number": 44116, + "number": 44192, "review_comments_count": 0, - "state": "open", - "title": "[WIP] [Flaubert] Refactor output tracing to decorator-based interface", - "updated_at": "2026-02-17T21:53:23Z" + "state": "closed", + "title": "Fix #44079: `ModelOutput` keys aren't correctly assigned if key was prev", + "updated_at": "2026-02-23T14:10:14Z" }, { - "additions": 2, - "author": "Deep-unlearning", - "author_association": "MEMBER", - "body_excerpt": "## Summary - Fix broken `[chat template](./chat_templating)` links in `docs/source/en/tasks/` - `./chat_templating` resolves within `tasks/` (doesn't exist); corrected to `../chat_templating` - Affected files: `tasks/image_text_to_text.md`\u2026", - "changed_files": 2, + "additions": 95, + "author": "danielalanbates", + "author_association": "NONE", + "body_excerpt": "Fixes #44155 ## Summary This PR fixes: [AudioFlamingo3] Batched inference produces incorrect results due to embedding/token leak between tracks ## Changes ``` .../audioflamingo3/modeling_audioflamingo3.py | 51 +++++++++++++++++++--- .../au\u2026", + "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44115", - "created_at": "2026-02-17T21:32:55Z", - "deletions": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44191", + "created_at": "2026-02-21T04:32:30Z", + "deletions": 11, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44115/files", - "html_url": "https://github.com/huggingface/transformers/pull/44115", - "labels": [], - "merged": true, - "number": 44115, + "files_url": "https://github.com/huggingface/transformers/pull/44191/files", + "html_url": "https://github.com/huggingface/transformers/pull/44191", + "labels": [ + "Audio" + ], + "merged": false, + "number": 44191, "review_comments_count": 0, "state": "closed", - "title": "[docs] fix broken chat_templating links in tasks docs", - "updated_at": "2026-02-23T16:27:57Z" + "title": "Fix #44155: [AudioFlamingo3] Batched inference produces incorrect result", + "updated_at": "2026-03-19T16:16:17Z" }, { - "additions": 716, - "author": "23atharvaS", + "additions": 3, + "author": "excepshenal", "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": "## Summary This PR migrates the `wav2vec2` family to the standardized output-capturing interface (`@capture_outputs` + `@can_return_tuple`) and includes follow-up compatibility fixes required to make full CI green. ## What changed ### Core\u2026", - "changed_files": 19, + "body_excerpt": "# What does this PR do? Under fp16_full_eval or bf16_full_eval, still don't move model to device if using another dist train backend. This is causing bugs with FSDP2 + bf16_full_eval. The dist train backend would still be in charge of movi\u2026", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44114", - "created_at": "2026-02-17T21:17:35Z", - "deletions": 1237, + "comments_count": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/44189", + "created_at": "2026-02-21T00:06:16Z", + "deletions": 1, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44114/files", - "html_url": "https://github.com/huggingface/transformers/pull/44114", + "files_url": "https://github.com/huggingface/transformers/pull/44189/files", + "html_url": "https://github.com/huggingface/transformers/pull/44189", "labels": [], "merged": false, - "number": 44114, + "number": 44189, "review_comments_count": 0, "state": "open", - "title": "Migrate wav2vec2, wav2vec2_conformer, and wav2vec2_bert to standardized output collection decorators", - "updated_at": "2026-02-18T20:34:53Z" + "title": "fix: don't move model to device under other dist train backends", + "updated_at": "2026-02-21T00:06:16Z" }, { - "additions": 5, + "additions": 3, "author": "harshaljanjani", "author_association": "CONTRIBUTOR", - "body_excerpt": "### What does this PR do? The following issue was identified and fixed in this PR: \u2192 Updates the stale `test_device_override` in `test_processing_granite_speech.py` to verify that the device param controls where speech inputs are placed, r\u2026", + "body_excerpt": "### What does this PR do? The following issues were identified and fixed in this PR: \u2192 The NER/token classification issue and the downstream bug uncovered in the batched preprocessing use case with `LayoutLMv2Tokenizer`. \u2192 **Reasoning:** T\u2026", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44113", - "created_at": "2026-02-17T20:01:32Z", - "deletions": 7, + "comments_count": 5, + "conversation_url": "https://github.com/huggingface/transformers/pull/44187", + "created_at": "2026-02-20T20:02:04Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44113/files", - "html_url": "https://github.com/huggingface/transformers/pull/44113", + "files_url": "https://github.com/huggingface/transformers/pull/44187/files", + "html_url": "https://github.com/huggingface/transformers/pull/44187", "labels": [], "merged": true, - "number": 44113, - "review_comments_count": 2, + "number": 44187, + "review_comments_count": 0, "state": "closed", - "title": "fix(testing): Update stale device override test in GraniteSpeech", - "updated_at": "2026-02-19T11:24:29Z" + "title": "fix(models): Fix LayoutLMv2 NER crash and broken batched truncation/padding", + "updated_at": "2026-04-18T08:32:02Z" }, { - "additions": 30, - "author": "fumadari", - "author_association": "NONE", - "body_excerpt": "## Summary - Part of #43979 \u2014 refactors `poolformer` to use the `capture_outputs`, `can_return_tuple`, and `merge_with_config_defaults` decorators - Simplifies `PoolFormerLayer` to return a single tensor instead of a 1-tuple - Simplifies `\u2026", - "changed_files": 1, + "additions": 361, + "author": "stevhliu", + "author_association": "MEMBER", + "body_excerpt": "part 1 of refactoring the `Trainer` docs - restructure the `toctree` a bit to accommodate new sections and docs - slim down `trainer.md` to be a clearer entry point (will expand the `## Next steps` section as we continue for better navigat\u2026", + "changed_files": 7, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 4, - "conversation_url": "https://github.com/huggingface/transformers/pull/44111", - "created_at": "2026-02-17T19:38:02Z", - "deletions": 59, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/44185", + "created_at": "2026-02-20T19:25:07Z", + "deletions": 578, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44111/files", - "html_url": "https://github.com/huggingface/transformers/pull/44111", + "files_url": "https://github.com/huggingface/transformers/pull/44185/files", + "html_url": "https://github.com/huggingface/transformers/pull/44185", "labels": [], - "merged": false, - "number": 44111, - "review_comments_count": 0, + "merged": true, + "number": 44185, + "review_comments_count": 19, "state": "closed", - "title": "refactor(poolformer): use capture_outputs for output tracing", - "updated_at": "2026-02-18T21:19:22Z" + "title": "[docs] trainer part 1", + "updated_at": "2026-02-24T21:18:42Z" }, { - "additions": 28, - "author": "fumadari", - "author_association": "NONE", - "body_excerpt": "## Summary - Part of #43979 \u2014 refactors `tvp` to use the `capture_outputs`, `can_return_tuple`, and `merge_with_config_defaults` decorators - Simplifies `TvpAttention` to always return `(output, attention_probs)` (hooks decide what to capt\u2026", - "changed_files": 1, + "additions": 191, + "author": "mariam851", + "author_association": "CONTRIBUTOR", + "body_excerpt": "This PR implements the initial architecture for CircuitGPT (based on OpenAI's research), as discussed in #44121. Key implementations: SparseLinear: Custom layer with Top-K weight sparsity logic. CircuitGpt Components: Attention, MLP, and C\u2026", + "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44110", - "created_at": "2026-02-17T19:32:55Z", - "deletions": 101, + "comments_count": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/44184", + "created_at": "2026-02-20T16:58:27Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44110/files", - "html_url": "https://github.com/huggingface/transformers/pull/44110", + "files_url": "https://github.com/huggingface/transformers/pull/44184/files", + "html_url": "https://github.com/huggingface/transformers/pull/44184", "labels": [], "merged": false, - "number": 44110, + "number": 44184, "review_comments_count": 0, - "state": "closed", - "title": "refactor(tvp): use capture_outputs for output tracing", - "updated_at": "2026-02-18T21:19:24Z" + "state": "open", + "title": "feat: add OpenAI CircuitGPT core architecture and sparse linear layers", + "updated_at": "2026-02-20T17:18:44Z" }, { - "additions": 48, - "author": "fumadari", - "author_association": "NONE", - "body_excerpt": "## Summary - Part of #43979 \u2014 refactors `hgnet_v2` to use the `capture_outputs` and `merge_with_config_defaults` decorators - Simplifies `HGNetV2Encoder` by removing `return_dict` parameter (always returns `BaseModelOutputWithNoAttention`)\u2026", + "additions": 1, + "author": "Rocketknight1", + "author_association": "MEMBER", + "body_excerpt": "Our code has some references to the `grouped_entities` arg to the token classification pipeline, but this is no longer usable. This PR cleans them up entirely! Fixes #44016", "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44109", - "created_at": "2026-02-17T19:23:03Z", - "deletions": 87, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44182", + "created_at": "2026-02-20T15:28:26Z", + "deletions": 4, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44109/files", - "html_url": "https://github.com/huggingface/transformers/pull/44109", + "files_url": "https://github.com/huggingface/transformers/pull/44182/files", + "html_url": "https://github.com/huggingface/transformers/pull/44182", "labels": [], - "merged": false, - "number": 44109, + "merged": true, + "number": 44182, "review_comments_count": 0, "state": "closed", - "title": "refactor(hgnet_v2): use capture_outputs for output tracing", - "updated_at": "2026-02-18T21:19:25Z" + "title": "Remove refs to grouped_entities", + "updated_at": "2026-02-24T16:07:24Z" }, { - "additions": 33, - "author": "fumadari", - "author_association": "NONE", - "body_excerpt": "## Summary - Adds `@merge_with_config_defaults` and `@capture_outputs` to both `VitDetModel` and `VitDetBackbone`, removing manual `output_attentions`/`return_dict` resolution - Adds `_can_record_outputs = {\"attentions\": VitDetAttention}`\u2026", - "changed_files": 1, + "additions": 898, + "author": "Cyrilvallez", + "author_association": "MEMBER", + "body_excerpt": "# What does this PR do? As per the title! Follow up of https://github.com/huggingface/transformers/pull/44130 and https://github.com/huggingface/transformers/pull/44226. Finally remove the `cache_position` everywhere (not ALL models, but a\u2026", + "changed_files": 169, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44108", - "created_at": "2026-02-17T19:15:00Z", - "deletions": 82, + "comments_count": 11, + "conversation_url": "https://github.com/huggingface/transformers/pull/44181", + "created_at": "2026-02-20T15:24:39Z", + "deletions": 2698, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44108/files", - "html_url": "https://github.com/huggingface/transformers/pull/44108", + "files_url": "https://github.com/huggingface/transformers/pull/44181/files", + "html_url": "https://github.com/huggingface/transformers/pull/44181", "labels": [], - "merged": false, - "number": 44108, - "review_comments_count": 0, + "merged": true, + "number": 44181, + "review_comments_count": 32, "state": "closed", - "title": "refactor(vitdet): use output tracing decorators", - "updated_at": "2026-02-18T21:19:27Z" + "title": "[core] \ud83d\udea8 Completely remove cache positions", + "updated_at": "2026-03-04T18:08:42Z" }, { - "additions": 40, - "author": "fumadari", - "author_association": "NONE", - "body_excerpt": "## Summary - Replaces manual `output_hidden_states`/`return_dict` resolution in `MraModel` with `@merge_with_config_defaults` and `@capture_outputs` decorators - Simplifies `MraEncoder` to a plain loop returning a single tensor, removing `\u2026", + "additions": 28, + "author": "tarekziade", + "author_association": "MEMBER", + "body_excerpt": "# What does this PR do? Fixes a flaky test in IdeficsForVisionText2TextTest::test_generate_continue_from_inputs_embeds. The flakiness can be reproduced with: ``` pytest -q -p no:rerunfailures --flake-finder --flake-runs=20 \\ tests/models/i\u2026", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44107", - "created_at": "2026-02-17T19:04:42Z", - "deletions": 112, + "comments_count": 10, + "conversation_url": "https://github.com/huggingface/transformers/pull/44180", + "created_at": "2026-02-20T14:30:46Z", + "deletions": 2, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44107/files", - "html_url": "https://github.com/huggingface/transformers/pull/44107", + "files_url": "https://github.com/huggingface/transformers/pull/44180/files", + "html_url": "https://github.com/huggingface/transformers/pull/44180", "labels": [], - "merged": false, - "number": 44107, + "merged": true, + "number": 44180, "review_comments_count": 0, "state": "closed", - "title": "refactor(mra): use output tracing decorators", - "updated_at": "2026-02-18T21:19:29Z" + "title": "fix(flaky): idefics generate cache flake", + "updated_at": "2026-02-26T16:18:18Z" }, { - "additions": 47, - "author": "fumadari", - "author_association": "NONE", - "body_excerpt": "## Summary - Replace manual `hidden_states`/`attentions` collection in `YosoEncoder` with the `@capture_outputs` decorator and forward hooks - Add `@can_return_tuple` to all 5 wrapper model classes, eliminating manual `return_dict` handlin\u2026", + "additions": 27, + "author": "itazap", + "author_association": "MEMBER", + "body_excerpt": "Models with incorrect tokenizer_class in tokenization_config.json that should use TokenziersBackend", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44106", - "created_at": "2026-02-17T18:59:25Z", - "deletions": 132, + "comments_count": 6, + "conversation_url": "https://github.com/huggingface/transformers/pull/44179", + "created_at": "2026-02-20T13:51:44Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44106/files", - "html_url": "https://github.com/huggingface/transformers/pull/44106", + "files_url": "https://github.com/huggingface/transformers/pull/44179/files", + "html_url": "https://github.com/huggingface/transformers/pull/44179", "labels": [], - "merged": false, - "number": 44106, - "review_comments_count": 0, + "merged": true, + "number": 44179, + "review_comments_count": 1, "state": "closed", - "title": "Refactor yoso to use automatic output tracing", - "updated_at": "2026-02-18T21:19:30Z" + "title": "Models with incorrect tokenizer_class in tokenization_config.json tha\u2026", + "updated_at": "2026-02-23T08:33:13Z" }, { - "additions": 39, - "author": "fumadari", - "author_association": "NONE", - "body_excerpt": "## Summary - Replace manual `hidden_states`/`attentions` collection in `LiltEncoder` with the `@capture_outputs` decorator and forward hooks - Add `@can_return_tuple` to all 3 wrapper model classes, eliminating manual `return_dict` handlin\u2026", - "changed_files": 1, + "additions": 2993, + "author": "ebezzam", + "author_association": "MEMBER", + "body_excerpt": "# What does this PR do? Re-opening https://github.com/huggingface/transformers/pull/37868 TODO - [x] recompute expected outputs - [x] passthrough code given new conventions - [x] check for unused code paths / configuration parameters Origi\u2026", + "changed_files": 27, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44105", - "created_at": "2026-02-17T18:54:40Z", - "deletions": 127, + "comments_count": 6, + "conversation_url": "https://github.com/huggingface/transformers/pull/44178", + "created_at": "2026-02-20T12:36:21Z", + "deletions": 50, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44105/files", - "html_url": "https://github.com/huggingface/transformers/pull/44105", + "files_url": "https://github.com/huggingface/transformers/pull/44178/files", + "html_url": "https://github.com/huggingface/transformers/pull/44178", "labels": [], "merged": false, - "number": 44105, - "review_comments_count": 0, - "state": "closed", - "title": "Refactor lilt to use automatic output tracing", - "updated_at": "2026-02-18T21:19:32Z" + "number": 44178, + "review_comments_count": 9, + "state": "open", + "title": "Add xcodec2 model", + "updated_at": "2026-04-13T14:26:21Z" }, { - "additions": 66, - "author": "fumadari", - "author_association": "NONE", - "body_excerpt": "## Summary - Replace manual `hidden_states`/`attentions`/`cross_attentions` collection in `MegatronBertEncoder` with the `@capture_outputs` decorator and forward hooks - Add `@can_return_tuple` to all 8 wrapper model classes, eliminating m\u2026", - "changed_files": 1, + "additions": 41, + "author": "vasqu", + "author_association": "MEMBER", + "body_excerpt": "As per title, spiritual successor to #44081 Why? Because as is - Only defaults for fa2/fa3, not on other requested kernels - Limits implementations to one kernel/implementation while I suspect that there will be multiple viable versions (i\u2026", + "changed_files": 7, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44104", - "created_at": "2026-02-17T18:43:44Z", - "deletions": 207, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44177", + "created_at": "2026-02-20T12:13:30Z", + "deletions": 71, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44104/files", - "html_url": "https://github.com/huggingface/transformers/pull/44104", + "files_url": "https://github.com/huggingface/transformers/pull/44177/files", + "html_url": "https://github.com/huggingface/transformers/pull/44177", "labels": [], - "merged": false, - "number": 44104, - "review_comments_count": 0, + "merged": true, + "number": 44177, + "review_comments_count": 2, "state": "closed", - "title": "Refactor megatron_bert to use automatic output tracing", - "updated_at": "2026-02-18T21:19:34Z" + "title": "[`Flash Attn`] Enable compatible implementations", + "updated_at": "2026-02-20T12:43:35Z" }, { - "additions": 53, - "author": "engmohamedsalah", - "author_association": "NONE", - "body_excerpt": "Fixes #44052 Now and then, the indexer ran into trouble switching between masks and cache. Most of the test failures came from these hiccups: - Indexer cache: the old if seq_len > 1: reset cache heuristic broke assisted decoding (multi-tok\u2026", - "changed_files": 3, + "additions": 271, + "author": "vasqu", + "author_association": "MEMBER", + "body_excerpt": "Our kernel loading is incompatible with the original packages as they do not expose the same import structure: - Kernels seem to expose things in the init (and not in the original path) - Original packages seem to expose only within their\u2026", + "changed_files": 14, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44103", - "created_at": "2026-02-17T18:04:48Z", - "deletions": 76, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44176", + "created_at": "2026-02-20T11:36:01Z", + "deletions": 124, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44103/files", - "html_url": "https://github.com/huggingface/transformers/pull/44103", + "files_url": "https://github.com/huggingface/transformers/pull/44176/files", + "html_url": "https://github.com/huggingface/transformers/pull/44176", "labels": [], - "merged": false, - "number": 44103, - "review_comments_count": 0, + "merged": true, + "number": 44176, + "review_comments_count": 2, "state": "closed", - "title": "Fix glm_moe_dsa", - "updated_at": "2026-02-18T19:38:11Z" + "title": "[`Mamba`] Fix kernel loading", + "updated_at": "2026-02-20T16:19:06Z" }, { - "additions": 42, - "author": "fumadari", - "author_association": "NONE", - "body_excerpt": "## Summary Refactors the `ibert` model to use the new `@capture_outputs` and `@can_return_tuple` decorators for output tracing, as part of the meta-issue #43979. **Key changes:** - Added `_can_record_outputs = {\"hidden_states\": IBertLayer,\u2026", + "additions": 1, + "author": "itazap", + "author_association": "MEMBER", + "body_excerpt": "add jamba tokenizer mapping to PreTrainedTokenizerFast (v4/v5 BC) for vllm: https://buildkite.com/vllm/ci/builds/52260/steps/canvas?sid=019c76ad-c8f2-4e59-a2f4-5f3b5bbc204c&tab=output", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44102", - "created_at": "2026-02-17T17:21:32Z", - "deletions": 154, + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44175", + "created_at": "2026-02-20T11:00:18Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44102/files", - "html_url": "https://github.com/huggingface/transformers/pull/44102", + "files_url": "https://github.com/huggingface/transformers/pull/44175/files", + "html_url": "https://github.com/huggingface/transformers/pull/44175", "labels": [], "merged": false, - "number": 44102, + "number": 44175, "review_comments_count": 0, "state": "closed", - "title": "Refactor ibert output tracing with capture_outputs", - "updated_at": "2026-02-18T21:19:35Z" + "title": "add jamba tokenizer mapping to PreTrainedTokenizerFast (v4/v5 BC)", + "updated_at": "2026-02-20T16:19:31Z" }, { - "additions": 210, - "author": "aman-coder03", - "author_association": "FIRST_TIME_CONTRIBUTOR", - "body_excerpt": "## What does this PR do? This PR refactors XLM's output tracing to align with the standardized output capturing patterns used across the codebase. ### Key changes: - Refactors transformer blocks into a dedicated `XLMLayer` module to enable\u2026", - "changed_files": 2, + "additions": 1367, + "author": "tarekziade", + "author_association": "MEMBER", + "body_excerpt": "This draft expands `utils/check_modeling_structure.py` into a rule-driven linter for model code, with new checks and tests, while keeping runtime very low. Key features: - The checker is intentionally AST-only (no heavy imports/execution),\u2026", + "changed_files": 4, "cluster_id": null, "cluster_ids": [], - "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44101", - "created_at": "2026-02-17T17:15:06Z", - "deletions": 194, + "cluster_role": null, + "comments_count": 8, + "conversation_url": "https://github.com/huggingface/transformers/pull/44174", + "created_at": "2026-02-20T10:38:11Z", + "deletions": 24, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44101/files", - "html_url": "https://github.com/huggingface/transformers/pull/44101", + "files_url": "https://github.com/huggingface/transformers/pull/44174/files", + "html_url": "https://github.com/huggingface/transformers/pull/44174", "labels": [], - "merged": false, - "number": 44101, - "review_comments_count": 0, - "state": "open", - "title": "[XLM] Refactor output tracing to align with capture_outputs standardized architecture", - "updated_at": "2026-02-19T08:08:33Z" + "merged": true, + "number": 44174, + "review_comments_count": 38, + "state": "closed", + "title": "Expand model-structure lint rules with a fast AST-based, ruff-like framework", + "updated_at": "2026-03-12T06:42:21Z" }, { - "additions": 3, - "author": "qgallouedec", + "additions": 20, + "author": "tarekziade", "author_association": "MEMBER", - "body_excerpt": "In https://github.com/huggingface/trl/pull/5112 a user reported that `trl sft --help` fails It's because three inherited args from `TrainingArguments` (`torch_empty_cache_steps`, `gradient_checkpointing` and `use_liger_kernel`)help strings\u2026", + "body_excerpt": "Fixes flaky GLM OCR generation behavior when 2D `position_ids` are passed explicitly. Reproducible locally with: ``` pytest tests/models/glm_ocr/test_modeling_glm_ocr.py::GlmOcrModelTest::test_generate_with_and_without_position_ids --flake\u2026", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44100", - "created_at": "2026-02-17T17:10:36Z", - "deletions": 3, + "comments_count": 14, + "conversation_url": "https://github.com/huggingface/transformers/pull/44173", + "created_at": "2026-02-20T09:28:48Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44100/files", - "html_url": "https://github.com/huggingface/transformers/pull/44100", + "files_url": "https://github.com/huggingface/transformers/pull/44173/files", + "html_url": "https://github.com/huggingface/transformers/pull/44173", "labels": [], "merged": true, - "number": 44100, - "review_comments_count": 0, + "number": 44173, + "review_comments_count": 10, "state": "closed", - "title": "Fix percentage formatting in help messages for gradient checkpointing, Liger Kernel, and empty cache steps", - "updated_at": "2026-02-20T09:57:51Z" + "title": "fix(flaky): `test_generate_with_and_without_position_ids` in GLM ORC", + "updated_at": "2026-02-20T19:06:19Z" }, { "additions": 2, - "author": "qgallouedec", + "author": "tarekziade", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? \"\". Then we compare `\"\" != \"LlamaTokenizer\"` (the `tokenizer_class` in `tokenizer_config.json`). Since that's true we earl\u2026", + "changed_files": 3, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44055", - "created_at": "2026-02-16T18:26:43Z", - "deletions": 3, + "comments_count": 5, + "conversation_url": "https://github.com/huggingface/transformers/pull/44127", + "created_at": "2026-02-18T10:41:48Z", + "deletions": 8, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44055/files", - "html_url": "https://github.com/huggingface/transformers/pull/44055", + "files_url": "https://github.com/huggingface/transformers/pull/44127/files", + "html_url": "https://github.com/huggingface/transformers/pull/44127", "labels": [], "merged": true, - "number": 44055, + "number": 44127, "review_comments_count": 0, "state": "closed", - "title": "Fix unprotected torch import", - "updated_at": "2026-02-16T18:43:01Z" - }, - { - "additions": 346, - "author": "ArthurZucker", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? Add flash MLA interface. - It does not work I get a segfault - we don't leverage the paged cache so it's not as efficient as that I reckon. ```bash Fetching 6 files: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2026", - "changed_files": 10, - "cluster_id": null, - "cluster_ids": [], - "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44054", - "created_at": "2026-02-16T18:07:14Z", - "deletions": 93, - "draft": true, - "files_url": "https://github.com/huggingface/transformers/pull/44054/files", - "html_url": "https://github.com/huggingface/transformers/pull/44054", - "labels": [], - "merged": false, - "number": 44054, - "review_comments_count": 0, - "state": "open", - "title": "Flash mla interface", - "updated_at": "2026-02-20T11:14:39Z" + "title": "AutoTokenizer ignores config when model_type is None", + "updated_at": "2026-02-18T14:47:52Z" }, { - "additions": 2, + "additions": 17, "author": "Cyrilvallez", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do?", - "changed_files": 1, + "body_excerpt": "# What does this PR do? As per the title. Let's simplify after https://github.com/huggingface/transformers/pull/42848", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 3, - "conversation_url": "https://github.com/huggingface/transformers/pull/44053", - "created_at": "2026-02-16T17:59:48Z", - "deletions": 2, + "comments_count": 1, + "conversation_url": "https://github.com/huggingface/transformers/pull/44126", + "created_at": "2026-02-18T09:58:49Z", + "deletions": 40, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44053/files", - "html_url": "https://github.com/huggingface/transformers/pull/44053", + "files_url": "https://github.com/huggingface/transformers/pull/44126/files", + "html_url": "https://github.com/huggingface/transformers/pull/44126", "labels": [], "merged": true, - "number": 44053, + "number": 44126, "review_comments_count": 0, "state": "closed", - "title": "Fix peft conversion typo", - "updated_at": "2026-02-17T11:12:19Z" + "title": "Simplify input preparation in generate", + "updated_at": "2026-02-18T10:30:48Z" }, { - "additions": 2, - "author": "tomaarsen", + "additions": 8, + "author": "zucchini-nlp", "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? * Fix incorrect forward type hint for Gemma3n ## Details The type hint didn't match the actual returned class: https://github.com/huggingface/transformers/blob/349e00c1a367ce263624e525038250625dcf20c7/src/transforme\u2026", - "changed_files": 2, + "body_excerpt": "# What does this PR do? Fixes https://github.com/huggingface/transformers/issues/43986", + "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44051", - "created_at": "2026-02-16T17:26:24Z", - "deletions": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44125", + "created_at": "2026-02-18T09:34:54Z", + "deletions": 7, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44051/files", - "html_url": "https://github.com/huggingface/transformers/pull/44051", + "files_url": "https://github.com/huggingface/transformers/pull/44125/files", + "html_url": "https://github.com/huggingface/transformers/pull/44125", "labels": [], "merged": true, - "number": 44051, - "review_comments_count": 0, + "number": 44125, + "review_comments_count": 2, "state": "closed", - "title": "[`chore`] Fix incorrect forward type hint for Gemma3n", - "updated_at": "2026-02-20T09:08:07Z" + "title": "Raise informative error when loading video processors", + "updated_at": "2026-02-20T08:23:35Z" }, { - "additions": 15, - "author": "tomaarsen", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? * Patch `get_text_features` for ChineseCLIP ### Details The `get_text_features` assumes that the `text_model` returns a `BaseModelOutputWithPooling`, just like is done with many other models. Currently, the `get_tex\u2026", - "changed_files": 7, + "additions": 10, + "author": "mariam851", + "author_association": "CONTRIBUTOR", + "body_excerpt": "Description: Adds eval_on_end to TrainingArguments to force evaluation at the end of training, even if the last step doesn't align with eval_steps. Changes: training_args.py: Added eval_on_end field. trainer.py: Added logic to call evaluat\u2026", + "changed_files": 2, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44050", - "created_at": "2026-02-16T17:23:31Z", - "deletions": 19, + "comments_count": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/44124", + "created_at": "2026-02-18T08:52:23Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44050/files", - "html_url": "https://github.com/huggingface/transformers/pull/44050", + "files_url": "https://github.com/huggingface/transformers/pull/44124/files", + "html_url": "https://github.com/huggingface/transformers/pull/44124", "labels": [], "merged": false, - "number": 44050, - "review_comments_count": 4, + "number": 44124, + "review_comments_count": 0, "state": "closed", - "title": "[`fix`] Patch `get_text_features` for ChineseCLIP", - "updated_at": "2026-02-17T09:55:17Z" + "title": "feat: add eval_on_end to Trainer for final evaluation", + "updated_at": "2026-02-18T14:14:16Z" }, { - "additions": 59, - "author": "ManasVardhan", - "author_association": "NONE", - "body_excerpt": "## What does this PR do? Refactors the `fnet` model to use the new `@capture_outputs` and `@can_return_tuple` decorators, as part of #43979. ### Changes: - Added `_can_record_outputs = {\"hidden_states\": FNetLayer}` to `FNetPreTrainedModel`\u2026", + "additions": 33, + "author": "cyyever", + "author_association": "CONTRIBUTOR", + "body_excerpt": "# What does this PR do? This PR avoids device sync in training loss accumulation by ```torch.where```. The `is_torch_xla_available` condition is also removed.", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44049", - "created_at": "2026-02-16T17:19:04Z", - "deletions": 112, + "conversation_url": "https://github.com/huggingface/transformers/pull/44123", + "created_at": "2026-02-18T08:22:57Z", + "deletions": 22, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44049/files", - "html_url": "https://github.com/huggingface/transformers/pull/44049", + "files_url": "https://github.com/huggingface/transformers/pull/44123/files", + "html_url": "https://github.com/huggingface/transformers/pull/44123", "labels": [], "merged": false, - "number": 44049, + "number": 44123, "review_comments_count": 0, - "state": "closed", - "title": "Refactor fnet model to use @capture_outputs and @can_return_tuple decorators", - "updated_at": "2026-03-03T00:30:13Z" + "state": "open", + "title": "Avoid device sync in training loss accumulation", + "updated_at": "2026-03-30T07:57:16Z" }, { - "additions": 4, - "author": "tomaarsen", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do? * Fix up `__repr__` whitespace/brackets ## Reproducer ```python from transformers import AutoTokenizer, PreTrainedTokenizerBase # __repr__ via PreTrainedTokenizerBase tokenizer = AutoTokenizer.from_pretrained(\"bert-\u2026", - "changed_files": 2, + "additions": 158, + "author": "adityuhkapoor", + "author_association": "NONE", + "body_excerpt": "# What does this PR do? Adds 4-bit embedding quantization for BitsAndBytes, mirroring TorchAO's existing `include_input_output_embeddings` and `untie_embedding_weights` pattern (PRs #37802, #37905, #37935). Large-vocabulary models (Llama 3\u2026", + "changed_files": 4, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44048", - "created_at": "2026-02-16T17:18:10Z", + "comments_count": 2, + "conversation_url": "https://github.com/huggingface/transformers/pull/44122", + "created_at": "2026-02-18T06:35:09Z", "deletions": 2, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44048/files", - "html_url": "https://github.com/huggingface/transformers/pull/44048", - "labels": [], - "merged": true, - "number": 44048, + "files_url": "https://github.com/huggingface/transformers/pull/44122/files", + "html_url": "https://github.com/huggingface/transformers/pull/44122", + "labels": [ + "Code agent slop" + ], + "merged": false, + "number": 44122, "review_comments_count": 0, "state": "closed", - "title": "[`simple`] Fix up `__repr__` whitespace/brackets", - "updated_at": "2026-02-20T10:03:34Z" + "title": "Add BnB 4-bit embedding quantization support", + "updated_at": "2026-02-18T14:27:25Z" }, { - "additions": 35, - "author": "ManasVardhan", + "additions": 14, + "author": "tirth8205", "author_association": "NONE", - "body_excerpt": "## What does this PR do? Refactors the `bloom` model to use the new `@capture_outputs` and `@can_return_tuple` decorators, as part of the effort in #43979. ### Changes: - Add `_can_record_outputs` dict to `BloomPreTrainedModel` mapping `hi\u2026", + "body_excerpt": "Fixes #34920 After applying `normalize()`, images can have negative values. Calling `resize()` on such images fails because it internally converts to PIL, which requires values in [0, 1] or [0, 255]. ### Fix When the image has values outsi\u2026", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, - "comments_count": 1, - "conversation_url": "https://github.com/huggingface/transformers/pull/44047", - "created_at": "2026-02-16T17:15:25Z", - "deletions": 104, + "comments_count": 0, + "conversation_url": "https://github.com/huggingface/transformers/pull/44120", + "created_at": "2026-02-17T23:56:48Z", + "deletions": 0, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44047/files", - "html_url": "https://github.com/huggingface/transformers/pull/44047", - "labels": [], + "files_url": "https://github.com/huggingface/transformers/pull/44120/files", + "html_url": "https://github.com/huggingface/transformers/pull/44120", + "labels": [ + "Code agent slop" + ], "merged": false, - "number": 44047, + "number": 44120, "review_comments_count": 0, "state": "closed", - "title": "Refactor bloom model to use @capture_outputs and @can_return_tuple decorators", - "updated_at": "2026-03-03T00:30:14Z" + "title": "fix: allow image_transforms.resize to handle negative values after normalization", + "updated_at": "2026-02-18T14:08:54Z" }, { - "additions": 24, - "author": "ManasVardhan", + "additions": 1, + "author": "tirth8205", "author_association": "NONE", - "body_excerpt": "## What does this PR do? Refactors the `codegen` model to use the `@capture_outputs` and `@can_return_tuple` decorators, replacing manual `output_attentions`/`output_hidden_states`/`return_dict` handling. ### Changes: - Add `_can_record_ou\u2026", + "body_excerpt": "Fixes #44117 `TOKENIZER_MAPPING_NAMES.get(config_model_type, \"\")` returns `None` when the key exists with value `None`, causing `AttributeError: 'NoneType' object has no attribute 'replace'` when loading models like `google/siglip2-so400m-\u2026", "changed_files": 1, "cluster_id": null, "cluster_ids": [], "cluster_role": null, "comments_count": 2, - "conversation_url": "https://github.com/huggingface/transformers/pull/44046", - "created_at": "2026-02-16T17:07:38Z", - "deletions": 70, + "conversation_url": "https://github.com/huggingface/transformers/pull/44119", + "created_at": "2026-02-17T23:53:20Z", + "deletions": 1, "draft": false, - "files_url": "https://github.com/huggingface/transformers/pull/44046/files", - "html_url": "https://github.com/huggingface/transformers/pull/44046", + "files_url": "https://github.com/huggingface/transformers/pull/44119/files", + "html_url": "https://github.com/huggingface/transformers/pull/44119", "labels": [], "merged": false, - "number": 44046, + "number": 44119, "review_comments_count": 0, "state": "closed", - "title": "Refactor codegen model to use @capture_outputs and @can_return_tuple decorators", - "updated_at": "2026-02-17T14:15:23Z" + "title": "fix: handle None value from TOKENIZER_MAPPING_NAMES.get() in AutoTokenizer", + "updated_at": "2026-02-18T14:04:47Z" }, { - "additions": 456215, - "author": "ArthurZucker", - "author_association": "MEMBER", - "body_excerpt": "# What does this PR do?