JinghuiLuAstronaut commited on 10 days ago

Commit

8b31547

verified ·

1 Parent(s): 76bde08

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

LTA_openwebtext_dualt/experiments/nanogpt_tinyshakespeare_char/runs/char_ar_lta_4gpu_5k_20260507.pid +1 -0
LTA_openwebtext_dualt/experiments/nanogpt_tinyshakespeare_char/runs/char_ar_lta_4gpu_5k_20260507_lta.pid +1 -0
LTA_openwebtext_dualt/experiments/nanogpt_tinyshakespeare_char/runs/char_ar_lta_4gpu_5k_20260507_lta_rerun.log +61 -0
LTA_openwebtext_dualt/experiments/nanogpt_tinyshakespeare_char/sample_fully_coupled.py +146 -0
LTA_openwebtext_dualt/experiments/nanogpt_tinyshakespeare_char/train_char.py +618 -0
LTA_openwebtext_dualt/logs/elfopt_4gpu_debug_20260513/lta_owt_fast10k_len1024_elfopt_muon_ema_ddit768x12_4gpu_5epoch_20260513.log +94 -0
LTA_openwebtext_dualt/logs/elfopt_4gpu_debug_20260513/lta_owt_fast10k_len1024_elfopt_muon_ema_ddit768x12_4gpu_5epoch_20260513_trace.log +1 -0
LTA_openwebtext_dualt/logs/lm1b_classic_repro_infer_watch/infer_lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005_step_0010000_t1p45.log +68 -0
LTA_openwebtext_dualt/logs/lm1b_classic_repro_infer_watch/infer_lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005_step_0020000_t1p45.log +68 -0
LTA_openwebtext_dualt/logs/lm1b_classic_repro_infer_watch/infer_lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005_step_0030000_t1p45.log +68 -0
LTA_openwebtext_dualt/logs/lm1b_classic_repro_infer_watch/processed_lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005_steps128_c1024_t1p45_n1024.txt +4 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/bin/activate.bat +71 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/bin/activate.fish +124 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/bin/f2py +10 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/bin/pydoc.bat +22 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/audio_utils.py +1254 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cli/__init__.py +13 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cli/add_new_model_like.py +790 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cli/chat.py +673 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cli/download.py +40 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cli/serve.py +241 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cli/system.py +139 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cli/transformers.py +41 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/distributed/__init__.py +33 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/distributed/configuration_utils.py +110 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/hyperparameter_search.py +123 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/image_transforms.py +1073 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/gemma3/__init__.py +30 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/gemma3/configuration_gemma3.py +225 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/gemma3/image_processing_gemma3.py +250 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/gemma3/image_processing_pil_gemma3.py +225 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/gemma3/modeling_gemma3.py +1118 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/gemma3/modular_gemma3.py +941 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/gemma3/processing_gemma3.py +165 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/youtu/__init__.py +27 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/youtu/configuration_youtu.py +107 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/youtu/modeling_youtu.py +607 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/youtu/modular_youtu.py +151 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/testing_utils.py +0 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/training_args.py +0 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/runs/owt_t5_elftokenized_full_len1024_C1_to_1024_pow1_d768_l12_h12_gbs512_2x8gpu_50ep_lr3e3_ema0p9999_elfopt_not5_bottleneck16_unfixed_norm_stateprobadd_selfcond_ce_fast_20260612_030202/step_070000.pt +3 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/runs/owt_t5_elftokenized_full_len1024_C1_to_1024_pow1_d768_l12_h12_gbs512_2x8gpu_50ep_lr3e3_ema0p9999_elfopt_not5_bottleneck16_unfixed_norm_stateprobadd_selfcond_ce_fast_20260612_030202/step_078000.pt +3 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/runs/owt_t5_elftokenized_full_len1024_C1_to_1024_pow1_d768_l12_h12_gbs512_2x8gpu_50ep_lr3e3_ema0p9999_elfopt_not5_bottleneck16_unfixed_norm_stateprobadd_selfcond_ce_fast_20260612_030202/step_079000.pt +3 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/runs/owt_t5_elftokenized_full_len1024_C1_to_1024_pow1_d768_l12_h12_gbs512_2x8gpu_50ep_lr3e3_ema0p9999_elfopt_not5_bottleneck16_unfixed_norm_stateprobadd_selfcond_ce_fast_20260612_030202/step_343000.pt +3 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/runs/owt_t5_elftokenized_full_len1024_C1_to_1024_pow1_d768_l12_h12_gbs512_2x8gpu_50ep_lr3e3_ema0p9999_elfopt_not5_bottleneck16_unfixed_norm_stateprobadd_selfcond_ce_fast_20260612_030202/step_352000.pt +3 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/runs/owt_t5_elftokenized_full_len1024_C1_to_1024_pow1_d768_l12_h12_gbs512_2x8gpu_50ep_lr3e3_ema0p9999_elfopt_not5_bottleneck16_unfixed_norm_stateprobadd_selfcond_ce_fast_20260612_030202/step_390000.pt +3 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/runs/owt_t5_elftokenized_full_len1024_C1_to_1024_pow1_d768_l12_h12_gbs512_2x8gpu_50ep_lr3e3_ema0p9999_elfopt_not5_bottleneck16_unfixed_norm_stateprobadd_selfcond_ce_fast_20260612_030202/step_433000.pt +3 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/runs/owt_t5_elftokenized_full_len1024_C1_to_1024_pow1_d768_l12_h12_gbs512_2x8gpu_50ep_lr3e3_ema0p9999_elfopt_not5_bottleneck16_unfixed_norm_stateprobadd_selfcond_ce_fast_20260612_030202/step_471000.pt +3 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/runs/owt_t5_elftokenized_full_len1024_C1_to_1024_pow1_d768_l12_h12_gbs512_2x8gpu_50ep_lr3e3_ema0p9999_elfopt_not5_bottleneck16_unfixed_norm_stateprobadd_selfcond_ce_fast_20260612_030202/step_565000.pt +3 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/runs/owt_t5_elftokenized_full_len1024_C1_to_1024_pow1_d768_l12_h12_gbs512_2x8gpu_50ep_lr3e3_ema0p9999_elfopt_not5_bottleneck16_unfixed_norm_stateprobadd_selfcond_ce_fast_20260612_030202/step_571000.pt +3 -0

LTA_openwebtext_dualt/experiments/nanogpt_tinyshakespeare_char/runs/char_ar_lta_4gpu_5k_20260507.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 354158

LTA_openwebtext_dualt/experiments/nanogpt_tinyshakespeare_char/runs/char_ar_lta_4gpu_5k_20260507_lta.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 354493

LTA_openwebtext_dualt/experiments/nanogpt_tinyshakespeare_char/runs/char_ar_lta_4gpu_5k_20260507_lta_rerun.log ADDED Viewed

	@@ -0,0 +1,61 @@

+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[setup] device=cuda:0 rank=0 world_size=4
+[rank2]:[W507 18:31:20.830802063 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank0]:[W507 18:31:20.831852183 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank3]:[W507 18:31:20.833351576 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank1]:[W507 18:31:20.834403717 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[data] chars=1115394 vocab=65 train=1003854 val=111540
+[lta] step=1 loss=4.2253 elapsed=0.6s
+[lta] step=100 loss=1.8800 elapsed=1.6s
+[lta] step=200 loss=1.7154 elapsed=2.6s
+[lta] step=300 loss=1.8433 elapsed=3.7s
+[lta] step=400 loss=1.5804 elapsed=4.7s
+[lta] step=500 loss=1.3491 elapsed=5.7s
+[lta] step=600 loss=1.5344 elapsed=7.2s
+[lta] step=700 loss=1.2540 elapsed=8.3s
+[lta] step=800 loss=1.3757 elapsed=9.3s
+[lta] step=900 loss=1.4476 elapsed=10.3s
+[lta] step=1000 loss=1.4170 elapsed=11.3s
+[lta] step=1100 loss=1.5610 elapsed=12.8s
+[lta] step=1200 loss=1.4933 elapsed=13.8s
+[lta] step=1300 loss=1.5656 elapsed=14.9s
+[lta] step=1400 loss=1.5198 elapsed=15.9s
+[lta] step=1500 loss=1.4798 elapsed=17.0s
+[lta] step=1600 loss=1.5783 elapsed=18.5s
+[lta] step=1700 loss=1.1984 elapsed=19.5s
+[lta] step=1800 loss=1.2941 elapsed=20.5s
+[lta] step=1900 loss=1.5220 elapsed=21.5s
+[lta] step=2000 loss=1.2615 elapsed=22.6s
+[lta] step=2100 loss=1.3370 elapsed=24.1s
+[lta] step=2200 loss=1.1854 elapsed=25.1s
+[lta] step=2300 loss=0.9726 elapsed=26.1s
+[lta] step=2400 loss=1.4613 elapsed=27.1s
+[lta] step=2500 loss=1.3016 elapsed=28.2s
+[lta] step=2600 loss=1.3408 elapsed=29.7s
+[lta] step=2700 loss=1.3022 elapsed=30.7s
+[lta] step=2800 loss=1.4492 elapsed=31.7s
+[lta] step=2900 loss=1.1530 elapsed=32.7s
+[lta] step=3000 loss=1.4642 elapsed=33.8s
+[lta] step=3100 loss=1.2645 elapsed=35.3s
+[lta] step=3200 loss=1.4777 elapsed=36.3s
+[lta] step=3300 loss=1.0923 elapsed=37.4s
+[lta] step=3400 loss=1.1992 elapsed=38.5s
+[lta] step=3500 loss=1.4760 elapsed=39.6s
+[lta] step=3600 loss=1.5702 elapsed=41.0s
+[lta] step=3700 loss=1.5327 elapsed=42.1s
+[lta] step=3800 loss=1.5319 elapsed=43.1s
+[lta] step=3900 loss=1.3098 elapsed=44.3s
+[lta] step=4000 loss=1.6050 elapsed=45.3s
+[lta] step=4100 loss=1.2478 elapsed=46.8s
+[lta] step=4200 loss=1.3497 elapsed=47.8s
+[lta] step=4300 loss=1.3263 elapsed=48.8s
+[lta] step=4400 loss=1.3406 elapsed=49.9s
+[lta] step=4500 loss=1.3295 elapsed=50.9s
+[lta] step=4600 loss=1.5340 elapsed=52.4s
+[lta] step=4700 loss=1.4847 elapsed=53.4s
+[lta] step=4800 loss=1.1464 elapsed=54.4s
+[lta] step=4900 loss=1.4102 elapsed=55.5s
+[lta] step=5000 loss=1.4638 elapsed=56.5s

LTA_openwebtext_dualt/experiments/nanogpt_tinyshakespeare_char/sample_fully_coupled.py ADDED Viewed

	@@ -0,0 +1,146 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import csv
+import json
+import math
+import sys
+from pathlib import Path
+import torch
+import torch.nn.functional as F
+SCRIPT_DIR = Path(__file__).resolve().parent
+if str(SCRIPT_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPT_DIR))
+from train_char import CharTokenizer, ModelConfig, TinyTransformer, standard_gamma, text_stats
+def pick_device(name: str) -> torch.device:
+    if name != "auto":
+        return torch.device(name)
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
+        return torch.device("mps")
+    return torch.device("cpu")
+def decode(
+    model: TinyTransformer,
+    tokenizer: CharTokenizer,
+    *,
+    length: int,
+    steps: int,
+    c_min: float,
+    c_max: float,
+    temp: float,
+    final_from: str,
+    seed: int,
+    device: torch.device,
+) -> str:
+    torch.manual_seed(seed)
+    eps = 1e-8
+    vocab_size = tokenizer.vocab_size
+    alpha = torch.full((1, length, vocab_size), 1.0 / vocab_size, device=device).clamp_min(eps)
+    probs = standard_gamma(alpha).clamp_min(eps)
+    probs = probs / probs.sum(dim=-1, keepdim=True).clamp_min(eps)
+    last_endpoint = probs
+    for step in range(steps):
+        t_value = (step + 1) / max(steps, 1)
+        t = torch.full((1,), t_value, device=device)
+        logits = model(probs, t) / temp
+        endpoint = F.softmax(logits, dim=-1)
+        last_endpoint = endpoint
+        support_t = t_value
+        semantic_t = t_value
+        forward_endpoint = (1.0 - semantic_t) * probs + semantic_t * endpoint
+        mean = (1.0 - support_t) / float(vocab_size) + support_t * forward_endpoint
+        mean = mean.clamp_min(eps)
+        mean = mean / mean.sum(dim=-1, keepdim=True).clamp_min(eps)
+        conc = math.exp(math.log(c_min) + support_t * math.log(c_max / c_min))
+        sample = standard_gamma((mean * conc).clamp_min(eps)).clamp_min(eps)
+        probs = sample / sample.sum(dim=-1, keepdim=True).clamp_min(eps)
+    if final_from == "state":
+        final = probs
+    elif final_from == "endpoint":
+        final = last_endpoint
+    elif final_from == "blend":
+        final = 0.5 * probs + 0.5 * last_endpoint
+    else:
+        raise ValueError(final_from)
+    ids = final.argmax(dim=-1)[0]
+    return tokenizer.decode(ids)
+def main() -> None:
+    p = argparse.ArgumentParser()
+    p.add_argument("--checkpoint", required=True)
+    p.add_argument("--out_dir", required=True)
+    p.add_argument("--length", type=int, default=128)
+    p.add_argument("--seed", type=int, default=20260507)
+    p.add_argument("--device", default="auto")
+    args = p.parse_args()
+    device = pick_device(args.device)
+    print(f"[setup] device={device}", flush=True)
+    ckpt = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
+    cfg = ModelConfig(**ckpt["model_config"])
+    tok_data = ckpt["extra"]["tokenizer"]
+    tokenizer = CharTokenizer("".join(tok_data["itos"]))
+    tokenizer.itos = tok_data["itos"]
+    tokenizer.stoi = tok_data["stoi"]
+    tokenizer.vocab_size = tok_data["vocab_size"]
+    model = TinyTransformer(cfg).to(device)
+    model.load_state_dict(ckpt["model"])
+    model.eval()
+    configs = []
+    for steps in [128, 256, 512, 1024]:
+        for c_max in [64.0, 16.0, 4.0, 1.0]:
+            for temp in [0.8, 1.0, 1.3, 1.8]:
+                for final_from in ["state", "endpoint", "blend"]:
+                    configs.append((steps, c_max, temp, final_from))
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    rows = []
+    for i, (steps, c_max, temp, final_from) in enumerate(configs):
+        name = f"steps{steps}_c{c_max:g}_temp{str(temp).replace('.', 'p')}_{final_from}"
+        text = decode(
+            model,
+            tokenizer,
+            length=args.length,
+            steps=steps,
+            c_min=1.0,
+            c_max=c_max,
+            temp=temp,
+            final_from=final_from,
+            seed=args.seed,
+            device=device,
+        )
+        stats = text_stats(text)
+        row = {
+            "name": name,
+            "steps": steps,
+            "c_max": c_max,
+            "temp": temp,
+            "final_from": final_from,
+            **stats,
+        }
+        rows.append(row)
+        (out_dir / f"{name}.txt").write_text(text, encoding="utf-8")
+        if i % 12 == 0:
+            print("[sample]", row, flush=True)
+    keys = list(rows[0].keys())
+    with (out_dir / "summary.tsv").open("w", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=keys, delimiter="\t")
+        writer.writeheader()
+        writer.writerows(rows)
+if __name__ == "__main__":
+    main()

LTA_openwebtext_dualt/experiments/nanogpt_tinyshakespeare_char/train_char.py ADDED Viewed

	@@ -0,0 +1,618 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import json
+import math
+import os
+import time
+import urllib.request
+from dataclasses import asdict, dataclass
+from pathlib import Path
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parallel import DistributedDataParallel as DDP
+DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
+def setup_distributed(name: str) -> tuple[torch.device, int, int, int, bool]:
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    rank = int(os.environ.get("RANK", "0"))
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    if world_size > 1:
+        if not torch.cuda.is_available():
+            raise RuntimeError("DDP mode expects CUDA. Run single-process for CPU/MPS.")
+        torch.cuda.set_device(local_rank)
+        dist.init_process_group(backend="nccl")
+        return torch.device("cuda", local_rank), rank, local_rank, world_size, True
+    if name != "auto":
+        return torch.device(name), rank, local_rank, world_size, False
+    if torch.cuda.is_available():
+        return torch.device("cuda"), rank, local_rank, world_size, False
+    if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
+        return torch.device("mps"), rank, local_rank, world_size, False
+    return torch.device("cpu"), rank, local_rank, world_size, False
+def cleanup_distributed(is_ddp: bool) -> None:
+    if is_ddp and dist.is_initialized():
+        dist.destroy_process_group()
+def is_main_process(rank: int) -> bool:
+    return rank == 0
+class CharTokenizer:
+    def __init__(self, text: str):
+        chars = sorted(set(text))
+        self.itos = chars
+        self.stoi = {ch: i for i, ch in enumerate(chars)}
+        self.vocab_size = len(chars)
+    def encode(self, text: str) -> list[int]:
+        return [self.stoi[ch] for ch in text]
+    def decode(self, ids: list[int] | torch.Tensor) -> str:
+        if isinstance(ids, torch.Tensor):
+            ids = ids.detach().cpu().tolist()
+        return "".join(self.itos[int(i)] for i in ids)
+    def to_json(self) -> dict:
+        return {"itos": self.itos, "stoi": self.stoi, "vocab_size": self.vocab_size}
+def ensure_tinyshakespeare(data_dir: Path) -> None:
+    data_dir.mkdir(parents=True, exist_ok=True)
+    path = data_dir / "input.txt"
+    if not path.exists():
+        print(f"[data] downloading {DATA_URL}", flush=True)
+        urllib.request.urlretrieve(DATA_URL, path)
+def load_tinyshakespeare(data_dir: Path) -> tuple[str, CharTokenizer, torch.Tensor, torch.Tensor]:
+    path = data_dir / "input.txt"
+    text = path.read_text(encoding="utf-8")
+    tokenizer = CharTokenizer(text)
+    ids = torch.tensor(tokenizer.encode(text), dtype=torch.long)
+    split = int(0.9 * len(ids))
+    return text, tokenizer, ids[:split], ids[split:]
+def get_batch(data: torch.Tensor, *, batch_size: int, block_size: int, device: torch.device) -> tuple[torch.Tensor, torch.Tensor]:
+    ix = torch.randint(0, len(data) - block_size - 1, (batch_size,))
+    x = torch.stack([data[i : i + block_size] for i in ix])
+    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
+    return x.to(device), y.to(device)
+def get_block_batch(data: torch.Tensor, *, batch_size: int, block_size: int, device: torch.device) -> torch.Tensor:
+    ix = torch.randint(0, len(data) - block_size, (batch_size,))
+    x = torch.stack([data[i : i + block_size] for i in ix])
+    return x.to(device)
+def _wrapped_window(data: torch.Tensor, start: int, width: int) -> torch.Tensor:
+    end = start + width
+    if end <= len(data):
+        return data[start:end]
+    return torch.cat([data[start:], data[: end % len(data)]], dim=0)
+def get_stream_batch(
+    data: torch.Tensor,
+    *,
+    batch_size: int,
+    block_size: int,
+    device: torch.device,
+    cursor: int,
+    rank: int = 0,
+    world_size: int = 1,
+) -> tuple[torch.Tensor, torch.Tensor, int]:
+    width = block_size + 1
+    base = (cursor + rank * batch_size * width) % len(data)
+    samples = torch.stack([_wrapped_window(data, (base + i * width) % len(data), width) for i in range(batch_size)])
+    next_cursor = (cursor + world_size * batch_size * width) % len(data)
+    return samples[:, :block_size].to(device), samples[:, 1:].to(device), next_cursor
+def get_stream_block_batch(
+    data: torch.Tensor,
+    *,
+    batch_size: int,
+    block_size: int,
+    device: torch.device,
+    cursor: int,
+    rank: int = 0,
+    world_size: int = 1,
+) -> tuple[torch.Tensor, int]:
+    width = block_size
+    base = (cursor + rank * batch_size * width) % len(data)
+    samples = torch.stack([_wrapped_window(data, (base + i * width) % len(data), width) for i in range(batch_size)])
+    next_cursor = (cursor + world_size * batch_size * width) % len(data)
+    return samples.to(device), next_cursor
+@dataclass
+class ModelConfig:
+    vocab_size: int
+    block_size: int = 128
+    n_layer: int = 4
+    n_head: int = 4
+    n_embd: int = 128
+    dropout: float = 0.1
+    causal: bool = True
+    input_kind: str = "tokens"
+class CausalSelfAttention(nn.Module):
+    def __init__(self, cfg: ModelConfig):
+        super().__init__()
+        assert cfg.n_embd % cfg.n_head == 0
+        self.n_head = cfg.n_head
+        self.dropout = cfg.dropout
+        self.c_attn = nn.Linear(cfg.n_embd, 3 * cfg.n_embd)
+        self.c_proj = nn.Linear(cfg.n_embd, cfg.n_embd)
+        self.attn_drop = nn.Dropout(cfg.dropout)
+        self.resid_drop = nn.Dropout(cfg.dropout)
+        self.causal = cfg.causal
+        self.register_buffer("tril", torch.tril(torch.ones(cfg.block_size, cfg.block_size)).view(1, 1, cfg.block_size, cfg.block_size))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        b, t, c = x.shape
+        q, k, v = self.c_attn(x).split(c, dim=2)
+        q = q.view(b, t, self.n_head, c // self.n_head).transpose(1, 2)
+        k = k.view(b, t, self.n_head, c // self.n_head).transpose(1, 2)
+        v = v.view(b, t, self.n_head, c // self.n_head).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        if self.causal:
+            att = att.masked_fill(self.tril[:, :, :t, :t] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.attn_drop(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(b, t, c)
+        return self.resid_drop(self.c_proj(y))
+class Block(nn.Module):
+    def __init__(self, cfg: ModelConfig):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(cfg.n_embd)
+        self.attn = CausalSelfAttention(cfg)
+        self.ln2 = nn.LayerNorm(cfg.n_embd)
+        self.mlp = nn.Sequential(
+            nn.Linear(cfg.n_embd, 4 * cfg.n_embd),
+            nn.GELU(),
+            nn.Linear(4 * cfg.n_embd, cfg.n_embd),
+            nn.Dropout(cfg.dropout),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class TinyTransformer(nn.Module):
+    def __init__(self, cfg: ModelConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.token_emb = nn.Embedding(cfg.vocab_size, cfg.n_embd)
+        self.prob_proj = nn.Linear(cfg.vocab_size, cfg.n_embd, bias=False)
+        self.pos_emb = nn.Embedding(cfg.block_size, cfg.n_embd)
+        self.time_mlp = nn.Sequential(nn.Linear(1, cfg.n_embd), nn.SiLU(), nn.Linear(cfg.n_embd, cfg.n_embd))
+        self.drop = nn.Dropout(cfg.dropout)
+        self.blocks = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layer)])
+        self.ln_f = nn.LayerNorm(cfg.n_embd)
+        self.lm_head = nn.Linear(cfg.n_embd, cfg.vocab_size, bias=False)
+        if cfg.input_kind == "tokens":
+            self.lm_head.weight = self.token_emb.weight
+        self.apply(self._init_weights)
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, x: torch.Tensor, t: torch.Tensor | None = None) -> torch.Tensor:
+        b, seq_len = x.shape[:2]
+        pos = torch.arange(seq_len, device=x.device)
+        if self.cfg.input_kind == "tokens":
+            h = self.token_emb(x)
+        elif self.cfg.input_kind == "probs":
+            h = self.prob_proj(x.float())
+            if t is None:
+                raise ValueError("LTA/prob model requires time t")
+            h = h + self.time_mlp(t.float().view(b, 1)).view(b, 1, -1)
+        else:
+            raise ValueError(f"unknown input_kind: {self.cfg.input_kind}")
+        h = self.drop(h + self.pos_emb(pos).view(1, seq_len, -1))
+        for block in self.blocks:
+            h = block(h)
+        return self.lm_head(self.ln_f(h))
+@dataclass
+class LTAConfig:
+    c_min: float = 1.0
+    c_max: float = 64.0
+    endpoint_mode: str = "full_vocab_wrong"
+    t_mode: str = "same"
+    eps: float = 1e-8
+def concentration(t: torch.Tensor, c_min: float, c_max: float) -> torch.Tensor:
+    return torch.exp(torch.log(torch.tensor(c_min, device=t.device)) + t * math.log(c_max / c_min))
+def standard_gamma(alpha: torch.Tensor) -> torch.Tensor:
+    # MPS does not implement aten::_standard_gamma yet.  Sampling on CPU is
+    # plenty fast for this tiny char-level experiment, while the Transformer
+    # still runs on the accelerator.
+    if alpha.device.type == "mps":
+        return torch._standard_gamma(alpha.cpu()).to(alpha.device)
+    return torch._standard_gamma(alpha)
+def corrupt_categorical_simplex(ids: torch.Tensor, vocab_size: int, cfg: LTAConfig) -> tuple[torch.Tensor, torch.Tensor]:
+    b, seq_len = ids.shape
+    device = ids.device
+    support_t = torch.rand(b, device=device)
+    if cfg.t_mode == "same":
+        semantic_t = support_t
+    elif cfg.t_mode == "independent":
+        semantic_t = torch.rand(b, device=device)
+    else:
+        raise ValueError(f"unknown t_mode: {cfg.t_mode}")
+    gold = F.one_hot(ids, vocab_size).float()
+    wrong_ids = torch.randint(0, vocab_size, ids.shape, device=device)
+    wrong = F.one_hot(wrong_ids, vocab_size).float()
+    endpoint = semantic_t.view(b, 1, 1) * gold + (1.0 - semantic_t).view(b, 1, 1) * wrong
+    support = support_t.view(b, 1, 1)
+    mean = (1.0 - support) / float(vocab_size) + support * endpoint
+    mean = mean.clamp_min(cfg.eps)
+    mean = mean / mean.sum(dim=-1, keepdim=True).clamp_min(cfg.eps)
+    conc = concentration(support_t, cfg.c_min, cfg.c_max).view(b, 1, 1)
+    alpha = (mean * conc).clamp_min(cfg.eps)
+    state = standard_gamma(alpha).clamp_min(cfg.eps)
+    state = state / state.sum(dim=-1, keepdim=True).clamp_min(cfg.eps)
+    return state, support_t
+@torch.no_grad()
+def estimate_ar_loss(model: TinyTransformer, data: torch.Tensor, args, device: torch.device, eval_iters: int) -> float:
+    model.eval()
+    losses = []
+    for _ in range(eval_iters):
+        x, y = get_batch(data, batch_size=args.batch_size, block_size=args.block_size, device=device)
+        logits = model(x)
+        losses.append(F.cross_entropy(logits.view(-1, logits.size(-1)), y.reshape(-1)).item())
+    model.train()
+    return float(sum(losses) / len(losses))
+@torch.no_grad()
+def estimate_lta_loss(model: TinyTransformer, data: torch.Tensor, args, lta_cfg: LTAConfig, device: torch.device, eval_iters: int) -> float:
+    model.eval()
+    losses = []
+    for _ in range(eval_iters):
+        ids = get_block_batch(data, batch_size=args.batch_size, block_size=args.block_size, device=device)
+        state, t = corrupt_categorical_simplex(ids, model.cfg.vocab_size, lta_cfg)
+        logits = model(state, t)
+        losses.append(F.cross_entropy(logits.view(-1, logits.size(-1)), ids.reshape(-1)).item())
+    model.train()
+    return float(sum(losses) / len(losses))
+@torch.no_grad()
+def generate_ar(model: TinyTransformer, tokenizer: CharTokenizer, *, length: int, temp: float, device: torch.device, seed: str = "\n") -> str:
+    model.eval()
+    ids = torch.tensor([tokenizer.encode(seed)], dtype=torch.long, device=device)
+    for _ in range(length):
+        idx = ids[:, -model.cfg.block_size :]
+        logits = model(idx)[:, -1, :]
+        if temp <= 0:
+            nxt = logits.argmax(dim=-1, keepdim=True)
+        else:
+            probs = F.softmax(logits / temp, dim=-1)
+            nxt = torch.multinomial(probs, num_samples=1)
+        ids = torch.cat([ids, nxt], dim=1)
+    return tokenizer.decode(ids[0])
+@torch.no_grad()
+def generate_lta(
+    model: TinyTransformer,
+    tokenizer: CharTokenizer,
+    *,
+    length: int,
+    steps: int,
+    c_min: float,
+    c_max: float,
+    temp: float,
+    device: torch.device,
+) -> str:
+    model.eval()
+    eps = 1e-8
+    vocab_size = tokenizer.vocab_size
+    alpha = torch.full((1, length, vocab_size), 1.0 / vocab_size, device=device).clamp_min(eps)
+    probs = standard_gamma(alpha).clamp_min(eps)
+    probs = probs / probs.sum(dim=-1, keepdim=True).clamp_min(eps)
+    last_endpoint = probs
+    for step in range(steps):
+        t_value = (step + 1) / max(steps, 1)
+        t = torch.full((1,), t_value, device=device)
+        logits = model(probs, t) / temp
+        endpoint = F.softmax(logits, dim=-1)
+        last_endpoint = endpoint
+        support_t = t_value
+        semantic_t = t_value
+        forward_endpoint = (1.0 - semantic_t) * probs + semantic_t * endpoint
+        mean = (1.0 - support_t) / float(vocab_size) + support_t * forward_endpoint
+        mean = mean.clamp_min(eps)
+        mean = mean / mean.sum(dim=-1, keepdim=True).clamp_min(eps)
+        conc = math.exp(math.log(c_min) + support_t * math.log(c_max / c_min))
+        sample = standard_gamma((mean * conc).clamp_min(eps)).clamp_min(eps)
+        probs = sample / sample.sum(dim=-1, keepdim=True).clamp_min(eps)
+    final = 0.5 * probs + 0.5 * last_endpoint
+    ids = final.argmax(dim=-1)[0]
+    return tokenizer.decode(ids)
+def text_stats(text: str) -> dict[str, float]:
+    chars = list(text)
+    counts = {}
+    for ch in chars:
+        counts[ch] = counts.get(ch, 0) + 1
+    n = max(len(chars), 1)
+    entropy = -sum((c / n) * math.log(c / n) for c in counts.values())
+    bigrams = list(zip(chars, chars[1:]))
+    distinct_2 = len(set(bigrams)) / max(len(bigrams), 1)
+    return {"char_entropy": entropy, "distinct_2": distinct_2, "length": float(len(text))}
+def save_checkpoint(path: Path, model: TinyTransformer, optimizer: torch.optim.Optimizer, step: int, cfg: ModelConfig, extra: dict) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    torch.save(
+        {
+            "model": model.state_dict(),
+            "optimizer": optimizer.state_dict(),
+            "step": step,
+            "model_config": asdict(cfg),
+            "extra": extra,
+        },
+        path,
+    )
+def maybe_resume(path: str | None, model: TinyTransformer, optimizer: torch.optim.Optimizer, device: torch.device) -> int:
+    if not path:
+        return 0
+    ckpt = torch.load(path, map_location=device)
+    model.load_state_dict(ckpt["model"])
+    if "optimizer" in ckpt:
+        optimizer.load_state_dict(ckpt["optimizer"])
+    return int(ckpt.get("step", 0))
+def train_ar(
+    args,
+    tokenizer: CharTokenizer,
+    train_data: torch.Tensor,
+    val_data: torch.Tensor,
+    device: torch.device,
+    *,
+    rank: int,
+    local_rank: int,
+    is_ddp: bool,
+) -> None:
+    cfg = ModelConfig(
+        vocab_size=tokenizer.vocab_size,
+        block_size=args.block_size,
+        n_layer=args.n_layer,
+        n_head=args.n_head,
+        n_embd=args.n_embd,
+        dropout=args.dropout,
+        causal=True,
+        input_kind="tokens",
+    )
+    raw_model = TinyTransformer(cfg).to(device)
+    model = DDP(raw_model, device_ids=[local_rank], find_unused_parameters=True) if is_ddp else raw_model
+    optimizer = torch.optim.AdamW(raw_model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+    start_step = maybe_resume(args.resume_path, raw_model, optimizer, device)
+    out_dir = Path(args.out_dir) / "ar"
+    if is_main_process(rank):
+        out_dir.mkdir(parents=True, exist_ok=True)
+    log_path = out_dir / "metrics.jsonl"
+    t0 = time.time()
+    stream_world_size = dist.get_world_size() if is_ddp and dist.is_initialized() else 1
+    stream_cursor = (start_step * args.batch_size * (args.block_size + 1) * stream_world_size) % len(train_data)
+    for step in range(start_step + 1, args.steps + 1):
+        if args.data_mode == "stream":
+            x, y, stream_cursor = get_stream_batch(
+                train_data,
+                batch_size=args.batch_size,
+                block_size=args.block_size,
+                device=device,
+                cursor=stream_cursor,
+                rank=rank,
+                world_size=stream_world_size,
+            )
+        else:
+            x, y = get_batch(train_data, batch_size=args.batch_size, block_size=args.block_size, device=device)
+        logits = model(x)
+        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.reshape(-1))
+        optimizer.zero_grad(set_to_none=True)
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
+        optimizer.step()
+        if is_main_process(rank) and (step % args.log_interval == 0 or step == 1):
+            print(f"[ar] step={step} loss={loss.item():.4f} elapsed={time.time() - t0:.1f}s", flush=True)
+        if step % args.eval_interval == 0 or step == args.steps:
+            if is_ddp:
+                dist.barrier()
+            if is_main_process(rank):
+                val_loss = estimate_ar_loss(raw_model, val_data, args, device, args.eval_iters)
+                sample = generate_ar(raw_model, tokenizer, length=args.sample_len, temp=args.ar_temp, device=device)
+                row = {"step": step, "train_loss": float(loss.item()), "val_loss": val_loss, **text_stats(sample)}
+                with log_path.open("a", encoding="utf-8") as f:
+                    f.write(json.dumps(row, ensure_ascii=False) + "\n")
+                (out_dir / f"sample_step{step:05d}.txt").write_text(sample, encoding="utf-8")
+                save_checkpoint(out_dir / "latest.pt", raw_model, optimizer, step, cfg, {"mode": "ar", "data_mode": args.data_mode, "tokenizer": tokenizer.to_json()})
+            if is_ddp:
+                dist.barrier()
+    if is_main_process(rank):
+        save_checkpoint(out_dir / f"step_{args.steps:05d}.pt", raw_model, optimizer, args.steps, cfg, {"mode": "ar", "data_mode": args.data_mode, "tokenizer": tokenizer.to_json()})
+def train_lta(
+    args,
+    tokenizer: CharTokenizer,
+    train_data: torch.Tensor,
+    val_data: torch.Tensor,
+    device: torch.device,
+    *,
+    rank: int,
+    local_rank: int,
+    is_ddp: bool,
+) -> None:
+    cfg = ModelConfig(
+        vocab_size=tokenizer.vocab_size,
+        block_size=args.block_size,
+        n_layer=args.n_layer,
+        n_head=args.n_head,
+        n_embd=args.n_embd,
+        dropout=args.dropout,
+        causal=False,
+        input_kind="probs",
+    )
+    lta_cfg = LTAConfig(c_min=args.c_min, c_max=args.c_max, t_mode=args.t_mode)
+    raw_model = TinyTransformer(cfg).to(device)
+    model = DDP(raw_model, device_ids=[local_rank], find_unused_parameters=True) if is_ddp else raw_model
+    optimizer = torch.optim.AdamW(raw_model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+    start_step = maybe_resume(args.resume_path, raw_model, optimizer, device)
+    out_dir = Path(args.out_dir) / ("fully_coupled" if args.mode == "fully_coupled" else "lta")
+    if is_main_process(rank):
+        out_dir.mkdir(parents=True, exist_ok=True)
+    log_path = out_dir / "metrics.jsonl"
+    t0 = time.time()
+    stream_world_size = dist.get_world_size() if is_ddp and dist.is_initialized() else 1
+    stream_cursor = (start_step * args.batch_size * args.block_size * stream_world_size) % len(train_data)
+    for step in range(start_step + 1, args.steps + 1):
+        if args.data_mode == "stream":
+            ids, stream_cursor = get_stream_block_batch(
+                train_data,
+                batch_size=args.batch_size,
+                block_size=args.block_size,
+                device=device,
+                cursor=stream_cursor,
+                rank=rank,
+                world_size=stream_world_size,
+            )
+        else:
+            ids = get_block_batch(train_data, batch_size=args.batch_size, block_size=args.block_size, device=device)
+        state, t = corrupt_categorical_simplex(ids, tokenizer.vocab_size, lta_cfg)
+        logits = model(state, t)
+        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), ids.reshape(-1))
+        optimizer.zero_grad(set_to_none=True)
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
+        optimizer.step()
+        if is_main_process(rank) and (step % args.log_interval == 0 or step == 1):
+            print(f"[lta] step={step} loss={loss.item():.4f} elapsed={time.time() - t0:.1f}s", flush=True)
+        if step % args.eval_interval == 0 or step == args.steps:
+            if is_ddp:
+                dist.barrier()
+            if is_main_process(rank):
+                val_loss = estimate_lta_loss(raw_model, val_data, args, lta_cfg, device, args.eval_iters)
+                sample = generate_lta(
+                    raw_model,
+                    tokenizer,
+                    length=min(args.sample_len, args.block_size),
+                    steps=args.decode_steps,
+                    c_min=args.c_min,
+                    c_max=args.decode_c_max,
+                    temp=args.endpoint_temp,
+                    device=device,
+                )
+                row = {"step": step, "train_loss": float(loss.item()), "val_loss": val_loss, **text_stats(sample)}
+                with log_path.open("a", encoding="utf-8") as f:
+                    f.write(json.dumps(row, ensure_ascii=False) + "\n")
+                (out_dir / f"sample_step{step:05d}.txt").write_text(sample, encoding="utf-8")
+                save_checkpoint(out_dir / "latest.pt", raw_model, optimizer, step, cfg, {"mode": "lta", "data_mode": args.data_mode, "lta_config": asdict(lta_cfg), "tokenizer": tokenizer.to_json()})
+            if is_ddp:
+                dist.barrier()
+    if is_main_process(rank):
+        save_checkpoint(out_dir / f"step_{args.steps:05d}.pt", raw_model, optimizer, args.steps, cfg, {"mode": "lta", "data_mode": args.data_mode, "lta_config": asdict(lta_cfg), "tokenizer": tokenizer.to_json()})
+def main() -> None:
+    p = argparse.ArgumentParser()
+    p.add_argument("--mode", choices=["ar", "fully_coupled", "lta", "both"], default="both")
+    p.add_argument("--data_dir", default="experiments/nanogpt_tinyshakespeare_char/data")
+    p.add_argument("--out_dir", default="experiments/nanogpt_tinyshakespeare_char/runs/char_5k")
+    p.add_argument("--device", default="auto")
+    p.add_argument("--steps", type=int, default=5000)
+    p.add_argument("--data_mode", choices=["random", "stream"], default="random")
+    p.add_argument("--batch_size", type=int, default=64)
+    p.add_argument("--block_size", type=int, default=128)
+    p.add_argument("--n_layer", type=int, default=4)
+    p.add_argument("--n_head", type=int, default=4)
+    p.add_argument("--n_embd", type=int, default=128)
+    p.add_argument("--dropout", type=float, default=0.1)
+    p.add_argument("--lr", type=float, default=3e-4)
+    p.add_argument("--weight_decay", type=float, default=0.1)
+    p.add_argument("--grad_clip", type=float, default=1.0)
+    p.add_argument("--log_interval", type=int, default=100)
+    p.add_argument("--eval_interval", type=int, default=500)
+    p.add_argument("--eval_iters", type=int, default=20)
+    p.add_argument("--sample_len", type=int, default=512)
+    p.add_argument("--ar_temp", type=float, default=0.8)
+    p.add_argument("--c_min", type=float, default=1.0)
+    p.add_argument("--c_max", type=float, default=64.0)
+    p.add_argument("--decode_c_max", type=float, default=16.0)
+    p.add_argument("--endpoint_temp", type=float, default=1.3)
+    p.add_argument("--decode_steps", type=int, default=256)
+    p.add_argument("--t_mode", choices=["same", "independent"], default="same")
+    p.add_argument("--resume_path", default="")
+    p.add_argument("--seed", type=int, default=1337)
+    args = p.parse_args()
+    device, rank, local_rank, world_size, is_ddp = setup_distributed(args.device)
+    torch.manual_seed(args.seed + rank)
+    if is_main_process(rank):
+        print(f"[setup] device={device} rank={rank} world_size={world_size}", flush=True)
+        ensure_tinyshakespeare(Path(args.data_dir))
+    if is_ddp:
+        dist.barrier()
+    text, tokenizer, train_data, val_data = load_tinyshakespeare(Path(args.data_dir))
+    out_dir = Path(args.out_dir)
+    if is_main_process(rank):
+        out_dir.mkdir(parents=True, exist_ok=True)
+        (out_dir / "tokenizer.json").write_text(json.dumps(tokenizer.to_json(), ensure_ascii=False, indent=2), encoding="utf-8")
+        (out_dir / "args.json").write_text(json.dumps(vars(args), ensure_ascii=False, indent=2), encoding="utf-8")
+        print(f"[data] chars={len(text)} vocab={tokenizer.vocab_size} train={len(train_data)} val={len(val_data)}", flush=True)
+    if is_ddp:
+        dist.barrier()
+    try:
+        if args.mode in {"ar", "both"}:
+            train_ar(args, tokenizer, train_data, val_data, device, rank=rank, local_rank=local_rank, is_ddp=is_ddp)
+        if args.mode in {"fully_coupled", "lta", "both"}:
+            train_lta(args, tokenizer, train_data, val_data, device, rank=rank, local_rank=local_rank, is_ddp=is_ddp)
+    finally:
+        cleanup_distributed(is_ddp)
+if __name__ == "__main__":
+    main()

LTA_openwebtext_dualt/logs/elfopt_4gpu_debug_20260513/lta_owt_fast10k_len1024_elfopt_muon_ema_ddit768x12_4gpu_5epoch_20260513.log ADDED Viewed

	@@ -0,0 +1,94 @@

+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[rank0]:[W513 02:20:46.100148121 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+NCCL version 2.25.1+cuda12.8
+[rank3]:[W513 02:20:46.141519466 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank1]:[W513 02:20:46.143177080 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank2]:[W513 02:20:46.172962616 ProcessGroupNCCL.cpp:4571] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+{
+  "device": "cuda:0",
+  "rank": 0,
+  "world_size": 4,
+  "samples": "owt_cached_chunks:10904",
+  "vocab_size": 50257,
+  "tokenizer_vocab_size": 50257,
+  "save_dir": "runs/lta_owt_fast10k_len1024_elfopt_muon_ema_ddit768x12_4gpu_5epoch_20260513",
+  "batch_size": 8,
+  "grad_accum": 16,
+  "effective_batch_size": 512,
+  "global_batch_size": 512,
+  "lr_schedule": "constant_warmup",
+  "optimizer": "muon",
+  "warmup_steps": 11,
+  "min_lr": 0.0,
+  "weight_decay": 0.0,
+  "adamw_param_groups": "nanogpt",
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_eps": 1e-08,
+  "muon_momentum": 0.95,
+  "muon_ns_steps": 5,
+  "muon_update_scale": 1.0,
+  "ema_decay": 0.9999,
+  "ema_start_step": 0,
+  "model_type": "ddit",
+  "dual_t": true,
+  "corrupt_t_mode": "independent",
+  "corrupt_min_t": null,
+  "corrupt_max_t": null,
+  "prefix_block_prob": 0.0,
+  "prefix_block_len": 128,
+  "dirichlet_endpoint_mode": "categorical_dual_t",
+  "dirichlet_semantic_t_mode": "same",
+  "dirichlet_semantic_t_value": 0.0,
+  "categorical_wrong_from_full_vocab": true,
+  "categorical_wrong_from_batch_valid_tokens": false,
+  "mask_mixture_original_prob": 0.0,
+  "mask_mixture_lowk_prob": 0.0,
+  "mask_mixture_lowcorrupt_prob": 0.0,
+  "mask_mixture_block_prob": 0.0,
+  "mask_mixture_all_prob": 0.0,
+  "mask_mixture_lowk_clean_tokens": "1,2,4,8,16,32,64",
+  "mask_mixture_lowcorrupt_tokens": "1,2,4,8,16,32,64",
+  "mask_mixture_block_tokens": "64,128",
+  "simplex_bridge_sampler": "dirichlet",
+  "logistic_normal_sigma_min": 0.18,
+  "logistic_normal_sigma_max": 2.2,
+  "logistic_normal_tau_min": 0.65,
+  "logistic_normal_tau_max": 1.15,
+  "torch_compile": false,
+  "compile_mode": "max-autotune",
+  "state_format": "prob",
+  "target_loss": "hard_ce",
+  "meanflow_weight": 0.0,
+  "bridge_noise_init": "logistic_normal",
+  "noise_sigma": -1.0,
+  "wrap": true,
+  "wrap_mode": "stream",
+  "wrap_record_buffer_size": 200,
+  "owt_cached_chunks": true,
+  "owt_chunk_cache_dir": "/e2e-data/evad-tech-vla/wanghan58/data/small_benchmarks/langflow_2604_11748/openwebtext_lta_cached_chunks/gpt2_len1024_train_minus_100k_fast10k",
+  "owt_chunk_cache_rebuild": false,
+  "owt_chunk_cache_write_batch": 4096,
+  "owt_exact_repeat_per_chunk": 0,
+  "online_chunk_shuffle": false,
+  "online_chunk_shuffle_buffer": 10000,
+  "openwebtext_split": "all",
+  "detokenizer": "auto",
+  "resolved_detokenizer": null,
+  "num_workers": 0,
+  "latest_every": 25,
+  "resume_path": ""
+}
+step=10 micro_steps=160 elapsed=35.3s lr=2.000000e-03 loss_all=10.7819 acc_all=0.5062 loss_corrupt=10.7918 acc_corrupt=0.3523 corrupt_frac=0.5581 loss=10.7918 loss_recon=10.7918 loss_meanflow=0.0000 mean_model_t=0.5021 mean_corrupt_t=0.5155 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4797 init_acc_corrupt=0.4881 init_gold_top10=0.5148 init_gold_top100=0.5444
+step=20 micro_steps=320 elapsed=43.2s lr=2.000000e-03 loss_all=10.5714 acc_all=0.5577 loss_corrupt=10.6476 acc_corrupt=0.3828 corrupt_frac=0.5535 loss=10.6476 loss_recon=10.6476 loss_meanflow=0.0000 mean_model_t=0.4831 mean_corrupt_t=0.5005 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4958 init_acc_corrupt=0.4692 init_gold_top10=0.4982 init_gold_top100=0.5296
+step=30 micro_steps=480 elapsed=49.2s lr=2.000000e-03 loss_all=10.2797 acc_all=0.5418 loss_corrupt=10.4402 acc_corrupt=0.3646 corrupt_frac=0.5515 loss=10.4402 loss_recon=10.4402 loss_meanflow=0.0000 mean_model_t=0.5005 mean_corrupt_t=0.4971 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.5053 init_acc_corrupt=0.4602 init_gold_top10=0.4890 init_gold_top100=0.5193
+step=40 micro_steps=640 elapsed=47.3s lr=2.000000e-03 loss_all=9.9466 acc_all=0.5278 loss_corrupt=10.1895 acc_corrupt=0.3629 corrupt_frac=0.5560 loss=10.1895 loss_recon=10.1895 loss_meanflow=0.0000 mean_model_t=0.4889 mean_corrupt_t=0.5037 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4913 init_acc_corrupt=0.4763 init_gold_top10=0.5031 init_gold_top100=0.5324
+step=50 micro_steps=800 elapsed=48.1s lr=2.000000e-03 loss_all=9.5849 acc_all=0.5083 loss_corrupt=9.9274 acc_corrupt=0.3452 corrupt_frac=0.5483 loss=9.9274 loss_recon=9.9274 loss_meanflow=0.0000 mean_model_t=0.5005 mean_corrupt_t=0.5071 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4913 init_acc_corrupt=0.4737 init_gold_top10=0.5031 init_gold_top100=0.5327
+step=60 micro_steps=960 elapsed=52.1s lr=2.000000e-03 loss_all=9.2104 acc_all=0.4910 loss_corrupt=9.6379 acc_corrupt=0.3375 corrupt_frac=0.5677 loss=9.6379 loss_recon=9.6379 loss_meanflow=0.0000 mean_model_t=0.5078 mean_corrupt_t=0.4997 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4952 init_acc_corrupt=0.4702 init_gold_top10=0.4995 init_gold_top100=0.5282
+step=70 micro_steps=1120 elapsed=49.0s lr=2.000000e-03 loss_all=8.7828 acc_all=0.4820 loss_corrupt=9.3226 acc_corrupt=0.3293 corrupt_frac=0.5563 loss=9.3226 loss_recon=9.3226 loss_meanflow=0.0000 mean_model_t=0.5141 mean_corrupt_t=0.5078 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4938 init_acc_corrupt=0.4720 init_gold_top10=0.5007 init_gold_top100=0.5290
+step=80 micro_steps=1280 elapsed=51.9s lr=2.000000e-03 loss_all=8.3273 acc_all=0.4771 loss_corrupt=8.9524 acc_corrupt=0.3331 corrupt_frac=0.5579 loss=8.9524 loss_recon=8.9524 loss_meanflow=0.0000 mean_model_t=0.5173 mean_corrupt_t=0.5132 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4842 init_acc_corrupt=0.4812 init_gold_top10=0.5102 init_gold_top100=0.5391
+step=90 micro_steps=1440 elapsed=49.3s lr=2.000000e-03 loss_all=7.8580 acc_all=0.4804 loss_corrupt=8.5915 acc_corrupt=0.3368 corrupt_frac=0.5610 loss=8.5915 loss_recon=8.5915 loss_meanflow=0.0000 mean_model_t=0.5097 mean_corrupt_t=0.5138 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4881 init_acc_corrupt=0.4782 init_gold_top10=0.5062 init_gold_top100=0.5356
+step=100 micro_steps=1600 elapsed=49.2s lr=2.000000e-03 loss_all=7.3653 acc_all=0.4879 loss_corrupt=8.2388 acc_corrupt=0.3383 corrupt_frac=0.5443 loss=8.2388 loss_recon=8.2388 loss_meanflow=0.0000 mean_model_t=0.4959 mean_corrupt_t=0.5087 mean_loss_t_weight=1.0000 prior_center_loss_beta=0.0000 wrong_frac=0.4917 init_acc_corrupt=0.4737 init_gold_top10=0.5032 init_gold_top100=0.5311

LTA_openwebtext_dualt/logs/elfopt_4gpu_debug_20260513/lta_owt_fast10k_len1024_elfopt_muon_ema_ddit768x12_4gpu_5epoch_20260513_trace.log ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"out_json": "docs/lta_samples/metrics_20260513/lta_owt_fast10k_len1024_elfopt_muon_ema_ddit768x12_4gpu_5epoch_20260513/trace_latest_ema_steps64_c48_t1p45.json", "records": 10, "step": 107}

LTA_openwebtext_dualt/logs/lm1b_classic_repro_infer_watch/infer_lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005_step_0010000_t1p45.log ADDED Viewed

	@@ -0,0 +1,68 @@

+[watch-classic] 2026-05-21_01:37:41 infer runs/lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005/step_0010000.pt -> docs/lta_samples/metrics_20260520/lm1b_classic_repro_every10k_normal_steps_state_t1p45_c1024_n1024/lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005/step_0010000
+[ckpt] runs/lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005/step_0010000.pt step=10000
+[decode] steps128_c1024_t1p45 generated 16/1024
+[decode] steps128_c1024_t1p45 generated 32/1024
+[decode] steps128_c1024_t1p45 generated 48/1024
+[decode] steps128_c1024_t1p45 generated 64/1024
+[decode] steps128_c1024_t1p45 generated 80/1024
+[decode] steps128_c1024_t1p45 generated 96/1024
+[decode] steps128_c1024_t1p45 generated 112/1024
+[decode] steps128_c1024_t1p45 generated 128/1024
+[decode] steps128_c1024_t1p45 generated 144/1024
+[decode] steps128_c1024_t1p45 generated 160/1024
+[decode] steps128_c1024_t1p45 generated 176/1024
+[decode] steps128_c1024_t1p45 generated 192/1024
+[decode] steps128_c1024_t1p45 generated 208/1024
+[decode] steps128_c1024_t1p45 generated 224/1024
+[decode] steps128_c1024_t1p45 generated 240/1024
+[decode] steps128_c1024_t1p45 generated 256/1024
+[decode] steps128_c1024_t1p45 generated 272/1024
+[decode] steps128_c1024_t1p45 generated 288/1024
+[decode] steps128_c1024_t1p45 generated 304/1024
+[decode] steps128_c1024_t1p45 generated 320/1024
+[decode] steps128_c1024_t1p45 generated 336/1024
+[decode] steps128_c1024_t1p45 generated 352/1024
+[decode] steps128_c1024_t1p45 generated 368/1024
+[decode] steps128_c1024_t1p45 generated 384/1024
+[decode] steps128_c1024_t1p45 generated 400/1024
+[decode] steps128_c1024_t1p45 generated 416/1024
+[decode] steps128_c1024_t1p45 generated 432/1024
+[decode] steps128_c1024_t1p45 generated 448/1024
+[decode] steps128_c1024_t1p45 generated 464/1024
+[decode] steps128_c1024_t1p45 generated 480/1024
+[decode] steps128_c1024_t1p45 generated 496/1024
+[decode] steps128_c1024_t1p45 generated 512/1024
+[decode] steps128_c1024_t1p45 generated 528/1024
+[decode] steps128_c1024_t1p45 generated 544/1024
+[decode] steps128_c1024_t1p45 generated 560/1024
+[decode] steps128_c1024_t1p45 generated 576/1024
+[decode] steps128_c1024_t1p45 generated 592/1024
+[decode] steps128_c1024_t1p45 generated 608/1024
+[decode] steps128_c1024_t1p45 generated 624/1024
+[decode] steps128_c1024_t1p45 generated 640/1024
+[decode] steps128_c1024_t1p45 generated 656/1024
+[decode] steps128_c1024_t1p45 generated 672/1024
+[decode] steps128_c1024_t1p45 generated 688/1024
+[decode] steps128_c1024_t1p45 generated 704/1024
+[decode] steps128_c1024_t1p45 generated 720/1024
+[decode] steps128_c1024_t1p45 generated 736/1024
+[decode] steps128_c1024_t1p45 generated 752/1024
+[decode] steps128_c1024_t1p45 generated 768/1024
+[decode] steps128_c1024_t1p45 generated 784/1024
+[decode] steps128_c1024_t1p45 generated 800/1024
+[decode] steps128_c1024_t1p45 generated 816/1024
+[decode] steps128_c1024_t1p45 generated 832/1024
+[decode] steps128_c1024_t1p45 generated 848/1024
+[decode] steps128_c1024_t1p45 generated 864/1024
+[decode] steps128_c1024_t1p45 generated 880/1024
+[decode] steps128_c1024_t1p45 generated 896/1024
+[decode] steps128_c1024_t1p45 generated 912/1024
+[decode] steps128_c1024_t1p45 generated 928/1024
+[decode] steps128_c1024_t1p45 generated 944/1024
+[decode] steps128_c1024_t1p45 generated 960/1024
+[decode] steps128_c1024_t1p45 generated 976/1024
+[decode] steps128_c1024_t1p45 generated 992/1024
+[decode] steps128_c1024_t1p45 generated 1008/1024
+[decode] steps128_c1024_t1p45 generated 1024/1024
+[summary] {"name": "steps128_c1024_t1p45", "step": 10000, "decode_steps": 128, "concentration_max": 1024.0, "raw_genppl": 30.13676440721101, "stripped_genppl": 34.68526771869161, "sample_entropy": 3.465849114229341, "distinct_1": 0.03083038330078125, "distinct_2": 0.19376691683070865, "top_token_mass": 0.232330322265625, "raw_kept": 1024, "stripped_kept": 1024}
+[watch-classic] 2026-05-21_01:46:53 done step_0010000

LTA_openwebtext_dualt/logs/lm1b_classic_repro_infer_watch/infer_lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005_step_0020000_t1p45.log ADDED Viewed

	@@ -0,0 +1,68 @@

+[watch-classic] 2026-05-21_04:16:54 infer runs/lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005/step_0020000.pt -> docs/lta_samples/metrics_20260520/lm1b_classic_repro_every10k_normal_steps_state_t1p45_c1024_n1024/lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005/step_0020000
+[ckpt] runs/lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005/step_0020000.pt step=20000
+[decode] steps128_c1024_t1p45 generated 16/1024
+[decode] steps128_c1024_t1p45 generated 32/1024
+[decode] steps128_c1024_t1p45 generated 48/1024
+[decode] steps128_c1024_t1p45 generated 64/1024
+[decode] steps128_c1024_t1p45 generated 80/1024
+[decode] steps128_c1024_t1p45 generated 96/1024
+[decode] steps128_c1024_t1p45 generated 112/1024
+[decode] steps128_c1024_t1p45 generated 128/1024
+[decode] steps128_c1024_t1p45 generated 144/1024
+[decode] steps128_c1024_t1p45 generated 160/1024
+[decode] steps128_c1024_t1p45 generated 176/1024
+[decode] steps128_c1024_t1p45 generated 192/1024
+[decode] steps128_c1024_t1p45 generated 208/1024
+[decode] steps128_c1024_t1p45 generated 224/1024
+[decode] steps128_c1024_t1p45 generated 240/1024
+[decode] steps128_c1024_t1p45 generated 256/1024
+[decode] steps128_c1024_t1p45 generated 272/1024
+[decode] steps128_c1024_t1p45 generated 288/1024
+[decode] steps128_c1024_t1p45 generated 304/1024
+[decode] steps128_c1024_t1p45 generated 320/1024
+[decode] steps128_c1024_t1p45 generated 336/1024
+[decode] steps128_c1024_t1p45 generated 352/1024
+[decode] steps128_c1024_t1p45 generated 368/1024
+[decode] steps128_c1024_t1p45 generated 384/1024
+[decode] steps128_c1024_t1p45 generated 400/1024
+[decode] steps128_c1024_t1p45 generated 416/1024
+[decode] steps128_c1024_t1p45 generated 432/1024
+[decode] steps128_c1024_t1p45 generated 448/1024
+[decode] steps128_c1024_t1p45 generated 464/1024
+[decode] steps128_c1024_t1p45 generated 480/1024
+[decode] steps128_c1024_t1p45 generated 496/1024
+[decode] steps128_c1024_t1p45 generated 512/1024
+[decode] steps128_c1024_t1p45 generated 528/1024
+[decode] steps128_c1024_t1p45 generated 544/1024
+[decode] steps128_c1024_t1p45 generated 560/1024
+[decode] steps128_c1024_t1p45 generated 576/1024
+[decode] steps128_c1024_t1p45 generated 592/1024
+[decode] steps128_c1024_t1p45 generated 608/1024
+[decode] steps128_c1024_t1p45 generated 624/1024
+[decode] steps128_c1024_t1p45 generated 640/1024
+[decode] steps128_c1024_t1p45 generated 656/1024
+[decode] steps128_c1024_t1p45 generated 672/1024
+[decode] steps128_c1024_t1p45 generated 688/1024
+[decode] steps128_c1024_t1p45 generated 704/1024
+[decode] steps128_c1024_t1p45 generated 720/1024
+[decode] steps128_c1024_t1p45 generated 736/1024
+[decode] steps128_c1024_t1p45 generated 752/1024
+[decode] steps128_c1024_t1p45 generated 768/1024
+[decode] steps128_c1024_t1p45 generated 784/1024
+[decode] steps128_c1024_t1p45 generated 800/1024
+[decode] steps128_c1024_t1p45 generated 816/1024
+[decode] steps128_c1024_t1p45 generated 832/1024
+[decode] steps128_c1024_t1p45 generated 848/1024
+[decode] steps128_c1024_t1p45 generated 864/1024
+[decode] steps128_c1024_t1p45 generated 880/1024
+[decode] steps128_c1024_t1p45 generated 896/1024
+[decode] steps128_c1024_t1p45 generated 912/1024
+[decode] steps128_c1024_t1p45 generated 928/1024
+[decode] steps128_c1024_t1p45 generated 944/1024
+[decode] steps128_c1024_t1p45 generated 960/1024
+[decode] steps128_c1024_t1p45 generated 976/1024
+[decode] steps128_c1024_t1p45 generated 992/1024
+[decode] steps128_c1024_t1p45 generated 1008/1024
+[decode] steps128_c1024_t1p45 generated 1024/1024
+[summary] {"name": "steps128_c1024_t1p45", "step": 20000, "decode_steps": 128, "concentration_max": 1024.0, "raw_genppl": 46.858096679153796, "stripped_genppl": 50.24137870832906, "sample_entropy": 3.638899175986386, "distinct_1": 0.0443572998046875, "distinct_2": 0.28157295767716534, "top_token_mass": 0.1688232421875, "raw_kept": 1024, "stripped_kept": 1024}
+[watch-classic] 2026-05-21_04:26:05 done step_0020000

LTA_openwebtext_dualt/logs/lm1b_classic_repro_infer_watch/infer_lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005_step_0030000_t1p45.log ADDED Viewed

	@@ -0,0 +1,68 @@

+[watch-classic] 2026-05-21_06:55:06 infer runs/lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005/step_0030000.pt -> docs/lta_samples/metrics_20260520/lm1b_classic_repro_every10k_normal_steps_state_t1p45_c1024_n1024/lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005/step_0030000
+[ckpt] runs/lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005/step_0030000.pt step=30000
+[decode] steps128_c1024_t1p45 generated 16/1024
+[decode] steps128_c1024_t1p45 generated 32/1024
+[decode] steps128_c1024_t1p45 generated 48/1024
+[decode] steps128_c1024_t1p45 generated 64/1024
+[decode] steps128_c1024_t1p45 generated 80/1024
+[decode] steps128_c1024_t1p45 generated 96/1024
+[decode] steps128_c1024_t1p45 generated 112/1024
+[decode] steps128_c1024_t1p45 generated 128/1024
+[decode] steps128_c1024_t1p45 generated 144/1024
+[decode] steps128_c1024_t1p45 generated 160/1024
+[decode] steps128_c1024_t1p45 generated 176/1024
+[decode] steps128_c1024_t1p45 generated 192/1024
+[decode] steps128_c1024_t1p45 generated 208/1024
+[decode] steps128_c1024_t1p45 generated 224/1024
+[decode] steps128_c1024_t1p45 generated 240/1024
+[decode] steps128_c1024_t1p45 generated 256/1024
+[decode] steps128_c1024_t1p45 generated 272/1024
+[decode] steps128_c1024_t1p45 generated 288/1024
+[decode] steps128_c1024_t1p45 generated 304/1024
+[decode] steps128_c1024_t1p45 generated 320/1024
+[decode] steps128_c1024_t1p45 generated 336/1024
+[decode] steps128_c1024_t1p45 generated 352/1024
+[decode] steps128_c1024_t1p45 generated 368/1024
+[decode] steps128_c1024_t1p45 generated 384/1024
+[decode] steps128_c1024_t1p45 generated 400/1024
+[decode] steps128_c1024_t1p45 generated 416/1024
+[decode] steps128_c1024_t1p45 generated 432/1024
+[decode] steps128_c1024_t1p45 generated 448/1024
+[decode] steps128_c1024_t1p45 generated 464/1024
+[decode] steps128_c1024_t1p45 generated 480/1024
+[decode] steps128_c1024_t1p45 generated 496/1024
+[decode] steps128_c1024_t1p45 generated 512/1024
+[decode] steps128_c1024_t1p45 generated 528/1024
+[decode] steps128_c1024_t1p45 generated 544/1024
+[decode] steps128_c1024_t1p45 generated 560/1024
+[decode] steps128_c1024_t1p45 generated 576/1024
+[decode] steps128_c1024_t1p45 generated 592/1024
+[decode] steps128_c1024_t1p45 generated 608/1024
+[decode] steps128_c1024_t1p45 generated 624/1024
+[decode] steps128_c1024_t1p45 generated 640/1024
+[decode] steps128_c1024_t1p45 generated 656/1024
+[decode] steps128_c1024_t1p45 generated 672/1024
+[decode] steps128_c1024_t1p45 generated 688/1024
+[decode] steps128_c1024_t1p45 generated 704/1024
+[decode] steps128_c1024_t1p45 generated 720/1024
+[decode] steps128_c1024_t1p45 generated 736/1024
+[decode] steps128_c1024_t1p45 generated 752/1024
+[decode] steps128_c1024_t1p45 generated 768/1024
+[decode] steps128_c1024_t1p45 generated 784/1024
+[decode] steps128_c1024_t1p45 generated 800/1024
+[decode] steps128_c1024_t1p45 generated 816/1024
+[decode] steps128_c1024_t1p45 generated 832/1024
+[decode] steps128_c1024_t1p45 generated 848/1024
+[decode] steps128_c1024_t1p45 generated 864/1024
+[decode] steps128_c1024_t1p45 generated 880/1024
+[decode] steps128_c1024_t1p45 generated 896/1024
+[decode] steps128_c1024_t1p45 generated 912/1024
+[decode] steps128_c1024_t1p45 generated 928/1024
+[decode] steps128_c1024_t1p45 generated 944/1024
+[decode] steps128_c1024_t1p45 generated 960/1024
+[decode] steps128_c1024_t1p45 generated 976/1024
+[decode] steps128_c1024_t1p45 generated 992/1024
+[decode] steps128_c1024_t1p45 generated 1008/1024
+[decode] steps128_c1024_t1p45 generated 1024/1024
+[summary] {"name": "steps128_c1024_t1p45", "step": 30000, "decode_steps": 128, "concentration_max": 1024.0, "raw_genppl": 42.11245193528327, "stripped_genppl": 57.07543476214156, "sample_entropy": 4.026958025870078, "distinct_1": 0.04402923583984375, "distinct_2": 0.3001584030511811, "top_token_mass": 0.0693817138671875, "raw_kept": 1024, "stripped_kept": 1024}
+[watch-classic] 2026-05-21_07:04:13 done step_0030000

LTA_openwebtext_dualt/logs/lm1b_classic_repro_infer_watch/processed_lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005_steps128_c1024_t1p45_n1024.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+runs/lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005/step_0010000.pt
+runs/lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005/step_0020000.pt
+runs/lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005/step_0030000.pt
+runs/lta_lm1b_classic_c1024_fullvocab_len128_repro_save10k_gbs512_4gpu_1m_20260520_231005/step_0040000.pt

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/bin/activate.bat ADDED Viewed

	@@ -0,0 +1,71 @@

+@REM Copyright (c) 2020-202x The virtualenv developers
+@REM
+@REM Permission is hereby granted, free of charge, to any person obtaining
+@REM a copy of this software and associated documentation files (the
+@REM "Software"), to deal in the Software without restriction, including
+@REM without limitation the rights to use, copy, modify, merge, publish,
+@REM distribute, sublicense, and/or sell copies of the Software, and to
+@REM permit persons to whom the Software is furnished to do so, subject to
+@REM the following conditions:
+@REM
+@REM The above copyright notice and this permission notice shall be
+@REM included in all copies or substantial portions of the Software.
+@REM
+@REM THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+@REM EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+@REM MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+@REM NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+@REM LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+@REM OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+@REM WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+@REM This file is UTF-8 encoded, so we need to update the current code page while executing it
+@for /f "tokens=2 delims=:." %%a in ('"%SystemRoot%\System32\chcp.com"') do @set _OLD_CODEPAGE=%%a
+@if defined _OLD_CODEPAGE (
+    "%SystemRoot%\System32\chcp.com" 65001 > nul
+)
+@for %%i in ("/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv") do @set "VIRTUAL_ENV=%%~fi"
+@set "VIRTUAL_ENV_PROMPT="
+@if NOT DEFINED VIRTUAL_ENV_PROMPT (
+    @for %%d in ("%VIRTUAL_ENV%") do @set "VIRTUAL_ENV_PROMPT=%%~nxd"
+)
+@if defined _OLD_VIRTUAL_PROMPT (
+    @set "PROMPT=%_OLD_VIRTUAL_PROMPT%"
+) else (
+    @if not defined PROMPT (
+        @set "PROMPT=$P$G"
+    )
+    @if not defined VIRTUAL_ENV_DISABLE_PROMPT (
+        @set "_OLD_VIRTUAL_PROMPT=%PROMPT%"
+    )
+)
+@if not defined VIRTUAL_ENV_DISABLE_PROMPT (
+    @set "PROMPT=(%VIRTUAL_ENV_PROMPT%) %PROMPT%"
+)
+@REM Don't use () to avoid problems with them in %PATH%
+@if defined _OLD_VIRTUAL_PYTHONHOME @goto ENDIFVHOME
+    @set "_OLD_VIRTUAL_PYTHONHOME=%PYTHONHOME%"
+:ENDIFVHOME
+@set PYTHONHOME=
+@REM if defined _OLD_VIRTUAL_PATH (
+@if not defined _OLD_VIRTUAL_PATH @goto ENDIFVPATH1
+    @set "PATH=%_OLD_VIRTUAL_PATH%"
+:ENDIFVPATH1
+@REM ) else (
+@if defined _OLD_VIRTUAL_PATH @goto ENDIFVPATH2
+    @set "_OLD_VIRTUAL_PATH=%PATH%"
+:ENDIFVPATH2
+@set "PATH=%VIRTUAL_ENV%\bin;%PATH%"
+@if defined _OLD_CODEPAGE (
+    "%SystemRoot%\System32\chcp.com" %_OLD_CODEPAGE% > nul
+    @set _OLD_CODEPAGE=
+)

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/bin/activate.fish ADDED Viewed

	@@ -0,0 +1,124 @@

+# Copyright (c) 2020-202x The virtualenv developers
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# This file must be used using `source bin/activate.fish` *within a running fish ( http://fishshell.com ) session*.
+# Do not run it directly.
+function _bashify_path -d "Converts a fish path to something bash can recognize"
+    set fishy_path $argv
+    set bashy_path $fishy_path[1]
+    for path_part in $fishy_path[2..-1]
+        set bashy_path "$bashy_path:$path_part"
+    end
+    echo $bashy_path
+end
+function _fishify_path -d "Converts a bash path to something fish can recognize"
+    echo $argv | tr ':' '\n'
+end
+function deactivate -d 'Exit virtualenv mode and return to the normal environment.'
+    # reset old environment variables
+    if test -n "$_OLD_VIRTUAL_PATH"
+        # https://github.com/fish-shell/fish-shell/issues/436 altered PATH handling
+        if test (string sub -s 1 -l 1 $FISH_VERSION) -lt 3
+            set -gx PATH (_fishify_path "$_OLD_VIRTUAL_PATH")
+        else
+            set -gx PATH $_OLD_VIRTUAL_PATH
+        end
+        set -e _OLD_VIRTUAL_PATH
+    end
+    if test -n "$_OLD_VIRTUAL_PYTHONHOME"
+        set -gx PYTHONHOME "$_OLD_VIRTUAL_PYTHONHOME"
+        set -e _OLD_VIRTUAL_PYTHONHOME
+    end
+    if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
+       and functions -q _old_fish_prompt
+        # Set an empty local `$fish_function_path` to allow the removal of `fish_prompt` using `functions -e`.
+        set -l fish_function_path
+        # Erase virtualenv's `fish_prompt` and restore the original.
+        functions -e fish_prompt
+        functions -c _old_fish_prompt fish_prompt
+        functions -e _old_fish_prompt
+        set -e _OLD_FISH_PROMPT_OVERRIDE
+    end
+    set -e VIRTUAL_ENV
+    set -e VIRTUAL_ENV_PROMPT
+    if test "$argv[1]" != 'nondestructive'
+        # Self-destruct!
+        functions -e pydoc
+        functions -e deactivate
+        functions -e _bashify_path
+        functions -e _fishify_path
+    end
+end
+# Unset irrelevant variables.
+deactivate nondestructive
+set -gx VIRTUAL_ENV '/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv'
+# https://github.com/fish-shell/fish-shell/issues/436 altered PATH handling
+if test (string sub -s 1 -l 1 $FISH_VERSION) -lt 3
+    set -gx _OLD_VIRTUAL_PATH (_bashify_path $PATH)
+else
+    set -gx _OLD_VIRTUAL_PATH $PATH
+end
+set -gx PATH "$VIRTUAL_ENV"'/bin' $PATH
+# Prompt override provided?
+# If not, just use the environment name.
+if test -n ''
+    set -gx VIRTUAL_ENV_PROMPT ''
+else
+    set -gx VIRTUAL_ENV_PROMPT (basename "$VIRTUAL_ENV")
+end
+# Unset `$PYTHONHOME` if set.
+if set -q PYTHONHOME
+    set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
+    set -e PYTHONHOME
+end
+function pydoc
+    python -m pydoc $argv
+end
+if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
+    # Copy the current `fish_prompt` function as `_old_fish_prompt`.
+    functions -c fish_prompt _old_fish_prompt
+    function fish_prompt
+        # Run the user's prompt first; it might depend on (pipe)status.
+        set -l prompt (_old_fish_prompt)
+        printf '(%s) ' $VIRTUAL_ENV_PROMPT
+        string join -- \n $prompt # handle multi-line prompts
+    end
+    set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
+end

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/bin/f2py ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/e2e-data/evad-tech-vla/wanghan58/workspace/LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/bin/python3
+# -*- coding: utf-8 -*-
+import sys
+from numpy.f2py.f2py2e import main
+if __name__ == "__main__":
+    if sys.argv[0].endswith("-script.pyw"):
+        sys.argv[0] = sys.argv[0][:-11]
+    elif sys.argv[0].endswith(".exe"):
+        sys.argv[0] = sys.argv[0][:-4]
+    sys.exit(main())

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/bin/pydoc.bat ADDED Viewed

	@@ -0,0 +1,22 @@

+@REM Copyright (c) 2020-202x The virtualenv developers
+@REM
+@REM Permission is hereby granted, free of charge, to any person obtaining
+@REM a copy of this software and associated documentation files (the
+@REM "Software"), to deal in the Software without restriction, including
+@REM without limitation the rights to use, copy, modify, merge, publish,
+@REM distribute, sublicense, and/or sell copies of the Software, and to
+@REM permit persons to whom the Software is furnished to do so, subject to
+@REM the following conditions:
+@REM
+@REM The above copyright notice and this permission notice shall be
+@REM included in all copies or substantial portions of the Software.
+@REM
+@REM THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+@REM EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+@REM MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+@REM NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+@REM LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+@REM OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+@REM WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+python.exe -m pydoc %*

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/audio_utils.py ADDED Viewed

	@@ -0,0 +1,1254 @@

+# Copyright 2023 The HuggingFace Inc. team and the librosa & torchaudio authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Audio processing functions to extract features from audio waveforms. This code is pure numpy to support all frameworks
+and remove unnecessary dependencies.
+"""
+import base64
+import importlib
+import io
+import os
+import warnings
+from collections.abc import Sequence
+from io import BytesIO
+from typing import TYPE_CHECKING, Any, Union
+import httpx
+import numpy as np
+from packaging import version
+from .utils import (
+    is_librosa_available,
+    is_numpy_array,
+    is_soundfile_available,
+    is_torch_tensor,
+    is_torchcodec_available,
+    requires_backends,
+)
+from .utils.generic import retry
+if TYPE_CHECKING:
+    import torch
+if is_soundfile_available():
+    import soundfile as sf
+if is_librosa_available():
+    import librosa
+    # TODO: @eustlb, we actually don't need librosa but soxr is installed with librosa
+    import soxr
+if is_torchcodec_available():
+    TORCHCODEC_VERSION = version.parse(importlib.metadata.version("torchcodec"))
+AudioInput = Union[np.ndarray, "torch.Tensor", Sequence[np.ndarray], Sequence["torch.Tensor"]]
+@retry(exceptions=(httpx.HTTPError,))
+def _fetch_audio_bytes(url: str, timeout: float | None = 10.0) -> bytes:
+    """Fetch audio bytes from a URL with automatic retry and exponential backoff."""
+    response = httpx.get(url, follow_redirects=True, timeout=timeout)
+    response.raise_for_status()
+    return response.content
+def load_audio(audio: str | np.ndarray, sampling_rate=16000, timeout=None) -> np.ndarray:
+    """
+    Loads `audio` to an np.ndarray object.
+    Args:
+        audio (`str` or `np.ndarray`):
+            The audio to be loaded to the numpy array format.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate to be used when loading the audio. It should be same as the
+            sampling rate the model you will be using further was trained with.
+        timeout (`float`, *optional*):
+            The timeout value in seconds for the URL request.
+    Returns:
+        `np.ndarray`: A numpy array representing the audio.
+    """
+    if isinstance(audio, str):
+        # Try to load with `torchcodec` but do not enforce users to install it. If not found
+        # fallback to `librosa`. If using an audio-only model, most probably `torchcodec` won't be
+        # needed. Do not raise any errors if not installed or versions do not match
+        if is_torchcodec_available() and version.parse("0.3.0") <= TORCHCODEC_VERSION:
+            audio = load_audio_torchcodec(audio, sampling_rate=sampling_rate, timeout=timeout)
+        elif audio.rsplit("?", 1)[0].lower().endswith((".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv")):
+            raise RuntimeError(
+                f"The audio source appears to be a video file ('{audio.split('/')[-1]}'). "
+                "librosa cannot decode video containers. "
+                "Install torchcodec>=0.3.0 (`pip install torchcodec`) to load audio from video files."
+            )
+        else:
+            audio = load_audio_librosa(audio, sampling_rate=sampling_rate, timeout=timeout)
+    elif not isinstance(audio, np.ndarray):
+        raise TypeError(
+            "Incorrect format used for `audio`. Should be an url linking to an audio, a local path, or numpy array."
+        )
+    return audio
+def load_audio_torchcodec(audio: str | np.ndarray, sampling_rate=16000, timeout=None) -> np.ndarray:
+    """
+    Loads `audio` to an np.ndarray object using `torchcodec`.
+    Args:
+        audio (`str` or `np.ndarray`):
+            The audio to be loaded to the numpy array format.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate to be used when loading the audio. It should be same as the
+            sampling rate the model you will be using further was trained with.
+        timeout (`float`, *optional*):
+            The timeout value in seconds for the URL request.
+    Returns:
+        `np.ndarray`: A numpy array representing the audio.
+    """
+    # Lazy import so that issues in torchcodec compatibility don't crash the whole library
+    requires_backends(load_audio_torchcodec, ["torchcodec"])
+    from torchcodec.decoders import AudioDecoder
+    # Fetch bytes for URLs so we get retry logic; torchcodec does not surface ffmpeg network retries options
+    if isinstance(audio, str) and audio.startswith(("http://", "https://")):
+        audio = _fetch_audio_bytes(audio, timeout=timeout)
+    # Set `num_channels` to `1` which is what most models expects and the default in librosa
+    decoder = AudioDecoder(audio, sample_rate=sampling_rate, num_channels=1)
+    audio = decoder.get_all_samples().data[0].numpy()  # NOTE: feature extractors don't accept torch tensors
+    return audio
+def load_audio_librosa(audio: str | np.ndarray, sampling_rate=16000, timeout=None) -> np.ndarray:
+    """
+    Loads `audio` to an np.ndarray object using `librosa`.
+    Args:
+        audio (`str` or `np.ndarray`):
+            The audio to be loaded to the numpy array format.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate to be used when loading the audio. It should be same as the
+            sampling rate the model you will be using further was trained with.
+        timeout (`float`, *optional*):
+            The timeout value in seconds for the URL request.
+    Returns:
+        `np.ndarray`: A numpy array representing the audio.
+    """
+    requires_backends(load_audio_librosa, ["librosa"])
+    # Load audio from URL (e.g https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav)
+    if audio.startswith("http://") or audio.startswith("https://"):
+        audio = librosa.load(BytesIO(_fetch_audio_bytes(audio, timeout=timeout)), sr=sampling_rate)[0]
+    elif os.path.isfile(audio):
+        audio = librosa.load(audio, sr=sampling_rate)[0]
+    return audio
+def load_audio_as(
+    audio: str,
+    return_format: str,
+    timeout: int | None = None,
+    force_mono: bool = False,
+    sampling_rate: int | None = None,
+) -> str | dict[str, Any] | io.BytesIO | None:
+    """
+    Load audio from either a local file path or URL and return in specified format.
+    Args:
+        audio (`str`): Either a local file path or a URL to an audio file
+        return_format (`str`): Format to return the audio in:
+            - "base64": Base64 encoded string
+            - "dict": Dictionary with data and format
+            - "buffer": BytesIO object
+        timeout (`int`, *optional*): Timeout for URL requests in seconds
+        force_mono (`bool`): Whether to convert stereo audio to mono
+        sampling_rate (`int`, *optional*): If provided, the audio will be resampled to the specified sampling rate.
+    Returns:
+        `Union[str, Dict[str, Any], io.BytesIO, None]`:
+            - `str`: Base64 encoded audio data (if return_format="base64")
+            - `dict`: Dictionary with 'data' (base64 encoded audio data) and 'format' keys (if return_format="dict")
+            - `io.BytesIO`: BytesIO object containing audio data (if return_format="buffer")
+    """
+    requires_backends(load_audio_as, ["librosa"])
+    if return_format not in ["base64", "dict", "buffer"]:
+        raise ValueError(f"Invalid return_format: {return_format}. Must be 'base64', 'dict', or 'buffer'")
+    try:
+        # Load audio bytes from URL or file
+        audio_bytes = None
+        if audio.startswith(("http://", "https://")):
+            audio_bytes = _fetch_audio_bytes(audio, timeout=timeout)
+        elif os.path.isfile(audio):
+            with open(audio, "rb") as audio_file:
+                audio_bytes = audio_file.read()
+        else:
+            raise ValueError(f"File not found: {audio}")
+        # Process audio data
+        with io.BytesIO(audio_bytes) as audio_file:
+            with sf.SoundFile(audio_file) as f:
+                audio_array = f.read(dtype="float32")
+                original_sr = f.samplerate
+                audio_format = f.format
+                if sampling_rate is not None and sampling_rate != original_sr:
+                    # Resample audio to target sampling rate
+                    audio_array = soxr.resample(audio_array, original_sr, sampling_rate, quality="HQ")
+                else:
+                    sampling_rate = original_sr
+        # Convert to mono if needed
+        if force_mono and audio_array.ndim != 1:
+            audio_array = audio_array.mean(axis=1)
+        buffer = io.BytesIO()
+        sf.write(buffer, audio_array, sampling_rate, format=audio_format.upper())
+        buffer.seek(0)
+        if return_format == "buffer":
+            return buffer
+        elif return_format == "base64":
+            return base64.b64encode(buffer.read()).decode("utf-8")
+        elif return_format == "dict":
+            return {
+                "data": base64.b64encode(buffer.read()).decode("utf-8"),
+                "format": audio_format.lower(),
+            }
+    except Exception as e:
+        raise ValueError(f"Error loading audio: {e}")
+def conv1d_output_length(module: "torch.nn.Conv1d", input_length: int) -> int:
+    """
+    Computes the output length of a 1D convolution layer according to torch's documentation:
+    https://docs.pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+    """
+    return int(
+        (input_length + 2 * module.padding[0] - module.dilation[0] * (module.kernel_size[0] - 1) - 1)
+        / module.stride[0]
+        + 1
+    )
+def is_valid_audio(audio):
+    return is_numpy_array(audio) or is_torch_tensor(audio)
+def is_valid_list_of_audio(audio):
+    return audio and all(is_valid_audio(audio_i) for audio_i in audio)
+def make_list_of_audio(
+    audio: list[AudioInput] | AudioInput,
+) -> AudioInput:
+    """
+    Ensure that the output is a list of audio.
+    Args:
+        audio (`Union[list[AudioInput], AudioInput]`):
+            The input audio.
+    Returns:
+        list: A list of audio.
+    """
+    # If it's a list of audios, it's already in the right format
+    if isinstance(audio, (list, tuple)) and is_valid_list_of_audio(audio):
+        return audio
+    # If it's a single audio, convert it to a list of
+    if is_valid_audio(audio):
+        return [audio]
+    raise ValueError("Invalid input type. Must be a single audio or a list of audio")
+def hertz_to_mel(freq: float | np.ndarray, mel_scale: str = "htk") -> float | np.ndarray:
+    """
+    Convert frequency from hertz to mels.
+    Args:
+        freq (`float` or `np.ndarray`):
+            The frequency, or multiple frequencies, in hertz (Hz).
+        mel_scale (`str`, *optional*, defaults to `"htk"`):
+            The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`.
+    Returns:
+        `float` or `np.ndarray`: The frequencies on the mel scale.
+    """
+    if mel_scale not in ["slaney", "htk", "kaldi"]:
+        raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".')
+    if mel_scale == "htk":
+        return 2595.0 * np.log10(1.0 + (freq / 700.0))
+    elif mel_scale == "kaldi":
+        return 1127.0 * np.log(1.0 + (freq / 700.0))
+    min_log_hertz = 1000.0
+    min_log_mel = 15.0
+    logstep = 27.0 / np.log(6.4)
+    mels = 3.0 * freq / 200.0
+    if isinstance(freq, np.ndarray):
+        log_region = freq >= min_log_hertz
+        mels[log_region] = min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep
+    elif freq >= min_log_hertz:
+        mels = min_log_mel + np.log(freq / min_log_hertz) * logstep
+    return mels
+def mel_to_hertz(mels: float | np.ndarray, mel_scale: str = "htk") -> float | np.ndarray:
+    """
+    Convert frequency from mels to hertz.
+    Args:
+        mels (`float` or `np.ndarray`):
+            The frequency, or multiple frequencies, in mels.
+        mel_scale (`str`, *optional*, `"htk"`):
+            The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`.
+    Returns:
+        `float` or `np.ndarray`: The frequencies in hertz.
+    """
+    if mel_scale not in ["slaney", "htk", "kaldi"]:
+        raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".')
+    if mel_scale == "htk":
+        return 700.0 * (np.power(10, mels / 2595.0) - 1.0)
+    elif mel_scale == "kaldi":
+        return 700.0 * (np.exp(mels / 1127.0) - 1.0)
+    min_log_hertz = 1000.0
+    min_log_mel = 15.0
+    logstep = np.log(6.4) / 27.0
+    freq = 200.0 * mels / 3.0
+    if isinstance(mels, np.ndarray):
+        log_region = mels >= min_log_mel
+        freq[log_region] = min_log_hertz * np.exp(logstep * (mels[log_region] - min_log_mel))
+    elif mels >= min_log_mel:
+        freq = min_log_hertz * np.exp(logstep * (mels - min_log_mel))
+    return freq
+def hertz_to_octave(freq: float | np.ndarray, tuning: float = 0.0, bins_per_octave: int = 12):
+    """
+    Convert frequency from hertz to fractional octave numbers.
+    Adapted from *librosa*.
+    Args:
+        freq (`float` or `np.ndarray`):
+            The frequency, or multiple frequencies, in hertz (Hz).
+        tuning (`float`, defaults to `0.`):
+            Tuning deviation from the Stuttgart pitch (A440) in (fractional) bins per octave.
+        bins_per_octave (`int`, defaults to `12`):
+            Number of bins per octave.
+    Returns:
+        `float` or `np.ndarray`: The frequencies on the octave scale.
+    """
+    stuttgart_pitch = 440.0 * 2.0 ** (tuning / bins_per_octave)
+    octave = np.log2(freq / (float(stuttgart_pitch) / 16))
+    return octave
+def _create_triangular_filter_bank(fft_freqs: np.ndarray, filter_freqs: np.ndarray) -> np.ndarray:
+    """
+    Creates a triangular filter bank.
+    Adapted from *torchaudio* and *librosa*.
+    Args:
+        fft_freqs (`np.ndarray` of shape `(num_frequency_bins,)`):
+            Discrete frequencies of the FFT bins in Hz.
+        filter_freqs (`np.ndarray` of shape `(num_mel_filters,)`):
+            Center frequencies of the triangular filters to create, in Hz.
+    Returns:
+        `np.ndarray` of shape `(num_frequency_bins, num_mel_filters)`
+    """
+    filter_diff = np.diff(filter_freqs)
+    slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1)
+    down_slopes = -slopes[:, :-2] / filter_diff[:-1]
+    up_slopes = slopes[:, 2:] / filter_diff[1:]
+    return np.maximum(np.zeros(1), np.minimum(down_slopes, up_slopes))
+def chroma_filter_bank(
+    num_frequency_bins: int,
+    num_chroma: int,
+    sampling_rate: int,
+    tuning: float = 0.0,
+    power: float | None = 2.0,
+    weighting_parameters: tuple[float, float] | None = (5.0, 2.0),
+    start_at_c_chroma: bool = True,
+):
+    """
+    Creates a chroma filter bank, i.e a linear transformation to project spectrogram bins onto chroma bins.
+    Adapted from *librosa*.
+    Args:
+        num_frequency_bins (`int`):
+            Number of frequencies used to compute the spectrogram (should be the same as in `stft`).
+        num_chroma (`int`):
+            Number of chroma bins (i.e pitch classes).
+        sampling_rate (`float`):
+            Sample rate of the audio waveform.
+        tuning (`float`):
+            Tuning deviation from A440 in fractions of a chroma bin.
+        power (`float`, *optional*, defaults to 2.0):
+            If 12.0, normalizes each column with their L2 norm. If 1.0, normalizes each column with their L1 norm.
+        weighting_parameters (`tuple[float, float]`, *optional*, defaults to `(5., 2.)`):
+            If specified, apply a Gaussian weighting parameterized by the first element of the tuple being the center and
+            the second element being the Gaussian half-width.
+        start_at_c_chroma (`bool`, *optional*, defaults to `True`):
+            If True, the filter bank will start at the 'C' pitch class. Otherwise, it will start at 'A'.
+    Returns:
+        `np.ndarray` of shape `(num_frequency_bins, num_chroma)`
+    """
+    # Get the FFT bins, not counting the DC component
+    frequencies = np.linspace(0, sampling_rate, num_frequency_bins, endpoint=False)[1:]
+    freq_bins = num_chroma * hertz_to_octave(frequencies, tuning=tuning, bins_per_octave=num_chroma)
+    # make up a value for the 0 Hz bin = 1.5 octaves below bin 1
+    # (so chroma is 50% rotated from bin 1, and bin width is broad)
+    freq_bins = np.concatenate(([freq_bins[0] - 1.5 * num_chroma], freq_bins))
+    bins_width = np.concatenate((np.maximum(freq_bins[1:] - freq_bins[:-1], 1.0), [1]))
+    chroma_filters = np.subtract.outer(freq_bins, np.arange(0, num_chroma, dtype="d")).T
+    num_chroma2 = np.round(float(num_chroma) / 2)
+    # Project into range -num_chroma/2 .. num_chroma/2
+    # add on fixed offset of 10*num_chroma to ensure all values passed to
+    # rem are positive
+    chroma_filters = np.remainder(chroma_filters + num_chroma2 + 10 * num_chroma, num_chroma) - num_chroma2
+    # Gaussian bumps - 2*D to make them narrower
+    chroma_filters = np.exp(-0.5 * (2 * chroma_filters / np.tile(bins_width, (num_chroma, 1))) ** 2)
+    # normalize each column
+    if power is not None:
+        chroma_filters = chroma_filters / np.sum(chroma_filters**power, axis=0, keepdims=True) ** (1.0 / power)
+    # Maybe apply scaling for fft bins
+    if weighting_parameters is not None:
+        center, half_width = weighting_parameters
+        chroma_filters *= np.tile(
+            np.exp(-0.5 * (((freq_bins / num_chroma - center) / half_width) ** 2)),
+            (num_chroma, 1),
+        )
+    if start_at_c_chroma:
+        chroma_filters = np.roll(chroma_filters, -3 * (num_chroma // 12), axis=0)
+    # remove aliasing columns, copy to ensure row-contiguity
+    return np.ascontiguousarray(chroma_filters[:, : int(1 + num_frequency_bins / 2)])
+def mel_filter_bank(
+    num_frequency_bins: int,
+    num_mel_filters: int,
+    min_frequency: float,
+    max_frequency: float,
+    sampling_rate: int,
+    norm: str | None = None,
+    mel_scale: str = "htk",
+    triangularize_in_mel_space: bool = False,
+) -> np.ndarray:
+    """
+    Creates a frequency bin conversion matrix used to obtain a mel spectrogram. This is called a *mel filter bank*, and
+    various implementation exist, which differ in the number of filters, the shape of the filters, the way the filters
+    are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these
+    features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency.
+    Different banks of mel filters were introduced in the literature. The following variations are supported:
+    - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHz and a speech
+      bandwidth of `[0, 4600]` Hz.
+    - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a speech
+      bandwidth of `[0, 8000]` Hz. This assumes sampling rate ≥ 16 kHz.
+    - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate of 16 kHz and
+      speech bandwidth of `[133, 6854]` Hz. This version also includes area normalization.
+    - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes a sampling rate of
+      12.5 kHz and speech bandwidth of `[0, 6250]` Hz.
+    This code is adapted from *torchaudio* and *librosa*. Note that the default parameters of torchaudio's
+    `melscale_fbanks` implement the `"htk"` filters while librosa uses the `"slaney"` implementation.
+    Args:
+        num_frequency_bins (`int`):
+            Number of frequency bins (should be the same as `n_fft // 2 + 1` where `n_fft` is the size of the Fourier Transform used to compute the spectrogram).
+        num_mel_filters (`int`):
+            Number of mel filters to generate.
+        min_frequency (`float`):
+            Lowest frequency of interest in Hz.
+        max_frequency (`float`):
+            Highest frequency of interest in Hz. This should not exceed `sampling_rate / 2`.
+        sampling_rate (`int`):
+            Sample rate of the audio waveform.
+        norm (`str`, *optional*):
+            If `"slaney"`, divide the triangular mel weights by the width of the mel band (area normalization).
+        mel_scale (`str`, *optional*, defaults to `"htk"`):
+            The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`.
+        triangularize_in_mel_space (`bool`, *optional*, defaults to `False`):
+            If this option is enabled, the triangular filter is applied in mel space rather than frequency space. This
+            should be set to `true` in order to get the same results as `torchaudio` when computing mel filters.
+    Returns:
+        `np.ndarray` of shape (`num_frequency_bins`, `num_mel_filters`): Triangular filter bank matrix. This is a
+        projection matrix to go from a spectrogram to a mel spectrogram.
+    """
+    if norm is not None and norm != "slaney":
+        raise ValueError('norm must be one of None or "slaney"')
+    if num_frequency_bins < 2:
+        raise ValueError(f"Require num_frequency_bins: {num_frequency_bins} >= 2")
+    if min_frequency > max_frequency:
+        raise ValueError(f"Require min_frequency: {min_frequency} <= max_frequency: {max_frequency}")
+    # center points of the triangular mel filters
+    mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale)
+    mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale)
+    mel_freqs = np.linspace(mel_min, mel_max, num_mel_filters + 2)
+    filter_freqs = mel_to_hertz(mel_freqs, mel_scale=mel_scale)
+    if triangularize_in_mel_space:
+        # frequencies of FFT bins in Hz, but filters triangularized in mel space
+        fft_bin_width = sampling_rate / ((num_frequency_bins - 1) * 2)
+        fft_freqs = hertz_to_mel(fft_bin_width * np.arange(num_frequency_bins), mel_scale=mel_scale)
+        filter_freqs = mel_freqs
+    else:
+        # frequencies of FFT bins in Hz
+        fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins)
+    mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs)
+    if norm is not None and norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters])
+        mel_filters *= np.expand_dims(enorm, 0)
+    if (mel_filters.max(axis=0) == 0.0).any():
+        warnings.warn(
+            "At least one mel filter has all zero values. "
+            f"The value for `num_mel_filters` ({num_mel_filters}) may be set too high. "
+            f"Or, the value for `num_frequency_bins` ({num_frequency_bins}) may be set too low."
+        )
+    return mel_filters
+def optimal_fft_length(window_length: int) -> int:
+    """
+    Finds the best FFT input size for a given `window_length`. This function takes a given window length and, if not
+    already a power of two, rounds it up to the next power or two.
+    The FFT algorithm works fastest when the length of the input is a power of two, which may be larger than the size
+    of the window or analysis frame. For example, if the window is 400 samples, using an FFT input size of 512 samples
+    is more optimal than an FFT size of 400 samples. Using a larger FFT size does not affect the detected frequencies,
+    it simply gives a higher frequency resolution (i.e. the frequency bins are smaller).
+    """
+    return 2 ** int(np.ceil(np.log2(window_length)))
+def window_function(
+    window_length: int,
+    name: str = "hann",
+    periodic: bool = True,
+    frame_length: int | None = None,
+    center: bool = True,
+) -> np.ndarray:
+    """
+    Returns an array containing the specified window. This window is intended to be used with `stft`.
+    The following window types are supported:
+        - `"boxcar"`: a rectangular window
+        - `"hamming"`: the Hamming window
+        - `"hann"`: the Hann window
+        - `"povey"`: the Povey window
+    Args:
+        window_length (`int`):
+            The length of the window in samples.
+        name (`str`, *optional*, defaults to `"hann"`):
+            The name of the window function.
+        periodic (`bool`, *optional*, defaults to `True`):
+            Whether the window is periodic or symmetric.
+        frame_length (`int`, *optional*):
+            The length of the analysis frames in samples. Provide a value for `frame_length` if the window is smaller
+            than the frame length, so that it will be zero-padded.
+        center (`bool`, *optional*, defaults to `True`):
+            Whether to center the window inside the FFT buffer. Only used when `frame_length` is provided.
+    Returns:
+        `np.ndarray` of shape `(window_length,)` or `(frame_length,)` containing the window.
+    """
+    length = window_length + 1 if periodic else window_length
+    if name == "boxcar":
+        window = np.ones(length)
+    elif name in ["hamming", "hamming_window"]:
+        window = np.hamming(length)
+    elif name in ["hann", "hann_window"]:
+        window = np.hanning(length)
+    elif name == "povey":
+        window = np.power(np.hanning(length), 0.85)
+    else:
+        raise ValueError(f"Unknown window function '{name}'")
+    if periodic:
+        window = window[:-1]
+    if frame_length is None:
+        return window
+    if window_length > frame_length:
+        raise ValueError(
+            f"Length of the window ({window_length}) may not be larger than frame_length ({frame_length})"
+        )
+    padded_window = np.zeros(frame_length)
+    offset = (frame_length - window_length) // 2 if center else 0
+    padded_window[offset : offset + window_length] = window
+    return padded_window
+# Note: This method processes a single waveform. For batch processing, use spectrogram_batch().
+def spectrogram(
+    waveform: np.ndarray,
+    window: np.ndarray,
+    frame_length: int,
+    hop_length: int,
+    fft_length: int | None = None,
+    power: float | None = 1.0,
+    center: bool = True,
+    pad_mode: str = "reflect",
+    onesided: bool = True,
+    dither: float = 0.0,
+    preemphasis: float | None = None,
+    mel_filters: np.ndarray | None = None,
+    mel_floor: float = 1e-10,
+    log_mel: str | None = None,
+    reference: float = 1.0,
+    min_value: float = 1e-10,
+    db_range: float | None = None,
+    remove_dc_offset: bool = False,
+    dtype: np.dtype = np.float32,
+) -> np.ndarray:
+    """
+    Calculates a spectrogram over one waveform using the Short-Time Fourier Transform.
+    This function can create the following kinds of spectrograms:
+      - amplitude spectrogram (`power = 1.0`)
+      - power spectrogram (`power = 2.0`)
+      - complex-valued spectrogram (`power = None`)
+      - log spectrogram (use `log_mel` argument)
+      - mel spectrogram (provide `mel_filters`)
+      - log-mel spectrogram (provide `mel_filters` and `log_mel`)
+    How this works:
+      1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length
+         - hop_length` samples.
+      2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`.
+      3. The DFT is taken of each windowed frame.
+      4. The results are stacked into a spectrogram.
+    We make a distinction between the following "blocks" of sample data, each of which may have a different lengths:
+      - The analysis frame. This is the size of the time slices that the input waveform is split into.
+      - The window. Each analysis frame is multiplied by the window to avoid spectral leakage.
+      - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram.
+    In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A
+    padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame,
+    typically the next power of two.
+    Note: This function is not optimized for speed yet. It should be mostly compatible with `librosa.stft` and
+    `torchaudio.functional.transforms.Spectrogram`, although it is more flexible due to the different ways spectrograms
+    can be constructed.
+    Args:
+        waveform (`np.ndarray` of shape `(length,)`):
+            The input waveform. This must be a single real-valued, mono waveform.
+        window (`np.ndarray` of shape `(frame_length,)`):
+            The windowing function to apply, including zero-padding if necessary. The actual window length may be
+            shorter than `frame_length`, but we're assuming the array has already been zero-padded.
+        frame_length (`int`):
+            The length of the analysis frames in samples. With librosa this is always equal to `fft_length` but we also
+            allow smaller sizes.
+        hop_length (`int`):
+            The stride between successive analysis frames in samples.
+        fft_length (`int`, *optional*):
+            The size of the FFT buffer in samples. This determines how many frequency bins the spectrogram will have.
+            For optimal speed, this should be a power of two. If `None`, uses `frame_length`.
+        power (`float`, *optional*, defaults to 1.0):
+            If 1.0, returns the amplitude spectrogram. If 2.0, returns the power spectrogram. If `None`, returns
+            complex numbers.
+        center (`bool`, *optional*, defaults to `True`):
+            Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame
+            `t` will start at time `t * hop_length`.
+        pad_mode (`str`, *optional*, defaults to `"reflect"`):
+            Padding mode used when `center` is `True`. Possible values are: `"constant"` (pad with zeros), `"edge"`
+            (pad with edge values), `"reflect"` (pads with mirrored values).
+        onesided (`bool`, *optional*, defaults to `True`):
+            If True, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1`
+            frequency bins. If False, also computes the negative frequencies and returns `fft_length` frequency bins.
+        dither (`float`, *optional*, defaults to 0.0):
+            Adds dithering. In other words, adds a small Gaussian noise to each frame.
+            E.g. use 4.0 to add dithering with a normal distribution centered
+            around 0.0 with standard deviation 4.0, 0.0 means no dithering.
+            Dithering has similar effect as `mel_floor`. It reduces the high log_mel_fbank
+            values for signals with hard-zero sections, when VAD cutoff is present in the signal.
+        preemphasis (`float`, *optional*)
+            Coefficient for a low-pass filter that applies pre-emphasis before the DFT.
+        mel_filters (`np.ndarray` of shape `(num_freq_bins, num_mel_filters)`, *optional*):
+            The mel filter bank. If supplied, applies a this filter bank to create a mel spectrogram.
+        mel_floor (`float`, *optional*, defaults to 1e-10):
+            Minimum value of mel frequency banks.
+        log_mel (`str`, *optional*):
+            How to convert the spectrogram to log scale. Possible options are: `None` (don't convert), `"log"` (take
+            the natural logarithm) `"log10"` (take the base-10 logarithm), `"dB"` (convert to decibels). Can only be
+            used when `power` is not `None`.
+        reference (`float`, *optional*, defaults to 1.0):
+            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
+            the loudest part to 0 dB. Must be greater than zero.
+        min_value (`float`, *optional*, defaults to `1e-10`):
+            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
+            `log(0)`. For a power spectrogram, the default of `1e-10` corresponds to a minimum of -100 dB. For an
+            amplitude spectrogram, the value `1e-5` corresponds to -100 dB. Must be greater than zero.
+        db_range (`float`, *optional*):
+            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
+            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
+        remove_dc_offset (`bool`, *optional*):
+            Subtract mean from waveform on each frame, applied before pre-emphasis. This should be set to `true` in
+            order to get the same results as `torchaudio.compliance.kaldi.fbank` when computing mel filters.
+        dtype (`np.dtype`, *optional*, defaults to `np.float32`):
+            Data type of the spectrogram tensor. If `power` is None, this argument is ignored and the dtype will be
+            `np.complex64`.
+    Returns:
+        `nd.array` containing a spectrogram of shape `(num_frequency_bins, length)` for a regular spectrogram or shape
+        `(num_mel_filters, length)` for a mel spectrogram.
+    """
+    window_length = len(window)
+    if fft_length is None:
+        fft_length = frame_length
+    if frame_length > fft_length:
+        raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})")
+    if window_length != frame_length:
+        raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})")
+    if hop_length <= 0:
+        raise ValueError("hop_length must be greater than zero")
+    if waveform.ndim != 1:
+        raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}")
+    if np.iscomplexobj(waveform):
+        raise ValueError("Complex-valued input waveforms are not currently supported")
+    if power is None and mel_filters is not None:
+        raise ValueError(
+            "You have provided `mel_filters` but `power` is `None`. Mel spectrogram computation is not yet supported for complex-valued spectrogram."
+            "Specify `power` to fix this issue."
+        )
+    # center pad the waveform
+    if center:
+        padding = [(int(frame_length // 2), int(frame_length // 2))]
+        waveform = np.pad(waveform, padding, mode=pad_mode)
+    # promote to float64, since np.fft uses float64 internally
+    waveform = waveform.astype(np.float64)
+    window = window.astype(np.float64)
+    # split waveform into frames of frame_length size
+    num_frames = int(1 + np.floor((waveform.size - frame_length) / hop_length))
+    num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length
+    spectrogram = np.empty((num_frames, num_frequency_bins), dtype=np.complex64)
+    # rfft is faster than fft
+    fft_func = np.fft.rfft if onesided else np.fft.fft
+    buffer = np.zeros(fft_length)
+    timestep = 0
+    for frame_idx in range(num_frames):
+        buffer[:frame_length] = waveform[timestep : timestep + frame_length]
+        if dither != 0.0:
+            buffer[:frame_length] += dither * np.random.randn(frame_length)
+        if remove_dc_offset:
+            buffer[:frame_length] = buffer[:frame_length] - buffer[:frame_length].mean()
+        if preemphasis is not None:
+            buffer[1:frame_length] -= preemphasis * buffer[: frame_length - 1]
+            buffer[0] *= 1 - preemphasis
+        buffer[:frame_length] *= window
+        spectrogram[frame_idx] = fft_func(buffer)
+        timestep += hop_length
+    # note: ** is much faster than np.power
+    if power is not None:
+        spectrogram = np.abs(spectrogram, dtype=np.float64) ** power
+    spectrogram = spectrogram.T
+    if mel_filters is not None:
+        spectrogram = np.maximum(mel_floor, np.dot(mel_filters.T, spectrogram))
+    if power is not None and log_mel is not None:
+        if log_mel == "log":
+            spectrogram = np.log(spectrogram)
+        elif log_mel == "log10":
+            spectrogram = np.log10(spectrogram)
+        elif log_mel == "dB":
+            if power == 1.0:
+                spectrogram = amplitude_to_db(spectrogram, reference, min_value, db_range)
+            elif power == 2.0:
+                spectrogram = power_to_db(spectrogram, reference, min_value, db_range)
+            else:
+                raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}")
+        else:
+            raise ValueError(f"Unknown log_mel option: {log_mel}")
+        spectrogram = np.asarray(spectrogram, dtype)
+    return spectrogram
+def spectrogram_batch(
+    waveform_list: list[np.ndarray],
+    window: np.ndarray,
+    frame_length: int,
+    hop_length: int,
+    fft_length: int | None = None,
+    power: float | None = 1.0,
+    center: bool = True,
+    pad_mode: str = "reflect",
+    onesided: bool = True,
+    dither: float = 0.0,
+    preemphasis: float | None = None,
+    mel_filters: np.ndarray | None = None,
+    mel_floor: float = 1e-10,
+    log_mel: str | None = None,
+    reference: float = 1.0,
+    min_value: float = 1e-10,
+    db_range: float | None = None,
+    remove_dc_offset: bool = False,
+    dtype: np.dtype = np.float32,
+) -> list[np.ndarray]:
+    """
+    Calculates spectrograms for a list of waveforms using the Short-Time Fourier Transform, optimized for batch processing.
+    This function extends the capabilities of the `spectrogram` function to handle multiple waveforms efficiently by leveraging broadcasting.
+    It supports generating various types of spectrograms:
+        - amplitude spectrogram (`power = 1.0`)
+        - power spectrogram (`power = 2.0`)
+        - complex-valued spectrogram (`power = None`)
+        - log spectrogram (use `log_mel` argument)
+        - mel spectrogram (provide `mel_filters`)
+        - log-mel spectrogram (provide `mel_filters` and `log_mel`)
+    How this works:
+        1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length
+            - hop_length` samples.
+        2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`.
+        3. The DFT is taken of each windowed frame.
+        4. The results are stacked into a spectrogram.
+    We make a distinction between the following "blocks" of sample data, each of which may have a different lengths:
+      - The analysis frame. This is the size of the time slices that the input waveform is split into.
+      - The window. Each analysis frame is multiplied by the window to avoid spectral leakage.
+      - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram.
+    In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A
+    padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame,
+    typically the next power of two.
+    Note: This function is designed for efficient batch processing of multiple waveforms but retains compatibility with individual waveform processing methods like `librosa.stft`.
+    Args:
+        waveform_list (`list[np.ndarray]` with arrays of shape `(length,)`):
+            The list of input waveforms, each a single-channel (mono) signal.
+        window (`np.ndarray` of shape `(frame_length,)`):
+            The windowing function to apply, including zero-padding if necessary.
+        frame_length (`int`):
+            The length of each frame for analysis.
+        hop_length (`int`):
+            The step size between successive frames.
+        fft_length (`int`, *optional*):
+            The size of the FFT buffer, defining frequency bin resolution.
+        power (`float`, *optional*, defaults to 1.0):
+            Determines the type of spectrogram: 1.0 for amplitude, 2.0 for power, None for complex.
+        center (`bool`, *optional*, defaults to `True`):
+            Whether to center-pad the waveform frames.
+        pad_mode (`str`, *optional*, defaults to `"reflect"`):
+            The padding strategy when `center` is `True`.
+        onesided (`bool`, *optional*, defaults to `True`):
+            If True, returns a one-sided spectrogram for real input signals.
+        dither (`float`, *optional*, defaults to 0.0):
+            Adds dithering. In other words, adds a small Gaussian noise to each frame.
+            E.g. use 4.0 to add dithering with a normal distribution centered
+            around 0.0 with standard deviation 4.0, 0.0 means no dithering.
+        preemphasis (`float`, *optional*):
+            Applies a pre-emphasis filter to each frame.
+        mel_filters (`np.ndarray`, *optional*):
+            Mel filter bank for converting to mel spectrogram.
+        mel_floor (`float`, *optional*, defaults to 1e-10):
+            Floor value for mel spectrogram to avoid log(0).
+        log_mel (`str`, *optional*):
+            Specifies log scaling strategy; options are None, "log", "log10", "dB".
+        reference (`float`, *optional*, defaults to 1.0):
+            Reference value for dB conversion in log_mel.
+        min_value (`float`, *optional*, defaults to 1e-10):
+            Minimum floor value for log scale conversions.
+        db_range (`float`, *optional*):
+            Dynamic range for dB scale spectrograms.
+        remove_dc_offset (`bool`, *optional*):
+            Whether to remove the DC offset from each frame.
+        dtype (`np.dtype`, *optional*, defaults to `np.float32`):
+            Data type of the output spectrogram.
+    Returns:
+        list[`np.ndarray`]: A list of spectrogram arrays, one for each input waveform.
+    """
+    window_length = len(window)
+    if fft_length is None:
+        fft_length = frame_length
+    if frame_length > fft_length:
+        raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})")
+    if window_length != frame_length:
+        raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})")
+    if hop_length <= 0:
+        raise ValueError("hop_length must be greater than zero")
+    # Check the dimensions of the waveform , and if waveform is complex
+    for waveform in waveform_list:
+        if waveform.ndim != 1:
+            raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}")
+        if np.iscomplexobj(waveform):
+            raise ValueError("Complex-valued input waveforms are not currently supported")
+    # Center pad the waveform
+    if center:
+        padding = [(int(frame_length // 2), int(frame_length // 2))]
+        waveform_list = [
+            np.pad(
+                waveform,
+                padding,
+                mode=pad_mode,
+            )
+            for waveform in waveform_list
+        ]
+    original_waveform_lengths = [
+        len(waveform) for waveform in waveform_list
+    ]  # these lengths will be used to remove padding later
+    # Batch pad the waveform
+    max_length = max(original_waveform_lengths)
+    padded_waveform_batch = np.array(
+        [
+            np.pad(waveform, (0, max_length - len(waveform)), mode="constant", constant_values=0)
+            for waveform in waveform_list
+        ],
+        dtype=dtype,
+    )
+    # Promote to float64, since np.fft uses float64 internally
+    padded_waveform_batch = padded_waveform_batch.astype(np.float64)
+    window = window.astype(np.float64)
+    # Split waveform into frames of frame_length size
+    num_frames = int(1 + np.floor((padded_waveform_batch.shape[1] - frame_length) / hop_length))
+    # these lengths will be used to remove padding later
+    true_num_frames = [int(1 + np.floor((length - frame_length) / hop_length)) for length in original_waveform_lengths]
+    num_batches = padded_waveform_batch.shape[0]
+    num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length
+    spectrogram = np.empty((num_batches, num_frames, num_frequency_bins), dtype=np.complex64)
+    # rfft is faster than fft
+    fft_func = np.fft.rfft if onesided else np.fft.fft
+    buffer = np.zeros((num_batches, fft_length))
+    for frame_idx in range(num_frames):
+        timestep = frame_idx * hop_length
+        buffer[:, :frame_length] = padded_waveform_batch[:, timestep : timestep + frame_length]
+        if dither != 0.0:
+            buffer[:, :frame_length] += dither * np.random.randn(*buffer[:, :frame_length].shape)
+        if remove_dc_offset:
+            buffer[:, :frame_length] -= buffer[:, :frame_length].mean(axis=1, keepdims=True)
+        if preemphasis is not None:
+            buffer[:, 1:frame_length] -= preemphasis * buffer[:, : frame_length - 1]
+            buffer[:, 0] *= 1 - preemphasis
+        buffer[:, :frame_length] *= window
+        spectrogram[:, frame_idx] = fft_func(buffer)
+    # Note: ** is much faster than np.power
+    if power is not None:
+        spectrogram = np.abs(spectrogram, dtype=np.float64) ** power
+    # Apply mel filters if provided
+    if mel_filters is not None:
+        result = np.tensordot(spectrogram, mel_filters.T, axes=([2], [1]))
+        spectrogram = np.maximum(mel_floor, result)
+    # Convert to log scale if specified
+    if power is not None and log_mel is not None:
+        if log_mel == "log":
+            spectrogram = np.log(spectrogram)
+        elif log_mel == "log10":
+            spectrogram = np.log10(spectrogram)
+        elif log_mel == "dB":
+            if power == 1.0:
+                spectrogram = amplitude_to_db_batch(spectrogram, reference, min_value, db_range)
+            elif power == 2.0:
+                spectrogram = power_to_db_batch(spectrogram, reference, min_value, db_range)
+            else:
+                raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}")
+        else:
+            raise ValueError(f"Unknown log_mel option: {log_mel}")
+        spectrogram = np.asarray(spectrogram, dtype)
+    spectrogram_list = [spectrogram[i, : true_num_frames[i], :].T for i in range(len(true_num_frames))]
+    return spectrogram_list
+def power_to_db(
+    spectrogram: np.ndarray,
+    reference: float = 1.0,
+    min_value: float = 1e-10,
+    db_range: float | None = None,
+) -> np.ndarray:
+    """
+    Converts a power spectrogram to the decibel scale. This computes `10 * log10(spectrogram / reference)`, using basic
+    logarithm properties for numerical stability.
+    The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
+    linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
+    This means that large variations in energy may not sound all that different if the sound is loud to begin with.
+    This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
+    Based on the implementation of `librosa.power_to_db`.
+    Args:
+        spectrogram (`np.ndarray`):
+            The input power (mel) spectrogram. Note that a power spectrogram has the amplitudes squared!
+        reference (`float`, *optional*, defaults to 1.0):
+            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
+            the loudest part to 0 dB. Must be greater than zero.
+        min_value (`float`, *optional*, defaults to `1e-10`):
+            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
+            `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero.
+        db_range (`float`, *optional*):
+            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
+            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
+    Returns:
+        `np.ndarray`: the spectrogram in decibels
+    """
+    if reference <= 0.0:
+        raise ValueError("reference must be greater than zero")
+    if min_value <= 0.0:
+        raise ValueError("min_value must be greater than zero")
+    reference = max(min_value, reference)
+    spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
+    spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference))
+    if db_range is not None:
+        if db_range <= 0.0:
+            raise ValueError("db_range must be greater than zero")
+        spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None)
+    return spectrogram
+def power_to_db_batch(
+    spectrogram: np.ndarray,
+    reference: float = 1.0,
+    min_value: float = 1e-10,
+    db_range: float | None = None,
+) -> np.ndarray:
+    """
+    Converts a batch of power spectrograms to the decibel scale. This computes `10 * log10(spectrogram / reference)`,
+    using basic logarithm properties for numerical stability.
+    This function supports batch processing, where each item in the batch is an individual power (mel) spectrogram.
+    Args:
+        spectrogram (`np.ndarray`):
+            The input batch of power (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape).
+            Note that a power spectrogram has the amplitudes squared!
+        reference (`float`, *optional*, defaults to 1.0):
+            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
+            the loudest part to 0 dB. Must be greater than zero.
+        min_value (`float`, *optional*, defaults to `1e-10`):
+            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
+            `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero.
+        db_range (`float`, *optional*):
+            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
+            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
+    Returns:
+        `np.ndarray`: the batch of spectrograms in decibels
+    """
+    if reference <= 0.0:
+        raise ValueError("reference must be greater than zero")
+    if min_value <= 0.0:
+        raise ValueError("min_value must be greater than zero")
+    reference = max(min_value, reference)
+    spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
+    spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference))
+    if db_range is not None:
+        if db_range <= 0.0:
+            raise ValueError("db_range must be greater than zero")
+        # Apply db_range clipping per batch item
+        max_values = spectrogram.max(axis=(1, 2), keepdims=True)
+        spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None)
+    return spectrogram
+def amplitude_to_db(
+    spectrogram: np.ndarray,
+    reference: float = 1.0,
+    min_value: float = 1e-5,
+    db_range: float | None = None,
+) -> np.ndarray:
+    """
+    Converts an amplitude spectrogram to the decibel scale. This computes `20 * log10(spectrogram / reference)`, using
+    basic logarithm properties for numerical stability.
+    The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
+    linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
+    This means that large variations in energy may not sound all that different if the sound is loud to begin with.
+    This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
+    Args:
+        spectrogram (`np.ndarray`):
+            The input amplitude (mel) spectrogram.
+        reference (`float`, *optional*, defaults to 1.0):
+            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
+            the loudest part to 0 dB. Must be greater than zero.
+        min_value (`float`, *optional*, defaults to `1e-5`):
+            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
+            `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero.
+        db_range (`float`, *optional*):
+            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
+            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
+    Returns:
+        `np.ndarray`: the spectrogram in decibels
+    """
+    if reference <= 0.0:
+        raise ValueError("reference must be greater than zero")
+    if min_value <= 0.0:
+        raise ValueError("min_value must be greater than zero")
+    reference = max(min_value, reference)
+    spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
+    spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference))
+    if db_range is not None:
+        if db_range <= 0.0:
+            raise ValueError("db_range must be greater than zero")
+        spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None)
+    return spectrogram
+def amplitude_to_db_batch(
+    spectrogram: np.ndarray, reference: float = 1.0, min_value: float = 1e-5, db_range: float | None = None
+) -> np.ndarray:
+    """
+    Converts a batch of amplitude spectrograms to the decibel scale. This computes `20 * log10(spectrogram / reference)`,
+    using basic logarithm properties for numerical stability.
+    The function supports batch processing, where each item in the batch is an individual amplitude (mel) spectrogram.
+    Args:
+        spectrogram (`np.ndarray`):
+            The input batch of amplitude (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape).
+        reference (`float`, *optional*, defaults to 1.0):
+            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
+            the loudest part to 0 dB. Must be greater than zero.
+        min_value (`float`, *optional*, defaults to `1e-5`):
+            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
+            `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero.
+        db_range (`float`, *optional*):
+            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
+            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
+    Returns:
+        `np.ndarray`: the batch of spectrograms in decibels
+    """
+    if reference <= 0.0:
+        raise ValueError("reference must be greater than zero")
+    if min_value <= 0.0:
+        raise ValueError("min_value must be greater than zero")
+    reference = max(min_value, reference)
+    spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
+    spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference))
+    if db_range is not None:
+        if db_range <= 0.0:
+            raise ValueError("db_range must be greater than zero")
+        # Apply db_range clipping per batch item
+        max_values = spectrogram.max(axis=(1, 2), keepdims=True)
+        spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None)
+    return spectrogram

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cli/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cli/add_new_model_like.py ADDED Viewed

	@@ -0,0 +1,790 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import difflib
+import os
+import re
+import subprocess
+import textwrap
+from collections.abc import Callable
+from datetime import date
+from pathlib import Path
+from typing import Annotated, Any, cast
+import typer
+from ..utils import is_libcst_available
+# We protect this import to avoid requiring it for all `transformers` CLI commands - however it is actually
+# strictly required for this one (we need it both for modular and for the following Visitor)
+if is_libcst_available():
+    import libcst as cst
+    from libcst import CSTVisitor
+    from libcst import matchers as m
+    class ClassFinder(CSTVisitor):
+        """
+        A visitor to find all classes in a python module.
+        """
+        def __init__(self):
+            self.classes: list = []
+            self.public_classes: list = []
+            self.is_in_class = False
+        def visit_ClassDef(self, node: cst.ClassDef) -> None:
+            """Record class names. We assume classes always only appear at top-level (i.e. no class definition in function or similar)"""
+            self.classes.append(node.name.value)
+            self.is_in_class = True
+        def leave_ClassDef(self, node: cst.ClassDef):
+            self.is_in_class = False
+        def visit_SimpleStatementLine(self, node: cst.SimpleStatementLine):
+            """Record all public classes inside the `__all__` assignment."""
+            simple_top_level_assign_structure = m.SimpleStatementLine(
+                body=[m.Assign(targets=[m.AssignTarget(target=m.Name())])]
+            )
+            if not self.is_in_class and m.matches(node, simple_top_level_assign_structure):
+                stmt = cast(cst.Assign, node.body[0])
+                assigned_variable = cast(cst.Name, stmt.targets[0].target).value
+                if assigned_variable == "__all__":
+                    elements = cast(cst.Tuple, stmt.value).elements
+                    self.public_classes = [cast(cst.SimpleString, element.value).value for element in elements]
+CURRENT_YEAR = date.today().year
+REPO_PATH = Path(__file__).parents[3]
+COPYRIGHT = f"""
+# coding=utf-8
+# Copyright {CURRENT_YEAR} the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""".lstrip()
+### Entrypoint
+def add_new_model_like(
+    repo_path: Annotated[
+        str | None, typer.Argument(help="When not using an editable install, the path to the Transformers repo.")
+    ] = None,
+):
+    """
+    Add a new model to the library, based on an existing one.
+    """
+    (
+        old_model_infos,
+        new_lowercase_name,
+        new_model_paper_name,
+        filenames_to_add,
+    ) = get_user_input()
+    _add_new_model_like_internal(
+        repo_path=Path(repo_path) if repo_path is not None else REPO_PATH,
+        old_model_infos=old_model_infos,
+        new_lowercase_name=new_lowercase_name,
+        new_model_paper_name=new_model_paper_name,
+        filenames_to_add=filenames_to_add,
+    )
+### Core logic
+class ModelInfos:
+    """
+    Retrieve the basic information about an existing model classes.
+    """
+    def __init__(self, lowercase_name: str):
+        from ..models.auto.configuration_auto import CONFIG_MAPPING_NAMES
+        from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING_NAMES
+        from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING_NAMES
+        from ..models.auto.processing_auto import PROCESSOR_MAPPING_NAMES
+        from ..models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES
+        from ..models.auto.video_processing_auto import VIDEO_PROCESSOR_MAPPING_NAMES
+        # Just to make sure it's indeed lowercase
+        self.lowercase_name = lowercase_name.lower().replace(" ", "_").replace("-", "_")
+        if self.lowercase_name not in CONFIG_MAPPING_NAMES:
+            self.lowercase_name.replace("_", "-")
+        if self.lowercase_name not in CONFIG_MAPPING_NAMES:
+            raise ValueError(f"{lowercase_name} is not a valid model name")
+        self.config_class = CONFIG_MAPPING_NAMES[self.lowercase_name]
+        self.camelcase_name = self.config_class.replace("Config", "")
+        # Get tokenizer class
+        if self.lowercase_name in TOKENIZER_MAPPING_NAMES:
+            self.tokenizer_class = None
+            self.fast_tokenizer_class = TOKENIZER_MAPPING_NAMES[self.lowercase_name]
+            self.fast_tokenizer_class = (
+                None if self.fast_tokenizer_class == "PreTrainedTokenizerFast" else self.fast_tokenizer_class
+            )
+        else:
+            self.tokenizer_class, self.fast_tokenizer_class = None, None
+        self.image_processor_classes = IMAGE_PROCESSOR_MAPPING_NAMES.get(self.lowercase_name, None)
+        self.video_processor_class = VIDEO_PROCESSOR_MAPPING_NAMES.get(self.lowercase_name, None)
+        self.feature_extractor_class = FEATURE_EXTRACTOR_MAPPING_NAMES.get(self.lowercase_name, None)
+        self.processor_class = PROCESSOR_MAPPING_NAMES.get(self.lowercase_name, None)
+def add_content_to_file(file_name: str | os.PathLike, new_content: str, add_after: str):
+    """
+    A utility to add some content inside a given file.
+    Args:
+        file_name (`str` or `os.PathLike`):
+            The name of the file in which we want to insert some content.
+        new_content (`str`):
+            The content to add.
+       add_after (`str`):
+           The new content is added just after the first instance matching it.
+    """
+    with open(file_name, "r", encoding="utf-8") as f:
+        old_content = f.read()
+    before, after = old_content.split(add_after, 1)
+    new_content = before + add_after + new_content + after
+    with open(file_name, "w", encoding="utf-8") as f:
+        f.write(new_content)
+def add_model_to_auto_mappings(
+    repo_path: Path,
+    old_model_infos: ModelInfos,
+    new_lowercase_name: str,
+    new_model_paper_name: str,
+    filenames_to_add: list[tuple[str, bool]],
+):
+    """
+    Add a model to all the relevant mappings in the auto module.
+    Args:
+        old_model_infos (`ModelInfos`):
+            The structure containing the class information of the old model.
+        new_lowercase_name (`str`):
+            The new lowercase model name.
+        new_model_paper_name (`str`):
+            The fully cased name (as in the official paper name) of the new model.
+        filenames_to_add (`list[tuple[str, bool]]`):
+            A list of tuples of all potential filenames to add for a new model, along a boolean flag describing if we
+            should add this file or not. For example, [(`modeling_xxx.px`, True), (`configuration_xxx.py`, True), (`tokenization_xxx.py`, False),...]
+    """
+    new_cased_name = "".join(x.title() for x in new_lowercase_name.replace("-", "_").split("_"))
+    old_lowercase_name = old_model_infos.lowercase_name
+    old_cased_name = old_model_infos.camelcase_name
+    filenames_to_add = [
+        (filename.replace(old_lowercase_name, "auto"), to_add) for filename, to_add in filenames_to_add[1:]
+    ]
+    # fast tokenizer has the same auto mappings as normal ones
+    corrected_filenames_to_add = []
+    has_image_processor = has_video_processor = False
+    for file, to_add in filenames_to_add:
+        if "image_processing" in file:
+            has_image_processor = True
+        elif "video_processing" in file:
+            has_video_processor = True
+        elif re.search(r"(?:tokenization)|(?:image_processing)_auto_fast.py", file):
+            previous_file, previous_to_add = corrected_filenames_to_add[-1]
+            corrected_filenames_to_add[-1] = (previous_file, previous_to_add or to_add)
+        else:
+            corrected_filenames_to_add.append((file, to_add))
+    # Add the config and image/video processor mappings directly as the handling is a bit different
+    add_content_to_file(
+        repo_path / "src" / "transformers" / "models" / "auto" / "auto_mappings.py",
+        new_content=f'("{new_lowercase_name}", "{new_cased_name}Config"),\n        ',
+        add_after="CONFIG_MAPPING_NAMES = OrderedDict(\n    [\n        ",
+    )
+    autofile = (repo_path / "src" / "transformers" / "models" / "auto" / "auto_mappings.py").read_text()
+    if has_image_processor:
+        matching_lines = re.findall(rf'^\s+\("{old_lowercase_name}",\s+{{[^}}]+}}\),?$', autofile, re.MULTILINE)
+        if matching_lines:
+            match = matching_lines[0]
+            add_content_to_file(
+                repo_path / "src" / "transformers" / "models" / "auto" / "auto_mappings.py",
+                new_content=match.replace(old_lowercase_name, new_lowercase_name).replace(
+                    old_cased_name, new_cased_name
+                )
+                + "\n",
+                add_after="IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(\n    [\n",
+            )
+    if has_video_processor:
+        # Extract the VIDEO_PROCESSOR_MAPPING_NAMES block first
+        block_match = re.search(
+            r"VIDEO_PROCESSOR_MAPPING_NAMES\s*=\s*OrderedDict\(\s*\[(.*?)\]\s*\)", autofile, re.DOTALL
+        )
+        block = block_match.group(1)  # type: ignore
+        matching_lines = re.findall(rf'^\s+\("{old_lowercase_name}",\s+"[^"]+"\),?$', block, re.MULTILINE)
+        if matching_lines:
+            match = matching_lines[0]
+            add_content_to_file(
+                repo_path / "src" / "transformers" / "models" / "auto" / "auto_mappings.py",
+                new_content=match.replace(old_lowercase_name, new_lowercase_name).replace(
+                    old_cased_name, new_cased_name
+                )
+                + "\n",
+                add_after="VIDEO_PROCESSOR_MAPPING_NAMES = OrderedDict(\n    [\n",
+            )
+    for filename, to_add in corrected_filenames_to_add:
+        if to_add:
+            # The auto mapping
+            filename = filename.replace("_fast.py", ".py")
+            file = (repo_path / "src" / "transformers" / "models" / "auto" / filename).read_text()
+            # The regex has to be a bit complex like this as the tokenizer mapping has new lines everywhere
+            matching_lines = re.findall(
+                rf'( {{8,12}}\(\s*"{old_lowercase_name}",.*?\),\n)(?: {{4,12}}\(|\])', file, re.DOTALL
+            )
+            for match in matching_lines:
+                add_content_to_file(
+                    repo_path / "src" / "transformers" / "models" / "auto" / filename,
+                    new_content=match.replace(old_lowercase_name, new_lowercase_name).replace(
+                        old_cased_name, new_cased_name
+                    ),
+                    add_after=match,
+                )
+def create_doc_file(new_paper_name: str, public_classes: list[str]):
+    """
+    Create a new doc file to fill for the new model.
+    Args:
+        new_paper_name (`str`):
+            The fully cased name (as in the official paper name) of the new model.
+        public_classes (`list[str]`):
+            A list of all the public classes that the model will have in the library.
+    """
+    added_note = (
+        "\n\n⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that "
+        "may not be rendered properly in your Markdown viewer.\n\n-->\n\n"
+    )
+    copyright_for_markdown = re.sub(r"# ?", "", COPYRIGHT).replace("coding=utf-8\n", "<!--") + added_note
+    doc_template = textwrap.dedent(
+        f"""
+        # {new_paper_name}
+        ## Overview
+        The {new_paper_name} model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+        <INSERT SHORT SUMMARY HERE>
+        The abstract from the paper is the following:
+        <INSERT PAPER ABSTRACT HERE>
+        Tips:
+        <INSERT TIPS ABOUT MODEL HERE>
+        This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+        The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+        ## Usage examples
+        <INSERT SOME NICE EXAMPLES HERE>
+        """
+    )
+    # Add public classes doc
+    doc_for_classes = []
+    for class_ in public_classes:
+        doc = f"## {class_}\n\n[[autodoc]] {class_}"
+        if "Model" in class_:
+            doc += "\n    - forward"
+        doc_for_classes.append(doc)
+    class_doc = "\n\n".join(doc_for_classes)
+    return copyright_for_markdown + doc_template + class_doc
+def insert_model_in_doc_toc(
+    repo_path: Path, old_lowercase_name: str, new_lowercase_name: str, new_model_paper_name: str
+):
+    """
+    Insert the new model in the doc `_toctree.yaml`, in the same section as the old model.
+    Args:
+        old_lowercase_name (`str`):
+            The old lowercase model name.
+        new_lowercase_name (`str`):
+            The new lowercase model name.
+        new_model_paper_name (`str`):
+            The fully cased name (as in the official paper name) of the new model.
+    """
+    toc_file = repo_path / "docs" / "source" / "en" / "_toctree.yml"
+    with open(toc_file, "r") as f:
+        content = f.read()
+    toc_match = re.search(rf"- local: model_doc/{old_lowercase_name}\n {{8}}title: .*?\n", content)
+    if toc_match is None:
+        raise ValueError(f"Could not find TOC entry for {old_lowercase_name}")
+    old_model_toc = toc_match.group(0)
+    new_toc = f"      - local: model_doc/{new_lowercase_name}\n        title: {new_model_paper_name}\n"
+    add_content_to_file(
+        repo_path / "docs" / "source" / "en" / "_toctree.yml", new_content=new_toc, add_after=old_model_toc
+    )
+def create_init_file(old_lowercase_name: str, new_lowercase_name: str, filenames_to_add: list[tuple[str, bool]]):
+    """
+    Create the `__init__.py` file to add in the new model folder.
+    Args:
+        old_lowercase_name (`str`):
+            The old lowercase model name.
+        new_lowercase_name (`str`):
+            The new lowercase model name.
+        filenames_to_add (`list[tuple[str, bool]]`):
+            A list of tuples of all potential filenames to add for a new model, along a boolean flag describing if we
+            should add this file or not. For example, [(`modeling_xxx.px`, True), (`configuration_xxx.py`, True), (`tokenization_xxx.py`, False),...]
+    """
+    filenames_to_add = [
+        (filename.replace(old_lowercase_name, new_lowercase_name).replace(".py", ""), to_add)
+        for filename, to_add in filenames_to_add
+    ]
+    imports = "\n            ".join(f"from .{file} import *" for file, to_add in filenames_to_add if to_add)
+    init_file = COPYRIGHT + textwrap.dedent(
+        f"""
+        from typing import TYPE_CHECKING
+        from ...utils import _LazyModule
+        from ...utils.import_utils import define_import_structure
+        if TYPE_CHECKING:
+            {imports}
+        else:
+            import sys
+            _file = globals()["__file__"]
+            sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
+        """
+    )
+    return init_file
+def find_all_classes_from_file(module_name: str) -> set:
+    """
+    Find the name of all classes defined in `module_name`, including public ones (defined in `__all__`).
+    Args:
+        module_name (`str`):
+            The full path to the python module from which to extract classes.
+    """
+    with open(module_name, "r", encoding="utf-8") as file:
+        source_code = file.read()
+    module = cst.parse_module(source_code)
+    visitor = ClassFinder()
+    module.visit(visitor)
+    return visitor.classes, visitor.public_classes
+def find_modular_structure(
+    module_name: Path, old_model_infos: ModelInfos, new_cased_name: str
+) -> tuple[str, str, list]:
+    """
+    Extract the modular structure that will be needed to copy a file `module_name` using modular.
+    Args:
+        module_name (`str`):
+            The full path to the python module to copy with modular.
+        old_model_infos (`ModelInfos`):
+            The structure containing the class information of the old model.
+        new_cased_name (`str`):
+            The new cased model name.
+    """
+    all_classes, public_classes = find_all_classes_from_file(module_name)
+    import_location = ".".join(module_name.parts[-2:]).replace(".py", "")
+    old_cased_name = old_model_infos.camelcase_name
+    imports = f"from ..{import_location} import {', '.join(class_ for class_ in all_classes)}"
+    modular_classes = "\n\n".join(
+        f"class {class_.replace(old_cased_name, new_cased_name)}({class_}):\n    pass" for class_ in all_classes
+    )
+    public_classes = [class_.replace(old_cased_name, new_cased_name) for class_ in public_classes]
+    return imports, modular_classes, public_classes
+def create_modular_file(
+    repo_path: Path,
+    old_model_infos: ModelInfos,
+    new_lowercase_name: str,
+    filenames_to_add: list[tuple[str, bool]],
+) -> str:
+    """
+    Create a new modular file which will copy the old model, based on the new name and the different filenames
+    (modules) to add.
+    Args:
+        old_model_infos (`ModelInfos`):
+            The structure containing the class information of the old model.
+        new_lowercase_name (`str`):
+            The new lowercase model name.
+        filenames_to_add (`list[tuple[str, bool]]`):
+            A list of tuples of all potential filenames to add for a new model, along a boolean flag describing if we
+            should add this file or not. For example, [(`modeling_xxx.px`, True), (`configuration_xxx.py`, True), (`tokenization_xxx.py`, False),...]
+    """
+    new_cased_name = "".join(x.title() for x in new_lowercase_name.replace("-", "_").split("_"))
+    old_lowercase_name = old_model_infos.lowercase_name
+    old_folder_root = repo_path / "src" / "transformers" / "models" / old_lowercase_name
+    # Construct the modular file from the original (old) model, by subclassing each class
+    all_imports = ""
+    all_bodies = ""
+    all_public_classes = []
+    for filename, to_add in filenames_to_add:
+        if to_add:
+            imports, body, public_classes = find_modular_structure(
+                old_folder_root / filename, old_model_infos, new_cased_name
+            )
+            all_imports += f"\n{imports}"
+            all_bodies += f"\n\n{body}"
+            all_public_classes.extend(public_classes)
+    # Create the __all__ assignment
+    public_classes_formatted = "\n            ".join(f"{public_class}," for public_class in all_public_classes)
+    all_statement = textwrap.dedent(
+        f"""
+        __all__ = [
+            {public_classes_formatted}
+        ]
+        """
+    )
+    # Create the whole modular file
+    modular_file = COPYRIGHT + all_imports + all_bodies + all_statement
+    # Remove outer explicit quotes "" around the public class names before returning them
+    all_public_classes = [public_class.replace('"', "") for public_class in all_public_classes]
+    return modular_file, all_public_classes
+def create_test_files(
+    repo_path: Path, old_model_infos: ModelInfos, new_lowercase_name, filenames_to_add: list[tuple[str, bool]]
+):
+    """
+    Create the test files for the new model. It basically copies over the old test files and adjust the class names.
+    Args:
+        old_model_infos (`ModelInfos`):
+            The structure containing the class information of the old model.
+        new_lowercase_name (`str`):
+            The new lowercase model name.
+        filenames_to_add (`list[tuple[str, bool]]`):
+            A list of tuples of all potential filenames to add for a new model, along a boolean flag describing if we
+            should add this file or not. For example, [(`modeling_xxx.px`, True), (`configuration_xxx.py`, True), (`tokenization_xxx.py`, False),...]
+    """
+    new_cased_name = "".join(x.title() for x in new_lowercase_name.replace("-", "_").split("_"))
+    old_lowercase_name = old_model_infos.lowercase_name
+    old_cased_name = old_model_infos.camelcase_name
+    filenames_to_add = [
+        ("test_" + filename.replace(old_lowercase_name, new_lowercase_name), to_add)
+        for filename, to_add in filenames_to_add[1:]
+    ]
+    # fast tokenizer/image processor have the same test files as normal ones
+    corrected_filenames_to_add = []
+    for file, to_add in filenames_to_add:
+        if re.search(rf"test_(?:tokenization)|(?:image_processing)_{new_lowercase_name}_fast.py", file):
+            previous_file, previous_to_add = corrected_filenames_to_add[-1]
+            corrected_filenames_to_add[-1] = (previous_file, previous_to_add or to_add)
+        else:
+            corrected_filenames_to_add.append((file, to_add))
+    test_files = {}
+    for new_file, to_add in corrected_filenames_to_add:
+        if to_add:
+            original_test_file = new_file.replace(new_lowercase_name, old_lowercase_name)
+            original_test_path = repo_path / "tests" / "models" / old_lowercase_name / original_test_file
+            # Sometimes, tests may not exist
+            if not original_test_path.is_file():
+                continue
+            with open(original_test_path, "r") as f:
+                test_code = f.read()
+            # Remove old copyright and add new one
+            test_lines = test_code.split("\n")
+            idx = 0
+            while test_lines[idx].startswith("#"):
+                idx += 1
+            test_code = COPYRIGHT + "\n".join(test_lines[idx:])
+            test_files[new_file] = test_code.replace(old_cased_name, new_cased_name)
+    return test_files
+def _add_new_model_like_internal(
+    repo_path: Path,
+    old_model_infos: ModelInfos,
+    new_lowercase_name: str,
+    new_model_paper_name: str,
+    filenames_to_add: list[tuple[str, bool]],
+):
+    """
+    Creates a new model module like a given model of the Transformers library.
+    Args:
+        repo_path (`Path`):
+            The path to the root of the Transformers repository.
+        old_model_infos (`ModelInfos`):
+            The structure containing the class information of the old model.
+        new_lowercase_name (`str`):
+            The new lowercase model name.
+        new_model_paper_name (`str`):
+            The fully cased name (as in the official paper name) of the new model.
+        filenames_to_add (`list[tuple[str, bool]]`):
+            A list of tuples of all potential filenames to add for a new model, along a boolean flag describing if we
+            should add this file or not. For example, [(`modeling_xxx.px`, True), (`configuration_xxx.py`, True), (`tokenization_xxx.py`, False),...]
+    """
+    # As the import was protected, raise if not present (as it's actually a hard dependency for this command)
+    if not is_libcst_available():
+        raise ValueError("You need to install `libcst` to run this command -> `pip install libcst`")
+    old_lowercase_name = old_model_infos.lowercase_name
+    # 1. We create the folder for our new model
+    new_module_folder = repo_path / "src" / "transformers" / "models" / new_lowercase_name
+    os.makedirs(new_module_folder, exist_ok=True)
+    # 2. Create and add the modular file
+    modular_file, public_classes = create_modular_file(
+        repo_path, old_model_infos, new_lowercase_name, filenames_to_add
+    )
+    with open(new_module_folder / f"modular_{new_lowercase_name}.py", "w") as f:
+        f.write(modular_file)
+    # 3. Create and add the __init__.py
+    init_file = create_init_file(old_lowercase_name, new_lowercase_name, filenames_to_add)
+    with open(new_module_folder / "__init__.py", "w") as f:
+        f.write(init_file)
+    # 4. Add new model to the models init
+    add_content_to_file(
+        repo_path / "src" / "transformers" / "models" / "__init__.py",
+        new_content=f"    from .{new_lowercase_name} import *\n",
+        add_after="if TYPE_CHECKING:\n",
+    )
+    # 5. Add model to auto mappings
+    add_model_to_auto_mappings(repo_path, old_model_infos, new_lowercase_name, new_model_paper_name, filenames_to_add)
+    # 6. Add test files
+    tests_folder = repo_path / "tests" / "models" / new_lowercase_name
+    os.makedirs(tests_folder, exist_ok=True)
+    # Add empty __init__.py
+    with open(tests_folder / "__init__.py", "w"):
+        pass
+    test_files = create_test_files(repo_path, old_model_infos, new_lowercase_name, filenames_to_add)
+    for filename, content in test_files.items():
+        with open(tests_folder / filename, "w") as f:
+            f.write(content)
+    # 7. Add doc file
+    doc_file = create_doc_file(new_model_paper_name, public_classes)
+    with open(repo_path / "docs" / "source" / "en" / "model_doc" / f"{new_lowercase_name}.md", "w") as f:
+        f.write(doc_file)
+    insert_model_in_doc_toc(repo_path, old_lowercase_name, new_lowercase_name, new_model_paper_name)
+    # 9. Run linters
+    model_init_file = repo_path / "src" / "transformers" / "models" / "__init__.py"
+    subprocess.run(
+        ["ruff", "check", new_module_folder, tests_folder, model_init_file, "--fix"],
+        cwd=repo_path,
+        stdout=subprocess.DEVNULL,
+    )
+    subprocess.run(
+        ["ruff", "format", new_module_folder, tests_folder, model_init_file],
+        cwd=repo_path,
+        stdout=subprocess.DEVNULL,
+    )
+    subprocess.run(
+        ["python", "utils/check_doc_toc.py", "--fix_and_overwrite"], cwd=repo_path, stdout=subprocess.DEVNULL
+    )
+    subprocess.run(["python", "utils/sort_auto_mappings.py"], cwd=repo_path, stdout=subprocess.DEVNULL)
+    # 10. Run the modular conversion
+    subprocess.run(
+        ["python", "utils/modular_model_converter.py", new_lowercase_name], cwd=repo_path, stdout=subprocess.DEVNULL
+    )
+def get_user_field(
+    question: str,
+    default_value: str | None = None,
+    convert_to: Callable | None = None,
+    fallback_message: str | None = None,
+) -> Any:
+    """
+    A utility function that asks a question to the user to get an answer, potentially looping until it gets a valid
+    answer.
+    Args:
+        question (`str`):
+            The question to ask the user.
+        default_value (`str`, *optional*):
+            A potential default value that will be used when the answer is empty.
+        convert_to (`Callable`, *optional*):
+            If set, the answer will be passed to this function. If this function raises an error on the provided
+            answer, the question will be asked again.
+        fallback_message (`str`, *optional*):
+            A message that will be displayed each time the question is asked again to the user.
+    Returns:
+        `Any`: The answer provided by the user (or the default), passed through the potential conversion function.
+    """
+    if not question.endswith(" "):
+        question = question + " "
+    if default_value is not None:
+        question = f"{question} [{default_value}] "
+    valid_answer = False
+    while not valid_answer:
+        answer = input(question)
+        if default_value is not None and len(answer) == 0:
+            answer = default_value
+        if convert_to is not None:
+            try:
+                answer = convert_to(answer)
+                valid_answer = True
+            except Exception:
+                valid_answer = False
+        else:
+            valid_answer = True
+        if not valid_answer:
+            print(fallback_message)
+    return answer
+def convert_to_bool(x: str) -> bool:
+    """
+    Converts a string to a bool.
+    """
+    if x.lower() in ["1", "y", "yes", "true"]:
+        return True
+    if x.lower() in ["0", "n", "no", "false"]:
+        return False
+    raise ValueError(f"{x} is not a value that can be converted to a bool.")
+def get_user_input():
+    """
+    Ask the user for the necessary inputs to add the new model.
+    """
+    from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES
+    model_types = list(CONFIG_MAPPING_NAMES.keys())
+    # Get old model type
+    valid_model_type = False
+    while not valid_model_type:
+        old_model_type = input(
+            "What model would you like to duplicate? Please provide it as lowercase, e.g. `llama`): "
+        )
+        if old_model_type in model_types:
+            valid_model_type = True
+        else:
+            print(f"{old_model_type} is not a valid model type.")
+            near_choices = difflib.get_close_matches(old_model_type, model_types)
+            if len(near_choices) >= 1:
+                if len(near_choices) > 1:
+                    near_choices = " or ".join(near_choices)
+                print(f"Did you mean {near_choices}?")
+    old_model_infos = ModelInfos(old_model_type)
+    # Ask for the new model name
+    new_lowercase_name = get_user_field(
+        "What is the new model name? Please provide it as snake lowercase, e.g. `new_model`?"
+    )
+    new_model_paper_name = get_user_field(
+        "What is the fully cased name you would like to appear in the doc (e.g. `NeW ModEl`)? ",
+        default_value="".join(x.title() for x in new_lowercase_name.split("_")),
+    )
+    # Ask if we want to add individual processor classes as well
+    add_tokenizer = False
+    add_fast_tokenizer = False
+    add_image_processor = False
+    add_video_processor = False
+    add_feature_extractor = False
+    add_processor = False
+    if old_model_infos.tokenizer_class is not None:
+        add_tokenizer = get_user_field(
+            f"Do you want to create a new tokenizer? If `no`, it will use the same as {old_model_type} (y/n)?",
+            convert_to=convert_to_bool,
+            fallback_message="Please answer yes/no, y/n, true/false or 1/0. ",
+        )
+    if old_model_infos.fast_tokenizer_class is not None:
+        add_fast_tokenizer = get_user_field(
+            f"Do you want to create a new fast tokenizer? If `no`, it will use the same as {old_model_type} (y/n)?",
+            convert_to=convert_to_bool,
+            fallback_message="Please answer yes/no, y/n, true/false or 1/0. ",
+        )
+    if old_model_infos.image_processor_classes is not None:
+        add_image_processor = get_user_field(
+            f"Do you want to create a new image processor? If `no`, it will use the same as {old_model_type} (y/n)?",
+            convert_to=convert_to_bool,
+            fallback_message="Please answer yes/no, y/n, true/false or 1/0. ",
+        )
+    if old_model_infos.video_processor_class is not None:
+        add_video_processor = get_user_field(
+            f"Do you want to create a new video processor? If `no`, it will use the same as {old_model_type} (y/n)?",
+            convert_to=convert_to_bool,
+            fallback_message="Please answer yes/no, y/n, true/false or 1/0. ",
+        )
+    if old_model_infos.feature_extractor_class is not None:
+        add_feature_extractor = get_user_field(
+            f"Do you want to create a new feature extractor? If `no`, it will use the same as {old_model_type} (y/n)?",
+            convert_to=convert_to_bool,
+            fallback_message="Please answer yes/no, y/n, true/false or 1/0. ",
+        )
+    if old_model_infos.processor_class is not None:
+        add_processor = get_user_field(
+            f"Do you want to create a new processor? If `no`, it will use the same as {old_model_type} (y/n)?",
+            convert_to=convert_to_bool,
+            fallback_message="Please answer yes/no, y/n, true/false or 1/0. ",
+        )
+    old_lowercase_name = old_model_infos.lowercase_name
+    # A list of the old filenames, along whether we should copy them or not
+    filenames_to_add = (
+        (f"configuration_{old_lowercase_name}.py", True),
+        (f"modeling_{old_lowercase_name}.py", True),
+        (f"tokenization_{old_lowercase_name}.py", add_tokenizer),
+        (f"tokenization_{old_lowercase_name}_fast.py", add_fast_tokenizer),
+        (f"image_processing_{old_lowercase_name}.py", add_image_processor),
+        (f"video_processing_{old_lowercase_name}.py", add_video_processor),
+        (f"feature_extraction_{old_lowercase_name}.py", add_feature_extractor),
+        (f"processing_{old_lowercase_name}.py", add_processor),
+    )
+    return old_model_infos, new_lowercase_name, new_model_paper_name, filenames_to_add

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cli/chat.py ADDED Viewed

	@@ -0,0 +1,673 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import json
+import os
+import platform
+import re
+import string
+import time
+from collections.abc import AsyncIterator, Awaitable
+from typing import Annotated, Any
+from urllib.parse import urljoin, urlparse
+import httpx
+import requests
+import typer
+import yaml
+from huggingface_hub import AsyncInferenceClient, ChatCompletionStreamOutput
+from transformers import GenerationConfig
+from transformers.utils import is_rich_available
+try:
+    import readline  # noqa importing this enables GNU readline capabilities
+except ImportError:
+    # some platforms may not support readline: https://docs.python.org/3/library/readline.html
+    pass
+if platform.system() != "Windows":
+    import pwd
+if is_rich_available():
+    from rich import filesize
+    from rich.console import Console
+    from rich.live import Live
+    from rich.markdown import Markdown
+    from rich.progress import BarColumn, Progress, ProgressColumn, TextColumn, TimeElapsedColumn
+    from rich.text import Text
+DEFAULT_HTTP_ENDPOINT = {"hostname": "localhost", "port": 8000}
+ALLOWED_KEY_CHARS = set(string.ascii_letters + string.whitespace)
+ALLOWED_VALUE_CHARS = set(
+    string.ascii_letters + string.digits + string.whitespace + r".!\"#$%&'()*+,\-/:<=>?@[]^_`{|}~"
+)
+DEFAULT_EXAMPLES = {
+    "llama": {"text": "There is a Llama in my lawn, how can I get rid of it?"},
+    "code": {
+        "text": (
+            "Write a Python function that integrates any Python function f(x) numerically over an arbitrary "
+            "interval [x_start, x_end]."
+        ),
+    },
+    "helicopter": {"text": "How many helicopters can a human eat in one sitting?"},
+    "numbers": {"text": "Count to 10 but skip every number ending with an 'e'"},
+    "birds": {"text": "Why aren't birds real?"},
+    "socks": {"text": "Why is it important to eat socks after meditating?"},
+    "numbers2": {"text": "Which number is larger, 9.9 or 9.11?"},
+}
+# Printed at the start of a chat session
+HELP_STRING_MINIMAL = """
+**TRANSFORMERS CHAT INTERFACE**
+Chat interface to try out a model. Besides chatting with the model, here are some basic commands:
+- **!help**: shows all available commands (set generation settings, save chat, etc.)
+- **!status**: shows the current status of the model and generation settings
+- **!clear**: clears the current conversation and starts a new one
+- **!exit**: closes the interface
+"""
+# Printed when the user types `help` in the chat session
+HELP_STRING = f"""
+**TRANSFORMERS CHAT INTERFACE HELP**
+Full command list:
+- **!help**: shows this help message
+- **!clear**: clears the current conversation and starts a new one
+- **!status**: shows the current status of the model and generation settings
+- **!example {{NAME}}**: loads example named `{{NAME}}` from the config and uses it as the user input.
+Available example names: `{"`, `".join(DEFAULT_EXAMPLES.keys())}`
+- **!set {{ARG_1}}={{VALUE_1}} {{ARG_2}}={{VALUE_2}}** ...: changes the system prompt or generation settings (multiple
+settings are separated by a space). Accepts the same flags and format as the `generate_flags` CLI argument.
+If you're a new user, check this basic flag guide: https://huggingface.co/docs/transformers/llm_tutorial#common-options
+- **!save {{SAVE_NAME}} (optional)**: saves the current chat and settings to file by default to
+`./chat_history/{{MODEL_ID}}/chat_{{DATETIME}}.yaml` or `{{SAVE_NAME}}` if provided
+- **!exit**: closes the interface
+"""
+class RichInterface:
+    def __init__(self, model_id: str, user_id: str, base_url: str):
+        self._console = Console()
+        self.model_id = model_id
+        self.user_id = user_id
+        self.base_url = base_url
+    async def stream_output(
+        self, stream: Awaitable[AsyncIterator[ChatCompletionStreamOutput]]
+    ) -> tuple[str, str | Any | None]:
+        self._console.print(f"[bold blue]<{self.model_id}>:")
+        with Live(console=self._console, refresh_per_second=4) as live:
+            text = ""
+            completion_tokens = 0
+            start_time = time.time()
+            finish_reason: str | None = None
+            async for token in await stream:
+                outputs = token.choices[0].delta.content
+                finish_reason = getattr(token.choices[0], "finish_reason", finish_reason)
+                usage = getattr(token, "usage", None)
+                if usage is not None:
+                    completion_tokens = getattr(usage, "completion_tokens", completion_tokens)
+                if not outputs:
+                    continue
+                # Escapes single words encased in <>, e.g. <think> -> \<think\>, for proper rendering in Markdown.
+                # It only escapes single words that may have `_`, optionally following a `/` (e.g. </think>)
+                outputs = re.sub(r"<(/*)(\w*)>", r"\<\1\2\>", outputs)
+                text += outputs
+                # Render the accumulated text as Markdown
+                # NOTE: this is a workaround for the rendering "unstandard markdown"
+                #  in rich. The chatbots output treat "\n" as a new line for
+                #  better compatibility with real-world text. However, rendering
+                #  in markdown would break the format. It is because standard markdown
+                #  treat a single "\n" in normal text as a space.
+                #  Our workaround is adding two spaces at the end of each line.
+                #  This is not a perfect solution, as it would
+                #  introduce trailing spaces (only) in code block, but it works well
+                #  especially for console output, because in general the console does not
+                #  care about trailing spaces.
+                lines = []
+                for line in text.splitlines():
+                    lines.append(line)
+                    if line.startswith("```"):
+                        # Code block marker - do not add trailing spaces, as it would
+                        #  break the syntax highlighting
+                        lines.append("\n")
+                    else:
+                        lines.append("  \n")
+                markdown = Markdown("".join(lines).strip(), code_theme="github-dark")
+                # Update the Live console output
+                live.update(markdown, refresh=True)
+        elapsed = time.time() - start_time
+        if elapsed > 0 and completion_tokens > 0:
+            tok_per_sec = completion_tokens / elapsed
+            self._console.print()
+            self._console.print(f"[dim]{completion_tokens} tokens in {elapsed:.1f}s ({tok_per_sec:.1f} tok/s)[/dim]")
+        self._console.print()
+        return text, finish_reason
+    def input(self) -> str:
+        """Gets user input from the console."""
+        input = self._console.input(f"[bold red]<{self.user_id}>:\n")
+        self._console.print()
+        return input
+    def clear(self):
+        """Clears the console."""
+        self._console.clear()
+    def print_user_message(self, text: str):
+        """Prints a user message to the console."""
+        self._console.print(f"[bold red]<{self.user_id}>:[/ bold red]\n{text}")
+        self._console.print()
+    def print_color(self, text: str, color: str):
+        """Prints text in a given color to the console."""
+        self._console.print(f"[bold {color}]{text}")
+        self._console.print()
+    def confirm(self, message: str, default: bool = False) -> bool:
+        """Displays a yes/no prompt to the user, returning True for confirmation."""
+        default_hint = "Y/n" if default else "y/N"
+        response = self._console.input(f"[bold yellow]{message} ({default_hint}): ")
+        self._console.print()
+        response = response.strip().lower()
+        if not response:
+            return default
+        return response in {"y", "yes"}
+    def print_help(self, minimal: bool = False):
+        """Prints the help message to the console."""
+        self._console.print(Markdown(HELP_STRING_MINIMAL if minimal else HELP_STRING))
+        self._console.print()
+    def print_model_load(self, model: str):
+        response = requests.post(f"{self.base_url.rstrip('/')}/load_model", json={"model": model}, stream=True)
+        response.raise_for_status()
+        class StatsColumn(ProgressColumn):
+            def render(self, task):
+                if not task.total:
+                    return Text("")
+                if task.fields.get("unit") == "bytes":
+                    done = filesize.decimal(int(task.completed))
+                    tot = filesize.decimal(int(task.total))
+                    speed = f"  {filesize.decimal(int(task.speed))}/s" if task.speed else ""
+                    if task.time_remaining is not None:
+                        eta = f"  {int(task.time_remaining // 60)}:{int(task.time_remaining % 60):02d}"
+                    else:
+                        eta = ""
+                    return Text(f"{done}/{tot}{speed}{eta}", style="progress.download")
+                return Text(f"{int(task.completed)}/{int(task.total)}")
+        stage_labels = {
+            "processor": "Loading processor",
+            "config": "Loading config",
+            "download": "Downloading files",
+            "weights": "Loading into memory",
+        }
+        # Include the model name prefix in descriptions only when the terminal is wide enough.
+        # The bar, stats, and elapsed columns need ~70 chars; the model prefix needs len(model)+5.
+        show_model_prefix = self._console.width >= len(model) + 5 + 70
+        def _label(stage_key):
+            stage_text = stage_labels.get(stage_key, stage_key)
+            if show_model_prefix:
+                return f"{model}  →  {stage_text}"
+            return stage_text
+        progress = Progress(
+            TextColumn("[bold]{task.description}"),
+            BarColumn(bar_width=40),
+            StatsColumn(),
+            TimeElapsedColumn(),
+            console=self._console,
+        )
+        task_id = progress.add_task(_label("processor"), total=None)
+        cached = False
+        with Live(progress, console=self._console, transient=True):
+            for line in response.iter_lines():
+                if not line or not line.startswith(b"data: "):
+                    continue
+                event = json.loads(line[6:])
+                status = event.get("status")
+                if status == "ready":
+                    cached = event.get("cached", False)
+                    break
+                if status == "error":
+                    raise RuntimeError(event.get("message", "Unknown error"))
+                if status == "loading":
+                    stage = event.get("stage")
+                    prog = event.get("progress")
+                    label = _label(stage)
+                    if prog:
+                        unit = "bytes" if stage == "download" else "items"
+                        progress.update(
+                            task_id, description=label, completed=prog["current"], total=prog.get("total"), unit=unit
+                        )
+                    else:
+                        progress.update(task_id, description=label, completed=0, total=None)
+        if cached:
+            self._console.print(Markdown(f"_*{model} was already loaded.*_"))
+        else:
+            self._console.print(Markdown(f"_*{model} is warm.*_"))
+        self._console.print()
+    def print_status(self, config: GenerationConfig):
+        """Prints the status of the model and generation settings to the console."""
+        self._console.print(f"[bold blue]Model: {self.model_id}\n")
+        self._console.print(f"[bold blue]{config}")
+        self._console.print()
+class Chat:
+    """Chat with a model from the command line."""
+    # Defining a class to help with internal state but in practice it's just a method to call
+    # TODO: refactor into a proper module with helpers + 1 main method
+    def __init__(
+        self,
+        model_id: Annotated[str, typer.Argument(help="ID of the model to use (e.g. 'HuggingFaceTB/SmolLM3-3B').")],
+        base_url: Annotated[
+            str | None, typer.Argument(help="Base url to connect to (e.g. http://localhost:8000/v1).")
+        ] = f"http://{DEFAULT_HTTP_ENDPOINT['hostname']}:{DEFAULT_HTTP_ENDPOINT['port']}",
+        generate_flags: Annotated[
+            list[str] | None,
+            typer.Argument(
+                help=(
+                    "Flags to pass to `generate`, using a space as a separator between flags. Accepts booleans, numbers, "
+                    "and lists of integers, more advanced parameterization should be set through --generation-config. "
+                    "Example: `transformers chat <base_url> <model_id> max_new_tokens=100 do_sample=False eos_token_id=[1,2]`. "
+                    "If you're a new user, check this basic flag guide: "
+                    "https://huggingface.co/docs/transformers/llm_tutorial#common-options"
+                )
+            ),
+        ] = None,
+        # General settings
+        user: Annotated[
+            str | None,
+            typer.Option(help="Username to display in chat interface. Defaults to the current user's name."),
+        ] = None,
+        system_prompt: Annotated[str | None, typer.Option(help="System prompt.")] = None,
+        save_folder: Annotated[str, typer.Option(help="Folder to save chat history.")] = "./chat_history/",
+        examples_path: Annotated[str | None, typer.Option(help="Path to a yaml file with examples.")] = None,
+        # Generation settings
+        generation_config: Annotated[
+            str | None,
+            typer.Option(
+                help="Path to a local generation config file or to a HuggingFace repo containing a `generation_config.json` file. Other generation settings passed as CLI arguments will be applied on top of this generation config."
+            ),
+        ] = None,
+    ) -> None:
+        """Chat with a model from the command line."""
+        self.base_url = base_url
+        parsed = urlparse(self.base_url)
+        if parsed.hostname == DEFAULT_HTTP_ENDPOINT["hostname"] and parsed.port == DEFAULT_HTTP_ENDPOINT["port"]:
+            self.check_health(self.base_url)
+        self.model_id = model_id
+        self.system_prompt = system_prompt
+        self.save_folder = save_folder
+        # Generation settings
+        config = load_generation_config(generation_config)
+        config.update(do_sample=True, max_new_tokens=256)  # some default values
+        config.update(**parse_generate_flags(generate_flags))
+        self.config = config
+        self.settings = {"base_url": base_url, "model_id": model_id, "config": self.config.to_dict()}
+        # User settings
+        self.user = user if user is not None else get_username()
+        # Load examples
+        if examples_path:
+            with open(examples_path) as f:
+                self.examples = yaml.safe_load(f)
+        else:
+            self.examples = DEFAULT_EXAMPLES
+        # Check requirements
+        if not is_rich_available():
+            raise ImportError("You need to install rich to use the chat interface. (`pip install rich`)")
+        # Run chat session
+        asyncio.run(self._inner_run())
+    @staticmethod
+    def check_health(url):
+        health_url = urljoin(url + "/", "health")
+        try:
+            output = httpx.get(health_url)
+            if output.status_code != 200:
+                raise ValueError(
+                    f"The server running on {url} returned status code {output.status_code} on health check (/health)."
+                )
+        except httpx.ConnectError:
+            raise ValueError(
+                f"No server currently running on {url}. To run a local server, please run `transformers serve` in a"
+                f"separate shell. Find more information here: https://huggingface.co/docs/transformers/serving"
+            )
+        return True
+    def handle_non_exit_user_commands(
+        self,
+        user_input: str,
+        interface: RichInterface,
+        examples: dict[str, dict[str, str]],
+        config: GenerationConfig,
+        chat: list[dict],
+    ) -> tuple[list[dict], GenerationConfig]:
+        """
+        Handles all user commands except for `!exit`. May update the chat history (e.g. reset it) or the
+        generation config (e.g. set a new flag).
+        """
+        valid_command = True
+        if user_input == "!clear":
+            chat = new_chat_history(self.system_prompt)
+            interface.clear()
+        elif user_input == "!help":
+            interface.print_help()
+        elif user_input.startswith("!save") and len(user_input.split()) < 2:
+            split_input = user_input.split()
+            filename = (
+                split_input[1]
+                if len(split_input) == 2
+                else os.path.join(self.save_folder, self.model_id, f"chat_{time.strftime('%Y-%m-%d_%H-%M-%S')}.json")
+            )
+            save_chat(filename=filename, chat=chat, settings=self.settings)
+            interface.print_color(text=f"Chat saved to {filename}!", color="green")
+        elif user_input.startswith("!set"):
+            # splits the new args into a list of strings, each string being a `flag=value` pair (same format as
+            # `generate_flags`)
+            new_generate_flags = user_input[4:].strip()
+            new_generate_flags = new_generate_flags.split()
+            # sanity check: each member in the list must have an =
+            for flag in new_generate_flags:
+                if "=" not in flag:
+                    interface.print_color(
+                        text=(
+                            f"Invalid flag format, missing `=` after `{flag}`. Please use the format "
+                            "`arg_1=value_1 arg_2=value_2 ...`."
+                        ),
+                        color="red",
+                    )
+                    break
+            else:
+                # Update config from user flags
+                config.update(**parse_generate_flags(new_generate_flags))
+        elif user_input.startswith("!example") and len(user_input.split()) == 2:
+            example_name = user_input.split()[1]
+            if example_name in examples:
+                interface.clear()
+                chat = []
+                interface.print_user_message(examples[example_name]["text"])
+                chat.append({"role": "user", "content": examples[example_name]["text"]})
+            else:
+                example_error = (
+                    f"Example {example_name} not found in list of available examples: {list(examples.keys())}."
+                )
+                interface.print_color(text=example_error, color="red")
+        elif user_input == "!status":
+            interface.print_status(config=config)
+        else:
+            valid_command = False
+            interface.print_color(text=f"'{user_input}' is not a valid command. Showing help message.", color="red")
+            interface.print_help()
+        return chat, valid_command, config
+    async def _inner_run(self):
+        interface = RichInterface(model_id=self.model_id, user_id=self.user, base_url=self.base_url)
+        interface.clear()
+        chat = new_chat_history(self.system_prompt)
+        # Starts the session with a minimal help message at the top, so that a user doesn't get stuck
+        interface.print_help(minimal=True)
+        interface.print_model_load(self.model_id)
+        config = self.config
+        async with AsyncInferenceClient(base_url=self.base_url) as client:
+            pending_user_input: str | None = None
+            while True:
+                try:
+                    if pending_user_input is not None:
+                        user_input = pending_user_input
+                        pending_user_input = None
+                        interface.print_user_message(user_input)
+                    else:
+                        user_input = interface.input()
+                    # User commands
+                    if user_input == "!exit":
+                        break
+                    elif user_input == "!clear":
+                        chat = new_chat_history(self.system_prompt)
+                        interface.clear()
+                        continue
+                    elif user_input == "!help":
+                        interface.print_help()
+                        continue
+                    elif user_input.startswith("!save") and len(user_input.split()) < 2:
+                        split_input = user_input.split()
+                        filename = (
+                            split_input[1]
+                            if len(split_input) == 2
+                            else os.path.join(
+                                self.save_folder, self.model_id, f"chat_{time.strftime('%Y-%m-%d_%H-%M-%S')}.json"
+                            )
+                        )
+                        save_chat(filename=filename, chat=chat, settings=self.settings)
+                        interface.print_color(text=f"Chat saved to {filename}!", color="green")
+                        continue
+                    elif user_input.startswith("!set"):
+                        # splits the new args into a list of strings, each string being a `flag=value` pair (same format as
+                        # `generate_flags`)
+                        new_generate_flags = user_input[4:].strip()
+                        new_generate_flags = new_generate_flags.split()
+                        # sanity check: each member in the list must have an =
+                        for flag in new_generate_flags:
+                            if "=" not in flag:
+                                interface.print_color(
+                                    text=(
+                                        f"Invalid flag format, missing `=` after `{flag}`. Please use the format "
+                                        "`arg_1=value_1 arg_2=value_2 ...`."
+                                    ),
+                                    color="red",
+                                )
+                                break
+                        else:
+                            # Update config from user flags
+                            config.update(**parse_generate_flags(new_generate_flags))
+                        continue
+                    elif user_input.startswith("!example") and len(user_input.split()) == 2:
+                        example_name = user_input.split()[1]
+                        if example_name in self.examples:
+                            interface.clear()
+                            chat = []
+                            interface.print_user_message(self.examples[example_name]["text"])
+                            chat.append({"role": "user", "content": self.examples[example_name]["text"]})
+                        else:
+                            example_error = f"Example {example_name} not found in list of available examples: {list(self.examples.keys())}."
+                            interface.print_color(text=example_error, color="red")
+                    elif user_input == "!status":
+                        interface.print_status(config=config)
+                        continue
+                    elif user_input.startswith("!"):
+                        interface.print_color(
+                            text=f"'{user_input}' is not a valid command. Showing help message.", color="red"
+                        )
+                        interface.print_help()
+                        continue
+                    else:
+                        chat.append({"role": "user", "content": user_input})
+                    extra_body = {
+                        "generation_config": config.to_json_string(),
+                        "model": self.model_id,
+                    }
+                    stream = client.chat_completion(
+                        chat,
+                        stream=True,
+                        model=self.model_id,
+                        extra_body=extra_body,
+                    )
+                    model_output, finish_reason = await interface.stream_output(stream)
+                    chat.append({"role": "assistant", "content": model_output})
+                    if finish_reason == "length":
+                        interface.print_color("Generation stopped after reaching the token limit.", "yellow")
+                        if interface.confirm("Continue generating?"):
+                            pending_user_input = "Please continue. Do not repeat text.”"
+                            continue
+                except KeyboardInterrupt:
+                    break
+def load_generation_config(generation_config: str | None) -> GenerationConfig:
+    if generation_config is None:
+        return GenerationConfig()
+    if ".json" in generation_config:  # is a local file
+        dirname = os.path.dirname(generation_config)
+        filename = os.path.basename(generation_config)
+        return GenerationConfig.from_pretrained(dirname, filename)
+    else:
+        return GenerationConfig.from_pretrained(generation_config)
+def parse_generate_flags(generate_flags: list[str] | None) -> dict:
+    """Parses the generate flags from the user input into a dictionary of `generate` kwargs."""
+    if generate_flags is None or len(generate_flags) == 0:
+        return {}
+    # Assumption: `generate_flags` is a list of strings, each string being a `flag=value` pair, that can be parsed
+    # into a json string if we:
+    # 1. Add quotes around each flag name
+    generate_flags_as_dict = {'"' + flag.split("=")[0] + '"': flag.split("=")[1] for flag in generate_flags}
+    # 2. Handle types:
+    # 2. a. booleans should be lowercase, None should be null
+    generate_flags_as_dict = {
+        k: v.lower() if v.lower() in ["true", "false"] else v for k, v in generate_flags_as_dict.items()
+    }
+    generate_flags_as_dict = {k: "null" if v == "None" else v for k, v in generate_flags_as_dict.items()}
+    # 2. b. strings should be quoted
+    def is_number(s: str) -> bool:
+        # handle negative numbers
+        s = s.removeprefix("-")
+        return s.replace(".", "", 1).isdigit()
+    generate_flags_as_dict = {k: f'"{v}"' if not is_number(v) else v for k, v in generate_flags_as_dict.items()}
+    # 2. c. [no processing needed] lists are lists of ints because `generate` doesn't take lists of strings :)
+    # We also mention in the help message that we only accept lists of ints for now.
+    # 3. Join the result into a comma separated string
+    generate_flags_string = ", ".join([f"{k}: {v}" for k, v in generate_flags_as_dict.items()])
+    # 4. Add the opening/closing brackets
+    generate_flags_string = "{" + generate_flags_string + "}"
+    # 5. Remove quotes around boolean/null and around lists
+    generate_flags_string = generate_flags_string.replace('"null"', "null")
+    generate_flags_string = generate_flags_string.replace('"true"', "true")
+    generate_flags_string = generate_flags_string.replace('"false"', "false")
+    generate_flags_string = generate_flags_string.replace('"[', "[")
+    generate_flags_string = generate_flags_string.replace(']"', "]")
+    # 6. Replace the `=` with `:`
+    generate_flags_string = generate_flags_string.replace("=", ":")
+    try:
+        processed_generate_flags = json.loads(generate_flags_string)
+    except json.JSONDecodeError:
+        raise ValueError(
+            "Failed to convert `generate_flags` into a valid JSON object."
+            "\n`generate_flags` = {generate_flags}"
+            "\nConverted JSON string = {generate_flags_string}"
+        )
+    return processed_generate_flags
+def new_chat_history(system_prompt: str | None = None) -> list[dict]:
+    """Returns a new chat conversation."""
+    return [{"role": "system", "content": system_prompt}] if system_prompt else []
+def save_chat(filename: str, chat: list[dict], settings: dict) -> str:
+    """Saves the chat history to a file."""
+    os.makedirs(os.path.dirname(filename), exist_ok=True)
+    with open(filename, "w") as f:
+        json.dump({"settings": settings, "chat_history": chat}, f, indent=4)
+    return os.path.abspath(filename)
+def get_username() -> str:
+    """Returns the username of the current user."""
+    if platform.system() == "Windows":
+        return os.getlogin()
+    else:
+        return pwd.getpwuid(os.getuid()).pw_name
+if __name__ == "__main__":
+    Chat(model_id="meta-llama/Llama-3.2-3b-Instruct")

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cli/download.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Annotated
+import typer
+def download(
+    model_id: Annotated[str, typer.Argument(help="The model ID to download")],
+    cache_dir: Annotated[str | None, typer.Option(help="Directory where to save files.")] = None,
+    force_download: Annotated[
+        bool, typer.Option(help="If set, the files will be downloaded even if they are already cached locally.")
+    ] = False,
+    trust_remote_code: Annotated[
+        bool,
+        typer.Option(
+            help="Whether or not to allow for custom models defined on the Hub in their own modeling files. Use only if you've reviewed the code as it will execute on your local machine"
+        ),
+    ] = False,
+):
+    """Download a model and its tokenizer from the Hub."""
+    from ..models.auto import AutoModel, AutoTokenizer
+    AutoModel.from_pretrained(
+        model_id, cache_dir=cache_dir, force_download=force_download, trust_remote_code=trust_remote_code
+    )
+    AutoTokenizer.from_pretrained(
+        model_id, cache_dir=cache_dir, force_download=force_download, trust_remote_code=trust_remote_code
+    )

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cli/serve.py ADDED Viewed

	@@ -0,0 +1,241 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CLI entry point for `transformers serve`.
+"""
+import asyncio
+import enum
+import json
+import threading
+from typing import Annotated
+import typer
+from transformers.utils import logging
+from transformers.utils.import_utils import is_serve_available
+from .serving.utils import set_torch_seed
+logger = logging.get_logger(__name__)
+class ReasoningMode(str, enum.Enum):
+    ON = "on"
+    OFF = "off"
+    AUTO = "auto"
+class Serve:
+    def __init__(
+        self,
+        force_model: Annotated[str | None, typer.Argument(help="Model to preload and use for all requests.")] = None,
+        # Model options
+        continuous_batching: Annotated[
+            bool,
+            typer.Option(help="Enable continuous batching with paged attention. Configure with --cb-* flags."),
+        ] = False,
+        attn_implementation: Annotated[
+            str | None, typer.Option(help="Attention implementation (e.g. flash_attention_2).")
+        ] = None,
+        compile: Annotated[bool, typer.Option(help="Enable torch.compile for faster inference.")] = False,
+        quantization: Annotated[
+            str | None, typer.Option(help="Quantization method: 'bnb-4bit' or 'bnb-8bit'.")
+        ] = None,
+        reasoning: Annotated[
+            ReasoningMode,
+            typer.Option(
+                help=(
+                    "Reasoning mode. 'auto' uses the chat template default. Only applies to models that "
+                    "support reasoning via their chat template (e.g. Qwen3, Gemma 4) — for other models "
+                    "this flag has no effect."
+                )
+            ),
+        ] = ReasoningMode.AUTO,
+        chat_template_kwargs: Annotated[
+            str | None,
+            typer.Option(
+                help=(
+                    "Default JSON kwargs forwarded to apply_chat_template "
+                    "(e.g. '{\"enable_thinking\": true}'); per-request chat_template_kwargs override these."
+                )
+            ),
+        ] = None,
+        device: Annotated[str, typer.Option(help="Device for inference (e.g. 'auto', 'cuda:0', 'cpu').")] = "auto",
+        dtype: Annotated[str | None, typer.Option(help="Override model dtype. 'auto' derives from weights.")] = "auto",
+        trust_remote_code: Annotated[bool, typer.Option(help="Trust remote code when loading.")] = False,
+        model_timeout: Annotated[
+            int, typer.Option(help="Seconds before idle model is unloaded. Ignored when force_model is set.")
+        ] = 300,
+        # Continuous batching tuning
+        cb_block_size: Annotated[
+            int | None, typer.Option(help="KV cache block size in tokens for continuous batching.")
+        ] = None,
+        cb_num_blocks: Annotated[
+            int | None, typer.Option(help="Number of KV cache blocks for continuous batching.")
+        ] = None,
+        cb_max_batch_tokens: Annotated[
+            int | None, typer.Option(help="Maximum tokens per batch for continuous batching.")
+        ] = None,
+        cb_max_memory_percent: Annotated[
+            float | None, typer.Option(help="Max GPU memory fraction for KV cache (0.0-1.0).")
+        ] = None,
+        cb_use_cuda_graph: Annotated[
+            bool | None, typer.Option(help="Enable CUDA graphs for continuous batching.")
+        ] = None,
+        # Server options
+        host: Annotated[str, typer.Option(help="Server listen address.")] = "localhost",
+        port: Annotated[int, typer.Option(help="Server listen port.")] = 8000,
+        enable_cors: Annotated[bool, typer.Option(help="Enable permissive CORS.")] = False,
+        log_level: Annotated[str, typer.Option(help="Logging level (e.g. 'info', 'warning').")] = "warning",
+        default_seed: Annotated[int | None, typer.Option(help="Default torch seed.")] = None,
+        non_blocking: Annotated[
+            bool, typer.Option(hidden=True, help="Run server in a background thread. Used by tests.")
+        ] = False,
+    ) -> None:
+        if not is_serve_available():
+            raise ImportError("Missing dependencies for serving. Install with `pip install transformers[serving]`")
+        import uvicorn
+        from .serving.chat_completion import ChatCompletionHandler
+        from .serving.completion import CompletionHandler
+        from .serving.model_manager import ModelManager
+        from .serving.response import ResponseHandler
+        from .serving.server import build_server
+        from .serving.transcription import TranscriptionHandler
+        from .serving.utils import GenerationState
+        # Seed
+        if default_seed is not None:
+            set_torch_seed(default_seed)
+        # Logging
+        transformers_logger = logging.get_logger("transformers")
+        transformers_logger.setLevel(logging.log_levels[log_level.lower()])
+        self._model_manager = ModelManager(
+            device=device,
+            dtype=dtype,
+            trust_remote_code=trust_remote_code,
+            attn_implementation=attn_implementation,
+            quantization=quantization,
+            model_timeout=model_timeout,
+            force_model=force_model,
+        )
+        from transformers import ContinuousBatchingConfig
+        cb_kwargs = {
+            k: v
+            for k, v in {
+                "block_size": cb_block_size,
+                "num_blocks": cb_num_blocks,
+                "max_batch_tokens": cb_max_batch_tokens,
+                "max_memory_percent": cb_max_memory_percent,
+                "use_cuda_graph": cb_use_cuda_graph,
+            }.items()
+            if v is not None
+        }
+        cb_config = ContinuousBatchingConfig(**cb_kwargs) if cb_kwargs else None
+        self._generation_state = GenerationState(
+            continuous_batching=continuous_batching,
+            compile=compile,
+            cb_config=cb_config,
+        )
+        if chat_template_kwargs:
+            chat_template_kwargs = json.loads(chat_template_kwargs)
+            if not isinstance(chat_template_kwargs, dict):
+                raise typer.BadParameter("--chat-template-kwargs must be a JSON object")
+        else:
+            chat_template_kwargs = {}
+        if reasoning == ReasoningMode.ON:
+            chat_template_kwargs["enable_thinking"] = True
+        elif reasoning == ReasoningMode.OFF:
+            chat_template_kwargs["enable_thinking"] = False
+        self._chat_handler = ChatCompletionHandler(
+            model_manager=self._model_manager,
+            generation_state=self._generation_state,
+            chat_template_kwargs=chat_template_kwargs,
+        )
+        self._completion_handler = CompletionHandler(
+            model_manager=self._model_manager,
+            generation_state=self._generation_state,
+        )
+        self._response_handler = ResponseHandler(
+            model_manager=self._model_manager,
+            generation_state=self._generation_state,
+            chat_template_kwargs=chat_template_kwargs,
+        )
+        self._transcription_handler = TranscriptionHandler(self._model_manager, self._generation_state)
+        app = build_server(
+            self._model_manager,
+            self._chat_handler,
+            completion_handler=self._completion_handler,
+            response_handler=self._response_handler,
+            transcription_handler=self._transcription_handler,
+            generation_state=self._generation_state,
+            enable_cors=enable_cors,
+        )
+        config = uvicorn.Config(app, host=host, port=port, log_level="info")
+        self.server = uvicorn.Server(config)
+        if non_blocking:
+            self.start_server()
+        else:
+            self.server.run()
+    def start_server(self):
+        def _run():
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            loop.run_until_complete(self.server.serve())
+        self._thread = threading.Thread(target=_run, name="uvicorn-thread", daemon=False)
+        self._thread.start()
+    def reset_loaded_models(self):
+        """Clear all loaded models from memory."""
+        self._model_manager.shutdown()
+    def kill_server(self):
+        self._generation_state.shutdown()
+        self._model_manager.shutdown()
+        if not self._thread or not self._thread.is_alive():
+            return
+        self.server.should_exit = True
+        self._thread.join(timeout=2)
+Serve.__doc__ = """
+Run a FastAPI server to serve models on-demand with an OpenAI compatible API.
+Models will be loaded and unloaded automatically based on usage and a timeout.
+\b
+Endpoints:
+    POST /v1/chat/completions — Chat completions (streaming + non-streaming).
+    POST /v1/completions      — Legacy text completions from a prompt.
+    GET  /v1/models           — Lists available models.
+    GET  /health              — Health check.
+Requires FastAPI and Uvicorn: pip install transformers[serving]
+"""

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cli/system.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains commands to print information about the environment and version.
+Usage:
+    transformers env
+    transformers version
+"""
+import contextlib
+import io
+import os
+import platform
+from typing import Annotated
+import huggingface_hub
+import typer
+from .. import __version__
+from ..integrations.deepspeed import is_deepspeed_available
+from ..utils import (
+    is_accelerate_available,
+    is_torch_available,
+    is_torch_hpu_available,
+    is_torch_npu_available,
+    is_torch_xpu_available,
+)
+def env(
+    accelerate_config_file: Annotated[
+        str | None,
+        typer.Argument(help="The accelerate config file to use for the default values in the launching script."),
+    ] = None,
+) -> None:
+    """Print information about the environment."""
+    import safetensors
+    # TODO: remove hasattr guard once safetensors >= 0.8.0 is released (adds __version__)
+    safetensors_version = safetensors.__version__ if hasattr(safetensors, "__version__") else "unknown"
+    accelerate_version = "not installed"
+    accelerate_config = accelerate_config_str = "not found"
+    if is_accelerate_available():
+        import accelerate
+        from accelerate.commands.config import default_config_file, load_config_from_file
+        accelerate_version = accelerate.__version__
+        # Get the default from the config file.
+        if accelerate_config_file is not None or os.path.isfile(default_config_file):
+            accelerate_config = load_config_from_file(accelerate_config_file).to_dict()
+        accelerate_config_str = (
+            "\n".join([f"\t- {prop}: {val}" for prop, val in accelerate_config.items()])
+            if isinstance(accelerate_config, dict)
+            else f"\t{accelerate_config}"
+        )
+    pt_version = "not installed"
+    pt_cuda_available = "NA"
+    pt_accelerator = "NA"
+    if is_torch_available():
+        import torch
+        pt_version = torch.__version__
+        pt_cuda_available = torch.cuda.is_available()
+        pt_xpu_available = is_torch_xpu_available()
+        pt_npu_available = is_torch_npu_available()
+        pt_hpu_available = is_torch_hpu_available()
+        if pt_cuda_available:
+            pt_accelerator = "CUDA"
+        elif pt_xpu_available:
+            pt_accelerator = "XPU"
+        elif pt_npu_available:
+            pt_accelerator = "NPU"
+        elif pt_hpu_available:
+            pt_accelerator = "HPU"
+    deepspeed_version = "not installed"
+    if is_deepspeed_available():
+        # Redirect command line output to silence deepspeed import output.
+        with contextlib.redirect_stdout(io.StringIO()):
+            import deepspeed
+        deepspeed_version = deepspeed.__version__
+    info = {
+        "`transformers` version": __version__,
+        "Platform": platform.platform(),
+        "Python version": platform.python_version(),
+        "Huggingface_hub version": huggingface_hub.__version__,
+        "Safetensors version": f"{safetensors_version}",
+        "Accelerate version": f"{accelerate_version}",
+        "Accelerate config": f"{accelerate_config_str}",
+        "DeepSpeed version": f"{deepspeed_version}",
+        "PyTorch version (accelerator?)": f"{pt_version} ({pt_accelerator})",
+        "Using distributed or parallel set-up in script?": "<fill in>",
+    }
+    if is_torch_available():
+        if pt_cuda_available:
+            info["Using GPU in script?"] = "<fill in>"
+            info["GPU type"] = torch.cuda.get_device_name()
+        elif pt_xpu_available:
+            info["Using XPU in script?"] = "<fill in>"
+            info["XPU type"] = torch.xpu.get_device_name()
+        elif pt_hpu_available and hasattr(torch, "hpu"):
+            info["Using HPU in script?"] = "<fill in>"
+            info["HPU type"] = torch.hpu.get_device_name()
+        elif pt_npu_available and hasattr(torch, "npu"):
+            info["Using NPU in script?"] = "<fill in>"
+            info["NPU type"] = torch.npu.get_device_name()
+            if hasattr(torch.version, "cann"):
+                info["CANN version"] = torch.version.cann
+    print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
+    print(_format_dict(info))
+    return info
+def version() -> None:
+    """Print CLI version."""
+    print(__version__)
+def _format_dict(d: dict) -> str:
+    return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/cli/transformers.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformers CLI."""
+from huggingface_hub import check_cli_update, typer_factory
+from transformers.cli.add_new_model_like import add_new_model_like
+from transformers.cli.chat import Chat
+from transformers.cli.download import download
+from transformers.cli.serve import Serve
+from transformers.cli.system import env, version
+app = typer_factory(help="Transformers CLI")
+app.command()(add_new_model_like)
+app.command(name="chat")(Chat)
+app.command()(download)
+app.command()(env)
+app.command(name="serve")(Serve)
+app.command()(version)
+def main():
+    check_cli_update("transformers")
+    app()
+if __name__ == "__main__":
+    main()

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/distributed/__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ..utils import _LazyModule
+_import_structure = {
+    "configuration_utils": ["DistributedConfig"],
+}
+if TYPE_CHECKING:
+    from .configuration_utils import (
+        DistributedConfig,
+    )
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/distributed/configuration_utils.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import json
+import os
+from dataclasses import dataclass
+from typing import Any
+@dataclass
+class DistributedConfig:
+    """
+    Base class for distributed configs
+    """
+    enable_expert_parallel: bool = False
+    # TODO: add tp_plan, pp_plan, device_mesh etc..
+    @classmethod
+    def from_dict(cls, config_dict, **kwargs):
+        """
+        Constructs a DistributedConfig instance from a dictionary of parameters.
+        Args:
+            config_dict (Dict[str, Any]): Dictionary containing configuration parameters.
+            **kwargs: Additional keyword arguments to override dictionary values.
+        Returns:
+            DistributedConfig: Instance of DistributedConfig constructed from the dictionary.
+        """
+        config = cls(**config_dict)
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+        return config
+    # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.to_json_file
+    def to_json_file(self, json_file_path: str | os.PathLike):
+        """
+        Save this instance to a JSON file.
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this configuration instance's parameters will be saved.
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default
+                `QuantizationConfig()` is serialized to JSON file.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            config_dict = self.to_dict()
+            json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+            writer.write(json_string)
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary. Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        return copy.deepcopy(self.__dict__)
+    # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.__iter__
+    def __iter__(self):
+        """allows `dict(obj)` for situations where obj may be a dict or QuantizationConfigMixin"""
+        yield from copy.deepcopy(self.__dict__).items()
+    # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.__repr__
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+    def to_json_string(self):
+        """
+        Serializes this instance to a JSON formatted string.
+        Returns:
+            str: JSON formatted string representing the configuration instance.
+        """
+        return json.dumps(self.__dict__, indent=2) + "\n"
+    def update(self, **kwargs):
+        """
+        Updates attributes of this class instance with attributes from `kwargs` if they match existing attributes,
+        returning all the unused kwargs.
+        Args:
+            kwargs (`Dict[str, Any]`):
+                Dictionary of attributes to tentatively update this class.
+        Returns:
+            `Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
+        """
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+                to_remove.append(key)
+        # Remove all the attributes that were updated, without modifying the input dict
+        unused_kwargs = {key: value for key, value in kwargs.items() if key not in to_remove}
+        return unused_kwargs

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/hyperparameter_search.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .integrations import (
+    is_optuna_available,
+    is_ray_tune_available,
+    is_wandb_available,
+    run_hp_search_optuna,
+    run_hp_search_ray,
+    run_hp_search_wandb,
+)
+from .trainer_utils import (
+    HPSearchBackend,
+    default_hp_space_optuna,
+    default_hp_space_ray,
+    default_hp_space_wandb,
+)
+from .utils import logging
+logger = logging.get_logger(__name__)
+class HyperParamSearchBackendBase:
+    name: str
+    pip_package: str | None = None
+    @staticmethod
+    def is_available():
+        raise NotImplementedError
+    def run(self, trainer, n_trials: int, direction: str, **kwargs):
+        raise NotImplementedError
+    def default_hp_space(self, trial):
+        raise NotImplementedError
+    def ensure_available(self):
+        if not self.is_available():
+            raise RuntimeError(
+                f"You picked the {self.name} backend, but it is not installed. Run {self.pip_install()}."
+            )
+    @classmethod
+    def pip_install(cls):
+        return f"`pip install {cls.pip_package or cls.name}`"
+class OptunaBackend(HyperParamSearchBackendBase):
+    name = "optuna"
+    @staticmethod
+    def is_available():
+        return is_optuna_available()
+    def run(self, trainer, n_trials: int, direction: str, **kwargs):
+        return run_hp_search_optuna(trainer, n_trials, direction, **kwargs)
+    def default_hp_space(self, trial):
+        return default_hp_space_optuna(trial)
+class RayTuneBackend(HyperParamSearchBackendBase):
+    name = "ray"
+    pip_package = "'ray[tune]'"
+    @staticmethod
+    def is_available():
+        return is_ray_tune_available()
+    def run(self, trainer, n_trials: int, direction: str, **kwargs):
+        return run_hp_search_ray(trainer, n_trials, direction, **kwargs)
+    def default_hp_space(self, trial):
+        return default_hp_space_ray(trial)
+class WandbBackend(HyperParamSearchBackendBase):
+    name = "wandb"
+    @staticmethod
+    def is_available():
+        return is_wandb_available()
+    def run(self, trainer, n_trials: int, direction: str, **kwargs):
+        return run_hp_search_wandb(trainer, n_trials, direction, **kwargs)
+    def default_hp_space(self, trial):
+        return default_hp_space_wandb(trial)
+ALL_HYPERPARAMETER_SEARCH_BACKENDS = {
+    HPSearchBackend(backend.name): backend for backend in [OptunaBackend, RayTuneBackend, WandbBackend]
+}
+def default_hp_search_backend() -> str:
+    available_backends = [backend for backend in ALL_HYPERPARAMETER_SEARCH_BACKENDS.values() if backend.is_available()]
+    if len(available_backends) > 0:
+        name = available_backends[0].name
+        if len(available_backends) > 1:
+            logger.info(
+                f"{len(available_backends)} hyperparameter search backends available. Using {name} as the default."
+            )
+        return name
+    raise RuntimeError(
+        "No hyperparameter search backend available.\n"
+        + "\n".join(
+            f" - To install {backend.name} run {backend.pip_install()}"
+            for backend in ALL_HYPERPARAMETER_SEARCH_BACKENDS.values()
+        )
+    )

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/image_transforms.py ADDED Viewed

	@@ -0,0 +1,1073 @@

+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import defaultdict
+from collections.abc import Collection, Iterable
+from math import ceil
+from typing import Optional, Union
+import numpy as np
+from .image_utils import (
+    ChannelDimension,
+    ImageInput,
+    get_channel_dimension_axis,
+    get_image_size,
+    infer_channel_dimension_format,
+)
+from .utils import ExplicitEnum, TensorType, is_torch_tensor
+from .utils.import_utils import (
+    is_torch_available,
+    is_vision_available,
+    requires_backends,
+)
+if is_vision_available():
+    import PIL
+    from .image_utils import PILImageResampling
+if is_torch_available():
+    import torch
+def to_channel_dimension_format(
+    image: np.ndarray,
+    channel_dim: ChannelDimension | str,
+    input_channel_dim: ChannelDimension | str | None = None,
+) -> np.ndarray:
+    """
+    Converts `image` to the channel dimension format specified by `channel_dim`. The input
+    can have arbitrary number of leading dimensions. Only last three dimension will be permuted
+    to format the `image`.
+    Args:
+        image (`numpy.ndarray`):
+            The image to have its channel dimension set.
+        channel_dim (`ChannelDimension`):
+            The channel dimension format to use.
+        input_channel_dim (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    Returns:
+        `np.ndarray`: The image with the channel dimension set to `channel_dim`.
+    """
+    if not isinstance(image, np.ndarray):
+        raise TypeError(f"Input image must be of type np.ndarray, got {type(image)}")
+    if input_channel_dim is None:
+        input_channel_dim = infer_channel_dimension_format(image)
+    target_channel_dim = ChannelDimension(channel_dim)
+    if input_channel_dim == target_channel_dim:
+        return image
+    if target_channel_dim == ChannelDimension.FIRST:
+        axes = list(range(image.ndim - 3)) + [image.ndim - 1, image.ndim - 3, image.ndim - 2]
+        image = image.transpose(axes)
+    elif target_channel_dim == ChannelDimension.LAST:
+        axes = list(range(image.ndim - 3)) + [image.ndim - 2, image.ndim - 1, image.ndim - 3]
+        image = image.transpose(axes)
+    else:
+        raise ValueError(f"Unsupported channel dimension format: {channel_dim}")
+    return image
+def rescale(
+    image: np.ndarray,
+    scale: float,
+    data_format: ChannelDimension | None = None,
+    dtype: np.dtype = np.float32,
+    input_data_format: str | ChannelDimension | None = None,
+) -> np.ndarray:
+    """
+    Rescales `image` by `scale`.
+    Args:
+        image (`np.ndarray`):
+            The image to rescale.
+        scale (`float`):
+            The scale to use for rescaling the image.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the image. If not provided, it will be the same as the input image.
+        dtype (`np.dtype`, *optional*, defaults to `np.float32`):
+            The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature
+            extractors.
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    Returns:
+        `np.ndarray`: The rescaled image.
+    """
+    if not isinstance(image, np.ndarray):
+        raise TypeError(f"Input image must be of type np.ndarray, got {type(image)}")
+    rescaled_image = image.astype(np.float64) * scale  # Numpy type promotion has changed, so always upcast first
+    if data_format is not None:
+        rescaled_image = to_channel_dimension_format(rescaled_image, data_format, input_data_format)
+    rescaled_image = rescaled_image.astype(dtype)  # Finally downcast to the desired dtype at the end
+    return rescaled_image
+def _rescale_for_pil_conversion(image):
+    """
+    Detects whether or not the image needs to be rescaled before being converted to a PIL image.
+    The assumption is that if the image is of type `np.float` and all values are between 0 and 1, it needs to be
+    rescaled.
+    """
+    if image.dtype == np.uint8:
+        do_rescale = False
+    elif np.allclose(image, image.astype(int)):
+        if np.all(image >= 0) and np.all(image <= 255):
+            do_rescale = False
+        else:
+            raise ValueError(
+                "The image to be converted to a PIL image contains values outside the range [0, 255], "
+                f"got [{image.min()}, {image.max()}] which cannot be converted to uint8."
+            )
+    elif np.all(image >= 0) and np.all(image <= 1):
+        do_rescale = True
+    else:
+        raise ValueError(
+            "The image to be converted to a PIL image contains values outside the range [0, 1], "
+            f"got [{image.min()}, {image.max()}] which cannot be converted to uint8."
+        )
+    return do_rescale
+def to_pil_image(
+    image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor"],
+    do_rescale: bool | None = None,
+    image_mode: str | None = None,
+    input_data_format: str | ChannelDimension | None = None,
+) -> "PIL.Image.Image":
+    """
+    Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
+    needed.
+    Args:
+        image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
+            The image to convert to the `PIL.Image` format.
+        do_rescale (`bool`, *optional*):
+            Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
+            to `True` if the image type is a floating type and casting to `int` would result in a loss of precision,
+            and `False` otherwise.
+        image_mode (`str`, *optional*):
+            The mode to use for the PIL image. If unset, will use the default mode for the input image type.
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If unset, will use the inferred format from the input.
+    Returns:
+        `PIL.Image.Image`: The converted image.
+    """
+    requires_backends(to_pil_image, ["vision"])
+    if isinstance(image, PIL.Image.Image):
+        return image
+    # Convert all tensors to numpy arrays before converting to PIL image
+    if is_torch_tensor(image):
+        image = image.numpy()
+    elif not isinstance(image, np.ndarray):
+        raise ValueError(f"Input image type not supported: {type(image)}")
+    # If the channel has been moved to first dim, we put it back at the end.
+    image = to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format)
+    # If there is a single channel, we squeeze it, as otherwise PIL can't handle it.
+    image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image
+    # PIL.Image can only store uint8 values so we rescale the image to be between 0 and 255 if needed.
+    do_rescale = _rescale_for_pil_conversion(image) if do_rescale is None else do_rescale
+    if do_rescale:
+        image = rescale(image, 255)
+    image = image.astype(np.uint8)
+    return PIL.Image.fromarray(image, mode=image_mode)
+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size.
+    Args:
+        image_size (`tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+    """
+    height, width = image_size
+    raw_size = None
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
+    if (height <= width and height == size) or (width <= height and width == size):
+        oh, ow = height, width
+    elif width < height:
+        ow = size
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
+    else:
+        oh = size
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+    return (oh, ow)
+# Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: int | tuple[int, int] | list[int] | tuple[int, ...],
+    default_to_square: bool = True,
+    max_size: int | None = None,
+    input_data_format: str | ChannelDimension | None = None,
+) -> tuple:
+    """
+    Find the target (height, width) dimension of the output image after resizing given the input image and the desired
+    size.
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `tuple[int, int]` or list[int] or `tuple[int]`):
+            The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
+            this.
+            If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
+            `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this
+            number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square
+            (`size`,`size`). If set to `False`, will replicate
+            [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
+            with support for resizing only the smallest edge and providing an optional `max_size`.
+        max_size (`int`, *optional*):
+            The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater
+            than `max_size` after being resized according to `size`, then the image is resized again so that the longer
+            edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
+            than `size`. Only used if `default_to_square` is `False`.
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If unset, will use the inferred format from the input.
+    Returns:
+        `tuple`: The target (height, width) dimension of the output image after resizing.
+    """
+    if isinstance(size, (tuple, list)):
+        if len(size) == 2:
+            return tuple(size)
+        elif len(size) == 1:
+            # Perform same logic as if size was an int
+            size = size[0]
+        else:
+            raise ValueError("size must have 1 or 2 elements if it is a list or tuple")
+    if default_to_square:
+        return (size, size)
+    height, width = get_image_size(input_image, input_data_format)
+    short, long = (width, height) if width <= height else (height, width)
+    requested_new_short = size
+    new_short, new_long = requested_new_short, int(requested_new_short * long / short)
+    if max_size is not None:
+        if max_size <= requested_new_short:
+            raise ValueError(
+                f"max_size = {max_size} must be strictly greater than the requested "
+                f"size for the smaller edge size = {size}"
+            )
+        if new_long > max_size:
+            new_short, new_long = int(max_size * new_short / new_long), max_size
+    return (new_long, new_short) if width <= height else (new_short, new_long)
+def resize(
+    image: np.ndarray,
+    size: tuple[int, int],
+    resample: Optional["PILImageResampling"] = None,
+    reducing_gap: int | None = None,
+    data_format: ChannelDimension | None = None,
+    return_numpy: bool = True,
+    input_data_format: str | ChannelDimension | None = None,
+) -> np.ndarray:
+    """
+    Resizes `image` to `(height, width)` specified by `size` using the PIL library.
+    Args:
+        image (`np.ndarray`):
+            The image to resize.
+        size (`tuple[int, int]`):
+            The size to use for resizing the image.
+        resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            The filter to user for resampling.
+        reducing_gap (`int`, *optional*):
+            Apply optimization by resizing the image in two steps. The bigger `reducing_gap`, the closer the result to
+            the fair resampling. See corresponding Pillow documentation for more details.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the output image. If unset, will use the inferred format from the input.
+        return_numpy (`bool`, *optional*, defaults to `True`):
+            Whether or not to return the resized image as a numpy array. If False a `PIL.Image.Image` object is
+            returned.
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If unset, will use the inferred format from the input.
+    Returns:
+        `np.ndarray`: The resized image.
+    """
+    requires_backends(resize, ["vision"])
+    resample = resample if resample is not None else PILImageResampling.BILINEAR
+    if not len(size) == 2:
+        raise ValueError("size must have 2 elements")
+    # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
+    # The resized image from PIL will always have channels last, so find the input format first.
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+    data_format = input_data_format if data_format is None else data_format
+    # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use
+    # the pillow library to resize the image and then convert back to numpy
+    do_rescale = False
+    if not isinstance(image, PIL.Image.Image):
+        do_rescale = _rescale_for_pil_conversion(image)
+        image = to_pil_image(image, do_rescale=do_rescale, input_data_format=input_data_format)
+    height, width = size
+    # PIL images are in the format (width, height)
+    resized_image = image.resize((width, height), resample=resample, reducing_gap=reducing_gap)
+    if return_numpy:
+        resized_image = np.array(resized_image)
+        # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image
+        # so we need to add it back if necessary.
+        resized_image = np.expand_dims(resized_image, axis=-1) if resized_image.ndim == 2 else resized_image
+        # The image is always in channels last format after converting from a PIL image
+        resized_image = to_channel_dimension_format(
+            resized_image, data_format, input_channel_dim=ChannelDimension.LAST
+        )
+        # If an image was rescaled to be in the range [0, 255] before converting to a PIL image, then we need to
+        # rescale it back to the original range.
+        resized_image = rescale(resized_image, 1 / 255) if do_rescale else resized_image
+    return resized_image
+def normalize(
+    image: np.ndarray,
+    mean: float | Collection[float],
+    std: float | Collection[float],
+    data_format: ChannelDimension | None = None,
+    input_data_format: str | ChannelDimension | None = None,
+) -> np.ndarray:
+    """
+    Normalizes `image` using the mean and standard deviation specified by `mean` and `std`.
+    image = (image - mean) / std
+    Args:
+        image (`np.ndarray`):
+            The image to normalize.
+        mean (`float` or `Collection[float]`):
+            The mean to use for normalization.
+        std (`float` or `Collection[float]`):
+            The standard deviation to use for normalization.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the output image. If unset, will use the inferred format from the input.
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If unset, will use the inferred format from the input.
+    """
+    if not isinstance(image, np.ndarray):
+        raise TypeError("image must be a numpy array")
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+    channel_axis = get_channel_dimension_axis(image, input_data_format=input_data_format)
+    num_channels = image.shape[channel_axis]
+    # We cast to float32 to avoid errors that can occur when subtracting uint8 values.
+    # We preserve the original dtype if it is a float type to prevent upcasting float16.
+    if not np.issubdtype(image.dtype, np.floating):
+        image = image.astype(np.float32)
+    if isinstance(mean, Collection):
+        if len(mean) != num_channels:
+            raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}")
+    else:
+        mean = [mean] * num_channels
+    mean = np.array(mean, dtype=image.dtype)
+    if isinstance(std, Collection):
+        if len(std) != num_channels:
+            raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(std)}")
+    else:
+        std = [std] * num_channels
+    std = np.array(std, dtype=image.dtype)
+    if input_data_format == ChannelDimension.LAST:
+        image = (image - mean) / std
+    else:
+        image = ((image.T - mean) / std).T
+    image = to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
+    return image
+def center_crop(
+    image: np.ndarray,
+    size: tuple[int, int],
+    data_format: str | ChannelDimension | None = None,
+    input_data_format: str | ChannelDimension | None = None,
+) -> np.ndarray:
+    """
+    Crops the `image` to the specified `size` using a center crop. Note that if the image is too small to be cropped to
+    the size given, it will be padded (so the returned result will always be of size `size`).
+    Args:
+        image (`np.ndarray`):
+            The image to crop.
+        size (`tuple[int, int]`):
+            The target size for the cropped image.
+        data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use the inferred format of the input image.
+        input_data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use the inferred format of the input image.
+    Returns:
+        `np.ndarray`: The cropped image.
+    """
+    requires_backends(center_crop, ["vision"])
+    if not isinstance(image, np.ndarray):
+        raise TypeError(f"Input image must be of type np.ndarray, got {type(image)}")
+    if not isinstance(size, Iterable) or len(size) != 2:
+        raise ValueError("size must have 2 elements representing the height and width of the output image")
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+    output_data_format = data_format if data_format is not None else input_data_format
+    # We perform the crop in (C, H, W) format and then convert to the output format
+    image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_data_format)
+    orig_height, orig_width = get_image_size(image, ChannelDimension.FIRST)
+    crop_height, crop_width = size
+    crop_height, crop_width = int(crop_height), int(crop_width)
+    # In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result.
+    top = (orig_height - crop_height) // 2
+    bottom = top + crop_height
+    # In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result.
+    left = (orig_width - crop_width) // 2
+    right = left + crop_width
+    # Check if cropped area is within image boundaries
+    if top >= 0 and bottom <= orig_height and left >= 0 and right <= orig_width:
+        image = image[..., top:bottom, left:right]
+        image = to_channel_dimension_format(image, output_data_format, ChannelDimension.FIRST)
+        return image
+    # Otherwise, we may need to pad if the image is too small. Oh joy...
+    new_height = max(crop_height, orig_height)
+    new_width = max(crop_width, orig_width)
+    new_shape = image.shape[:-2] + (new_height, new_width)
+    new_image = np.zeros_like(image, shape=new_shape)
+    # If the image is too small, pad it with zeros
+    top_pad = ceil((new_height - orig_height) / 2)
+    bottom_pad = top_pad + orig_height
+    left_pad = ceil((new_width - orig_width) / 2)
+    right_pad = left_pad + orig_width
+    new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image
+    top += top_pad
+    bottom += top_pad
+    left += left_pad
+    right += left_pad
+    new_image = new_image[..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right)]
+    new_image = to_channel_dimension_format(new_image, output_data_format, ChannelDimension.FIRST)
+    return new_image
+def _center_to_corners_format_torch(bboxes_center: "torch.Tensor") -> "torch.Tensor":
+    center_x, center_y, width, height = bboxes_center.unbind(-1)
+    bbox_corners = torch.stack(
+        # top left x, top left y, bottom right x, bottom right y
+        [(center_x - 0.5 * width), (center_y - 0.5 * height), (center_x + 0.5 * width), (center_y + 0.5 * height)],
+        dim=-1,
+    )
+    return bbox_corners
+def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray:
+    center_x, center_y, width, height = bboxes_center.T
+    bboxes_corners = np.stack(
+        # top left x, top left y, bottom right x, bottom right y
+        [center_x - 0.5 * width, center_y - 0.5 * height, center_x + 0.5 * width, center_y + 0.5 * height],
+        axis=-1,
+    )
+    return bboxes_corners
+# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
+    """
+    Converts bounding boxes from center format to corners format.
+    center format: contains the coordinate for the center of the box and its width, height dimensions
+        (center_x, center_y, width, height)
+    corners format: contains the coordinates for the top-left and bottom-right corners of the box
+        (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
+    """
+    # Function is used during model forward pass, so we use torch if relevant, without converting to numpy
+    if is_torch_tensor(bboxes_center):
+        return _center_to_corners_format_torch(bboxes_center)
+    elif isinstance(bboxes_center, np.ndarray):
+        return _center_to_corners_format_numpy(bboxes_center)
+    raise ValueError(f"Unsupported input type {type(bboxes_center)}")
+def _corners_to_center_format_torch(bboxes_corners: "torch.Tensor") -> "torch.Tensor":
+    top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(-1)
+    b = [
+        (top_left_x + bottom_right_x) / 2,  # center x
+        (top_left_y + bottom_right_y) / 2,  # center y
+        (bottom_right_x - top_left_x),  # width
+        (bottom_right_y - top_left_y),  # height
+    ]
+    return torch.stack(b, dim=-1)
+def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray:
+    top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.T
+    bboxes_center = np.stack(
+        [
+            (top_left_x + bottom_right_x) / 2,  # center x
+            (top_left_y + bottom_right_y) / 2,  # center y
+            (bottom_right_x - top_left_x),  # width
+            (bottom_right_y - top_left_y),  # height
+        ],
+        axis=-1,
+    )
+    return bboxes_center
+def corners_to_center_format(bboxes_corners: TensorType) -> TensorType:
+    """
+    Converts bounding boxes from corners format to center format.
+    corners format: contains the coordinates for the top-left and bottom-right corners of the box
+        (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
+    center format: contains the coordinate for the center of the box and its the width, height dimensions
+        (center_x, center_y, width, height)
+    """
+    # Inverse function accepts different input types so implemented here too
+    if is_torch_tensor(bboxes_corners):
+        return _corners_to_center_format_torch(bboxes_corners)
+    elif isinstance(bboxes_corners, np.ndarray):
+        return _corners_to_center_format_numpy(bboxes_corners)
+    raise ValueError(f"Unsupported input type {type(bboxes_corners)}")
+def safe_squeeze(
+    tensor: Union[np.ndarray, "torch.Tensor"], axis: int | None = None
+) -> Union[np.ndarray, "torch.Tensor"]:
+    """
+    Squeezes a tensor, but only if the axis specified has dim 1.
+    """
+    if axis is None:
+        return tensor.squeeze()
+    try:
+        return tensor.squeeze(axis=axis)
+    except ValueError:
+        return tensor
+# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
+def rgb_to_id(color):
+    """
+    Converts RGB color to unique ID.
+    """
+    if isinstance(color, np.ndarray) and len(color.shape) == 3:
+        if color.dtype == np.uint8:
+            color = color.astype(np.int32)
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+    return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+def id_to_rgb(id_map):
+    """
+    Converts unique ID to RGB color.
+    """
+    if isinstance(id_map, np.ndarray):
+        id_map_copy = id_map.copy()
+        rgb_shape = tuple(list(id_map.shape) + [3])
+        rgb_map = np.zeros(rgb_shape, dtype=np.uint8)
+        for i in range(3):
+            rgb_map[..., i] = id_map_copy % 256
+            id_map_copy //= 256
+        return rgb_map
+    color = []
+    for _ in range(3):
+        color.append(id_map % 256)
+        id_map //= 256
+    return color
+class PaddingMode(ExplicitEnum):
+    """
+    Enum class for the different padding modes to use when padding images.
+    """
+    CONSTANT = "constant"
+    REFLECT = "reflect"
+    REPLICATE = "replicate"
+    SYMMETRIC = "symmetric"
+def pad(
+    image: np.ndarray,
+    padding: int | tuple[int, int] | Iterable[tuple[int, int]],
+    mode: PaddingMode = PaddingMode.CONSTANT,
+    constant_values: float | Iterable[float] = 0.0,
+    data_format: str | ChannelDimension | None = None,
+    input_data_format: str | ChannelDimension | None = None,
+) -> np.ndarray:
+    """
+    Pads the `image` with the specified (height, width) `padding` and `mode`.
+    Args:
+        image (`np.ndarray`):
+            The image to pad.
+        padding (`int` or `tuple[int, int]` or `Iterable[tuple[int, int]]`):
+            Padding to apply to the edges of the height, width axes. Can be one of three formats:
+            - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
+            - `((before, after),)` yields same before and after pad for height and width.
+            - `(pad,)` or int is a shortcut for before = after = pad width for all axes.
+        mode (`PaddingMode`):
+            The padding mode to use. Can be one of:
+                - `"constant"`: pads with a constant value.
+                - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
+                  vector along each axis.
+                - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
+                - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
+        constant_values (`float` or `Iterable[float]`, *optional*):
+            The value to use for the padding if `mode` is `"constant"`.
+        data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use same as the input image.
+        input_data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use the inferred format of the input image.
+    Returns:
+        `np.ndarray`: The padded image.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+    def _expand_for_data_format(values):
+        """
+        Convert values to be in the format expected by np.pad based on the data format.
+        """
+        if isinstance(values, (int, float)):
+            values = ((values, values), (values, values))
+        elif isinstance(values, tuple) and len(values) == 1:
+            values = ((values[0], values[0]), (values[0], values[0]))
+        elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], int):
+            values = (values, values)
+        elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], tuple):
+            pass
+        else:
+            raise ValueError(f"Unsupported format: {values}")
+        # add 0 for channel dimension
+        values = ((0, 0), *values) if input_data_format == ChannelDimension.FIRST else (*values, (0, 0))
+        # Add additional padding if there's a batch dimension
+        values = ((0, 0), *values) if image.ndim == 4 else values
+        return values
+    padding = _expand_for_data_format(padding)
+    if mode == PaddingMode.CONSTANT:
+        constant_values = _expand_for_data_format(constant_values)
+        image = np.pad(image, padding, mode="constant", constant_values=constant_values)
+    elif mode == PaddingMode.REFLECT:
+        image = np.pad(image, padding, mode="reflect")
+    elif mode == PaddingMode.REPLICATE:
+        image = np.pad(image, padding, mode="edge")
+    elif mode == PaddingMode.SYMMETRIC:
+        image = np.pad(image, padding, mode="symmetric")
+    else:
+        raise ValueError(f"Invalid padding mode: {mode}")
+    image = to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
+    return image
+# TODO (Amy): Accept 1/3/4 channel numpy array as input and return np.array as default
+def convert_to_rgb(image: ImageInput) -> ImageInput:
+    """
+    Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
+    as is.
+    Args:
+        image (Image):
+            The image to convert.
+    """
+    requires_backends(convert_to_rgb, ["vision"])
+    if not isinstance(image, PIL.Image.Image):
+        return image
+    if image.mode == "RGB":
+        return image
+    image = image.convert("RGB")
+    return image
+def flip_channel_order(
+    image: np.ndarray,
+    data_format: ChannelDimension | None = None,
+    input_data_format: str | ChannelDimension | None = None,
+) -> np.ndarray:
+    """
+    Flips the channel order of the image.
+    If the image is in RGB format, it will be converted to BGR and vice versa.
+    Args:
+        image (`np.ndarray`):
+            The image to flip.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use same as the input image.
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format for the input image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use the inferred format of the input image.
+    """
+    input_data_format = infer_channel_dimension_format(image) if input_data_format is None else input_data_format
+    if input_data_format == ChannelDimension.LAST:
+        image = image[..., ::-1]
+    elif input_data_format == ChannelDimension.FIRST:
+        image = image[::-1, ...]
+    else:
+        raise ValueError(f"Unsupported channel dimension: {input_data_format}")
+    if data_format is not None:
+        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+    return image
+def split_to_tiles(images: "torch.Tensor", num_tiles_height: int, num_tiles_width: int) -> "torch.Tensor":
+    # Split image into number of required tiles (width x height)
+    batch_size, num_channels, height, width = images.size()
+    images = images.view(
+        batch_size,
+        num_channels,
+        num_tiles_height,
+        height // num_tiles_height,
+        num_tiles_width,
+        width // num_tiles_width,
+    )
+    # Permute dimensions to reorder the axes
+    image = images.permute(0, 2, 4, 1, 3, 5).contiguous()
+    # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
+    image = image.view(
+        batch_size,
+        num_tiles_width * num_tiles_height,
+        num_channels,
+        height // num_tiles_height,
+        width // num_tiles_width,
+    )
+    return image
+def divide_to_patches(
+    image: Union[np.ndarray, "torch.Tensor"], patch_size: int | tuple[int, int]
+) -> list[Union[np.ndarray, "torch.Tensor"]]:
+    """
+    Divides an image into patches of a specified size.
+    Args:
+        image (`np.array | "torch.Tensor"`):
+            The input image.
+        patch_size (`int` or `tuple[int, int]`):
+            The size of each patch. If an int, patches are square. If a tuple,
+            it is interpreted as `(patch_height, patch_width)`.
+    Returns:
+        list: A list of `np.array | "torch.Tensor"` representing the patches.
+    """
+    patch_h, patch_w = (patch_size, patch_size) if isinstance(patch_size, int) else patch_size
+    patches = []
+    height, width = get_image_size(image, channel_dim=ChannelDimension.FIRST)
+    for i in range(0, height, patch_h):
+        for j in range(0, width, patch_w):
+            patch = image[..., i : i + patch_h, j : j + patch_w]
+            patches.append(patch)
+    return patches
+def _group_images_by_shape(nested_images, *paired_inputs, is_nested: bool = False):
+    """
+    Helper function to flatten a single level of nested image and batch structures and group by shape.
+    Args:
+        nested_images (list):
+            A list of images or a single tensor
+        paired_inputs (Any, *optional*):
+            Zero or more lists that mirror the structure of `nested_images` (flat list, or list of lists when
+            `is_nested=True`). Each element is paired 1:1 with the corresponding image so it can be grouped by the
+            same shape key. These paired values are grouped alongside `nested_images` but are not stacked in the output, so
+            they do not need to be tensors.
+        is_nested (bool, *optional*, defaults to False):
+            Whether the images are nested.
+    Returns:
+        tuple[dict, ...]:
+            - A dictionary with shape as key and list of images with that shape as value
+            - A dictionary with shape as key and list of paired values with that shape as value
+            - A dictionary mapping original indices to (shape, index) tuples
+            - A dictionary mapping original indices to (shape, index) tuples for each paired input
+    """
+    grouped_images = defaultdict(list)
+    grouped_images_index = {}
+    paired_grouped_values = [defaultdict(list) for _ in paired_inputs]
+    # Normalize inputs to consistent nested structure
+    normalized_images = [nested_images] if not is_nested else nested_images
+    normalized_paired = []
+    for paired_input in paired_inputs:
+        normalized_paired.append([paired_input] if not is_nested else paired_input)
+    # Process each image and group by shape
+    for i, (sublist, *paired_sublists) in enumerate(zip(normalized_images, *normalized_paired)):
+        for j, (image, *paired_values) in enumerate(zip(sublist, *paired_sublists)):
+            key = (i, j) if is_nested else j
+            shape = image.shape[1:]
+            # Add to grouped structures
+            grouped_images[shape].append(image)
+            for paired_index, paired_value in enumerate(paired_values):
+                paired_grouped_values[paired_index][shape].append(paired_value)
+            grouped_images_index[key] = (shape, len(grouped_images[shape]) - 1)
+    # Store structure size for nested inputs to handle empty sublists during reconstruction
+    if is_nested:
+        grouped_images_index["_num_sublists"] = len(normalized_images)
+    return grouped_images, *paired_grouped_values, grouped_images_index
+def _reconstruct_nested_structure(indices, processed_images):
+    """Helper function to reconstruct a single level nested structure."""
+    # Get the number of sublists (handles empty sublists like in [[], [image]])
+    num_sublists = indices.pop("_num_sublists", None)
+    # Group indices by outer index
+    nested_indices = defaultdict(list)
+    for i, j in indices:
+        nested_indices[i].append(j)
+    # Determine the number of outer sublists
+    if num_sublists is not None:
+        max_outer_idx = num_sublists - 1
+    elif nested_indices:
+        max_outer_idx = max(nested_indices.keys())
+    else:
+        return []
+    # Create the result structure
+    result = []
+    for i in range(max_outer_idx + 1):
+        if i not in nested_indices:
+            result.append([])
+        else:
+            inner_max_idx = max(nested_indices[i])
+            inner_list = [None] * (inner_max_idx + 1)
+            for j in nested_indices[i]:
+                shape, idx = indices[(i, j)]
+                inner_list[j] = processed_images[shape][idx]
+            result.append(inner_list)
+    return result
+def _iterate_items(items, is_nested: bool):
+    """
+    Helper function to iterate over items yielding (key, item) pairs.
+    For nested structures, yields ((row_index, col_index), item).
+    For flat structures, yields (index, item).
+    """
+    if is_nested:
+        for i, row in enumerate(items):
+            for j, item in enumerate(row):
+                yield (i, j), item
+    else:
+        for i, item in enumerate(items):
+            yield i, item
+def _get_device_from_images(images, is_nested: bool) -> "torch.device":
+    """
+    Get the device from the first non-empty element in a (potentially nested) list of images.
+    Handles cases like `images = [[], [image]]` where the first sublist may be empty.
+    """
+    if is_nested:
+        for row in images:
+            if isinstance(row, torch.Tensor):
+                return row.device
+            if isinstance(row, list) and len(row) > 0:
+                return row[0].device
+    return images[0].device
+def group_images_by_shape(
+    images: Union[list["torch.Tensor"], "torch.Tensor"],
+    *paired_inputs,
+    disable_grouping: bool | None,
+    is_nested: bool = False,
+) -> tuple[dict, ...]:
+    """
+    Groups images by shape.
+    Returns a dictionary with the shape as key and a list of images with that shape as value,
+    and a dictionary with the index of the image in the original list as key and the shape and index in the grouped list as value.
+    The function supports both flat lists of tensors and nested structures.
+    The input must be either all flat or all nested, not a mix of both.
+    Args:
+        images (Union[list["torch.Tensor"], "torch.Tensor"]):
+            A list of images or a single tensor
+        paired_inputs (Any, *optional*):
+            Zero or more lists that mirror the structure of `images` (flat list, or list of lists when
+            `is_nested=True`). Each element is paired 1:1 with the corresponding image so it can be grouped by the
+            same shape key. These paired values are grouped alongside `images` but are not stacked in the output, so
+            they do not need to be tensors.
+        disable_grouping (bool):
+            Whether to disable grouping. If None, will be set to True if the images are on CPU, and False otherwise.
+            This choice is based on empirical observations, as detailed here: https://github.com/huggingface/transformers/pull/38157
+        is_nested (bool, *optional*, defaults to False):
+            Whether the images are nested.
+    Returns:
+        tuple[dict, ...]:
+            - A dictionary with shape as key and list/batch of images with that shape as value
+            - Zero or more dictionaries (one per argument in `*paired_inputs`) grouped consistently with `images`; these carry
+              the corresponding per-item values and are not stacked
+            - A dictionary mapping original indices to (shape, index) tuples
+    """
+    # If disable grouping is not explicitly provided, we favor disabling it if the images are on CPU, and enabling it otherwise.
+    if disable_grouping is None:
+        device = _get_device_from_images(images, is_nested)
+        disable_grouping = device == "cpu"
+    if disable_grouping:
+        grouped_images_index = {key: (key, 0) for key, _ in _iterate_items(images, is_nested)}
+        if is_nested:
+            grouped_images_index["_num_sublists"] = len(images)
+        return (
+            {key: img.unsqueeze(0) for key, img in _iterate_items(images, is_nested)},
+            *[
+                {key: item.unsqueeze(0) for key, item in _iterate_items(paired_list, is_nested)}
+                for paired_list in paired_inputs
+            ],
+            grouped_images_index,
+        )
+    # Handle single level nested structure
+    grouped_images, *paired_grouped_values, grouped_images_index = _group_images_by_shape(
+        images, *paired_inputs, is_nested=is_nested
+    )
+    # Stack images with the same shape
+    grouped_images = {shape: torch.stack(images_list, dim=0) for shape, images_list in grouped_images.items()}
+    return grouped_images, *paired_grouped_values, grouped_images_index
+def reorder_images(
+    processed_images: dict[tuple[int, int], "torch.Tensor"],
+    grouped_images_index: dict[int | tuple[int, int], tuple[tuple[int, int], int]],
+    is_nested: bool = False,
+) -> Union[list["torch.Tensor"], "torch.Tensor"]:
+    """
+    Reconstructs images in the original order, preserving the original structure (nested or not).
+    The input structure is either all flat or all nested.
+    Args:
+        processed_images (dict[tuple[int, int], "torch.Tensor"]):
+            Dictionary mapping shapes to batched processed images.
+        grouped_images_index (dict[Union[int, tuple[int, int]], tuple[tuple[int, int], int]]):
+            Dictionary mapping original indices to (shape, index) tuples.
+        is_nested (bool, *optional*, defaults to False):
+            Whether the images are nested. Cannot be inferred from the input, as some processing functions outputs nested images.
+            even with non nested images,e.g functions splitting images into patches. We thus can't deduce is_nested from the input.
+    Returns:
+        Union[list["torch.Tensor"], "torch.Tensor"]:
+            Images in the original structure.
+    """
+    if not is_nested:
+        return [
+            processed_images[grouped_images_index[i][0]][grouped_images_index[i][1]]
+            for i in range(len(grouped_images_index))
+        ]
+    return _reconstruct_nested_structure(grouped_images_index, processed_images)

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/gemma3/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_gemma3 import *
+    from .image_processing_gemma3 import *
+    from .image_processing_pil_gemma3 import *
+    from .modeling_gemma3 import *
+    from .processing_gemma3 import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/gemma3/configuration_gemma3.py ADDED Viewed

	@@ -0,0 +1,225 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma3/modular_gemma3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+from huggingface_hub.dataclasses import strict
+from ...configuration_utils import PreTrainedConfig
+from ...utils import auto_docstring, logging
+from ..siglip import SiglipVisionConfig
+logger = logging.get_logger(__name__)
+@auto_docstring(checkpoint="google/gemma-3-4b-it")
+@strict
+class Gemma3TextConfig(PreTrainedConfig):
+    r"""
+    query_pre_attn_scalar (`float`, *optional*, defaults to 256):
+        scaling factor used on the attention scores
+    final_logit_softcapping (`float`, *optional*):
+        Scaling factor when applying tanh softcapping on the logits.
+    attn_logit_softcapping (`float`, *optional*):
+        Scaling factor when applying tanh softcapping on the attention scores.
+    use_bidirectional_attention (`bool`, *optional*, defaults to `False`):
+        If True, the model will attend to all text tokens instead of using a causal mask. This does not change
+        behavior for vision tokens.
+    ```python
+    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
+    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
+    >>> configuration = Gemma3TextConfig()
+    >>> # Initializing a model from the gemma3_text-7b style configuration
+    >>> model = Gemma3TextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "gemma3_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.q_norm": "replicated_with_grad_allreduce",
+        "layers.*.self_attn.k_norm": "replicated_with_grad_allreduce",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    vocab_size: int = 262_208
+    hidden_size: int = 2304
+    intermediate_size: int = 9216
+    num_hidden_layers: int = 26
+    num_attention_heads: int = 8
+    num_key_value_heads: int = 4
+    head_dim: int = 256
+    hidden_activation: str = "gelu_pytorch_tanh"
+    max_position_embeddings: int = 131_072
+    initializer_range: float = 0.02
+    rms_norm_eps: float = 1e-6
+    use_cache: bool = True
+    pad_token_id: int | None = 0
+    eos_token_id: int | list[int] | None = 1
+    bos_token_id: int | None = 2
+    tie_word_embeddings: bool = True
+    rope_parameters: dict | None = None
+    attention_bias: bool = False
+    attention_dropout: int | float | None = 0.0
+    query_pre_attn_scalar: int = 256
+    sliding_window: int | None = 4096
+    layer_types: list[str] | None = None
+    final_logit_softcapping: float | None = None
+    attn_logit_softcapping: float | None = None
+    use_bidirectional_attention: bool | None = False
+    default_theta = {"global": 1_000_000.0, "local": 10_000.0}
+    def __post_init__(self, **kwargs):
+        if self.use_bidirectional_attention:
+            self.sliding_window = (self.sliding_window // 2) + 1  # due to fa we set exclusive bounds
+        # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
+        self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6)
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        super().__post_init__(**kwargs)
+    def validate_architecture(self):
+        """Part of `@strict`-powered validation. Validates the architecture of the config."""
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({self.num_attention_heads})."
+            )
+    def convert_rope_params_to_dict(self, **kwargs):
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters`
+        # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format
+        default_rope_params = {
+            "sliding_attention": {"rope_type": "default"},
+            "full_attention": {"rope_type": "default"},
+        }
+        self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params
+        if rope_scaling is not None:
+            self.rope_parameters["full_attention"].update(rope_scaling)
+        # Set default values if not present
+        if self.rope_parameters.get("full_attention") is None:
+            self.rope_parameters["full_attention"] = {"rope_type": "default"}
+        self.rope_parameters["full_attention"].setdefault(
+            "rope_theta", kwargs.pop("rope_theta", self.default_theta["global"])
+        )
+        if self.rope_parameters.get("sliding_attention") is None:
+            self.rope_parameters["sliding_attention"] = {"rope_type": "default"}
+        self.rope_parameters["sliding_attention"].setdefault(
+            "rope_theta", kwargs.pop("rope_local_base_freq", self.default_theta["local"])
+        )
+        # Standardize and validate the correctness of rotary position embeddings parameters
+        self.standardize_rope_params()
+        return kwargs
+@auto_docstring(checkpoint="google/gemma-3-4b-it")
+@strict
+class Gemma3Config(PreTrainedConfig):
+    r"""
+    mm_tokens_per_image (`int`, *optional*, defaults to 256):
+        The number of tokens per image embedding.
+    boi_token_index (`int`, *optional*, defaults to 255999):
+        The begin-of-image token index to wrap the image prompt.
+    eoi_token_index (`int`, *optional*, defaults to 256000):
+        The end-of-image token index to wrap the image prompt.
+    Example:
+    ```python
+    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig
+    >>> # Initializing a Siglip-like vision config
+    >>> vision_config = SiglipVisionConfig()
+    >>> # Initializing a Gemma3 Text config
+    >>> text_config = Gemma3TextConfig()
+    >>> # Initializing a Gemma3 gemma-3-4b style configuration
+    >>> configuration = Gemma3Config(vision_config, text_config)
+    >>> # Initializing a model from the gemma-3-4b style configuration
+    >>> model = Gemma3TextConfig(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "gemma3"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+        "boi_token_id": "boi_token_index",
+        "eoi_token_id": "eoi_token_index",
+    }
+    sub_configs = {
+        "text_config": Gemma3TextConfig,
+        "vision_config": SiglipVisionConfig,
+    }
+    text_config: Gemma3TextConfig | dict[str, Any] | None = None
+    vision_config: SiglipVisionConfig | dict[str, Any] | None = None
+    mm_tokens_per_image: int | None = 256
+    boi_token_index: int | None = 255_999
+    eoi_token_index: int | None = 256_000
+    image_token_index: int | None = 262_144
+    initializer_range: float | None = 0.02
+    tie_word_embeddings: bool | None = True
+    def __post_init__(self, **kwargs):
+        if self.text_config is None:
+            self.text_config = Gemma3TextConfig()
+            logger.info("text_config is None, using default Gemma3TextConfig text config.")
+        elif isinstance(self.text_config, dict):
+            self.text_config = Gemma3TextConfig(**self.text_config)
+        if isinstance(self.vision_config, dict):
+            self.vision_config = SiglipVisionConfig(**self.vision_config)
+        elif self.vision_config is None:
+            self.vision_config = SiglipVisionConfig()
+            logger.info("vision_config is None, using default SiglipVisionConfig vision config.")
+        super().__post_init__(**kwargs)
+__all__ = ["Gemma3Config", "Gemma3TextConfig"]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/gemma3/image_processing_gemma3.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Gemma3."""
+import itertools
+import math
+import torch
+from torchvision.transforms.v2 import functional as tvF
+from ...image_processing_backends import TorchvisionBackend
+from ...image_processing_utils import BatchFeature
+from ...image_transforms import group_images_by_shape, reorder_images
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+)
+from ...processing_utils import ImagesKwargs, Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+)
+class Gemma3ImageProcessorKwargs(ImagesKwargs, total=False):
+    """
+    do_pan_and_scan (`bool`, *optional*):
+        Whether to apply `pan_and_scan` to images.
+    pan_and_scan_min_crop_size (`int`, *optional*):
+        Minimum size of each crop in pan and scan.
+    pan_and_scan_max_num_crops (`int`, *optional*):
+        Maximum number of crops per image in pan and scan.
+    pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+        Minimum aspect ratio to activate pan and scan.
+    """
+    do_pan_and_scan: bool
+    pan_and_scan_min_crop_size: int
+    pan_and_scan_max_num_crops: int
+    pan_and_scan_min_ratio_to_activate: float
+@auto_docstring
+class Gemma3ImageProcessor(TorchvisionBackend):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 224, "width": 224}
+    default_to_square = True
+    do_convert_rgb = True
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_pan_and_scan = None
+    pan_and_scan_min_crop_size = None
+    pan_and_scan_max_num_crops = None
+    pan_and_scan_min_ratio_to_activate = None
+    valid_kwargs = Gemma3ImageProcessorKwargs
+    model_input_names = ["pixel_values", "num_crops"]
+    def __init__(self, **kwargs: Unpack[Gemma3ImageProcessorKwargs]):
+        super().__init__(**kwargs)
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Gemma3ImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+    def pan_and_scan_batched(
+        self,
+        images: "torch.Tensor",
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+    ):
+        """
+        Pan and Scan an image, by cropping into smaller images when the aspect ratio exceeds
+        minimum allowed ratio.
+        Args:
+            images (`torch.Tensor`):
+                Image to resize.
+            pan_and_scan_min_crop_size (`int`, *optional*):
+                Minimum size of each crop in pan and scan.
+            pan_and_scan_max_num_crops (`int`, *optional*):
+                Maximum number of crops per image in pan and scan.
+            pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+                Minimum aspect ratio to activate pan and scan.
+        """
+        height, width = images.shape[-2:]
+        # Square or landscape image.
+        if width >= height:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if width / height < pan_and_scan_min_ratio_to_activate:
+                return []
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_w = int(math.floor(width / height + 0.5))  # Half round up rounding.
+            num_crops_w = min(int(math.floor(width / pan_and_scan_min_crop_size)), num_crops_w)
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_w = max(2, num_crops_w)
+            num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w)
+            num_crops_h = 1
+        # Portrait image.
+        else:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if height / width < pan_and_scan_min_ratio_to_activate:
+                return []
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_h = int(math.floor(height / width + 0.5))
+            num_crops_h = min(int(math.floor(height / pan_and_scan_min_crop_size)), num_crops_h)
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_h = max(2, num_crops_h)
+            num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
+            num_crops_w = 1
+        crop_size_w = int(math.ceil(width / num_crops_w))
+        crop_size_h = int(math.ceil(height / num_crops_h))
+        # Don't apply PaS if crop size is too small.
+        if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
+            return []
+        crop_positions_w = [crop_size_w * i for i in range(num_crops_w)]
+        crop_positions_h = [crop_size_h * i for i in range(num_crops_h)]
+        return [
+            images[..., pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
+            for pos_h, pos_w in itertools.product(crop_positions_h, crop_positions_w)
+        ]
+    def _process_images_for_pan_and_scan(
+        self,
+        images: list["torch.Tensor"],
+        do_pan_and_scan: bool,
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+    ):
+        pas_images = self.pan_and_scan_batched(
+            images=images,
+            pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+            pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+            pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+        )
+        num_crops = [len(pas_images) for _ in images]
+        return pas_images, num_crops
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        resample: "PILImageResampling | tvF.InterpolationMode | int | None",
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: float | list[float] | None,
+        image_std: float | list[float] | None,
+        disable_grouping: bool | None,
+        return_tensors: str | TensorType | None,
+        do_pan_and_scan: bool | None = None,
+        pan_and_scan_min_crop_size: int | None = None,
+        pan_and_scan_max_num_crops: int | None = None,
+        pan_and_scan_min_ratio_to_activate: float | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        # Group images by size for batched processing
+        processed_images_grouped = {}
+        num_crops_grouped = {}
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        for shape_images, stacked_images in grouped_images.items():
+            if do_pan_and_scan:
+                pas_images, num_crops = self._process_images_for_pan_and_scan(
+                    images=stacked_images,
+                    do_pan_and_scan=do_pan_and_scan,
+                    pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+                    pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+                    pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+                )
+                # Add the thumbnails to the image patches
+                stacked_images = [stacked_images] + pas_images
+                # Group images by size for batched resizing (this will typically group thumbnails together and cropped patches together)
+                processed_image_patches_grouped = {}
+                grouped_image_patches, grouped_image_patches_index = group_images_by_shape(
+                    stacked_images, disable_grouping=disable_grouping
+                )
+                for shape, stacked_image_patches in grouped_image_patches.items():
+                    stacked_image_patches = self.resize(
+                        image=stacked_image_patches,
+                        size=size,
+                        resample=resample,
+                    )
+                    processed_image_patches_grouped[shape] = stacked_image_patches
+                processed_image_patches = reorder_images(processed_image_patches_grouped, grouped_image_patches_index)
+                # Transpose to have the thumbnails with their corresponding patches
+                stacked_images = torch.stack(processed_image_patches, dim=0).transpose(0, 1).contiguous()
+            else:
+                num_crops = [0 for _ in stacked_images]
+                if do_resize:
+                    stacked_images = self.resize(
+                        image=stacked_images,
+                        size=size,
+                        resample=resample,
+                    )
+            num_crops_grouped[shape_images] = num_crops
+            processed_images_grouped[shape_images] = stacked_images
+        resized_images = reorder_images(processed_images_grouped, grouped_images_index)
+        # If pan and scan is enabled, we need to flatten the list of images
+        if do_pan_and_scan:
+            resized_images = [image for images_list in resized_images for image in images_list]
+        num_crops = reorder_images(num_crops_grouped, grouped_images_index)
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        return BatchFeature(
+            data={"pixel_values": processed_images, "num_crops": num_crops}, tensor_type=return_tensors
+        )
+__all__ = ["Gemma3ImageProcessor"]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/gemma3/image_processing_pil_gemma3.py ADDED Viewed

	@@ -0,0 +1,225 @@

+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Gemma3."""
+import itertools
+import math
+import numpy as np
+from ...image_processing_backends import PilBackend
+from ...image_processing_utils import BatchFeature
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+    get_image_size,
+)
+from ...processing_utils import ImagesKwargs, Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+)
+# Adapted from transformers.models.gemma3.image_processing_gemma3.Gemma3ImageProcessorKwargs
+class Gemma3ImageProcessorKwargs(ImagesKwargs, total=False):
+    """
+    do_pan_and_scan (`bool`, *optional*):
+        Whether to apply `pan_and_scan` to images.
+    pan_and_scan_min_crop_size (`int`, *optional*):
+        Minimum size of each crop in pan and scan.
+    pan_and_scan_max_num_crops (`int`, *optional*):
+        Maximum number of crops per image in pan and scan.
+    pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+        Minimum aspect ratio to activate pan and scan.
+    """
+    do_pan_and_scan: bool
+    pan_and_scan_min_crop_size: int
+    pan_and_scan_max_num_crops: int
+    pan_and_scan_min_ratio_to_activate: float
+@auto_docstring
+class Gemma3ImageProcessorPil(PilBackend):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 224, "width": 224}
+    default_to_square = True
+    do_convert_rgb = True
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_pan_and_scan = None
+    pan_and_scan_min_crop_size = None
+    pan_and_scan_max_num_crops = None
+    pan_and_scan_min_ratio_to_activate = None
+    valid_kwargs = Gemma3ImageProcessorKwargs
+    model_input_names = ["pixel_values", "num_crops"]
+    def __init__(self, **kwargs: Unpack[Gemma3ImageProcessorKwargs]):
+        super().__init__(**kwargs)
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Gemma3ImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+    def pan_and_scan(
+        self,
+        image: np.ndarray,
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+    ):
+        """
+        Pan and Scan an image, by cropping into smaller images when the aspect ratio exceeds
+        minimum allowed ratio.
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            pan_and_scan_min_crop_size (`int`, *optional*):
+                Minimum size of each crop in pan and scan.
+            pan_and_scan_max_num_crops (`int`, *optional*):
+                Maximum number of crops per image in pan and scan.
+            pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+                Minimum aspect ratio to activate pan and scan.
+        """
+        height, width = get_image_size(image, channel_dim="channels_first")
+        # Square or landscape image.
+        if width >= height:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if width / height < pan_and_scan_min_ratio_to_activate:
+                return []
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_w = int(math.floor(width / height + 0.5))  # Half round up rounding.
+            num_crops_w = min(int(math.floor(width / pan_and_scan_min_crop_size)), num_crops_w)
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_w = max(2, num_crops_w)
+            num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w)
+            num_crops_h = 1
+        # Portrait image.
+        else:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if height / width < pan_and_scan_min_ratio_to_activate:
+                return []
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_h = int(math.floor(height / width + 0.5))
+            num_crops_h = min(int(math.floor(height / pan_and_scan_min_crop_size)), num_crops_h)
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_h = max(2, num_crops_h)
+            num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
+            num_crops_w = 1
+        crop_size_w = int(math.ceil(width / num_crops_w))
+        crop_size_h = int(math.ceil(height / num_crops_h))
+        # Don't apply PaS if crop size is too small.
+        if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
+            return []
+        crop_positions_w = [crop_size_w * i for i in range(num_crops_w)]
+        crop_positions_h = [crop_size_h * i for i in range(num_crops_h)]
+        # Images are channels-first (CHW format)
+        return [
+            image[:, pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
+            for pos_h, pos_w in itertools.product(crop_positions_h, crop_positions_w)
+        ]
+    def _process_images_for_pan_and_scan(
+        self,
+        images: list[np.ndarray],
+        do_pan_and_scan: bool,
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+    ):
+        pas_images_list = []
+        num_crops = []
+        for image in images:
+            pas_images = self.pan_and_scan(
+                image=image,
+                pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+                pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+                pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+            )
+            pas_images_list.extend([image] + pas_images)
+            num_crops.append(len(pas_images))
+        return pas_images_list, num_crops
+    def _preprocess(
+        self,
+        images: list[np.ndarray],
+        do_resize: bool,
+        size: SizeDict,
+        resample: "PILImageResampling | None",
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: float | list[float] | None,
+        image_std: float | list[float] | None,
+        return_tensors: str | TensorType | None,
+        do_pan_and_scan: bool | None = None,
+        pan_and_scan_min_crop_size: int | None = None,
+        pan_and_scan_max_num_crops: int | None = None,
+        pan_and_scan_min_ratio_to_activate: float | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        processed_images = []
+        num_crops = []
+        for image in images:
+            if do_pan_and_scan:
+                pas_images = self.pan_and_scan(
+                    image=image,
+                    pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+                    pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+                    pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+                )
+                # Add the original image and its crops
+                image_list = [image] + pas_images
+                num_crops.append(len(pas_images))
+            else:
+                image_list = [image]
+                num_crops.append(0)
+            # Process each image (original + crops if pan_and_scan)
+            processed_image_list = []
+            for img in image_list:
+                if do_resize:
+                    img = self.resize(image=img, size=size, resample=resample)
+                if do_rescale:
+                    img = self.rescale(image=img, scale=rescale_factor)
+                if do_normalize:
+                    img = self.normalize(image=img, mean=image_mean, std=image_std)
+                processed_image_list.append(img)
+            processed_images.extend(processed_image_list)
+        return BatchFeature(
+            data={"pixel_values": processed_images, "num_crops": num_crops}, tensor_type=return_tensors
+        )
+__all__ = ["Gemma3ImageProcessorPil"]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/gemma3/modeling_gemma3.py ADDED Viewed

	@@ -0,0 +1,1118 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma3/modular_gemma3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+from ... import initialization as init
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...configuration_utils import PreTrainedConfig
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_func_from_hub, use_kernelized_func
+from ...masking_utils import create_causal_mask, create_masks_for_generate, create_sliding_window_causal_mask
+from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    BaseModelOutputWithPooling,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check
+from ...utils.generic import maybe_autocast, merge_with_config_defaults
+from ...utils.output_capturing import capture_outputs
+from ..auto import AutoModel
+from .configuration_gemma3 import Gemma3Config, Gemma3TextConfig
+@auto_docstring(
+    custom_intro="""
+    Base class for Gemma3 outputs, with hidden states and attentions.
+    """
+)
+@dataclass
+class Gemma3ModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+    image_hidden_states: torch.FloatTensor | None = None
+@auto_docstring(
+    custom_intro="""
+    Base class for Gemma3 causal language model (or autoregressive) outputs.
+    """
+)
+@dataclass
+class Gemma3CausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
+    """
+    loss: torch.FloatTensor | None = None
+    logits: torch.FloatTensor | None = None
+    past_key_values: Cache | None = None
+    hidden_states: tuple[torch.FloatTensor] | None = None
+    attentions: tuple[torch.FloatTensor] | None = None
+    image_hidden_states: torch.FloatTensor | None = None
+class Gemma3TextScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.scalar_embed_scale = embed_scale
+        self.register_buffer("embed_scale", torch.tensor(embed_scale), persistent=False)
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype)
+class Gemma3MLP(nn.Module):
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_activation]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class Gemma3RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float())
+        # Llama does x.to(float16) * w whilst Gemma3 is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+class Gemma3RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__()
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.layer_types = list(set(config.layer_types))
+        self.rope_type = {}
+        for layer_type in self.layer_types:
+            rope_params = self.config.rope_parameters[layer_type]
+            if rope_params is None:
+                continue
+            self.rope_type[layer_type] = rope_params["rope_type"]
+            rope_init_fn: Callable = self.compute_default_rope_parameters
+            if self.rope_type[layer_type] != "default":
+                rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type[layer_type]]
+            curr_inv_freq, curr_attention_scaling = rope_init_fn(self.config, layer_type=layer_type)
+            self.register_buffer(f"{layer_type}_inv_freq", curr_inv_freq, persistent=False)
+            self.register_buffer(f"{layer_type}_original_inv_freq", curr_inv_freq.clone(), persistent=False)
+            setattr(self, f"{layer_type}_attention_scaling", curr_attention_scaling)
+    @staticmethod
+    def compute_default_rope_parameters(
+        config: Gemma3TextConfig | None = None,
+        device: Optional["torch.device"] = None,
+        seq_len: int | None = None,
+        layer_type: str | None = None,
+    ) -> tuple["torch.Tensor", float]:
+        """
+        Computes the inverse frequencies according to the original RoPE implementation
+        Args:
+            config ([`~transformers.PreTrainedConfig`]):
+                The model configuration.
+            device (`torch.device`):
+                The device to use for initialization of the inverse frequencies.
+            seq_len (`int`, *optional*):
+                The current sequence length. Unused for this type of RoPE.
+            layer_type (`str`, *optional*):
+                The current layer type if the model has different RoPE parameters per type.
+                Should not be used unless `config.layer_types is not None`
+        Returns:
+            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+        """
+        # For backward compatibility standardize the `rope_parameters_dict` if it uses old format
+        base = config.rope_parameters[layer_type]["rope_theta"]
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        attention_factor = 1.0  # Unused in this type of RoPE
+        # Compute the inverse frequencies
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, attention_factor
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids, layer_type=None):
+        inv_freq = getattr(self, f"{layer_type}_inv_freq")
+        attention_scaling = getattr(self, f"{layer_type}_attention_scaling")
+        inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * attention_scaling
+            sin = emb.sin() * attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+@use_kernel_func_from_hub("rotary_pos_emb")
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    dropout: float | int = 0.0,
+    scaling: float | None = None,
+    softcap: float | None = None,
+    **kwargs,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if scaling is None:
+        scaling = module.head_dim**-0.5
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if softcap is not None:
+        attn_weights = attn_weights / softcap
+        attn_weights = torch.tanh(attn_weights)
+        attn_weights = attn_weights * softcap
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+@use_kernelized_func(apply_rotary_pos_emb)
+class Gemma3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        super().__init__()
+        self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = config.query_pre_attn_scalar**-0.5
+        self.attention_dropout = self.config.attention_dropout
+        self.is_causal = not self.config.use_bidirectional_attention
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.attn_logit_softcapping = self.config.attn_logit_softcapping
+        self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None
+        self.is_sliding = self.layer_type == "sliding_attention"
+        self.q_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: torch.Tensor = None,
+        attention_mask: torch.Tensor | None = None,
+        past_key_values: Cache | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=self.attention_dropout if self.training else 0.0,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class Gemma3DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        self.self_attn = Gemma3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Gemma3MLP(config)
+        self.input_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: torch.Tensor = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor] | None]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+@auto_docstring
+class Gemma3PreTrainedModel(PreTrainedModel):
+    config: Gemma3Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "Gemma3DecoderLayer",
+        "SiglipVisionEmbeddings",
+        "SiglipEncoderLayer",
+        "SiglipMultiheadAttentionPoolingHead",
+    ]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Gemma3DecoderLayer,
+        "attentions": Gemma3Attention,
+    }
+    input_modalities = ("image", "text")
+    @torch.no_grad()
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, Gemma3MultiModalProjector):
+            init.zeros_(module.mm_input_projection_weight)
+        # We initialize with 0s to be 1 centered as the RMSNorm here does (1 + weight)
+        elif "RMSNorm" in module.__class__.__name__:
+            init.zeros_(module.weight)
+        elif isinstance(module, Gemma3TextScaledWordEmbedding):
+            init.constant_(module.embed_scale, module.scalar_embed_scale)
+        elif isinstance(module, Gemma3RotaryEmbedding):
+            for layer_type in module.layer_types:
+                rope_init_fn = module.compute_default_rope_parameters
+                if module.rope_type[layer_type] != "default":
+                    rope_init_fn = ROPE_INIT_FUNCTIONS[module.rope_type[layer_type]]
+                curr_inv_freq, _ = rope_init_fn(module.config, layer_type=layer_type)
+                init.copy_(getattr(module, f"{layer_type}_inv_freq"), curr_inv_freq)
+                init.copy_(getattr(module, f"{layer_type}_original_inv_freq"), curr_inv_freq)
+def _bidirectional_window_overlay(sliding_window: int) -> Callable[[int, int, int, int], bool]:
+    """
+    Enables a bidirectional mask within the sliding window.
+    """
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        """A token can attend to any other token if their absolute distance is within
+        the (exclusive) sliding window size (distance < sliding_window)."""
+        return abs(q_idx - kv_idx) < sliding_window
+    return inner_mask
+@auto_docstring
+class Gemma3TextModel(Gemma3PreTrainedModel):
+    config: Gemma3TextConfig
+    input_modalities = ("text",)
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        # Gemma3 downcasts the below to bfloat16, causing sqrt(3072)=55.4256 to become 55.5. See https://github.com/huggingface/transformers/pull/29402
+        self.embed_tokens = Gemma3TextScaledWordEmbedding(
+            config.vocab_size, config.hidden_size, self.padding_idx, embed_scale=self.config.hidden_size**0.5
+        )
+        self.layers = nn.ModuleList(
+            [Gemma3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Gemma3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Gemma3RotaryEmbedding(config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    @merge_with_config_defaults
+    @capture_outputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        use_cache: bool | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if position_ids is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
+            position_ids = position_ids.unsqueeze(0)
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "inputs_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            sliding_mask_kwargs = mask_kwargs.copy()
+            if self.config.use_bidirectional_attention:
+                mask_kwargs["or_mask_function"] = lambda *args: torch.tensor(True, dtype=torch.bool)
+                sliding_mask_kwargs["or_mask_function"] = _bidirectional_window_overlay(self.config.sliding_window)
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**sliding_mask_kwargs),
+            }
+        # embed positions
+        hidden_states = inputs_embeds
+        position_embeddings = {}
+        for layer_type in set(self.config.layer_types):
+            position_embeddings[layer_type] = self.rotary_emb(hidden_states, position_ids, layer_type)
+        for i, decoder_layer in enumerate(self.layers[: self.config.num_hidden_layers]):
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask_mapping[self.config.layer_types[i]],
+                position_embeddings=position_embeddings[self.config.layer_types[i]],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+@auto_docstring
+class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    config: Gemma3TextConfig
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+        self.model = Gemma3TextModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        logits_to_keep: int | torch.Tensor = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, Gemma3ForCausalLM
+        >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        if self.config.final_logit_softcapping is not None:
+            logits = logits / self.config.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.config.final_logit_softcapping
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class Gemma3MultiModalProjector(nn.Module):
+    def __init__(self, config: Gemma3Config):
+        super().__init__()
+        self.mm_input_projection_weight = nn.Parameter(
+            torch.zeros(config.vision_config.hidden_size, config.text_config.hidden_size)
+        )
+        self.mm_soft_emb_norm = Gemma3RMSNorm(
+            config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps
+        )
+        self.patches_per_image = int(config.vision_config.image_size // config.vision_config.patch_size)
+        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
+        self.kernel_size = self.patches_per_image // self.tokens_per_side
+        self.avg_pool = nn.AvgPool2d(kernel_size=self.kernel_size, stride=self.kernel_size)
+    def forward(self, vision_outputs: torch.Tensor):
+        batch_size, _, hidden_size = vision_outputs.shape
+        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
+        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
+            batch_size, hidden_size, self.patches_per_image, self.patches_per_image
+        )
+        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
+        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
+        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
+        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
+        normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
+        projected_vision_outputs = torch.matmul(normed_vision_outputs, self.mm_input_projection_weight)
+        return projected_vision_outputs.type_as(vision_outputs)
+def get_block_sequence_ids_for_mask(token_type_ids: torch.Tensor, device: torch.device | None = None) -> torch.Tensor:
+    # First find where a new image block starts: 1 if image and previous not image
+    # The images cannot attend to future images, but can attend to all prev images and to itself bidirectionally
+    is_image = (token_type_ids == 1).to(device=device)
+    is_previous_image = nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]
+    new_image_start = is_image & ~is_previous_image
+    group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1
+    block_sequence_ids = torch.where(is_image, group_ids, -1)
+    return block_sequence_ids
+@auto_docstring(
+    custom_intro="""
+    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
+    """
+)
+class Gemma3Model(Gemma3PreTrainedModel):
+    # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
+    accepts_loss_kwargs = False
+    def __init__(self, config: Gemma3Config):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config=config.vision_config)
+        self.multi_modal_projector = Gemma3MultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+        language_model = AutoModel.from_config(config=config.text_config)
+        self.language_model = language_model
+        self.post_init()
+    @can_return_tuple
+    @auto_docstring(custom_intro="Projects the last hidden state from the vision model into language model space.")
+    def get_image_features(
+        self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
+    ) -> tuple | BaseModelOutputWithPooling:
+        vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs)
+        last_hidden_state = vision_outputs.last_hidden_state
+        vision_outputs.pooler_output = self.multi_modal_projector(last_hidden_state)
+        return vision_outputs
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+        n_image_tokens = special_image_mask.sum()
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        torch_compilable_check(
+            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
+        )
+        return special_image_mask
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        pixel_values: torch.FloatTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        **lm_kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | Gemma3ModelOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import httpx
+        >>> from io import BytesIO
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma32-3b-mix-224")
+        >>> processor = AutoProcessor.from_pretrained("google/gemma32-3b-mix-224")
+        >>> prompt = "Where is the cat standing?"
+        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+        >>> with httpx.stream("GET", url) as response:
+        ...     image = Image.open(BytesIO(response.read()))
+        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs,)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Where is the cat standing?\nsnow"
+        ```"""
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
+        if input_ids is not None and self.config.image_token_id >= self.vocab_size:
+            special_image_mask = input_ids == self.config.image_token_id
+            llm_input_ids = input_ids.clone()
+            llm_input_ids[special_image_mask] = 0
+        else:
+            llm_input_ids = input_ids
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(llm_input_ids)
+        # Merge text and images
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values, return_dict=True).pooler_output
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            mask_kwargs = {
+                "config": self.config.get_text_config(),
+                "inputs_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            if token_type_ids is not None:
+                mask_kwargs["block_sequence_ids"] = get_block_sequence_ids_for_mask(
+                    token_type_ids, device=inputs_embeds.device
+                )
+            # Create the masks
+            sliding_mask_kwargs = mask_kwargs.copy()
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**sliding_mask_kwargs),
+            }
+        outputs = self.language_model(
+            attention_mask=causal_mask_mapping,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            return_dict=True,
+            **lm_kwargs,
+        )
+        return Gemma3ModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+@auto_docstring(
+    custom_intro="""
+    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
+    """
+)
+class Gemma3ForConditionalGeneration(Gemma3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"}
+    # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
+    # Fix: https://github.com/huggingface/transformers/issues/40564
+    accepts_loss_kwargs = False
+    def __init__(self, config: Gemma3Config):
+        super().__init__(config)
+        self.model = Gemma3Model(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+    @auto_docstring
+    def get_image_features(self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]):
+        return self.model.get_image_features(pixel_values, **kwargs)
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        pixel_values: torch.FloatTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        logits_to_keep: int | torch.Tensor = 0,
+        **lm_kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | Gemma3CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import httpx
+        >>> from io import BytesIO
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
+        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
+        >>> messages = [
+        ...     {
+        ...         "role": "system",
+        ...         "content": [
+        ...             {"type": "text", "text": "You are a helpful assistant."}
+        ...         ]
+        ...     },
+        ...     {
+        ...         "role": "user", "content": [
+        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+        ...             {"type": "text", "text": "Where is the cat standing?"},
+        ...         ]
+        ...     },
+        ... ]
+        >>> inputs = processor.apply_chat_template(
+        ...     messages,
+        ...     tokenize=True,
+        ...     return_dict=True,
+        ...     return_tensors="pt",
+        ...     add_generation_prompt=True
+        ... )
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
+        ```
+        """
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            labels=labels,
+            return_dict=True,
+            **lm_kwargs,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+        return Gemma3CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        position_ids=None,
+        pixel_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        logits_to_keep=None,
+        labels=None,
+        is_first_iteration=False,
+        **kwargs,
+    ):
+        # Overwritten -- custom `pixel_values` handling
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            token_type_ids=token_type_ids,
+            is_first_iteration=is_first_iteration,
+            **kwargs,
+        )
+        # Pixel values are used only in the first iteration if available
+        # In subsequent iterations, they are already merged with text and cached
+        # NOTE: first iteration doesn't have to be prefill, it can be the first
+        # iteration with a question and cached system prompt (continue generate from cache). NOTE: use_cache=False needs pixel_values always
+        if is_first_iteration or not use_cache:
+            model_inputs["pixel_values"] = pixel_values
+        else:
+            # Don't pass to not apply bidirectional mask on top
+            model_inputs["token_type_ids"] = None
+        return model_inputs
+    @staticmethod
+    def create_masks_for_generate(
+        config: PreTrainedConfig,
+        inputs_embeds: torch.Tensor,
+        attention_mask: torch.Tensor | None,
+        past_key_values: Cache | None,
+        position_ids: torch.Tensor | None,
+        token_type_ids: torch.Tensor | None = None,
+        is_first_iteration: bool | None = False,
+        **kwargs,
+    ) -> dict:
+        mask_kwargs = {
+            "config": config.get_text_config(),
+            "inputs_embeds": inputs_embeds,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+        }
+        if token_type_ids is not None:
+            mask_kwargs["block_sequence_ids"] = get_block_sequence_ids_for_mask(
+                token_type_ids, device=inputs_embeds.device
+            )
+        return create_masks_for_generate(**mask_kwargs)
+@auto_docstring(
+    custom_intro="""
+Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
+It uses the generic sequence classification implementation for efficiency and consistency."""
+)
+class Gemma3TextForSequenceClassification(GenericForSequenceClassification, Gemma3PreTrainedModel):
+    config: Gemma3TextConfig
+    input_modalities = ("text",)
+class Gemma3ForSequenceClassification(GenericForSequenceClassification, Gemma3PreTrainedModel):
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        pixel_values: torch.FloatTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> SequenceClassifierOutputWithPast:
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            token_type_ids=token_type_ids,
+            labels=labels,
+            **kwargs,
+        )
+__all__ = [
+    "Gemma3PreTrainedModel",
+    "Gemma3TextModel",
+    "Gemma3ForCausalLM",
+    "Gemma3ForConditionalGeneration",
+    "Gemma3Model",
+    "Gemma3ForSequenceClassification",
+    "Gemma3TextForSequenceClassification",
+]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/gemma3/modular_gemma3.py ADDED Viewed

	@@ -0,0 +1,941 @@

+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Callable
+from typing import Any, Optional
+import torch
+import torch.nn as nn
+from huggingface_hub.dataclasses import strict
+from ... import initialization as init
+from ...cache_utils import Cache, DynamicCache
+from ...configuration_utils import PreTrainedConfig
+from ...masking_utils import create_causal_mask, create_masks_for_generate, create_sliding_window_causal_mask
+from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, SequenceClassifierOutputWithPast
+from ...modeling_rope_utils import (
+    ROPE_INIT_FUNCTIONS,
+    dynamic_rope_update,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.generic import maybe_autocast
+from ..gemma2.configuration_gemma2 import Gemma2Config
+from ..gemma2.modeling_gemma2 import (
+    Gemma2Attention,
+    Gemma2ForCausalLM,
+    Gemma2MLP,
+    Gemma2Model,
+    Gemma2PreTrainedModel,
+    Gemma2RMSNorm,
+    Gemma2RotaryEmbedding,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+from ..paligemma.modeling_paligemma import (
+    PaliGemmaCausalLMOutputWithPast,
+    PaliGemmaForConditionalGeneration,
+    PaliGemmaModel,
+    PaligemmaModelOutputWithPast,
+)
+from ..siglip import SiglipVisionConfig
+logger = logging.get_logger(__name__)
+@auto_docstring(checkpoint="google/gemma-3-4b-it")
+@strict
+class Gemma3TextConfig(Gemma2Config, PreTrainedConfig):
+    r"""
+    query_pre_attn_scalar (`float`, *optional*, defaults to 256):
+        scaling factor used on the attention scores
+    final_logit_softcapping (`float`, *optional*):
+        Scaling factor when applying tanh softcapping on the logits.
+    attn_logit_softcapping (`float`, *optional*):
+        Scaling factor when applying tanh softcapping on the attention scores.
+    use_bidirectional_attention (`bool`, *optional*, defaults to `False`):
+        If True, the model will attend to all text tokens instead of using a causal mask. This does not change
+        behavior for vision tokens.
+    ```python
+    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
+    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
+    >>> configuration = Gemma3TextConfig()
+    >>> # Initializing a model from the gemma3_text-7b style configuration
+    >>> model = Gemma3TextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "gemma3_text"
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.q_norm": "replicated_with_grad_allreduce",
+        "layers.*.self_attn.k_norm": "replicated_with_grad_allreduce",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    default_theta = {"global": 1_000_000.0, "local": 10_000.0}
+    vocab_size: int = 262_208
+    max_position_embeddings: int = 131_072
+    layer_types: list[str] | None = None
+    final_logit_softcapping: float | None = None
+    attn_logit_softcapping: float | None = None
+    rope_parameters: dict | None = None
+    use_bidirectional_attention: bool | None = False
+    def __post_init__(self, **kwargs):
+        if self.use_bidirectional_attention:
+            self.sliding_window = (self.sliding_window // 2) + 1  # due to fa we set exclusive bounds
+        # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
+        self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6)
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        PreTrainedConfig.__post_init__(**kwargs)
+    def convert_rope_params_to_dict(self, **kwargs):
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters`
+        # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format
+        default_rope_params = {
+            "sliding_attention": {"rope_type": "default"},
+            "full_attention": {"rope_type": "default"},
+        }
+        self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params
+        if rope_scaling is not None:
+            self.rope_parameters["full_attention"].update(rope_scaling)
+        # Set default values if not present
+        if self.rope_parameters.get("full_attention") is None:
+            self.rope_parameters["full_attention"] = {"rope_type": "default"}
+        self.rope_parameters["full_attention"].setdefault(
+            "rope_theta", kwargs.pop("rope_theta", self.default_theta["global"])
+        )
+        if self.rope_parameters.get("sliding_attention") is None:
+            self.rope_parameters["sliding_attention"] = {"rope_type": "default"}
+        self.rope_parameters["sliding_attention"].setdefault(
+            "rope_theta", kwargs.pop("rope_local_base_freq", self.default_theta["local"])
+        )
+        # Standardize and validate the correctness of rotary position embeddings parameters
+        self.standardize_rope_params()
+        return kwargs
+@auto_docstring(checkpoint="google/gemma-3-4b-it")
+@strict
+class Gemma3Config(PreTrainedConfig):
+    r"""
+    mm_tokens_per_image (`int`, *optional*, defaults to 256):
+        The number of tokens per image embedding.
+    boi_token_index (`int`, *optional*, defaults to 255999):
+        The begin-of-image token index to wrap the image prompt.
+    eoi_token_index (`int`, *optional*, defaults to 256000):
+        The end-of-image token index to wrap the image prompt.
+    Example:
+    ```python
+    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig
+    >>> # Initializing a Siglip-like vision config
+    >>> vision_config = SiglipVisionConfig()
+    >>> # Initializing a Gemma3 Text config
+    >>> text_config = Gemma3TextConfig()
+    >>> # Initializing a Gemma3 gemma-3-4b style configuration
+    >>> configuration = Gemma3Config(vision_config, text_config)
+    >>> # Initializing a model from the gemma-3-4b style configuration
+    >>> model = Gemma3TextConfig(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "gemma3"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+        "boi_token_id": "boi_token_index",
+        "eoi_token_id": "eoi_token_index",
+    }
+    sub_configs = {
+        "text_config": Gemma3TextConfig,
+        "vision_config": SiglipVisionConfig,
+    }
+    text_config: Gemma3TextConfig | dict[str, Any] | None = None
+    vision_config: SiglipVisionConfig | dict[str, Any] | None = None
+    mm_tokens_per_image: int | None = 256
+    boi_token_index: int | None = 255_999
+    eoi_token_index: int | None = 256_000
+    image_token_index: int | None = 262_144
+    initializer_range: float | None = 0.02
+    tie_word_embeddings: bool | None = True
+    def __post_init__(self, **kwargs):
+        if self.text_config is None:
+            self.text_config = Gemma3TextConfig()
+            logger.info("text_config is None, using default Gemma3TextConfig text config.")
+        elif isinstance(self.text_config, dict):
+            self.text_config = Gemma3TextConfig(**self.text_config)
+        if isinstance(self.vision_config, dict):
+            self.vision_config = SiglipVisionConfig(**self.vision_config)
+        elif self.vision_config is None:
+            self.vision_config = SiglipVisionConfig()
+            logger.info("vision_config is None, using default SiglipVisionConfig vision config.")
+        super().__post_init__(**kwargs)
+class Gemma3ModelOutputWithPast(PaligemmaModelOutputWithPast):
+    pass
+class Gemma3CausalLMOutputWithPast(PaliGemmaCausalLMOutputWithPast):
+    pass
+class Gemma3TextScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.scalar_embed_scale = embed_scale
+        self.register_buffer("embed_scale", torch.tensor(embed_scale), persistent=False)
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype)
+class Gemma3MLP(Gemma2MLP):
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+class Gemma3RMSNorm(Gemma2RMSNorm):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__(dim=dim, eps=eps)
+class Gemma3RotaryEmbedding(Gemma2RotaryEmbedding, nn.Module):
+    def __init__(self, config: Gemma3TextConfig):
+        nn.Module.__init__(self)
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.layer_types = list(set(config.layer_types))
+        self.rope_type = {}
+        for layer_type in self.layer_types:
+            rope_params = self.config.rope_parameters[layer_type]
+            if rope_params is None:
+                continue
+            self.rope_type[layer_type] = rope_params["rope_type"]
+            rope_init_fn: Callable = self.compute_default_rope_parameters
+            if self.rope_type[layer_type] != "default":
+                rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type[layer_type]]
+            curr_inv_freq, curr_attention_scaling = rope_init_fn(self.config, layer_type=layer_type)
+            self.register_buffer(f"{layer_type}_inv_freq", curr_inv_freq, persistent=False)
+            self.register_buffer(f"{layer_type}_original_inv_freq", curr_inv_freq.clone(), persistent=False)
+            setattr(self, f"{layer_type}_attention_scaling", curr_attention_scaling)
+    @staticmethod
+    def compute_default_rope_parameters(
+        config: Gemma3TextConfig | None = None,
+        device: Optional["torch.device"] = None,
+        seq_len: int | None = None,
+        layer_type: str | None = None,
+    ) -> tuple["torch.Tensor", float]:
+        """
+        Computes the inverse frequencies according to the original RoPE implementation
+        Args:
+            config ([`~transformers.PreTrainedConfig`]):
+                The model configuration.
+            device (`torch.device`):
+                The device to use for initialization of the inverse frequencies.
+            seq_len (`int`, *optional*):
+                The current sequence length. Unused for this type of RoPE.
+            layer_type (`str`, *optional*):
+                The current layer type if the model has different RoPE parameters per type.
+                Should not be used unless `config.layer_types is not None`
+        Returns:
+            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+        """
+        # For backward compatibility standardize the `rope_parameters_dict` if it uses old format
+        base = config.rope_parameters[layer_type]["rope_theta"]
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        attention_factor = 1.0  # Unused in this type of RoPE
+        # Compute the inverse frequencies
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, attention_factor
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids, layer_type=None):
+        inv_freq = getattr(self, f"{layer_type}_inv_freq")
+        attention_scaling = getattr(self, f"{layer_type}_attention_scaling")
+        inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * attention_scaling
+            sin = emb.sin() * attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+# Weird way to inherit but otherwise the sliding window gets defined first and can't access `is_sliding`
+class Gemma3Attention(Gemma2Attention):
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None
+        self.is_sliding = self.layer_type == "sliding_attention"
+        self.is_causal = not self.config.use_bidirectional_attention
+        self.q_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: torch.Tensor = None,
+        attention_mask: torch.Tensor | None = None,
+        past_key_values: Cache | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=self.attention_dropout if self.training else 0.0,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class Gemma3DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        self.self_attn = Gemma3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Gemma3MLP(config)
+        self.input_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: torch.Tensor = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor] | None]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+GEMMA3_START_DOCSTRING = None
+class Gemma3PreTrainedModel(Gemma2PreTrainedModel):
+    base_model_prefix = "model"
+    input_modalities = ("image", "text")
+    _no_split_modules = [
+        "Gemma3DecoderLayer",
+        "SiglipVisionEmbeddings",
+        "SiglipEncoderLayer",
+        "SiglipMultiheadAttentionPoolingHead",
+    ]
+    @torch.no_grad()
+    def _init_weights(self, module):
+        PreTrainedModel._init_weights(self, module)
+        if isinstance(module, Gemma3MultiModalProjector):
+            init.zeros_(module.mm_input_projection_weight)
+        # We initialize with 0s to be 1 centered as the RMSNorm here does (1 + weight)
+        elif "RMSNorm" in module.__class__.__name__:
+            init.zeros_(module.weight)
+        elif isinstance(module, Gemma3TextScaledWordEmbedding):
+            init.constant_(module.embed_scale, module.scalar_embed_scale)
+        elif isinstance(module, Gemma3RotaryEmbedding):
+            for layer_type in module.layer_types:
+                rope_init_fn = module.compute_default_rope_parameters
+                if module.rope_type[layer_type] != "default":
+                    rope_init_fn = ROPE_INIT_FUNCTIONS[module.rope_type[layer_type]]
+                curr_inv_freq, _ = rope_init_fn(module.config, layer_type=layer_type)
+                init.copy_(getattr(module, f"{layer_type}_inv_freq"), curr_inv_freq)
+                init.copy_(getattr(module, f"{layer_type}_original_inv_freq"), curr_inv_freq)
+def _bidirectional_window_overlay(sliding_window: int) -> Callable[[int, int, int, int], bool]:
+    """
+    Enables a bidirectional mask within the sliding window.
+    """
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        """A token can attend to any other token if their absolute distance is within
+        the (exclusive) sliding window size (distance < sliding_window)."""
+        return abs(q_idx - kv_idx) < sliding_window
+    return inner_mask
+class Gemma3TextModel(Gemma2Model):
+    config: Gemma3TextConfig
+    input_modalities = ("text",)
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+        # Gemma3 downcasts the below to bfloat16, causing sqrt(3072)=55.4256 to become 55.5. See https://github.com/huggingface/transformers/pull/29402
+        self.embed_tokens = Gemma3TextScaledWordEmbedding(
+            config.vocab_size, config.hidden_size, self.padding_idx, embed_scale=self.config.hidden_size**0.5
+        )
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        use_cache: bool | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if position_ids is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
+            position_ids = position_ids.unsqueeze(0)
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "inputs_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            sliding_mask_kwargs = mask_kwargs.copy()
+            if self.config.use_bidirectional_attention:
+                mask_kwargs["or_mask_function"] = lambda *args: torch.tensor(True, dtype=torch.bool)
+                sliding_mask_kwargs["or_mask_function"] = _bidirectional_window_overlay(self.config.sliding_window)
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**sliding_mask_kwargs),
+            }
+        # embed positions
+        hidden_states = inputs_embeds
+        position_embeddings = {}
+        for layer_type in set(self.config.layer_types):
+            position_embeddings[layer_type] = self.rotary_emb(hidden_states, position_ids, layer_type)
+        for i, decoder_layer in enumerate(self.layers[: self.config.num_hidden_layers]):
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask_mapping[self.config.layer_types[i]],
+                position_embeddings=position_embeddings[self.config.layer_types[i]],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+class Gemma3ForCausalLM(Gemma2ForCausalLM):
+    config: Gemma3TextConfig
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+        self.model = Gemma3TextModel(config)
+class Gemma3MultiModalProjector(nn.Module):
+    def __init__(self, config: Gemma3Config):
+        super().__init__()
+        self.mm_input_projection_weight = nn.Parameter(
+            torch.zeros(config.vision_config.hidden_size, config.text_config.hidden_size)
+        )
+        self.mm_soft_emb_norm = Gemma3RMSNorm(
+            config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps
+        )
+        self.patches_per_image = int(config.vision_config.image_size // config.vision_config.patch_size)
+        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
+        self.kernel_size = self.patches_per_image // self.tokens_per_side
+        self.avg_pool = nn.AvgPool2d(kernel_size=self.kernel_size, stride=self.kernel_size)
+    def forward(self, vision_outputs: torch.Tensor):
+        batch_size, _, hidden_size = vision_outputs.shape
+        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
+        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
+            batch_size, hidden_size, self.patches_per_image, self.patches_per_image
+        )
+        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
+        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
+        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
+        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
+        normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
+        projected_vision_outputs = torch.matmul(normed_vision_outputs, self.mm_input_projection_weight)
+        return projected_vision_outputs.type_as(vision_outputs)
+def get_block_sequence_ids_for_mask(token_type_ids: torch.Tensor, device: torch.device | None = None) -> torch.Tensor:
+    # First find where a new image block starts: 1 if image and previous not image
+    # The images cannot attend to future images, but can attend to all prev images and to itself bidirectionally
+    is_image = (token_type_ids == 1).to(device=device)
+    is_previous_image = nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]
+    new_image_start = is_image & ~is_previous_image
+    group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1
+    block_sequence_ids = torch.where(is_image, group_ids, -1)
+    return block_sequence_ids
+class Gemma3Model(PaliGemmaModel):
+    # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
+    accepts_loss_kwargs = False
+    def __init__(self, config: Gemma3Config):
+        super().__init__(config)
+        del self.text_config_dtype
+    @can_return_tuple
+    @auto_docstring(custom_intro="Projects the last hidden state from the vision model into language model space.")
+    def get_image_features(
+        self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
+    ) -> tuple | BaseModelOutputWithPooling:
+        vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs)
+        last_hidden_state = vision_outputs.last_hidden_state
+        vision_outputs.pooler_output = self.multi_modal_projector(last_hidden_state)
+        return vision_outputs
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        pixel_values: torch.FloatTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        **lm_kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | Gemma3ModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
+        if input_ids is not None and self.config.image_token_id >= self.vocab_size:
+            special_image_mask = input_ids == self.config.image_token_id
+            llm_input_ids = input_ids.clone()
+            llm_input_ids[special_image_mask] = 0
+        else:
+            llm_input_ids = input_ids
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(llm_input_ids)
+        # Merge text and images
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values, return_dict=True).pooler_output
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            mask_kwargs = {
+                "config": self.config.get_text_config(),
+                "inputs_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            if token_type_ids is not None:
+                mask_kwargs["block_sequence_ids"] = get_block_sequence_ids_for_mask(
+                    token_type_ids, device=inputs_embeds.device
+                )
+            # Create the masks
+            sliding_mask_kwargs = mask_kwargs.copy()
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**sliding_mask_kwargs),
+            }
+        outputs = self.language_model(
+            attention_mask=causal_mask_mapping,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            return_dict=True,
+            **lm_kwargs,
+        )
+        return Gemma3ModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+class Gemma3ForConditionalGeneration(PaliGemmaForConditionalGeneration):
+    # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
+    # Fix: https://github.com/huggingface/transformers/issues/40564
+    accepts_loss_kwargs = False
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        pixel_values: torch.FloatTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        logits_to_keep: int | torch.Tensor = 0,
+        **lm_kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | Gemma3CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import httpx
+        >>> from io import BytesIO
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
+        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
+        >>> messages = [
+        ...     {
+        ...         "role": "system",
+        ...         "content": [
+        ...             {"type": "text", "text": "You are a helpful assistant."}
+        ...         ]
+        ...     },
+        ...     {
+        ...         "role": "user", "content": [
+        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+        ...             {"type": "text", "text": "Where is the cat standing?"},
+        ...         ]
+        ...     },
+        ... ]
+        >>> inputs = processor.apply_chat_template(
+        ...     messages,
+        ...     tokenize=True,
+        ...     return_dict=True,
+        ...     return_tensors="pt",
+        ...     add_generation_prompt=True
+        ... )
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
+        ```
+        """
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            labels=labels,
+            return_dict=True,
+            **lm_kwargs,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+        return Gemma3CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        position_ids=None,
+        pixel_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        logits_to_keep=None,
+        labels=None,
+        is_first_iteration=False,
+        **kwargs,
+    ):
+        # Overwritten -- custom `pixel_values` handling
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            token_type_ids=token_type_ids,
+            is_first_iteration=is_first_iteration,
+            **kwargs,
+        )
+        # Pixel values are used only in the first iteration if available
+        # In subsequent iterations, they are already merged with text and cached
+        # NOTE: first iteration doesn't have to be prefill, it can be the first
+        # iteration with a question and cached system prompt (continue generate from cache). NOTE: use_cache=False needs pixel_values always
+        if is_first_iteration or not use_cache:
+            model_inputs["pixel_values"] = pixel_values
+        else:
+            # Don't pass to not apply bidirectional mask on top
+            model_inputs["token_type_ids"] = None
+        return model_inputs
+    def create_masks_for_generate(
+        config: PreTrainedConfig,
+        inputs_embeds: torch.Tensor,
+        attention_mask: torch.Tensor | None,
+        past_key_values: Cache | None,
+        position_ids: torch.Tensor | None,
+        token_type_ids: torch.Tensor | None = None,
+        is_first_iteration: bool | None = False,
+        **kwargs,
+    ) -> dict:
+        mask_kwargs = {
+            "config": config.get_text_config(),
+            "inputs_embeds": inputs_embeds,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+        }
+        if token_type_ids is not None:
+            mask_kwargs["block_sequence_ids"] = get_block_sequence_ids_for_mask(
+                token_type_ids, device=inputs_embeds.device
+            )
+        return create_masks_for_generate(**mask_kwargs)
+@auto_docstring(
+    custom_intro="""
+Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
+It uses the generic sequence classification implementation for efficiency and consistency."""
+)
+class Gemma3TextForSequenceClassification(GenericForSequenceClassification, Gemma3PreTrainedModel):
+    config: Gemma3TextConfig
+    input_modalities = ("text",)
+class Gemma3ForSequenceClassification(GenericForSequenceClassification, Gemma3PreTrainedModel):
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        pixel_values: torch.FloatTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> SequenceClassifierOutputWithPast:
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            token_type_ids=token_type_ids,
+            labels=labels,
+            **kwargs,
+        )
+__all__ = [
+    "Gemma3Config",
+    "Gemma3TextConfig",
+    "Gemma3PreTrainedModel",
+    "Gemma3TextModel",
+    "Gemma3ForCausalLM",
+    "Gemma3ForConditionalGeneration",
+    "Gemma3Model",
+    "Gemma3ForSequenceClassification",
+    "Gemma3TextForSequenceClassification",
+]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/gemma3/processing_gemma3.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, make_nested_list_of_images
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import auto_docstring, to_py_obj
+class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_mm_token_type_ids": True,
+        },
+        "images_kwargs": {
+            "do_convert_rgb": True,
+            "do_pan_and_scan": False,
+            "pan_and_scan_min_crop_size": 256,
+            "pan_and_scan_max_num_crops": 4,
+            "pan_and_scan_min_ratio_to_activate": 1.2,
+        },
+    }
+@auto_docstring
+class Gemma3Processor(ProcessorMixin):
+    def __init__(
+        self,
+        image_processor,
+        tokenizer,
+        chat_template=None,
+        image_seq_length: int = 256,
+        **kwargs,
+    ):
+        self.image_seq_length = image_seq_length
+        self.image_token_id = tokenizer.image_token_id
+        self.boi_token = tokenizer.boi_token
+        self.image_token = tokenizer.image_token
+        image_tokens_expanded = "".join([tokenizer.image_token] * image_seq_length)
+        self.full_image_sequence = f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n"
+        super().__init__(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            chat_template=chat_template,
+            **kwargs,
+        )
+    @auto_docstring
+    def __call__(
+        self,
+        images: ImageInput | None = None,
+        text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
+        **kwargs: Unpack[Gemma3ProcessorKwargs],
+    ) -> BatchFeature:
+        if text is None and images is None:
+            raise ValueError("Provide at least one of `text` or `images`.")
+        output_kwargs = self._merge_kwargs(
+            Gemma3ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise TypeError("Invalid input text. Please provide a string, or a list of strings")
+        image_inputs = {}
+        if images is not None:
+            images = self.image_processor.fetch_images(images)
+            batched_images = make_nested_list_of_images(images)
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+            # Create empty text to be replaced with placeholders
+            if not text:
+                text = [" ".join([self.boi_token] * len(images)) for images in batched_images]
+            if len(batched_images) != len(text):
+                raise ValueError(
+                    f"Received inconsistently sized batches of images ({len(batched_images)}) and text ({len(text)})."
+                )
+            # Replace image tokens by the full expanded sequence
+            num_crops = to_py_obj(image_inputs.pop("num_crops"))
+            batch_num_crops = [[num_crops.pop(0) for _ in range(len(images))] for images in batched_images]
+            for batch_idx, (prompt, images, num_crops) in enumerate(zip(text, batched_images, batch_num_crops)):
+                image_indexes = [m.start() for m in re.finditer(self.boi_token, prompt)]
+                if len(images) != len(image_indexes):
+                    raise ValueError(
+                        f"Prompt contained {len(image_indexes)} image tokens but received {len(images)} images."
+                    )
+                # Insert additional image tokens for Pan-and-Scan crops
+                for num, idx in reversed(list(zip(num_crops, image_indexes))):
+                    if num:
+                        formatted_image_text = (
+                            f"Here is the original image {self.boi_token} and here are some crops to help you see better "
+                            + " ".join([self.boi_token] * num)
+                        )
+                        prompt = prompt[:idx] + formatted_image_text + prompt[idx + len(self.boi_token) :]
+                        text[batch_idx] = prompt
+            # Expand placeholder image tokens to the full image token sequence
+            text = [prompt.replace(self.boi_token, self.full_image_sequence) for prompt in text]
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
+        if return_mm_token_type_ids:
+            text_inputs["token_type_ids"] = self.create_mm_token_type_ids(text_inputs["input_ids"])
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+        vision_data = {}
+        if image_sizes is not None:
+            # NOTE: no image cropping supported yet
+            num_image_tokens = [self.image_seq_length] * len(image_sizes)
+            num_image_patches = [1] * len(image_sizes)
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+        return MultiModalData(**vision_data)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names + ["token_type_ids"]
+        image_processor_input_names = self.image_processor.model_input_names
+        image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"]
+        return list(tokenizer_input_names + image_processor_input_names)
+__all__ = ["Gemma3Processor"]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/youtu/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_youtu import *
+    from .modeling_youtu import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/youtu/configuration_youtu.py ADDED Viewed

	@@ -0,0 +1,107 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/youtu/modular_youtu.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_youtu.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 the Tencent and HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from huggingface_hub.dataclasses import strict
+from ...configuration_utils import PreTrainedConfig
+from ...modeling_rope_utils import RopeParameters
+from ...utils import auto_docstring
+@auto_docstring(checkpoint="tencent/Youtu-LLM-2B")
+@strict
+class YoutuConfig(PreTrainedConfig):
+    r"""
+    rope_interleave (`bool`, *optional*, defaults to `True`):
+        Whether to interleave the rotary position embeddings.
+    embedding_initializer_range (`float`, *optional*):
+        The standard deviation of the truncated_normal_initializer for initializing all embedding matrices.
+    ```python
+    >>> from transformers import YoutuModel, YoutuConfig
+    >>> # Initializing a Youtu-LLM-2B style configuration
+    >>> configuration = YoutuConfig()
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "youtu"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    attribute_map = {}
+    vocab_size: int = 128256
+    hidden_size: int = 2048
+    intermediate_size: int = 6144
+    num_hidden_layers: int = 32
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 16
+    kv_lora_rank: int = 512
+    q_lora_rank: int | None = 1536
+    qk_rope_head_dim: int = 64
+    v_head_dim: int | None = 128
+    qk_nope_head_dim: int = 128
+    hidden_act: str = "silu"
+    max_position_embeddings: int = 131072
+    initializer_range: float | None = None
+    rms_norm_eps: float = 1e-6
+    use_cache: bool = True
+    pad_token_id: int | None = None
+    bos_token_id: int | None = 128000
+    eos_token_id: int | list[int] | None = 128001
+    tie_word_embeddings: bool = True
+    rope_parameters: RopeParameters | dict | None = None
+    rope_interleave: bool | None = True
+    attention_bias: bool = False
+    attention_dropout: float | int | None = 0.0
+    embedding_initializer_range: float | None = None
+    def __post_init__(self, **kwargs):
+        if self.initializer_range is None:
+            if self.hidden_size != 0:
+                self.initializer_range = 2.0 / (5.0 * self.hidden_size) ** 0.5
+            else:
+                self.initializer_range = 0.02
+        self.embedding_initializer_range = self.embedding_initializer_range or 2.0 * self.initializer_range
+        if self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+        self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
+        self.head_dim = self.qk_rope_head_dim
+        super().__post_init__(**kwargs)
+__all__ = ["YoutuConfig"]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/youtu/modeling_youtu.py ADDED Viewed

	@@ -0,0 +1,607 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/youtu/modular_youtu.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_youtu.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 the Tencent and HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from collections.abc import Callable
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from ... import initialization as init
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.generic import is_flash_attention_requested, maybe_autocast, merge_with_config_defaults
+from ...utils.output_capturing import capture_outputs
+from .configuration_youtu import YoutuConfig
+@use_kernel_forward_from_hub("RMSNorm")
+class YoutuRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps: float = 1e-6) -> None:
+        """
+        YoutuRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class YoutuRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, config: YoutuConfig, device=None):
+        super().__init__()
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_type = self.config.rope_parameters["rope_type"]
+        rope_init_fn: Callable = self.compute_default_rope_parameters
+        if self.rope_type != "default":
+            rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
+    @staticmethod
+    def compute_default_rope_parameters(
+        config: YoutuConfig | None = None,
+        device: Optional["torch.device"] = None,
+        seq_len: int | None = None,
+    ) -> tuple["torch.Tensor", float]:
+        """
+        Computes the inverse frequencies according to the original RoPE implementation
+        Args:
+            config ([`~transformers.PreTrainedConfig`]):
+                The model configuration.
+            device (`torch.device`):
+                The device to use for initialization of the inverse frequencies.
+            seq_len (`int`, *optional*):
+                The current sequence length. Unused for this type of RoPE.
+        Returns:
+            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+        """
+        base = config.rope_parameters["rope_theta"]
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        attention_factor = 1.0  # Unused in this type of RoPE
+        # Compute the inverse frequencies
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, attention_factor
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class YoutuMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+@use_kernel_func_from_hub("rotary_pos_emb")
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+def apply_rotary_pos_emb_interleave(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    r"""
+    TODO let's just use the original freqcis computation to not have the view
+    transpose + reshape! This is not optimized!
+    Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    b, h, s, d = q.shape
+    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+    b, h, s, d = k.shape
+    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def yarn_get_mscale(scale=1, mscale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+class YoutuAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: YoutuConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.attention_dropout = config.attention_dropout
+        self.num_heads = config.num_attention_heads
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.kv_lora_rank = config.kv_lora_rank
+        self.v_head_dim = config.v_head_dim
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.qk_head_dim = config.qk_head_dim
+        self.is_causal = True
+        if self.q_lora_rank is None:
+            self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.qk_head_dim, bias=False)
+        else:
+            self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.attention_bias)
+            self.q_a_layernorm = YoutuRMSNorm(config.q_lora_rank)
+            self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
+        self.kv_a_proj_with_mqa = nn.Linear(
+            config.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=config.attention_bias,
+        )
+        self.kv_a_layernorm = YoutuRMSNorm(self.kv_lora_rank)
+        self.kv_b_proj = nn.Linear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+        )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.v_head_dim,
+            config.hidden_size,
+            bias=config.attention_bias,
+        )
+        self.scaling = self.qk_head_dim ** (-0.5)
+        if self.config.rope_parameters.get("rope_type", "default") != "default":
+            mscale_all_dim = self.config.rope_parameters.get("mscale_all_dim", 0)
+            scaling_factor = self.config.rope_parameters["factor"]
+            if mscale_all_dim:
+                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+                self.scaling = self.scaling * mscale * mscale
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: torch.Tensor | None,
+        past_key_values: Cache | None = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
+        batch_size, seq_length = hidden_states.shape[:-1]
+        query_shape = (batch_size, seq_length, -1, self.qk_head_dim)
+        key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim)
+        if self.q_lora_rank is None:
+            q_states = self.q_proj(hidden_states)
+        else:
+            q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q_states = q_states.view(query_shape).transpose(1, 2)
+        q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass)).view(key_shape).transpose(1, 2)
+        k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+        cos, sin = position_embeddings
+        if self.config.rope_interleave:  # support using interleaved weights for efficiency
+            q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin)
+        else:
+            q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin)
+        k_rot = k_rot.expand(*k_pass.shape[:-1], -1)
+        query_states = torch.cat((q_pass, q_rot), dim=-1)
+        key_states = torch.cat((k_pass, k_rot), dim=-1)
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
+        if is_flash_attention_requested(self.config) and self.qk_head_dim != self.v_head_dim:
+            value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        if is_flash_attention_requested(self.config) and self.qk_head_dim != self.v_head_dim:
+            attn_output = attn_output[:, :, :, : self.v_head_dim]
+        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class YoutuDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: YoutuConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = YoutuAttention(config=config, layer_idx=layer_idx)
+        self.mlp = YoutuMLP(config)
+        self.input_layernorm = YoutuRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = YoutuRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        use_cache: bool | None = False,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+@auto_docstring
+class YoutuPreTrainedModel(PreTrainedModel):
+    config: YoutuConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["YoutuDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": YoutuDecoderLayer,
+        "attentions": YoutuAttention,
+    }
+    @torch.no_grad()
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        std = getattr(self.config, "initializer_range", 0.02)
+        embed_std = getattr(self.config, "embedding_initializer_range", 2 * std)
+        if isinstance(module, nn.Embedding):
+            init.normal_(module.weight, mean=0.0, std=embed_std)
+            if module.padding_idx is not None:
+                init.zeros_(module.weight.data[module.padding_idx])
+@auto_docstring
+class YoutuModel(YoutuPreTrainedModel):
+    def __init__(self, config: YoutuConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [YoutuDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = YoutuRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = YoutuRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    @merge_with_config_defaults
+    @capture_outputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        use_cache: bool | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if position_ids is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
+            position_ids = position_ids.unsqueeze(0)
+        causal_mask = create_causal_mask(
+            config=self.config,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids)
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_embeddings=position_embeddings,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+@auto_docstring
+class YoutuForCausalLM(YoutuPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = YoutuModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        logits_to_keep: int | torch.Tensor = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, YoutuForCausalLM
+        >>> model = YoutuForCausalLM.from_pretrained("meta-youtu/Youtu-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-youtu/Youtu-2-7b-hf")
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = ["YoutuPreTrainedModel", "YoutuModel", "YoutuForCausalLM"]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/youtu/modular_youtu.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright 2026 the Tencent and HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from huggingface_hub.dataclasses import strict
+from torch import nn
+from ... import initialization as init
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from ..deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config
+from ..deepseek_v3.modeling_deepseek_v3 import DeepseekV3Attention
+from ..llama.modeling_llama import (
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaModel,
+    LlamaPreTrainedModel,
+    LlamaRMSNorm,
+    LlamaRotaryEmbedding,
+)
+from ..qwen3.modeling_qwen3 import Qwen3MLP
+logger = logging.get_logger(__name__)
+@auto_docstring(checkpoint="tencent/Youtu-LLM-2B")
+@strict
+class YoutuConfig(DeepseekV3Config):
+    r"""
+    rope_interleave (`bool`, *optional*, defaults to `True`):
+        Whether to interleave the rotary position embeddings.
+    embedding_initializer_range (`float`, *optional*):
+        The standard deviation of the truncated_normal_initializer for initializing all embedding matrices.
+    ```python
+    >>> from transformers import YoutuModel, YoutuConfig
+    >>> # Initializing a Youtu-LLM-2B style configuration
+    >>> configuration = YoutuConfig()
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "youtu"
+    base_model_tp_plan = {
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    attribute_map = {}
+    vocab_size: int = 128256
+    hidden_size: int = 2048
+    intermediate_size: int = 6144
+    num_hidden_layers: int = 32
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 16
+    max_position_embeddings: int = 131072
+    initializer_range: float | None = None
+    embedding_initializer_range: float | None = None
+    pad_token_id: int | None = None
+    bos_token_id: int | None = 128000
+    eos_token_id: int | list[int] | None = 128001
+    tie_word_embeddings: bool = True
+    # remove unused attribute
+    n_shared_experts = AttributeError()
+    n_routed_experts = AttributeError()
+    routed_scaling_factor = AttributeError()
+    n_group = AttributeError()
+    topk_group = AttributeError()
+    num_experts_per_tok = AttributeError()
+    first_k_dense_replace = AttributeError()
+    norm_topk_prob = AttributeError()
+    pretraining_tp = AttributeError()
+    moe_intermediate_size = AttributeError()
+    def __post_init__(self, **kwargs):
+        if self.initializer_range is None:
+            if self.hidden_size != 0:
+                self.initializer_range = 2.0 / (5.0 * self.hidden_size) ** 0.5
+            else:
+                self.initializer_range = 0.02
+        self.embedding_initializer_range = self.embedding_initializer_range or 2.0 * self.initializer_range
+        super().__post_init__(**kwargs)
+class YoutuRMSNorm(LlamaRMSNorm):
+    pass
+class YoutuRotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+class YoutuMLP(Qwen3MLP):
+    pass
+class YoutuAttention(DeepseekV3Attention):
+    pass
+class YoutuDecoderLayer(LlamaDecoderLayer):
+    pass
+class YoutuPreTrainedModel(LlamaPreTrainedModel, PreTrainedModel):
+    @torch.no_grad()
+    def _init_weights(self, module):
+        PreTrainedModel._init_weights(self, module)
+        std = getattr(self.config, "initializer_range", 0.02)
+        embed_std = getattr(self.config, "embedding_initializer_range", 2 * std)
+        if isinstance(module, nn.Embedding):
+            init.normal_(module.weight, mean=0.0, std=embed_std)
+            if module.padding_idx is not None:
+                init.zeros_(module.weight.data[module.padding_idx])
+class YoutuModel(LlamaModel):
+    pass
+class YoutuForCausalLM(LlamaForCausalLM):
+    pass
+__all__ = [
+    "YoutuConfig",
+    "YoutuPreTrainedModel",
+    "YoutuModel",
+    "YoutuForCausalLM",
+]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/testing_utils.py ADDED Viewed

The diff for this file is too large to render. See raw diff

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/training_args.py ADDED Viewed

The diff for this file is too large to render. See raw diff

LTA_openwebtext_dualt/mini_owt_logdirichlet/runs/owt_t5_elftokenized_full_len1024_C1_to_1024_pow1_d768_l12_h12_gbs512_2x8gpu_50ep_lr3e3_ema0p9999_elfopt_not5_bottleneck16_unfixed_norm_stateprobadd_selfcond_ce_fast_20260612_030202/step_070000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f9d782f9b8c989c4bdaccc104827d9b426cdf6c42c2bd671b6e40541ceb24c2
+size 897562466

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68bdb7c05a6e6a3e90081c66c23125a3951666f75f5171b90f4ca8e23da89f69
+size 897562466

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c31ea7165f98543157bb82e8ff15f8183ee06eaa5cfa6901c6ed884fbe5e0ec
+size 897562466

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a4d7035ba11f7286021fe81bbe41d62ea19e278f5466a9c6506e495e025e2e6
+size 897562466

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8cf82b122d41a279824fef01b76fe887328610fe616cccbda5186d6bed45dd29
+size 897562466

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84de046c4742e1f4294ba4ee53edbadd466c91ff6003748fbc7e2bcfc56e1d32
+size 897562466

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71cd93a38ffcecccd3b1e054151774919038ec40a6aec162ebcfa5db27fa7b46
+size 897562466

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b1caa92ac5cea91c492b668a52dbce1d34b2cf72db6edbd2ee768d2608cf203
+size 897562466

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ebd372ab29ab0dde7b3aab27de2e20487c5fce1628a03df9d46b59795702246
+size 897562466

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:477f5f24e284fcf6630d7a64ed678211de0b09b4ab11f149bc02dd43c78678da
+size 897562466