JinghuiLuAstronaut commited on 10 days ago

Commit

11a66f3

verified ·

1 Parent(s): 00a21bd

Add files using upload-large-folder tool

Browse files

Files changed (20) hide show

LTA_openwebtext_dualt/logs/eval_fixedwrong70/ema_s128_finalfreq_topp.log +12 -0
LTA_openwebtext_dualt/logs/eval_fixedwrong70/ema_s128_topp_t2p0.log +12 -0
LTA_openwebtext_dualt/logs/eval_fixedwrong70/fixedwrong70_step54000_ema_dirres_n32_s256.log +20 -0
LTA_openwebtext_dualt/logs/eval_fixedwrong70/fixedwrong70_step91000_online_dirres_n16_s128.log +12 -0
LTA_openwebtext_dualt/logs/eval_fixedwrong70/step91000_ema_base_argmax.log +12 -0
LTA_openwebtext_dualt/logs/eval_fixedwrong70/step91000_ema_topp_t1p5.log +5 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/_emoji_codes.py +0 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/cells.py +154 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/file_proxy.py +57 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/protocol.py +42 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/screen.py +54 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/segment.py +739 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/__init__.py +27 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/configuration_arcee.py +100 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/modeling_arcee.py +520 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/modular_arcee.py +117 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__init__.py +27 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +93 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +0 -0
LTA_openwebtext_dualt/mini_owt_logdirichlet/cache/owt_t5_llmclean_qwen36_35b_articlefull_pack1023_10k_rejected_docs.txt +0 -0

LTA_openwebtext_dualt/logs/eval_fixedwrong70/ema_s128_finalfreq_topp.log ADDED Viewed

	@@ -0,0 +1,12 @@

+[ckpt] runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0054000_ema.pt step=54000
+[decode-base] n=16 max_len=1024 steps=128 model_t=flow
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 2/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 4/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 6/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 8/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 10/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 12/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 14/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 16/16
+[summary] {"type": "summary", "checkpoint": "runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0054000_ema.pt", "step": 54000, "decode": {"steps": 128, "model_t_mode": "flow", "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.5, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "blend", "final_sample_mode": "topp", "final_sample_temp": 1.3, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.8, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.002, "lock_bos": false, "n_samples": 16, "seed": 20260514}, "raw_genppl": {"ppl": 338.15488842203416, "nll_per_token": 5.8235040402879905, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 354.25428820863135, "nll_per_token": 5.870014983532475, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 4.685037354092487, "unique_tokens": 3644, "token_count": 16384, "distinct_1": 0.222412109375, "distinct_2": 0.5967741935483871, "top_token_mass": 0.06591796875}}
+[done] docs/lta_samples/metrics_20260514/fixedwrong70_decode_sweep_fast/ema_s128_finalfreq_topp.jsonl

LTA_openwebtext_dualt/logs/eval_fixedwrong70/ema_s128_topp_t2p0.log ADDED Viewed

	@@ -0,0 +1,12 @@

+[ckpt] runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0054000_ema.pt step=54000
+[decode-base] n=16 max_len=1024 steps=128 model_t=flow
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 2/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 4/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 6/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 8/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 10/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 12/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 14/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 16/16
+[summary] {"type": "summary", "checkpoint": "runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0054000_ema.pt", "step": 54000, "decode": {"steps": 128, "model_t_mode": "flow", "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.5, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "blend", "final_sample_mode": "topp", "final_sample_temp": 2.0, "final_top_k": 64, "final_top_p": 0.97, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 16, "seed": 20260514}, "raw_genppl": {"ppl": 78517.19514691229, "nll_per_token": 11.271072926240809, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 76752.70527879382, "nll_per_token": 11.248343912760417, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 6.619591836259636, "unique_tokens": 11908, "token_count": 16384, "distinct_1": 0.726806640625, "distinct_2": 0.9923020527859238, "top_token_mass": 0.0205078125}}
+[done] docs/lta_samples/metrics_20260514/fixedwrong70_decode_sweep_fast/ema_s128_topp_t2p0.jsonl

LTA_openwebtext_dualt/logs/eval_fixedwrong70/fixedwrong70_step54000_ema_dirres_n32_s256.log ADDED Viewed

	@@ -0,0 +1,20 @@

+[ckpt] runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0054000_ema.pt step=54000
+[decode-base] n=32 max_len=1024 steps=256 model_t=flow
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 2/32
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 4/32
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 6/32
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 8/32
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 10/32
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 12/32
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 14/32
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 16/32
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 18/32
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 20/32
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 22/32
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 24/32
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 26/32
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 28/32
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 30/32
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 32/32
+[summary] {"type": "summary", "checkpoint": "runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0054000_ema.pt", "step": 54000, "decode": {"steps": 256, "model_t_mode": "flow", "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.5, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "blend", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 32, "seed": 20260514}, "raw_genppl": {"ppl": 7.839044812727041, "nll_per_token": 2.0591169918284695, "tokens": 8160, "kept_samples": 32, "total_samples": 32, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 11.212826752402886, "nll_per_token": 2.4170583687576594, "tokens": 8160, "kept_samples": 32, "total_samples": 32, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 2.419525574410852, "unique_tokens": 236, "token_count": 32768, "distinct_1": 0.0072021484375, "distinct_2": 0.07138929618768329, "top_token_mass": 0.23388671875}}
+[done] docs/lta_samples/metrics_20260514/fixedwrong70_latest_quick/fixedwrong70_step54000_ema_dirres_n32_s256.jsonl

LTA_openwebtext_dualt/logs/eval_fixedwrong70/fixedwrong70_step91000_online_dirres_n16_s128.log ADDED Viewed

	@@ -0,0 +1,12 @@

+[ckpt] runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0091000_online.pt step=91000
+[decode-base] n=16 max_len=1024 steps=128 model_t=flow
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 2/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 4/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 6/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 8/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 10/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 12/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 14/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 16/16
+[summary] {"type": "summary", "checkpoint": "runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0091000_online.pt", "step": 91000, "decode": {"steps": 128, "model_t_mode": "flow", "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.5, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "blend", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 16, "seed": 20260514}, "raw_genppl": {"ppl": 6.635170922139129, "nll_per_token": 1.892384428136489, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 8.932796631820791, "nll_per_token": 2.189729518516391, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 2.0834674523881906, "unique_tokens": 143, "token_count": 16384, "distinct_1": 0.00872802734375, "distinct_2": 0.05663489736070381, "top_token_mass": 0.27642822265625}}
+[done] docs/lta_samples/metrics_20260514/fixedwrong70_latest_step91000/fixedwrong70_step91000_online_dirres_n16_s128.jsonl

LTA_openwebtext_dualt/logs/eval_fixedwrong70/step91000_ema_base_argmax.log ADDED Viewed

	@@ -0,0 +1,12 @@

+[ckpt] runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0091000_ema.pt step=91000
+[decode-base] n=16 max_len=1024 steps=128 model_t=flow
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 2/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 4/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 6/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 8/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 10/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 12/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 14/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 16/16
+[summary] {"type": "summary", "checkpoint": "runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0091000_ema.pt", "step": 91000, "decode": {"steps": 128, "model_t_mode": "flow", "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.5, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "blend", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 16, "seed": 20260514}, "raw_genppl": {"ppl": 9.15153130996079, "nll_per_token": 2.2139212215647976, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 11.897193857793793, "nll_per_token": 2.4763025620404413, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 2.4467596776773615, "unique_tokens": 209, "token_count": 16384, "distinct_1": 0.01275634765625, "distinct_2": 0.09567448680351906, "top_token_mass": 0.2486572265625}}
+[done] docs/lta_samples/metrics_20260514/fixedwrong70_step91000_decode_sweep_fast/step91000_ema_base_argmax.jsonl

LTA_openwebtext_dualt/logs/eval_fixedwrong70/step91000_ema_topp_t1p5.log ADDED Viewed

	@@ -0,0 +1,5 @@

+[ckpt] runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0091000_ema.pt step=91000
+[decode-base] n=16 max_len=1024 steps=128 model_t=flow
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.5/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 2/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.5/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 4/16
+[decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.5/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 6/16

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/_emoji_codes.py ADDED Viewed

The diff for this file is too large to render. See raw diff

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/cells.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import re
+from functools import lru_cache
+from typing import Callable, List
+from ._cell_widths import CELL_WIDTHS
+# Regex to match sequence of the most common character ranges
+_is_single_cell_widths = re.compile("^[\u0020-\u006f\u00a0\u02ff\u0370-\u0482]*$").match
+@lru_cache(4096)
+def cached_cell_len(text: str) -> int:
+    """Get the number of cells required to display text.
+    This method always caches, which may use up a lot of memory. It is recommended to use
+    `cell_len` over this method.
+    Args:
+        text (str): Text to display.
+    Returns:
+        int: Get the number of cells required to display text.
+    """
+    _get_size = get_character_cell_size
+    total_size = sum(_get_size(character) for character in text)
+    return total_size
+def cell_len(text: str, _cell_len: Callable[[str], int] = cached_cell_len) -> int:
+    """Get the number of cells required to display text.
+    Args:
+        text (str): Text to display.
+    Returns:
+        int: Get the number of cells required to display text.
+    """
+    if len(text) < 512:
+        return _cell_len(text)
+    _get_size = get_character_cell_size
+    total_size = sum(_get_size(character) for character in text)
+    return total_size
+@lru_cache(maxsize=4096)
+def get_character_cell_size(character: str) -> int:
+    """Get the cell size of a character.
+    Args:
+        character (str): A single character.
+    Returns:
+        int: Number of cells (0, 1 or 2) occupied by that character.
+    """
+    return _get_codepoint_cell_size(ord(character))
+@lru_cache(maxsize=4096)
+def _get_codepoint_cell_size(codepoint: int) -> int:
+    """Get the cell size of a character.
+    Args:
+        codepoint (int): Codepoint of a character.
+    Returns:
+        int: Number of cells (0, 1 or 2) occupied by that character.
+    """
+    _table = CELL_WIDTHS
+    lower_bound = 0
+    upper_bound = len(_table) - 1
+    index = (lower_bound + upper_bound) // 2
+    while True:
+        start, end, width = _table[index]
+        if codepoint < start:
+            upper_bound = index - 1
+        elif codepoint > end:
+            lower_bound = index + 1
+        else:
+            return 0 if width == -1 else width
+        if upper_bound < lower_bound:
+            break
+        index = (lower_bound + upper_bound) // 2
+    return 1
+def set_cell_size(text: str, total: int) -> str:
+    """Set the length of a string to fit within given number of cells."""
+    if _is_single_cell_widths(text):
+        size = len(text)
+        if size < total:
+            return text + " " * (total - size)
+        return text[:total]
+    if total <= 0:
+        return ""
+    cell_size = cell_len(text)
+    if cell_size == total:
+        return text
+    if cell_size < total:
+        return text + " " * (total - cell_size)
+    start = 0
+    end = len(text)
+    # Binary search until we find the right size
+    while True:
+        pos = (start + end) // 2
+        before = text[: pos + 1]
+        before_len = cell_len(before)
+        if before_len == total + 1 and cell_len(before[-1]) == 2:
+            return before[:-1] + " "
+        if before_len == total:
+            return before
+        if before_len > total:
+            end = pos
+        else:
+            start = pos
+# TODO: This is inefficient
+# TODO: This might not work with CWJ type characters
+def chop_cells(text: str, max_size: int, position: int = 0) -> List[str]:
+    """Break text in to equal (cell) length strings, returning the characters in reverse
+    order"""
+    _get_character_cell_size = get_character_cell_size
+    characters = [
+        (character, _get_character_cell_size(character)) for character in text
+    ]
+    total_size = position
+    lines: List[List[str]] = [[]]
+    append = lines[-1].append
+    for character, size in reversed(characters):
+        if total_size + size > max_size:
+            lines.append([character])
+            append = lines[-1].append
+            total_size = size
+        else:
+            total_size += size
+            append(character)
+    return ["".join(line) for line in lines]
+if __name__ == "__main__":  # pragma: no cover
+    print(get_character_cell_size("😽"))
+    for line in chop_cells("""这是对亚洲语言支持的测试。面对模棱两可的想法，拒绝猜测的诱惑。""", 8):
+        print(line)
+    for n in range(80, 1, -1):
+        print(set_cell_size("""这是对亚洲语言支持的测试。面对模棱两可的想法，拒绝猜测的诱惑。""", n) + "|")
+        print("x" * n)

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/file_proxy.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import io
+from typing import IO, TYPE_CHECKING, Any, List
+from .ansi import AnsiDecoder
+from .text import Text
+if TYPE_CHECKING:
+    from .console import Console
+class FileProxy(io.TextIOBase):
+    """Wraps a file (e.g. sys.stdout) and redirects writes to a console."""
+    def __init__(self, console: "Console", file: IO[str]) -> None:
+        self.__console = console
+        self.__file = file
+        self.__buffer: List[str] = []
+        self.__ansi_decoder = AnsiDecoder()
+    @property
+    def rich_proxied_file(self) -> IO[str]:
+        """Get proxied file."""
+        return self.__file
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self.__file, name)
+    def write(self, text: str) -> int:
+        if not isinstance(text, str):
+            raise TypeError(f"write() argument must be str, not {type(text).__name__}")
+        buffer = self.__buffer
+        lines: List[str] = []
+        while text:
+            line, new_line, text = text.partition("\n")
+            if new_line:
+                lines.append("".join(buffer) + line)
+                buffer.clear()
+            else:
+                buffer.append(line)
+                break
+        if lines:
+            console = self.__console
+            with console:
+                output = Text("\n").join(
+                    self.__ansi_decoder.decode_line(line) for line in lines
+                )
+                console.print(output)
+        return len(text)
+    def flush(self) -> None:
+        output = "".join(self.__buffer)
+        if output:
+            self.__console.print(output)
+        del self.__buffer[:]
+    def fileno(self) -> int:
+        return self.__file.fileno()

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/protocol.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from typing import Any, cast, Set, TYPE_CHECKING
+from inspect import isclass
+if TYPE_CHECKING:
+    from pip._vendor.rich.console import RenderableType
+_GIBBERISH = """aihwerij235234ljsdnp34ksodfipwoe234234jlskjdf"""
+def is_renderable(check_object: Any) -> bool:
+    """Check if an object may be rendered by Rich."""
+    return (
+        isinstance(check_object, str)
+        or hasattr(check_object, "__rich__")
+        or hasattr(check_object, "__rich_console__")
+    )
+def rich_cast(renderable: object) -> "RenderableType":
+    """Cast an object to a renderable by calling __rich__ if present.
+    Args:
+        renderable (object): A potentially renderable object
+    Returns:
+        object: The result of recursively calling __rich__.
+    """
+    from pip._vendor.rich.console import RenderableType
+    rich_visited_set: Set[type] = set()  # Prevent potential infinite loop
+    while hasattr(renderable, "__rich__") and not isclass(renderable):
+        # Detect object which claim to have all the attributes
+        if hasattr(renderable, _GIBBERISH):
+            return repr(renderable)
+        cast_method = getattr(renderable, "__rich__")
+        renderable = cast_method()
+        renderable_type = type(renderable)
+        if renderable_type in rich_visited_set:
+            break
+        rich_visited_set.add(renderable_type)
+    return cast(RenderableType, renderable)

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/screen.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from typing import Optional, TYPE_CHECKING
+from .segment import Segment
+from .style import StyleType
+from ._loop import loop_last
+if TYPE_CHECKING:
+    from .console import (
+        Console,
+        ConsoleOptions,
+        RenderResult,
+        RenderableType,
+        Group,
+    )
+class Screen:
+    """A renderable that fills the terminal screen and crops excess.
+    Args:
+        renderable (RenderableType): Child renderable.
+        style (StyleType, optional): Optional background style. Defaults to None.
+    """
+    renderable: "RenderableType"
+    def __init__(
+        self,
+        *renderables: "RenderableType",
+        style: Optional[StyleType] = None,
+        application_mode: bool = False,
+    ) -> None:
+        from pip._vendor.rich.console import Group
+        self.renderable = Group(*renderables)
+        self.style = style
+        self.application_mode = application_mode
+    def __rich_console__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> "RenderResult":
+        width, height = options.size
+        style = console.get_style(self.style) if self.style else None
+        render_options = options.update(width=width, height=height)
+        lines = console.render_lines(
+            self.renderable or "", render_options, style=style, pad=True
+        )
+        lines = Segment.set_shape(lines, width, height, style=style)
+        new_line = Segment("\n\r") if self.application_mode else Segment.line()
+        for last, line in loop_last(lines):
+            yield from line
+            if not last:
+                yield new_line

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/segment.py ADDED Viewed

	@@ -0,0 +1,739 @@

+from enum import IntEnum
+from functools import lru_cache
+from itertools import filterfalse
+from logging import getLogger
+from operator import attrgetter
+from typing import (
+    TYPE_CHECKING,
+    Dict,
+    Iterable,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    Union,
+)
+from .cells import (
+    _is_single_cell_widths,
+    cached_cell_len,
+    cell_len,
+    get_character_cell_size,
+    set_cell_size,
+)
+from .repr import Result, rich_repr
+from .style import Style
+if TYPE_CHECKING:
+    from .console import Console, ConsoleOptions, RenderResult
+log = getLogger("rich")
+class ControlType(IntEnum):
+    """Non-printable control codes which typically translate to ANSI codes."""
+    BELL = 1
+    CARRIAGE_RETURN = 2
+    HOME = 3
+    CLEAR = 4
+    SHOW_CURSOR = 5
+    HIDE_CURSOR = 6
+    ENABLE_ALT_SCREEN = 7
+    DISABLE_ALT_SCREEN = 8
+    CURSOR_UP = 9
+    CURSOR_DOWN = 10
+    CURSOR_FORWARD = 11
+    CURSOR_BACKWARD = 12
+    CURSOR_MOVE_TO_COLUMN = 13
+    CURSOR_MOVE_TO = 14
+    ERASE_IN_LINE = 15
+    SET_WINDOW_TITLE = 16
+ControlCode = Union[
+    Tuple[ControlType],
+    Tuple[ControlType, Union[int, str]],
+    Tuple[ControlType, int, int],
+]
+@rich_repr()
+class Segment(NamedTuple):
+    """A piece of text with associated style. Segments are produced by the Console render process and
+    are ultimately converted in to strings to be written to the terminal.
+    Args:
+        text (str): A piece of text.
+        style (:class:`~rich.style.Style`, optional): An optional style to apply to the text.
+        control (Tuple[ControlCode], optional): Optional sequence of control codes.
+    Attributes:
+        cell_length (int): The cell length of this Segment.
+    """
+    text: str
+    style: Optional[Style] = None
+    control: Optional[Sequence[ControlCode]] = None
+    @property
+    def cell_length(self) -> int:
+        """The number of terminal cells required to display self.text.
+        Returns:
+            int: A number of cells.
+        """
+        text, _style, control = self
+        return 0 if control else cell_len(text)
+    def __rich_repr__(self) -> Result:
+        yield self.text
+        if self.control is None:
+            if self.style is not None:
+                yield self.style
+        else:
+            yield self.style
+            yield self.control
+    def __bool__(self) -> bool:
+        """Check if the segment contains text."""
+        return bool(self.text)
+    @property
+    def is_control(self) -> bool:
+        """Check if the segment contains control codes."""
+        return self.control is not None
+    @classmethod
+    @lru_cache(1024 * 16)
+    def _split_cells(cls, segment: "Segment", cut: int) -> Tuple["Segment", "Segment"]:
+        text, style, control = segment
+        _Segment = Segment
+        cell_length = segment.cell_length
+        if cut >= cell_length:
+            return segment, _Segment("", style, control)
+        cell_size = get_character_cell_size
+        pos = int((cut / cell_length) * (len(text) - 1))
+        before = text[:pos]
+        cell_pos = cell_len(before)
+        if cell_pos == cut:
+            return (
+                _Segment(before, style, control),
+                _Segment(text[pos:], style, control),
+            )
+        while pos < len(text):
+            char = text[pos]
+            pos += 1
+            cell_pos += cell_size(char)
+            before = text[:pos]
+            if cell_pos == cut:
+                return (
+                    _Segment(before, style, control),
+                    _Segment(text[pos:], style, control),
+                )
+            if cell_pos > cut:
+                return (
+                    _Segment(before[: pos - 1] + " ", style, control),
+                    _Segment(" " + text[pos:], style, control),
+                )
+        raise AssertionError("Will never reach here")
+    def split_cells(self, cut: int) -> Tuple["Segment", "Segment"]:
+        """Split segment in to two segments at the specified column.
+        If the cut point falls in the middle of a 2-cell wide character then it is replaced
+        by two spaces, to preserve the display width of the parent segment.
+        Returns:
+            Tuple[Segment, Segment]: Two segments.
+        """
+        text, style, control = self
+        if _is_single_cell_widths(text):
+            # Fast path with all 1 cell characters
+            if cut >= len(text):
+                return self, Segment("", style, control)
+            return (
+                Segment(text[:cut], style, control),
+                Segment(text[cut:], style, control),
+            )
+        return self._split_cells(self, cut)
+    @classmethod
+    def line(cls) -> "Segment":
+        """Make a new line segment."""
+        return cls("\n")
+    @classmethod
+    def apply_style(
+        cls,
+        segments: Iterable["Segment"],
+        style: Optional[Style] = None,
+        post_style: Optional[Style] = None,
+    ) -> Iterable["Segment"]:
+        """Apply style(s) to an iterable of segments.
+        Returns an iterable of segments where the style is replaced by ``style + segment.style + post_style``.
+        Args:
+            segments (Iterable[Segment]): Segments to process.
+            style (Style, optional): Base style. Defaults to None.
+            post_style (Style, optional): Style to apply on top of segment style. Defaults to None.
+        Returns:
+            Iterable[Segments]: A new iterable of segments (possibly the same iterable).
+        """
+        result_segments = segments
+        if style:
+            apply = style.__add__
+            result_segments = (
+                cls(text, None if control else apply(_style), control)
+                for text, _style, control in result_segments
+            )
+        if post_style:
+            result_segments = (
+                cls(
+                    text,
+                    (
+                        None
+                        if control
+                        else (_style + post_style if _style else post_style)
+                    ),
+                    control,
+                )
+                for text, _style, control in result_segments
+            )
+        return result_segments
+    @classmethod
+    def filter_control(
+        cls, segments: Iterable["Segment"], is_control: bool = False
+    ) -> Iterable["Segment"]:
+        """Filter segments by ``is_control`` attribute.
+        Args:
+            segments (Iterable[Segment]): An iterable of Segment instances.
+            is_control (bool, optional): is_control flag to match in search.
+        Returns:
+            Iterable[Segment]: And iterable of Segment instances.
+        """
+        if is_control:
+            return filter(attrgetter("control"), segments)
+        else:
+            return filterfalse(attrgetter("control"), segments)
+    @classmethod
+    def split_lines(cls, segments: Iterable["Segment"]) -> Iterable[List["Segment"]]:
+        """Split a sequence of segments in to a list of lines.
+        Args:
+            segments (Iterable[Segment]): Segments potentially containing line feeds.
+        Yields:
+            Iterable[List[Segment]]: Iterable of segment lists, one per line.
+        """
+        line: List[Segment] = []
+        append = line.append
+        for segment in segments:
+            if "\n" in segment.text and not segment.control:
+                text, style, _ = segment
+                while text:
+                    _text, new_line, text = text.partition("\n")
+                    if _text:
+                        append(cls(_text, style))
+                    if new_line:
+                        yield line
+                        line = []
+                        append = line.append
+            else:
+                append(segment)
+        if line:
+            yield line
+    @classmethod
+    def split_and_crop_lines(
+        cls,
+        segments: Iterable["Segment"],
+        length: int,
+        style: Optional[Style] = None,
+        pad: bool = True,
+        include_new_lines: bool = True,
+    ) -> Iterable[List["Segment"]]:
+        """Split segments in to lines, and crop lines greater than a given length.
+        Args:
+            segments (Iterable[Segment]): An iterable of segments, probably
+                generated from console.render.
+            length (int): Desired line length.
+            style (Style, optional): Style to use for any padding.
+            pad (bool): Enable padding of lines that are less than `length`.
+        Returns:
+            Iterable[List[Segment]]: An iterable of lines of segments.
+        """
+        line: List[Segment] = []
+        append = line.append
+        adjust_line_length = cls.adjust_line_length
+        new_line_segment = cls("\n")
+        for segment in segments:
+            if "\n" in segment.text and not segment.control:
+                text, segment_style, _ = segment
+                while text:
+                    _text, new_line, text = text.partition("\n")
+                    if _text:
+                        append(cls(_text, segment_style))
+                    if new_line:
+                        cropped_line = adjust_line_length(
+                            line, length, style=style, pad=pad
+                        )
+                        if include_new_lines:
+                            cropped_line.append(new_line_segment)
+                        yield cropped_line
+                        line.clear()
+            else:
+                append(segment)
+        if line:
+            yield adjust_line_length(line, length, style=style, pad=pad)
+    @classmethod
+    def adjust_line_length(
+        cls,
+        line: List["Segment"],
+        length: int,
+        style: Optional[Style] = None,
+        pad: bool = True,
+    ) -> List["Segment"]:
+        """Adjust a line to a given width (cropping or padding as required).
+        Args:
+            segments (Iterable[Segment]): A list of segments in a single line.
+            length (int): The desired width of the line.
+            style (Style, optional): The style of padding if used (space on the end). Defaults to None.
+            pad (bool, optional): Pad lines with spaces if they are shorter than `length`. Defaults to True.
+        Returns:
+            List[Segment]: A line of segments with the desired length.
+        """
+        line_length = sum(segment.cell_length for segment in line)
+        new_line: List[Segment]
+        if line_length < length:
+            if pad:
+                new_line = line + [cls(" " * (length - line_length), style)]
+            else:
+                new_line = line[:]
+        elif line_length > length:
+            new_line = []
+            append = new_line.append
+            line_length = 0
+            for segment in line:
+                segment_length = segment.cell_length
+                if line_length + segment_length < length or segment.control:
+                    append(segment)
+                    line_length += segment_length
+                else:
+                    text, segment_style, _ = segment
+                    text = set_cell_size(text, length - line_length)
+                    append(cls(text, segment_style))
+                    break
+        else:
+            new_line = line[:]
+        return new_line
+    @classmethod
+    def get_line_length(cls, line: List["Segment"]) -> int:
+        """Get the length of list of segments.
+        Args:
+            line (List[Segment]): A line encoded as a list of Segments (assumes no '\\\\n' characters),
+        Returns:
+            int: The length of the line.
+        """
+        _cell_len = cell_len
+        return sum(_cell_len(text) for text, style, control in line if not control)
+    @classmethod
+    def get_shape(cls, lines: List[List["Segment"]]) -> Tuple[int, int]:
+        """Get the shape (enclosing rectangle) of a list of lines.
+        Args:
+            lines (List[List[Segment]]): A list of lines (no '\\\\n' characters).
+        Returns:
+            Tuple[int, int]: Width and height in characters.
+        """
+        get_line_length = cls.get_line_length
+        max_width = max(get_line_length(line) for line in lines) if lines else 0
+        return (max_width, len(lines))
+    @classmethod
+    def set_shape(
+        cls,
+        lines: List[List["Segment"]],
+        width: int,
+        height: Optional[int] = None,
+        style: Optional[Style] = None,
+        new_lines: bool = False,
+    ) -> List[List["Segment"]]:
+        """Set the shape of a list of lines (enclosing rectangle).
+        Args:
+            lines (List[List[Segment]]): A list of lines.
+            width (int): Desired width.
+            height (int, optional): Desired height or None for no change.
+            style (Style, optional): Style of any padding added.
+            new_lines (bool, optional): Padded lines should include "\n". Defaults to False.
+        Returns:
+            List[List[Segment]]: New list of lines.
+        """
+        _height = height or len(lines)
+        blank = (
+            [cls(" " * width + "\n", style)] if new_lines else [cls(" " * width, style)]
+        )
+        adjust_line_length = cls.adjust_line_length
+        shaped_lines = lines[:_height]
+        shaped_lines[:] = [
+            adjust_line_length(line, width, style=style) for line in lines
+        ]
+        if len(shaped_lines) < _height:
+            shaped_lines.extend([blank] * (_height - len(shaped_lines)))
+        return shaped_lines
+    @classmethod
+    def align_top(
+        cls: Type["Segment"],
+        lines: List[List["Segment"]],
+        width: int,
+        height: int,
+        style: Style,
+        new_lines: bool = False,
+    ) -> List[List["Segment"]]:
+        """Aligns lines to top (adds extra lines to bottom as required).
+        Args:
+            lines (List[List[Segment]]): A list of lines.
+            width (int): Desired width.
+            height (int, optional): Desired height or None for no change.
+            style (Style): Style of any padding added.
+            new_lines (bool, optional): Padded lines should include "\n". Defaults to False.
+        Returns:
+            List[List[Segment]]: New list of lines.
+        """
+        extra_lines = height - len(lines)
+        if not extra_lines:
+            return lines[:]
+        lines = lines[:height]
+        blank = cls(" " * width + "\n", style) if new_lines else cls(" " * width, style)
+        lines = lines + [[blank]] * extra_lines
+        return lines
+    @classmethod
+    def align_bottom(
+        cls: Type["Segment"],
+        lines: List[List["Segment"]],
+        width: int,
+        height: int,
+        style: Style,
+        new_lines: bool = False,
+    ) -> List[List["Segment"]]:
+        """Aligns render to bottom (adds extra lines above as required).
+        Args:
+            lines (List[List[Segment]]): A list of lines.
+            width (int): Desired width.
+            height (int, optional): Desired height or None for no change.
+            style (Style): Style of any padding added. Defaults to None.
+            new_lines (bool, optional): Padded lines should include "\n". Defaults to False.
+        Returns:
+            List[List[Segment]]: New list of lines.
+        """
+        extra_lines = height - len(lines)
+        if not extra_lines:
+            return lines[:]
+        lines = lines[:height]
+        blank = cls(" " * width + "\n", style) if new_lines else cls(" " * width, style)
+        lines = [[blank]] * extra_lines + lines
+        return lines
+    @classmethod
+    def align_middle(
+        cls: Type["Segment"],
+        lines: List[List["Segment"]],
+        width: int,
+        height: int,
+        style: Style,
+        new_lines: bool = False,
+    ) -> List[List["Segment"]]:
+        """Aligns lines to middle (adds extra lines to above and below as required).
+        Args:
+            lines (List[List[Segment]]): A list of lines.
+            width (int): Desired width.
+            height (int, optional): Desired height or None for no change.
+            style (Style): Style of any padding added.
+            new_lines (bool, optional): Padded lines should include "\n". Defaults to False.
+        Returns:
+            List[List[Segment]]: New list of lines.
+        """
+        extra_lines = height - len(lines)
+        if not extra_lines:
+            return lines[:]
+        lines = lines[:height]
+        blank = cls(" " * width + "\n", style) if new_lines else cls(" " * width, style)
+        top_lines = extra_lines // 2
+        bottom_lines = extra_lines - top_lines
+        lines = [[blank]] * top_lines + lines + [[blank]] * bottom_lines
+        return lines
+    @classmethod
+    def simplify(cls, segments: Iterable["Segment"]) -> Iterable["Segment"]:
+        """Simplify an iterable of segments by combining contiguous segments with the same style.
+        Args:
+            segments (Iterable[Segment]): An iterable of segments.
+        Returns:
+            Iterable[Segment]: A possibly smaller iterable of segments that will render the same way.
+        """
+        iter_segments = iter(segments)
+        try:
+            last_segment = next(iter_segments)
+        except StopIteration:
+            return
+        _Segment = Segment
+        for segment in iter_segments:
+            if last_segment.style == segment.style and not segment.control:
+                last_segment = _Segment(
+                    last_segment.text + segment.text, last_segment.style
+                )
+            else:
+                yield last_segment
+                last_segment = segment
+        yield last_segment
+    @classmethod
+    def strip_links(cls, segments: Iterable["Segment"]) -> Iterable["Segment"]:
+        """Remove all links from an iterable of styles.
+        Args:
+            segments (Iterable[Segment]): An iterable segments.
+        Yields:
+            Segment: Segments with link removed.
+        """
+        for segment in segments:
+            if segment.control or segment.style is None:
+                yield segment
+            else:
+                text, style, _control = segment
+                yield cls(text, style.update_link(None) if style else None)
+    @classmethod
+    def strip_styles(cls, segments: Iterable["Segment"]) -> Iterable["Segment"]:
+        """Remove all styles from an iterable of segments.
+        Args:
+            segments (Iterable[Segment]): An iterable segments.
+        Yields:
+            Segment: Segments with styles replace with None
+        """
+        for text, _style, control in segments:
+            yield cls(text, None, control)
+    @classmethod
+    def remove_color(cls, segments: Iterable["Segment"]) -> Iterable["Segment"]:
+        """Remove all color from an iterable of segments.
+        Args:
+            segments (Iterable[Segment]): An iterable segments.
+        Yields:
+            Segment: Segments with colorless style.
+        """
+        cache: Dict[Style, Style] = {}
+        for text, style, control in segments:
+            if style:
+                colorless_style = cache.get(style)
+                if colorless_style is None:
+                    colorless_style = style.without_color
+                    cache[style] = colorless_style
+                yield cls(text, colorless_style, control)
+            else:
+                yield cls(text, None, control)
+    @classmethod
+    def divide(
+        cls, segments: Iterable["Segment"], cuts: Iterable[int]
+    ) -> Iterable[List["Segment"]]:
+        """Divides an iterable of segments in to portions.
+        Args:
+            cuts (Iterable[int]): Cell positions where to divide.
+        Yields:
+            [Iterable[List[Segment]]]: An iterable of Segments in List.
+        """
+        split_segments: List["Segment"] = []
+        add_segment = split_segments.append
+        iter_cuts = iter(cuts)
+        while True:
+            cut = next(iter_cuts, -1)
+            if cut == -1:
+                return []
+            if cut != 0:
+                break
+            yield []
+        pos = 0
+        segments_clear = split_segments.clear
+        segments_copy = split_segments.copy
+        _cell_len = cached_cell_len
+        for segment in segments:
+            text, _style, control = segment
+            while text:
+                end_pos = pos if control else pos + _cell_len(text)
+                if end_pos < cut:
+                    add_segment(segment)
+                    pos = end_pos
+                    break
+                if end_pos == cut:
+                    add_segment(segment)
+                    yield segments_copy()
+                    segments_clear()
+                    pos = end_pos
+                    cut = next(iter_cuts, -1)
+                    if cut == -1:
+                        if split_segments:
+                            yield segments_copy()
+                        return
+                    break
+                else:
+                    before, segment = segment.split_cells(cut - pos)
+                    text, _style, control = segment
+                    add_segment(before)
+                    yield segments_copy()
+                    segments_clear()
+                    pos = cut
+                cut = next(iter_cuts, -1)
+                if cut == -1:
+                    if split_segments:
+                        yield segments_copy()
+                    return
+        yield segments_copy()
+class Segments:
+    """A simple renderable to render an iterable of segments. This class may be useful if
+    you want to print segments outside of a __rich_console__ method.
+    Args:
+        segments (Iterable[Segment]): An iterable of segments.
+        new_lines (bool, optional): Add new lines between segments. Defaults to False.
+    """
+    def __init__(self, segments: Iterable[Segment], new_lines: bool = False) -> None:
+        self.segments = list(segments)
+        self.new_lines = new_lines
+    def __rich_console__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> "RenderResult":
+        if self.new_lines:
+            line = Segment.line()
+            for segment in self.segments:
+                yield segment
+                yield line
+        else:
+            yield from self.segments
+class SegmentLines:
+    def __init__(self, lines: Iterable[List[Segment]], new_lines: bool = False) -> None:
+        """A simple renderable containing a number of lines of segments. May be used as an intermediate
+        in rendering process.
+        Args:
+            lines (Iterable[List[Segment]]): Lists of segments forming lines.
+            new_lines (bool, optional): Insert new lines after each line. Defaults to False.
+        """
+        self.lines = list(lines)
+        self.new_lines = new_lines
+    def __rich_console__(
+        self, console: "Console", options: "ConsoleOptions"
+    ) -> "RenderResult":
+        if self.new_lines:
+            new_line = Segment.line()
+            for line in self.lines:
+                yield from line
+                yield new_line
+        else:
+            for line in self.lines:
+                yield from line
+if __name__ == "__main__":  # pragma: no cover
+    from pip._vendor.rich.console import Console
+    from pip._vendor.rich.syntax import Syntax
+    from pip._vendor.rich.text import Text
+    code = """from rich.console import Console
+console = Console()
+text = Text.from_markup("Hello, [bold magenta]World[/]!")
+console.print(text)"""
+    text = Text.from_markup("Hello, [bold magenta]World[/]!")
+    console = Console()
+    console.rule("rich.Segment")
+    console.print(
+        "A Segment is the last step in the Rich render process before generating text with ANSI codes."
+    )
+    console.print("\nConsider the following code:\n")
+    console.print(Syntax(code, "python", line_numbers=True))
+    console.print()
+    console.print(
+        "When you call [b]print()[/b], Rich [i]renders[/i] the object in to the following:\n"
+    )
+    fragments = list(console.render(text))
+    console.print(fragments)
+    console.print()
+    console.print("The Segments are then processed to produce the following output:\n")
+    console.print(text)
+    console.print(
+        "\nYou will only need to know this if you are implementing your own Rich renderables."
+    )

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_arcee import *
+    from .modeling_arcee import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/configuration_arcee.py ADDED Viewed

	@@ -0,0 +1,100 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/arcee/modular_arcee.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_arcee.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from huggingface_hub.dataclasses import strict
+from transformers.utils import auto_docstring
+from ...configuration_utils import PreTrainedConfig
+from ...modeling_rope_utils import RopeParameters
+@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
+@strict
+class ArceeConfig(PreTrainedConfig):
+    r"""
+    ```python
+    >>> from transformers import ArceeModel, ArceeConfig
+    >>> # Initializing an Arcee AFM-4.5B-Base style configuration
+    >>> configuration = ArceeConfig()
+    >>> # Initializing a model from the AFM-4.5B-Base style configuration
+    >>> model = ArceeModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "arcee"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    vocab_size: int = 32000
+    hidden_size: int = 2560
+    intermediate_size: int = 18432
+    num_hidden_layers: int = 32
+    num_attention_heads: int = 32
+    num_key_value_heads: int | None = None
+    hidden_act: str = "relu2"
+    max_position_embeddings: int = 4096
+    initializer_range: float = 0.02
+    rms_norm_eps: float = 1e-5
+    use_cache: bool = True
+    pad_token_id: int | None = None
+    bos_token_id: int | None = 128000
+    eos_token_id: int | list[int] | None = 128001
+    tie_word_embeddings: bool = False
+    rope_parameters: RopeParameters | dict | None = None
+    attention_bias: bool = False
+    attention_dropout: float | int = 0.0
+    mlp_bias: bool = False
+    head_dim: int | None = None
+    def __post_init__(self, **kwargs):
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.num_attention_heads
+        if self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+        super().__post_init__(**kwargs)
+    def validate_architecture(self):
+        """Part of `@strict`-powered validation. Validates the architecture of the config."""
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({self.num_attention_heads})."
+            )
+__all__ = ["ArceeConfig"]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/modeling_arcee.py ADDED Viewed

	@@ -0,0 +1,520 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/arcee/modular_arcee.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_arcee.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Callable
+from typing import Optional
+import torch
+from torch import nn
+from transformers.utils import auto_docstring
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import (
+    GenericForQuestionAnswering,
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, can_return_tuple
+from ...utils.generic import maybe_autocast, merge_with_config_defaults
+from ...utils.output_capturing import capture_outputs
+from .configuration_arcee import ArceeConfig
+class ArceeMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.up_proj(x)))
+@use_kernel_forward_from_hub("RMSNorm")
+class ArceeRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps: float = 1e-6) -> None:
+        """
+        ArceeRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class ArceeRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, config: ArceeConfig, device=None):
+        super().__init__()
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_type = self.config.rope_parameters["rope_type"]
+        rope_init_fn: Callable = self.compute_default_rope_parameters
+        if self.rope_type != "default":
+            rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
+    @staticmethod
+    def compute_default_rope_parameters(
+        config: ArceeConfig | None = None,
+        device: Optional["torch.device"] = None,
+        seq_len: int | None = None,
+    ) -> tuple["torch.Tensor", float]:
+        """
+        Computes the inverse frequencies according to the original RoPE implementation
+        Args:
+            config ([`~transformers.PreTrainedConfig`]):
+                The model configuration.
+            device (`torch.device`):
+                The device to use for initialization of the inverse frequencies.
+            seq_len (`int`, *optional*):
+                The current sequence length. Unused for this type of RoPE.
+        Returns:
+            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+        """
+        base = config.rope_parameters["rope_theta"]
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        attention_factor = 1.0  # Unused in this type of RoPE
+        # Compute the inverse frequencies
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, attention_factor
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+@use_kernel_func_from_hub("rotary_pos_emb")
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+@use_kernelized_func(apply_rotary_pos_emb)
+class ArceeAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: ArceeConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        attention_mask: torch.Tensor | None = None,
+        past_key_values: Cache | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class ArceeDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: ArceeConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = ArceeAttention(config=config, layer_idx=layer_idx)
+        self.mlp = ArceeMLP(config)
+        self.input_layernorm = ArceeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = ArceeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        use_cache: bool | None = False,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+@auto_docstring
+class ArceePreTrainedModel(PreTrainedModel):
+    config: ArceeConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ArceeDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": ArceeDecoderLayer,
+        "attentions": ArceeAttention,
+    }
+@auto_docstring
+class ArceeModel(ArceePreTrainedModel):
+    def __init__(self, config: ArceeConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [ArceeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = ArceeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = ArceeRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    @merge_with_config_defaults
+    @capture_outputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        use_cache: bool | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if position_ids is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
+            position_ids = position_ids.unsqueeze(0)
+        causal_mask = create_causal_mask(
+            config=self.config,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids)
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_embeddings=position_embeddings,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
+class ArceeForCausalLM(ArceePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    _tp_plan = {"lm_head": "colwise_gather_output"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = ArceeModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        logits_to_keep: int | torch.Tensor = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, ArceeForCausalLM
+        >>> model = ArceeForCausalLM.from_pretrained("meta-arcee/Arcee-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-arcee/Arcee-2-7b-hf")
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
+class ArceeForSequenceClassification(GenericForSequenceClassification, ArceePreTrainedModel):
+    pass
+@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
+class ArceeForQuestionAnswering(GenericForQuestionAnswering, ArceePreTrainedModel):
+    base_model_prefix = "transformer"  # For BC, where `transformer` was used instead of `model`
+@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
+class ArceeForTokenClassification(GenericForTokenClassification, ArceePreTrainedModel):
+    pass
+__all__ = [
+    "ArceeForCausalLM",
+    "ArceeForQuestionAnswering",
+    "ArceeForSequenceClassification",
+    "ArceeForTokenClassification",
+    "ArceeModel",
+    "ArceePreTrainedModel",
+]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/modular_arcee.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Arcee model."""
+from huggingface_hub.dataclasses import strict
+from transformers.utils import auto_docstring, logging
+from ...modeling_rope_utils import RopeParameters
+from ..llama.configuration_llama import LlamaConfig
+from ..llama.modeling_llama import (
+    LlamaForCausalLM,
+    LlamaForQuestionAnswering,
+    LlamaForSequenceClassification,
+    LlamaForTokenClassification,
+)
+from ..nemotron.modeling_nemotron import NemotronMLP
+logger = logging.get_logger(__name__)
+@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
+@strict
+class ArceeConfig(LlamaConfig):
+    r"""
+    ```python
+    >>> from transformers import ArceeModel, ArceeConfig
+    >>> # Initializing an Arcee AFM-4.5B-Base style configuration
+    >>> configuration = ArceeConfig()
+    >>> # Initializing a model from the AFM-4.5B-Base style configuration
+    >>> model = ArceeModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "arcee"
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    vocab_size: int = 32000
+    hidden_size: int = 2560
+    intermediate_size: int = 18432
+    num_hidden_layers: int = 32
+    num_attention_heads: int = 32
+    num_key_value_heads: int | None = None
+    hidden_act: str = "relu2"
+    max_position_embeddings: int = 4096
+    initializer_range: float = 0.02
+    rms_norm_eps: float = 1e-5
+    use_cache: bool = True
+    pad_token_id: int | None = None
+    bos_token_id: int | None = 128000
+    eos_token_id: int | list[int] | None = 128001
+    tie_word_embeddings: bool = False
+    rope_parameters: RopeParameters | dict | None = None
+    attention_bias: bool = False
+    attention_dropout: float | int = 0.0
+    mlp_bias: bool = False
+    head_dim: int | None = None
+    pretraining_tp = AttributeError()
+class ArceeMLP(NemotronMLP):
+    pass
+@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
+class ArceeForCausalLM(LlamaForCausalLM):
+    pass
+@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
+class ArceeForSequenceClassification(LlamaForSequenceClassification):
+    pass
+@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
+class ArceeForQuestionAnswering(LlamaForQuestionAnswering):
+    pass
+@auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
+class ArceeForTokenClassification(LlamaForTokenClassification):
+    pass
+__all__ = [
+    "ArceeConfig",
+    "ArceeForCausalLM",
+    "ArceeForQuestionAnswering",
+    "ArceeForSequenceClassification",
+    "ArceeForTokenClassification",
+    "ArceeModel",  # noqa: F822
+    "ArceePreTrainedModel",  # noqa: F822
+]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_bigbird_pegasus import *
+    from .modeling_bigbird_pegasus import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright Google Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BigBirdPegasus model configuration"""
+from huggingface_hub.dataclasses import strict
+from ...configuration_utils import PreTrainedConfig
+from ...utils import auto_docstring
+@auto_docstring(checkpoint="google/bigbird-pegasus-large-arxiv")
+@strict
+class BigBirdPegasusConfig(PreTrainedConfig):
+    r"""
+    attention_type (`str`, *optional*, defaults to `"block_sparse"`):
+        Whether to use block sparse attention (with n complexity) as introduced in paper or original attention
+        layer (with n^2 complexity). Possible values are `"original_full"` and `"block_sparse"`.
+    block_size (`int`, *optional*, defaults to 64):
+        Size of each block. Useful only when `attention_type == "block_sparse"`.
+    num_random_blocks (`int`, *optional*, defaults to 3):
+        Each query is going to attend these many number of random blocks. Useful only when `attention_type ==
+        "block_sparse"`.
+    use_bias (`bool`, *optional*, defaults to `True`):
+        Whether to use bias in query, key, value.
+    Example:
+    ```python
+    >>> from transformers import BigBirdPegasusConfig, BigBirdPegasusModel
+    >>> # Initializing a BigBirdPegasus bigbird-pegasus-base style configuration
+    >>> configuration = BigBirdPegasusConfig()
+    >>> # Initializing a model (with random weights) from the bigbird-pegasus-base style configuration
+    >>> model = BigBirdPegasusModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "bigbird_pegasus"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_attention_heads": "encoder_attention_heads",
+        "hidden_size": "d_model",
+        "attention_probs_dropout_prob": "attention_dropout",
+        "num_hidden_layers": "encoder_layers",
+    }
+    vocab_size: int = 96103
+    max_position_embeddings: int = 4096
+    encoder_layers: int = 16
+    encoder_ffn_dim: int = 4096
+    encoder_attention_heads: int = 16
+    decoder_layers: int = 16
+    decoder_ffn_dim: int = 4096
+    decoder_attention_heads: int = 16
+    encoder_layerdrop: float | int = 0.0
+    decoder_layerdrop: float | int = 0.0
+    use_cache: bool = True
+    is_encoder_decoder: bool = True
+    activation_function: str = "gelu_new"
+    d_model: int = 1024
+    dropout: float | int = 0.1
+    attention_dropout: float | int = 0.0
+    activation_dropout: float | int = 0.0
+    init_std: float = 0.02
+    decoder_start_token_id: int = 2
+    classifier_dropout: float | int = 0.0
+    scale_embedding: bool = True
+    pad_token_id: int | None = 0
+    bos_token_id: int | None = 2
+    eos_token_id: int | list[int] | None = 1
+    attention_type: str = "block_sparse"  # only for encoder
+    block_size: int = 64
+    num_random_blocks: int = 3
+    use_bias: bool = False
+    is_decoder: bool = False
+    tie_word_embeddings: bool = True
+__all__ = ["BigBirdPegasusConfig"]

LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py ADDED Viewed

The diff for this file is too large to render. See raw diff

LTA_openwebtext_dualt/mini_owt_logdirichlet/cache/owt_t5_llmclean_qwen36_35b_articlefull_pack1023_10k_rejected_docs.txt ADDED Viewed

The diff for this file is too large to render. See raw diff