JinghuiLuAstronaut commited on
Commit
11a66f3
·
verified ·
1 Parent(s): 00a21bd

Add files using upload-large-folder tool

Browse files
Files changed (20) hide show
  1. LTA_openwebtext_dualt/logs/eval_fixedwrong70/ema_s128_finalfreq_topp.log +12 -0
  2. LTA_openwebtext_dualt/logs/eval_fixedwrong70/ema_s128_topp_t2p0.log +12 -0
  3. LTA_openwebtext_dualt/logs/eval_fixedwrong70/fixedwrong70_step54000_ema_dirres_n32_s256.log +20 -0
  4. LTA_openwebtext_dualt/logs/eval_fixedwrong70/fixedwrong70_step91000_online_dirres_n16_s128.log +12 -0
  5. LTA_openwebtext_dualt/logs/eval_fixedwrong70/step91000_ema_base_argmax.log +12 -0
  6. LTA_openwebtext_dualt/logs/eval_fixedwrong70/step91000_ema_topp_t1p5.log +5 -0
  7. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/_emoji_codes.py +0 -0
  8. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/cells.py +154 -0
  9. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/file_proxy.py +57 -0
  10. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/protocol.py +42 -0
  11. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/screen.py +54 -0
  12. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/segment.py +739 -0
  13. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/__init__.py +27 -0
  14. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/configuration_arcee.py +100 -0
  15. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/modeling_arcee.py +520 -0
  16. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/modular_arcee.py +117 -0
  17. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__init__.py +27 -0
  18. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +93 -0
  19. LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +0 -0
  20. LTA_openwebtext_dualt/mini_owt_logdirichlet/cache/owt_t5_llmclean_qwen36_35b_articlefull_pack1023_10k_rejected_docs.txt +0 -0
LTA_openwebtext_dualt/logs/eval_fixedwrong70/ema_s128_finalfreq_topp.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [ckpt] runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0054000_ema.pt step=54000
2
+ [decode-base] n=16 max_len=1024 steps=128 model_t=flow
3
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 2/16
4
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 4/16
5
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 6/16
6
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 8/16
7
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 10/16
8
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 12/16
9
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 14/16
10
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.3/k64/p0.95 freq_penalty=0.8/0/0.002 start_t=0 start_init=noise generated 16/16
11
+ [summary] {"type": "summary", "checkpoint": "runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0054000_ema.pt", "step": 54000, "decode": {"steps": 128, "model_t_mode": "flow", "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.5, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "blend", "final_sample_mode": "topp", "final_sample_temp": 1.3, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.8, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.002, "lock_bos": false, "n_samples": 16, "seed": 20260514}, "raw_genppl": {"ppl": 338.15488842203416, "nll_per_token": 5.8235040402879905, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 354.25428820863135, "nll_per_token": 5.870014983532475, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 4.685037354092487, "unique_tokens": 3644, "token_count": 16384, "distinct_1": 0.222412109375, "distinct_2": 0.5967741935483871, "top_token_mass": 0.06591796875}}
12
+ [done] docs/lta_samples/metrics_20260514/fixedwrong70_decode_sweep_fast/ema_s128_finalfreq_topp.jsonl
LTA_openwebtext_dualt/logs/eval_fixedwrong70/ema_s128_topp_t2p0.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [ckpt] runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0054000_ema.pt step=54000
2
+ [decode-base] n=16 max_len=1024 steps=128 model_t=flow
3
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 2/16
4
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 4/16
5
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 6/16
6
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 8/16
7
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 10/16
8
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 12/16
9
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 14/16
10
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/2/k64/p0.97 freq_penalty=0/0/0 start_t=0 start_init=noise generated 16/16
11
+ [summary] {"type": "summary", "checkpoint": "runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0054000_ema.pt", "step": 54000, "decode": {"steps": 128, "model_t_mode": "flow", "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.5, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "blend", "final_sample_mode": "topp", "final_sample_temp": 2.0, "final_top_k": 64, "final_top_p": 0.97, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 16, "seed": 20260514}, "raw_genppl": {"ppl": 78517.19514691229, "nll_per_token": 11.271072926240809, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 76752.70527879382, "nll_per_token": 11.248343912760417, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 6.619591836259636, "unique_tokens": 11908, "token_count": 16384, "distinct_1": 0.726806640625, "distinct_2": 0.9923020527859238, "top_token_mass": 0.0205078125}}
12
+ [done] docs/lta_samples/metrics_20260514/fixedwrong70_decode_sweep_fast/ema_s128_topp_t2p0.jsonl
LTA_openwebtext_dualt/logs/eval_fixedwrong70/fixedwrong70_step54000_ema_dirres_n32_s256.log ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [ckpt] runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0054000_ema.pt step=54000
2
+ [decode-base] n=32 max_len=1024 steps=256 model_t=flow
3
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 2/32
4
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 4/32
5
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 6/32
6
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 8/32
7
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 10/32
8
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 12/32
9
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 14/32
10
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 16/32
11
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 18/32
12
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 20/32
13
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 22/32
14
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 24/32
15
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 26/32
16
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 28/32
17
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 30/32
18
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 32/32
19
+ [summary] {"type": "summary", "checkpoint": "runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0054000_ema.pt", "step": 54000, "decode": {"steps": 256, "model_t_mode": "flow", "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.5, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "blend", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 32, "seed": 20260514}, "raw_genppl": {"ppl": 7.839044812727041, "nll_per_token": 2.0591169918284695, "tokens": 8160, "kept_samples": 32, "total_samples": 32, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 11.212826752402886, "nll_per_token": 2.4170583687576594, "tokens": 8160, "kept_samples": 32, "total_samples": 32, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 2.419525574410852, "unique_tokens": 236, "token_count": 32768, "distinct_1": 0.0072021484375, "distinct_2": 0.07138929618768329, "top_token_mass": 0.23388671875}}
20
+ [done] docs/lta_samples/metrics_20260514/fixedwrong70_latest_quick/fixedwrong70_step54000_ema_dirres_n32_s256.jsonl
LTA_openwebtext_dualt/logs/eval_fixedwrong70/fixedwrong70_step91000_online_dirres_n16_s128.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [ckpt] runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0091000_online.pt step=91000
2
+ [decode-base] n=16 max_len=1024 steps=128 model_t=flow
3
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 2/16
4
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 4/16
5
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 6/16
6
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 8/16
7
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 10/16
8
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 12/16
9
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 14/16
10
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 16/16
11
+ [summary] {"type": "summary", "checkpoint": "runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0091000_online.pt", "step": 91000, "decode": {"steps": 128, "model_t_mode": "flow", "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.5, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "blend", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 16, "seed": 20260514}, "raw_genppl": {"ppl": 6.635170922139129, "nll_per_token": 1.892384428136489, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 8.932796631820791, "nll_per_token": 2.189729518516391, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 2.0834674523881906, "unique_tokens": 143, "token_count": 16384, "distinct_1": 0.00872802734375, "distinct_2": 0.05663489736070381, "top_token_mass": 0.27642822265625}}
12
+ [done] docs/lta_samples/metrics_20260514/fixedwrong70_latest_step91000/fixedwrong70_step91000_online_dirres_n16_s128.jsonl
LTA_openwebtext_dualt/logs/eval_fixedwrong70/step91000_ema_base_argmax.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [ckpt] runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0091000_ema.pt step=91000
2
+ [decode-base] n=16 max_len=1024 steps=128 model_t=flow
3
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 2/16
4
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 4/16
5
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 6/16
6
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 8/16
7
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 10/16
8
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 12/16
9
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 14/16
10
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=argmax/1/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 16/16
11
+ [summary] {"type": "summary", "checkpoint": "runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0091000_ema.pt", "step": 91000, "decode": {"steps": 128, "model_t_mode": "flow", "decode_rule": "dirichlet_resample", "support_power": 1.0, "semantic_power": 1.5, "anchor_mode": "state", "cfg_scale": 0.0, "cfg_power": 1.0, "cfg_start": 0.0, "cfg_prior": "uniform", "decode_freq_penalty_alpha": 0.0, "decode_freq_penalty_beta": 0.0, "decode_freq_penalty_floor": 0.0, "decode_freq_penalty_start": 0.0, "decode_freq_penalty_end": 1.0, "decode_freq_penalty_power": 1.0, "start_t": 0.0, "start_init": "noise", "noise_init": "dirichlet", "noise_sigma": -1.0, "dirichlet_concentration": 1.0, "concentration_min": 1.0, "concentration_max": 1024.0, "target_prob": 1.0, "endpoint_temp": 1.45, "final_from": "blend", "final_sample_mode": "argmax", "final_sample_temp": 1.0, "final_top_k": 64, "final_top_p": 0.95, "final_freq_penalty_alpha": 0.0, "final_freq_penalty_beta": 0.0, "final_freq_penalty_floor": 0.0, "lock_bos": false, "n_samples": 16, "seed": 20260514}, "raw_genppl": {"ppl": 9.15153130996079, "nll_per_token": 2.2139212215647976, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "stripped_genppl": {"ppl": 11.897193857793793, "nll_per_token": 2.4763025620404413, "tokens": 4080, "kept_samples": 16, "total_samples": 16, "empty_rate": 0.0, "skipped_samples": 0}, "diversity": {"sample_entropy": 2.4467596776773615, "unique_tokens": 209, "token_count": 16384, "distinct_1": 0.01275634765625, "distinct_2": 0.09567448680351906, "top_token_mass": 0.2486572265625}}
12
+ [done] docs/lta_samples/metrics_20260514/fixedwrong70_step91000_decode_sweep_fast/step91000_ema_base_argmax.jsonl
LTA_openwebtext_dualt/logs/eval_fixedwrong70/step91000_ema_topp_t1p5.log ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [ckpt] runs/lta_owt_gpt2cached_len1024_fixedwrong70_c1024_ddit768x12_muon_ema_gbs512_8gpu_1m_20260513_171557/eval_snapshot_step_0091000_ema.pt step=91000
2
+ [decode-base] n=16 max_len=1024 steps=128 model_t=flow
3
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.5/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 2/16
4
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.5/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 4/16
5
+ [decode] temp=1.45 final=blend rule=dirichlet_resample support=1 semantic=1.5 anchor=state cfg=0/1@0:uniform decode_freq_penalty=0/0/0-1^1 final_sample=topp/1.5/k64/p0.95 freq_penalty=0/0/0 start_t=0 start_init=noise generated 6/16
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/_emoji_codes.py ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/cells.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from functools import lru_cache
3
+ from typing import Callable, List
4
+
5
+ from ._cell_widths import CELL_WIDTHS
6
+
7
+ # Regex to match sequence of the most common character ranges
8
+ _is_single_cell_widths = re.compile("^[\u0020-\u006f\u00a0\u02ff\u0370-\u0482]*$").match
9
+
10
+
11
+ @lru_cache(4096)
12
+ def cached_cell_len(text: str) -> int:
13
+ """Get the number of cells required to display text.
14
+
15
+ This method always caches, which may use up a lot of memory. It is recommended to use
16
+ `cell_len` over this method.
17
+
18
+ Args:
19
+ text (str): Text to display.
20
+
21
+ Returns:
22
+ int: Get the number of cells required to display text.
23
+ """
24
+ _get_size = get_character_cell_size
25
+ total_size = sum(_get_size(character) for character in text)
26
+ return total_size
27
+
28
+
29
+ def cell_len(text: str, _cell_len: Callable[[str], int] = cached_cell_len) -> int:
30
+ """Get the number of cells required to display text.
31
+
32
+ Args:
33
+ text (str): Text to display.
34
+
35
+ Returns:
36
+ int: Get the number of cells required to display text.
37
+ """
38
+ if len(text) < 512:
39
+ return _cell_len(text)
40
+ _get_size = get_character_cell_size
41
+ total_size = sum(_get_size(character) for character in text)
42
+ return total_size
43
+
44
+
45
+ @lru_cache(maxsize=4096)
46
+ def get_character_cell_size(character: str) -> int:
47
+ """Get the cell size of a character.
48
+
49
+ Args:
50
+ character (str): A single character.
51
+
52
+ Returns:
53
+ int: Number of cells (0, 1 or 2) occupied by that character.
54
+ """
55
+ return _get_codepoint_cell_size(ord(character))
56
+
57
+
58
+ @lru_cache(maxsize=4096)
59
+ def _get_codepoint_cell_size(codepoint: int) -> int:
60
+ """Get the cell size of a character.
61
+
62
+ Args:
63
+ codepoint (int): Codepoint of a character.
64
+
65
+ Returns:
66
+ int: Number of cells (0, 1 or 2) occupied by that character.
67
+ """
68
+
69
+ _table = CELL_WIDTHS
70
+ lower_bound = 0
71
+ upper_bound = len(_table) - 1
72
+ index = (lower_bound + upper_bound) // 2
73
+ while True:
74
+ start, end, width = _table[index]
75
+ if codepoint < start:
76
+ upper_bound = index - 1
77
+ elif codepoint > end:
78
+ lower_bound = index + 1
79
+ else:
80
+ return 0 if width == -1 else width
81
+ if upper_bound < lower_bound:
82
+ break
83
+ index = (lower_bound + upper_bound) // 2
84
+ return 1
85
+
86
+
87
+ def set_cell_size(text: str, total: int) -> str:
88
+ """Set the length of a string to fit within given number of cells."""
89
+
90
+ if _is_single_cell_widths(text):
91
+ size = len(text)
92
+ if size < total:
93
+ return text + " " * (total - size)
94
+ return text[:total]
95
+
96
+ if total <= 0:
97
+ return ""
98
+ cell_size = cell_len(text)
99
+ if cell_size == total:
100
+ return text
101
+ if cell_size < total:
102
+ return text + " " * (total - cell_size)
103
+
104
+ start = 0
105
+ end = len(text)
106
+
107
+ # Binary search until we find the right size
108
+ while True:
109
+ pos = (start + end) // 2
110
+ before = text[: pos + 1]
111
+ before_len = cell_len(before)
112
+ if before_len == total + 1 and cell_len(before[-1]) == 2:
113
+ return before[:-1] + " "
114
+ if before_len == total:
115
+ return before
116
+ if before_len > total:
117
+ end = pos
118
+ else:
119
+ start = pos
120
+
121
+
122
+ # TODO: This is inefficient
123
+ # TODO: This might not work with CWJ type characters
124
+ def chop_cells(text: str, max_size: int, position: int = 0) -> List[str]:
125
+ """Break text in to equal (cell) length strings, returning the characters in reverse
126
+ order"""
127
+ _get_character_cell_size = get_character_cell_size
128
+ characters = [
129
+ (character, _get_character_cell_size(character)) for character in text
130
+ ]
131
+ total_size = position
132
+ lines: List[List[str]] = [[]]
133
+ append = lines[-1].append
134
+
135
+ for character, size in reversed(characters):
136
+ if total_size + size > max_size:
137
+ lines.append([character])
138
+ append = lines[-1].append
139
+ total_size = size
140
+ else:
141
+ total_size += size
142
+ append(character)
143
+
144
+ return ["".join(line) for line in lines]
145
+
146
+
147
+ if __name__ == "__main__": # pragma: no cover
148
+
149
+ print(get_character_cell_size("😽"))
150
+ for line in chop_cells("""这是对亚洲语言支持的测试。面对模棱两可的想法,拒绝猜测的诱惑。""", 8):
151
+ print(line)
152
+ for n in range(80, 1, -1):
153
+ print(set_cell_size("""这是对亚洲语言支持的测试。面对模棱两可的想法,拒绝猜测的诱惑。""", n) + "|")
154
+ print("x" * n)
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/file_proxy.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ from typing import IO, TYPE_CHECKING, Any, List
3
+
4
+ from .ansi import AnsiDecoder
5
+ from .text import Text
6
+
7
+ if TYPE_CHECKING:
8
+ from .console import Console
9
+
10
+
11
+ class FileProxy(io.TextIOBase):
12
+ """Wraps a file (e.g. sys.stdout) and redirects writes to a console."""
13
+
14
+ def __init__(self, console: "Console", file: IO[str]) -> None:
15
+ self.__console = console
16
+ self.__file = file
17
+ self.__buffer: List[str] = []
18
+ self.__ansi_decoder = AnsiDecoder()
19
+
20
+ @property
21
+ def rich_proxied_file(self) -> IO[str]:
22
+ """Get proxied file."""
23
+ return self.__file
24
+
25
+ def __getattr__(self, name: str) -> Any:
26
+ return getattr(self.__file, name)
27
+
28
+ def write(self, text: str) -> int:
29
+ if not isinstance(text, str):
30
+ raise TypeError(f"write() argument must be str, not {type(text).__name__}")
31
+ buffer = self.__buffer
32
+ lines: List[str] = []
33
+ while text:
34
+ line, new_line, text = text.partition("\n")
35
+ if new_line:
36
+ lines.append("".join(buffer) + line)
37
+ buffer.clear()
38
+ else:
39
+ buffer.append(line)
40
+ break
41
+ if lines:
42
+ console = self.__console
43
+ with console:
44
+ output = Text("\n").join(
45
+ self.__ansi_decoder.decode_line(line) for line in lines
46
+ )
47
+ console.print(output)
48
+ return len(text)
49
+
50
+ def flush(self) -> None:
51
+ output = "".join(self.__buffer)
52
+ if output:
53
+ self.__console.print(output)
54
+ del self.__buffer[:]
55
+
56
+ def fileno(self) -> int:
57
+ return self.__file.fileno()
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/protocol.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, cast, Set, TYPE_CHECKING
2
+ from inspect import isclass
3
+
4
+ if TYPE_CHECKING:
5
+ from pip._vendor.rich.console import RenderableType
6
+
7
+ _GIBBERISH = """aihwerij235234ljsdnp34ksodfipwoe234234jlskjdf"""
8
+
9
+
10
+ def is_renderable(check_object: Any) -> bool:
11
+ """Check if an object may be rendered by Rich."""
12
+ return (
13
+ isinstance(check_object, str)
14
+ or hasattr(check_object, "__rich__")
15
+ or hasattr(check_object, "__rich_console__")
16
+ )
17
+
18
+
19
+ def rich_cast(renderable: object) -> "RenderableType":
20
+ """Cast an object to a renderable by calling __rich__ if present.
21
+
22
+ Args:
23
+ renderable (object): A potentially renderable object
24
+
25
+ Returns:
26
+ object: The result of recursively calling __rich__.
27
+ """
28
+ from pip._vendor.rich.console import RenderableType
29
+
30
+ rich_visited_set: Set[type] = set() # Prevent potential infinite loop
31
+ while hasattr(renderable, "__rich__") and not isclass(renderable):
32
+ # Detect object which claim to have all the attributes
33
+ if hasattr(renderable, _GIBBERISH):
34
+ return repr(renderable)
35
+ cast_method = getattr(renderable, "__rich__")
36
+ renderable = cast_method()
37
+ renderable_type = type(renderable)
38
+ if renderable_type in rich_visited_set:
39
+ break
40
+ rich_visited_set.add(renderable_type)
41
+
42
+ return cast(RenderableType, renderable)
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/screen.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, TYPE_CHECKING
2
+
3
+ from .segment import Segment
4
+ from .style import StyleType
5
+ from ._loop import loop_last
6
+
7
+
8
+ if TYPE_CHECKING:
9
+ from .console import (
10
+ Console,
11
+ ConsoleOptions,
12
+ RenderResult,
13
+ RenderableType,
14
+ Group,
15
+ )
16
+
17
+
18
+ class Screen:
19
+ """A renderable that fills the terminal screen and crops excess.
20
+
21
+ Args:
22
+ renderable (RenderableType): Child renderable.
23
+ style (StyleType, optional): Optional background style. Defaults to None.
24
+ """
25
+
26
+ renderable: "RenderableType"
27
+
28
+ def __init__(
29
+ self,
30
+ *renderables: "RenderableType",
31
+ style: Optional[StyleType] = None,
32
+ application_mode: bool = False,
33
+ ) -> None:
34
+ from pip._vendor.rich.console import Group
35
+
36
+ self.renderable = Group(*renderables)
37
+ self.style = style
38
+ self.application_mode = application_mode
39
+
40
+ def __rich_console__(
41
+ self, console: "Console", options: "ConsoleOptions"
42
+ ) -> "RenderResult":
43
+ width, height = options.size
44
+ style = console.get_style(self.style) if self.style else None
45
+ render_options = options.update(width=width, height=height)
46
+ lines = console.render_lines(
47
+ self.renderable or "", render_options, style=style, pad=True
48
+ )
49
+ lines = Segment.set_shape(lines, width, height, style=style)
50
+ new_line = Segment("\n\r") if self.application_mode else Segment.line()
51
+ for last, line in loop_last(lines):
52
+ yield from line
53
+ if not last:
54
+ yield new_line
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35/lib/python3.12/site-packages/pip/_vendor/rich/segment.py ADDED
@@ -0,0 +1,739 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import IntEnum
2
+ from functools import lru_cache
3
+ from itertools import filterfalse
4
+ from logging import getLogger
5
+ from operator import attrgetter
6
+ from typing import (
7
+ TYPE_CHECKING,
8
+ Dict,
9
+ Iterable,
10
+ List,
11
+ NamedTuple,
12
+ Optional,
13
+ Sequence,
14
+ Tuple,
15
+ Type,
16
+ Union,
17
+ )
18
+
19
+ from .cells import (
20
+ _is_single_cell_widths,
21
+ cached_cell_len,
22
+ cell_len,
23
+ get_character_cell_size,
24
+ set_cell_size,
25
+ )
26
+ from .repr import Result, rich_repr
27
+ from .style import Style
28
+
29
+ if TYPE_CHECKING:
30
+ from .console import Console, ConsoleOptions, RenderResult
31
+
32
+ log = getLogger("rich")
33
+
34
+
35
+ class ControlType(IntEnum):
36
+ """Non-printable control codes which typically translate to ANSI codes."""
37
+
38
+ BELL = 1
39
+ CARRIAGE_RETURN = 2
40
+ HOME = 3
41
+ CLEAR = 4
42
+ SHOW_CURSOR = 5
43
+ HIDE_CURSOR = 6
44
+ ENABLE_ALT_SCREEN = 7
45
+ DISABLE_ALT_SCREEN = 8
46
+ CURSOR_UP = 9
47
+ CURSOR_DOWN = 10
48
+ CURSOR_FORWARD = 11
49
+ CURSOR_BACKWARD = 12
50
+ CURSOR_MOVE_TO_COLUMN = 13
51
+ CURSOR_MOVE_TO = 14
52
+ ERASE_IN_LINE = 15
53
+ SET_WINDOW_TITLE = 16
54
+
55
+
56
+ ControlCode = Union[
57
+ Tuple[ControlType],
58
+ Tuple[ControlType, Union[int, str]],
59
+ Tuple[ControlType, int, int],
60
+ ]
61
+
62
+
63
+ @rich_repr()
64
+ class Segment(NamedTuple):
65
+ """A piece of text with associated style. Segments are produced by the Console render process and
66
+ are ultimately converted in to strings to be written to the terminal.
67
+
68
+ Args:
69
+ text (str): A piece of text.
70
+ style (:class:`~rich.style.Style`, optional): An optional style to apply to the text.
71
+ control (Tuple[ControlCode], optional): Optional sequence of control codes.
72
+
73
+ Attributes:
74
+ cell_length (int): The cell length of this Segment.
75
+ """
76
+
77
+ text: str
78
+ style: Optional[Style] = None
79
+ control: Optional[Sequence[ControlCode]] = None
80
+
81
+ @property
82
+ def cell_length(self) -> int:
83
+ """The number of terminal cells required to display self.text.
84
+
85
+ Returns:
86
+ int: A number of cells.
87
+ """
88
+ text, _style, control = self
89
+ return 0 if control else cell_len(text)
90
+
91
+ def __rich_repr__(self) -> Result:
92
+ yield self.text
93
+ if self.control is None:
94
+ if self.style is not None:
95
+ yield self.style
96
+ else:
97
+ yield self.style
98
+ yield self.control
99
+
100
+ def __bool__(self) -> bool:
101
+ """Check if the segment contains text."""
102
+ return bool(self.text)
103
+
104
+ @property
105
+ def is_control(self) -> bool:
106
+ """Check if the segment contains control codes."""
107
+ return self.control is not None
108
+
109
+ @classmethod
110
+ @lru_cache(1024 * 16)
111
+ def _split_cells(cls, segment: "Segment", cut: int) -> Tuple["Segment", "Segment"]:
112
+
113
+ text, style, control = segment
114
+ _Segment = Segment
115
+
116
+ cell_length = segment.cell_length
117
+ if cut >= cell_length:
118
+ return segment, _Segment("", style, control)
119
+
120
+ cell_size = get_character_cell_size
121
+
122
+ pos = int((cut / cell_length) * (len(text) - 1))
123
+
124
+ before = text[:pos]
125
+ cell_pos = cell_len(before)
126
+ if cell_pos == cut:
127
+ return (
128
+ _Segment(before, style, control),
129
+ _Segment(text[pos:], style, control),
130
+ )
131
+ while pos < len(text):
132
+ char = text[pos]
133
+ pos += 1
134
+ cell_pos += cell_size(char)
135
+ before = text[:pos]
136
+ if cell_pos == cut:
137
+ return (
138
+ _Segment(before, style, control),
139
+ _Segment(text[pos:], style, control),
140
+ )
141
+ if cell_pos > cut:
142
+ return (
143
+ _Segment(before[: pos - 1] + " ", style, control),
144
+ _Segment(" " + text[pos:], style, control),
145
+ )
146
+
147
+ raise AssertionError("Will never reach here")
148
+
149
+ def split_cells(self, cut: int) -> Tuple["Segment", "Segment"]:
150
+ """Split segment in to two segments at the specified column.
151
+
152
+ If the cut point falls in the middle of a 2-cell wide character then it is replaced
153
+ by two spaces, to preserve the display width of the parent segment.
154
+
155
+ Returns:
156
+ Tuple[Segment, Segment]: Two segments.
157
+ """
158
+ text, style, control = self
159
+
160
+ if _is_single_cell_widths(text):
161
+ # Fast path with all 1 cell characters
162
+ if cut >= len(text):
163
+ return self, Segment("", style, control)
164
+ return (
165
+ Segment(text[:cut], style, control),
166
+ Segment(text[cut:], style, control),
167
+ )
168
+
169
+ return self._split_cells(self, cut)
170
+
171
+ @classmethod
172
+ def line(cls) -> "Segment":
173
+ """Make a new line segment."""
174
+ return cls("\n")
175
+
176
+ @classmethod
177
+ def apply_style(
178
+ cls,
179
+ segments: Iterable["Segment"],
180
+ style: Optional[Style] = None,
181
+ post_style: Optional[Style] = None,
182
+ ) -> Iterable["Segment"]:
183
+ """Apply style(s) to an iterable of segments.
184
+
185
+ Returns an iterable of segments where the style is replaced by ``style + segment.style + post_style``.
186
+
187
+ Args:
188
+ segments (Iterable[Segment]): Segments to process.
189
+ style (Style, optional): Base style. Defaults to None.
190
+ post_style (Style, optional): Style to apply on top of segment style. Defaults to None.
191
+
192
+ Returns:
193
+ Iterable[Segments]: A new iterable of segments (possibly the same iterable).
194
+ """
195
+ result_segments = segments
196
+ if style:
197
+ apply = style.__add__
198
+ result_segments = (
199
+ cls(text, None if control else apply(_style), control)
200
+ for text, _style, control in result_segments
201
+ )
202
+ if post_style:
203
+ result_segments = (
204
+ cls(
205
+ text,
206
+ (
207
+ None
208
+ if control
209
+ else (_style + post_style if _style else post_style)
210
+ ),
211
+ control,
212
+ )
213
+ for text, _style, control in result_segments
214
+ )
215
+ return result_segments
216
+
217
+ @classmethod
218
+ def filter_control(
219
+ cls, segments: Iterable["Segment"], is_control: bool = False
220
+ ) -> Iterable["Segment"]:
221
+ """Filter segments by ``is_control`` attribute.
222
+
223
+ Args:
224
+ segments (Iterable[Segment]): An iterable of Segment instances.
225
+ is_control (bool, optional): is_control flag to match in search.
226
+
227
+ Returns:
228
+ Iterable[Segment]: And iterable of Segment instances.
229
+
230
+ """
231
+ if is_control:
232
+ return filter(attrgetter("control"), segments)
233
+ else:
234
+ return filterfalse(attrgetter("control"), segments)
235
+
236
+ @classmethod
237
+ def split_lines(cls, segments: Iterable["Segment"]) -> Iterable[List["Segment"]]:
238
+ """Split a sequence of segments in to a list of lines.
239
+
240
+ Args:
241
+ segments (Iterable[Segment]): Segments potentially containing line feeds.
242
+
243
+ Yields:
244
+ Iterable[List[Segment]]: Iterable of segment lists, one per line.
245
+ """
246
+ line: List[Segment] = []
247
+ append = line.append
248
+
249
+ for segment in segments:
250
+ if "\n" in segment.text and not segment.control:
251
+ text, style, _ = segment
252
+ while text:
253
+ _text, new_line, text = text.partition("\n")
254
+ if _text:
255
+ append(cls(_text, style))
256
+ if new_line:
257
+ yield line
258
+ line = []
259
+ append = line.append
260
+ else:
261
+ append(segment)
262
+ if line:
263
+ yield line
264
+
265
+ @classmethod
266
+ def split_and_crop_lines(
267
+ cls,
268
+ segments: Iterable["Segment"],
269
+ length: int,
270
+ style: Optional[Style] = None,
271
+ pad: bool = True,
272
+ include_new_lines: bool = True,
273
+ ) -> Iterable[List["Segment"]]:
274
+ """Split segments in to lines, and crop lines greater than a given length.
275
+
276
+ Args:
277
+ segments (Iterable[Segment]): An iterable of segments, probably
278
+ generated from console.render.
279
+ length (int): Desired line length.
280
+ style (Style, optional): Style to use for any padding.
281
+ pad (bool): Enable padding of lines that are less than `length`.
282
+
283
+ Returns:
284
+ Iterable[List[Segment]]: An iterable of lines of segments.
285
+ """
286
+ line: List[Segment] = []
287
+ append = line.append
288
+
289
+ adjust_line_length = cls.adjust_line_length
290
+ new_line_segment = cls("\n")
291
+
292
+ for segment in segments:
293
+ if "\n" in segment.text and not segment.control:
294
+ text, segment_style, _ = segment
295
+ while text:
296
+ _text, new_line, text = text.partition("\n")
297
+ if _text:
298
+ append(cls(_text, segment_style))
299
+ if new_line:
300
+ cropped_line = adjust_line_length(
301
+ line, length, style=style, pad=pad
302
+ )
303
+ if include_new_lines:
304
+ cropped_line.append(new_line_segment)
305
+ yield cropped_line
306
+ line.clear()
307
+ else:
308
+ append(segment)
309
+ if line:
310
+ yield adjust_line_length(line, length, style=style, pad=pad)
311
+
312
+ @classmethod
313
+ def adjust_line_length(
314
+ cls,
315
+ line: List["Segment"],
316
+ length: int,
317
+ style: Optional[Style] = None,
318
+ pad: bool = True,
319
+ ) -> List["Segment"]:
320
+ """Adjust a line to a given width (cropping or padding as required).
321
+
322
+ Args:
323
+ segments (Iterable[Segment]): A list of segments in a single line.
324
+ length (int): The desired width of the line.
325
+ style (Style, optional): The style of padding if used (space on the end). Defaults to None.
326
+ pad (bool, optional): Pad lines with spaces if they are shorter than `length`. Defaults to True.
327
+
328
+ Returns:
329
+ List[Segment]: A line of segments with the desired length.
330
+ """
331
+ line_length = sum(segment.cell_length for segment in line)
332
+ new_line: List[Segment]
333
+
334
+ if line_length < length:
335
+ if pad:
336
+ new_line = line + [cls(" " * (length - line_length), style)]
337
+ else:
338
+ new_line = line[:]
339
+ elif line_length > length:
340
+ new_line = []
341
+ append = new_line.append
342
+ line_length = 0
343
+ for segment in line:
344
+ segment_length = segment.cell_length
345
+ if line_length + segment_length < length or segment.control:
346
+ append(segment)
347
+ line_length += segment_length
348
+ else:
349
+ text, segment_style, _ = segment
350
+ text = set_cell_size(text, length - line_length)
351
+ append(cls(text, segment_style))
352
+ break
353
+ else:
354
+ new_line = line[:]
355
+ return new_line
356
+
357
+ @classmethod
358
+ def get_line_length(cls, line: List["Segment"]) -> int:
359
+ """Get the length of list of segments.
360
+
361
+ Args:
362
+ line (List[Segment]): A line encoded as a list of Segments (assumes no '\\\\n' characters),
363
+
364
+ Returns:
365
+ int: The length of the line.
366
+ """
367
+ _cell_len = cell_len
368
+ return sum(_cell_len(text) for text, style, control in line if not control)
369
+
370
+ @classmethod
371
+ def get_shape(cls, lines: List[List["Segment"]]) -> Tuple[int, int]:
372
+ """Get the shape (enclosing rectangle) of a list of lines.
373
+
374
+ Args:
375
+ lines (List[List[Segment]]): A list of lines (no '\\\\n' characters).
376
+
377
+ Returns:
378
+ Tuple[int, int]: Width and height in characters.
379
+ """
380
+ get_line_length = cls.get_line_length
381
+ max_width = max(get_line_length(line) for line in lines) if lines else 0
382
+ return (max_width, len(lines))
383
+
384
+ @classmethod
385
+ def set_shape(
386
+ cls,
387
+ lines: List[List["Segment"]],
388
+ width: int,
389
+ height: Optional[int] = None,
390
+ style: Optional[Style] = None,
391
+ new_lines: bool = False,
392
+ ) -> List[List["Segment"]]:
393
+ """Set the shape of a list of lines (enclosing rectangle).
394
+
395
+ Args:
396
+ lines (List[List[Segment]]): A list of lines.
397
+ width (int): Desired width.
398
+ height (int, optional): Desired height or None for no change.
399
+ style (Style, optional): Style of any padding added.
400
+ new_lines (bool, optional): Padded lines should include "\n". Defaults to False.
401
+
402
+ Returns:
403
+ List[List[Segment]]: New list of lines.
404
+ """
405
+ _height = height or len(lines)
406
+
407
+ blank = (
408
+ [cls(" " * width + "\n", style)] if new_lines else [cls(" " * width, style)]
409
+ )
410
+
411
+ adjust_line_length = cls.adjust_line_length
412
+ shaped_lines = lines[:_height]
413
+ shaped_lines[:] = [
414
+ adjust_line_length(line, width, style=style) for line in lines
415
+ ]
416
+ if len(shaped_lines) < _height:
417
+ shaped_lines.extend([blank] * (_height - len(shaped_lines)))
418
+ return shaped_lines
419
+
420
+ @classmethod
421
+ def align_top(
422
+ cls: Type["Segment"],
423
+ lines: List[List["Segment"]],
424
+ width: int,
425
+ height: int,
426
+ style: Style,
427
+ new_lines: bool = False,
428
+ ) -> List[List["Segment"]]:
429
+ """Aligns lines to top (adds extra lines to bottom as required).
430
+
431
+ Args:
432
+ lines (List[List[Segment]]): A list of lines.
433
+ width (int): Desired width.
434
+ height (int, optional): Desired height or None for no change.
435
+ style (Style): Style of any padding added.
436
+ new_lines (bool, optional): Padded lines should include "\n". Defaults to False.
437
+
438
+ Returns:
439
+ List[List[Segment]]: New list of lines.
440
+ """
441
+ extra_lines = height - len(lines)
442
+ if not extra_lines:
443
+ return lines[:]
444
+ lines = lines[:height]
445
+ blank = cls(" " * width + "\n", style) if new_lines else cls(" " * width, style)
446
+ lines = lines + [[blank]] * extra_lines
447
+ return lines
448
+
449
+ @classmethod
450
+ def align_bottom(
451
+ cls: Type["Segment"],
452
+ lines: List[List["Segment"]],
453
+ width: int,
454
+ height: int,
455
+ style: Style,
456
+ new_lines: bool = False,
457
+ ) -> List[List["Segment"]]:
458
+ """Aligns render to bottom (adds extra lines above as required).
459
+
460
+ Args:
461
+ lines (List[List[Segment]]): A list of lines.
462
+ width (int): Desired width.
463
+ height (int, optional): Desired height or None for no change.
464
+ style (Style): Style of any padding added. Defaults to None.
465
+ new_lines (bool, optional): Padded lines should include "\n". Defaults to False.
466
+
467
+ Returns:
468
+ List[List[Segment]]: New list of lines.
469
+ """
470
+ extra_lines = height - len(lines)
471
+ if not extra_lines:
472
+ return lines[:]
473
+ lines = lines[:height]
474
+ blank = cls(" " * width + "\n", style) if new_lines else cls(" " * width, style)
475
+ lines = [[blank]] * extra_lines + lines
476
+ return lines
477
+
478
+ @classmethod
479
+ def align_middle(
480
+ cls: Type["Segment"],
481
+ lines: List[List["Segment"]],
482
+ width: int,
483
+ height: int,
484
+ style: Style,
485
+ new_lines: bool = False,
486
+ ) -> List[List["Segment"]]:
487
+ """Aligns lines to middle (adds extra lines to above and below as required).
488
+
489
+ Args:
490
+ lines (List[List[Segment]]): A list of lines.
491
+ width (int): Desired width.
492
+ height (int, optional): Desired height or None for no change.
493
+ style (Style): Style of any padding added.
494
+ new_lines (bool, optional): Padded lines should include "\n". Defaults to False.
495
+
496
+ Returns:
497
+ List[List[Segment]]: New list of lines.
498
+ """
499
+ extra_lines = height - len(lines)
500
+ if not extra_lines:
501
+ return lines[:]
502
+ lines = lines[:height]
503
+ blank = cls(" " * width + "\n", style) if new_lines else cls(" " * width, style)
504
+ top_lines = extra_lines // 2
505
+ bottom_lines = extra_lines - top_lines
506
+ lines = [[blank]] * top_lines + lines + [[blank]] * bottom_lines
507
+ return lines
508
+
509
+ @classmethod
510
+ def simplify(cls, segments: Iterable["Segment"]) -> Iterable["Segment"]:
511
+ """Simplify an iterable of segments by combining contiguous segments with the same style.
512
+
513
+ Args:
514
+ segments (Iterable[Segment]): An iterable of segments.
515
+
516
+ Returns:
517
+ Iterable[Segment]: A possibly smaller iterable of segments that will render the same way.
518
+ """
519
+ iter_segments = iter(segments)
520
+ try:
521
+ last_segment = next(iter_segments)
522
+ except StopIteration:
523
+ return
524
+
525
+ _Segment = Segment
526
+ for segment in iter_segments:
527
+ if last_segment.style == segment.style and not segment.control:
528
+ last_segment = _Segment(
529
+ last_segment.text + segment.text, last_segment.style
530
+ )
531
+ else:
532
+ yield last_segment
533
+ last_segment = segment
534
+ yield last_segment
535
+
536
+ @classmethod
537
+ def strip_links(cls, segments: Iterable["Segment"]) -> Iterable["Segment"]:
538
+ """Remove all links from an iterable of styles.
539
+
540
+ Args:
541
+ segments (Iterable[Segment]): An iterable segments.
542
+
543
+ Yields:
544
+ Segment: Segments with link removed.
545
+ """
546
+ for segment in segments:
547
+ if segment.control or segment.style is None:
548
+ yield segment
549
+ else:
550
+ text, style, _control = segment
551
+ yield cls(text, style.update_link(None) if style else None)
552
+
553
+ @classmethod
554
+ def strip_styles(cls, segments: Iterable["Segment"]) -> Iterable["Segment"]:
555
+ """Remove all styles from an iterable of segments.
556
+
557
+ Args:
558
+ segments (Iterable[Segment]): An iterable segments.
559
+
560
+ Yields:
561
+ Segment: Segments with styles replace with None
562
+ """
563
+ for text, _style, control in segments:
564
+ yield cls(text, None, control)
565
+
566
+ @classmethod
567
+ def remove_color(cls, segments: Iterable["Segment"]) -> Iterable["Segment"]:
568
+ """Remove all color from an iterable of segments.
569
+
570
+ Args:
571
+ segments (Iterable[Segment]): An iterable segments.
572
+
573
+ Yields:
574
+ Segment: Segments with colorless style.
575
+ """
576
+
577
+ cache: Dict[Style, Style] = {}
578
+ for text, style, control in segments:
579
+ if style:
580
+ colorless_style = cache.get(style)
581
+ if colorless_style is None:
582
+ colorless_style = style.without_color
583
+ cache[style] = colorless_style
584
+ yield cls(text, colorless_style, control)
585
+ else:
586
+ yield cls(text, None, control)
587
+
588
+ @classmethod
589
+ def divide(
590
+ cls, segments: Iterable["Segment"], cuts: Iterable[int]
591
+ ) -> Iterable[List["Segment"]]:
592
+ """Divides an iterable of segments in to portions.
593
+
594
+ Args:
595
+ cuts (Iterable[int]): Cell positions where to divide.
596
+
597
+ Yields:
598
+ [Iterable[List[Segment]]]: An iterable of Segments in List.
599
+ """
600
+ split_segments: List["Segment"] = []
601
+ add_segment = split_segments.append
602
+
603
+ iter_cuts = iter(cuts)
604
+
605
+ while True:
606
+ cut = next(iter_cuts, -1)
607
+ if cut == -1:
608
+ return []
609
+ if cut != 0:
610
+ break
611
+ yield []
612
+ pos = 0
613
+
614
+ segments_clear = split_segments.clear
615
+ segments_copy = split_segments.copy
616
+
617
+ _cell_len = cached_cell_len
618
+ for segment in segments:
619
+ text, _style, control = segment
620
+ while text:
621
+ end_pos = pos if control else pos + _cell_len(text)
622
+ if end_pos < cut:
623
+ add_segment(segment)
624
+ pos = end_pos
625
+ break
626
+
627
+ if end_pos == cut:
628
+ add_segment(segment)
629
+ yield segments_copy()
630
+ segments_clear()
631
+ pos = end_pos
632
+
633
+ cut = next(iter_cuts, -1)
634
+ if cut == -1:
635
+ if split_segments:
636
+ yield segments_copy()
637
+ return
638
+
639
+ break
640
+
641
+ else:
642
+ before, segment = segment.split_cells(cut - pos)
643
+ text, _style, control = segment
644
+ add_segment(before)
645
+ yield segments_copy()
646
+ segments_clear()
647
+ pos = cut
648
+
649
+ cut = next(iter_cuts, -1)
650
+ if cut == -1:
651
+ if split_segments:
652
+ yield segments_copy()
653
+ return
654
+
655
+ yield segments_copy()
656
+
657
+
658
+ class Segments:
659
+ """A simple renderable to render an iterable of segments. This class may be useful if
660
+ you want to print segments outside of a __rich_console__ method.
661
+
662
+ Args:
663
+ segments (Iterable[Segment]): An iterable of segments.
664
+ new_lines (bool, optional): Add new lines between segments. Defaults to False.
665
+ """
666
+
667
+ def __init__(self, segments: Iterable[Segment], new_lines: bool = False) -> None:
668
+ self.segments = list(segments)
669
+ self.new_lines = new_lines
670
+
671
+ def __rich_console__(
672
+ self, console: "Console", options: "ConsoleOptions"
673
+ ) -> "RenderResult":
674
+ if self.new_lines:
675
+ line = Segment.line()
676
+ for segment in self.segments:
677
+ yield segment
678
+ yield line
679
+ else:
680
+ yield from self.segments
681
+
682
+
683
+ class SegmentLines:
684
+ def __init__(self, lines: Iterable[List[Segment]], new_lines: bool = False) -> None:
685
+ """A simple renderable containing a number of lines of segments. May be used as an intermediate
686
+ in rendering process.
687
+
688
+ Args:
689
+ lines (Iterable[List[Segment]]): Lists of segments forming lines.
690
+ new_lines (bool, optional): Insert new lines after each line. Defaults to False.
691
+ """
692
+ self.lines = list(lines)
693
+ self.new_lines = new_lines
694
+
695
+ def __rich_console__(
696
+ self, console: "Console", options: "ConsoleOptions"
697
+ ) -> "RenderResult":
698
+ if self.new_lines:
699
+ new_line = Segment.line()
700
+ for line in self.lines:
701
+ yield from line
702
+ yield new_line
703
+ else:
704
+ for line in self.lines:
705
+ yield from line
706
+
707
+
708
+ if __name__ == "__main__": # pragma: no cover
709
+ from pip._vendor.rich.console import Console
710
+ from pip._vendor.rich.syntax import Syntax
711
+ from pip._vendor.rich.text import Text
712
+
713
+ code = """from rich.console import Console
714
+ console = Console()
715
+ text = Text.from_markup("Hello, [bold magenta]World[/]!")
716
+ console.print(text)"""
717
+
718
+ text = Text.from_markup("Hello, [bold magenta]World[/]!")
719
+
720
+ console = Console()
721
+
722
+ console.rule("rich.Segment")
723
+ console.print(
724
+ "A Segment is the last step in the Rich render process before generating text with ANSI codes."
725
+ )
726
+ console.print("\nConsider the following code:\n")
727
+ console.print(Syntax(code, "python", line_numbers=True))
728
+ console.print()
729
+ console.print(
730
+ "When you call [b]print()[/b], Rich [i]renders[/i] the object in to the following:\n"
731
+ )
732
+ fragments = list(console.render(text))
733
+ console.print(fragments)
734
+ console.print()
735
+ console.print("The Segments are then processed to produce the following output:\n")
736
+ console.print(text)
737
+ console.print(
738
+ "\nYou will only need to know this if you are implementing your own Rich renderables."
739
+ )
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/__init__.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import TYPE_CHECKING
15
+
16
+ from ...utils import _LazyModule
17
+ from ...utils.import_utils import define_import_structure
18
+
19
+
20
+ if TYPE_CHECKING:
21
+ from .configuration_arcee import *
22
+ from .modeling_arcee import *
23
+ else:
24
+ import sys
25
+
26
+ _file = globals()["__file__"]
27
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/configuration_arcee.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/arcee/modular_arcee.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_arcee.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ from huggingface_hub.dataclasses import strict
22
+
23
+ from transformers.utils import auto_docstring
24
+
25
+ from ...configuration_utils import PreTrainedConfig
26
+ from ...modeling_rope_utils import RopeParameters
27
+
28
+
29
+ @auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
30
+ @strict
31
+ class ArceeConfig(PreTrainedConfig):
32
+ r"""
33
+ ```python
34
+ >>> from transformers import ArceeModel, ArceeConfig
35
+
36
+ >>> # Initializing an Arcee AFM-4.5B-Base style configuration
37
+ >>> configuration = ArceeConfig()
38
+
39
+ >>> # Initializing a model from the AFM-4.5B-Base style configuration
40
+ >>> model = ArceeModel(configuration)
41
+
42
+ >>> # Accessing the model configuration
43
+ >>> configuration = model.config
44
+ ```"""
45
+
46
+ model_type = "arcee"
47
+ keys_to_ignore_at_inference = ["past_key_values"]
48
+ base_model_tp_plan = {
49
+ "layers.*.self_attn.q_proj": "colwise",
50
+ "layers.*.self_attn.k_proj": "colwise",
51
+ "layers.*.self_attn.v_proj": "colwise",
52
+ "layers.*.self_attn.o_proj": "rowwise",
53
+ "layers.*.mlp.up_proj": "colwise",
54
+ "layers.*.mlp.down_proj": "rowwise",
55
+ }
56
+ base_model_pp_plan = {
57
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
58
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
59
+ "norm": (["hidden_states"], ["hidden_states"]),
60
+ }
61
+
62
+ vocab_size: int = 32000
63
+ hidden_size: int = 2560
64
+ intermediate_size: int = 18432
65
+ num_hidden_layers: int = 32
66
+ num_attention_heads: int = 32
67
+ num_key_value_heads: int | None = None
68
+ hidden_act: str = "relu2"
69
+ max_position_embeddings: int = 4096
70
+ initializer_range: float = 0.02
71
+ rms_norm_eps: float = 1e-5
72
+ use_cache: bool = True
73
+ pad_token_id: int | None = None
74
+ bos_token_id: int | None = 128000
75
+ eos_token_id: int | list[int] | None = 128001
76
+ tie_word_embeddings: bool = False
77
+ rope_parameters: RopeParameters | dict | None = None
78
+ attention_bias: bool = False
79
+ attention_dropout: float | int = 0.0
80
+ mlp_bias: bool = False
81
+ head_dim: int | None = None
82
+
83
+ def __post_init__(self, **kwargs):
84
+ if self.head_dim is None:
85
+ self.head_dim = self.hidden_size // self.num_attention_heads
86
+ if self.num_key_value_heads is None:
87
+ self.num_key_value_heads = self.num_attention_heads
88
+
89
+ super().__post_init__(**kwargs)
90
+
91
+ def validate_architecture(self):
92
+ """Part of `@strict`-powered validation. Validates the architecture of the config."""
93
+ if self.hidden_size % self.num_attention_heads != 0:
94
+ raise ValueError(
95
+ f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
96
+ f"heads ({self.num_attention_heads})."
97
+ )
98
+
99
+
100
+ __all__ = ["ArceeConfig"]
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/modeling_arcee.py ADDED
@@ -0,0 +1,520 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/arcee/modular_arcee.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_arcee.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ from collections.abc import Callable
22
+ from typing import Optional
23
+
24
+ import torch
25
+ from torch import nn
26
+
27
+ from transformers.utils import auto_docstring
28
+
29
+ from ...activations import ACT2FN
30
+ from ...cache_utils import Cache, DynamicCache
31
+ from ...generation import GenerationMixin
32
+ from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
33
+ from ...masking_utils import create_causal_mask
34
+ from ...modeling_layers import (
35
+ GenericForQuestionAnswering,
36
+ GenericForSequenceClassification,
37
+ GenericForTokenClassification,
38
+ GradientCheckpointingLayer,
39
+ )
40
+ from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
41
+ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
42
+ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
43
+ from ...processing_utils import Unpack
44
+ from ...utils import TransformersKwargs, can_return_tuple
45
+ from ...utils.generic import maybe_autocast, merge_with_config_defaults
46
+ from ...utils.output_capturing import capture_outputs
47
+ from .configuration_arcee import ArceeConfig
48
+
49
+
50
+ class ArceeMLP(nn.Module):
51
+ def __init__(self, config):
52
+ super().__init__()
53
+ self.config = config
54
+ self.hidden_size = config.hidden_size
55
+ self.intermediate_size = config.intermediate_size
56
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
57
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
58
+ self.act_fn = ACT2FN[config.hidden_act]
59
+
60
+ def forward(self, x):
61
+ return self.down_proj(self.act_fn(self.up_proj(x)))
62
+
63
+
64
+ @use_kernel_forward_from_hub("RMSNorm")
65
+ class ArceeRMSNorm(nn.Module):
66
+ def __init__(self, hidden_size, eps: float = 1e-6) -> None:
67
+ """
68
+ ArceeRMSNorm is equivalent to T5LayerNorm
69
+ """
70
+ super().__init__()
71
+ self.weight = nn.Parameter(torch.ones(hidden_size))
72
+ self.variance_epsilon = eps
73
+
74
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
75
+ input_dtype = hidden_states.dtype
76
+ hidden_states = hidden_states.to(torch.float32)
77
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
78
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
79
+ return self.weight * hidden_states.to(input_dtype)
80
+
81
+ def extra_repr(self):
82
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
83
+
84
+
85
+ class ArceeRotaryEmbedding(nn.Module):
86
+ inv_freq: torch.Tensor # fix linting for `register_buffer`
87
+
88
+ def __init__(self, config: ArceeConfig, device=None):
89
+ super().__init__()
90
+ self.max_seq_len_cached = config.max_position_embeddings
91
+ self.original_max_seq_len = config.max_position_embeddings
92
+
93
+ self.config = config
94
+
95
+ self.rope_type = self.config.rope_parameters["rope_type"]
96
+ rope_init_fn: Callable = self.compute_default_rope_parameters
97
+ if self.rope_type != "default":
98
+ rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
99
+ inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
100
+
101
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
102
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
103
+
104
+ @staticmethod
105
+ def compute_default_rope_parameters(
106
+ config: ArceeConfig | None = None,
107
+ device: Optional["torch.device"] = None,
108
+ seq_len: int | None = None,
109
+ ) -> tuple["torch.Tensor", float]:
110
+ """
111
+ Computes the inverse frequencies according to the original RoPE implementation
112
+ Args:
113
+ config ([`~transformers.PreTrainedConfig`]):
114
+ The model configuration.
115
+ device (`torch.device`):
116
+ The device to use for initialization of the inverse frequencies.
117
+ seq_len (`int`, *optional*):
118
+ The current sequence length. Unused for this type of RoPE.
119
+ Returns:
120
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
121
+ post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
122
+ """
123
+ base = config.rope_parameters["rope_theta"]
124
+ dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
125
+
126
+ attention_factor = 1.0 # Unused in this type of RoPE
127
+
128
+ # Compute the inverse frequencies
129
+ inv_freq = 1.0 / (
130
+ base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
131
+ )
132
+ return inv_freq, attention_factor
133
+
134
+ @torch.no_grad()
135
+ @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
136
+ def forward(self, x, position_ids):
137
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
138
+ position_ids_expanded = position_ids[:, None, :].float()
139
+
140
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
141
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
142
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
143
+ emb = torch.cat((freqs, freqs), dim=-1)
144
+ cos = emb.cos() * self.attention_scaling
145
+ sin = emb.sin() * self.attention_scaling
146
+
147
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
148
+
149
+
150
+ def rotate_half(x):
151
+ """Rotates half the hidden dims of the input."""
152
+ x1 = x[..., : x.shape[-1] // 2]
153
+ x2 = x[..., x.shape[-1] // 2 :]
154
+ return torch.cat((-x2, x1), dim=-1)
155
+
156
+
157
+ @use_kernel_func_from_hub("rotary_pos_emb")
158
+ def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
159
+ """Applies Rotary Position Embedding to the query and key tensors.
160
+
161
+ Args:
162
+ q (`torch.Tensor`): The query tensor.
163
+ k (`torch.Tensor`): The key tensor.
164
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
165
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
166
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
167
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
168
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
169
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
170
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
171
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
172
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
173
+ Returns:
174
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
175
+ """
176
+ cos = cos.unsqueeze(unsqueeze_dim)
177
+ sin = sin.unsqueeze(unsqueeze_dim)
178
+ q_embed = (q * cos) + (rotate_half(q) * sin)
179
+ k_embed = (k * cos) + (rotate_half(k) * sin)
180
+ return q_embed, k_embed
181
+
182
+
183
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
184
+ """
185
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
186
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
187
+ """
188
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
189
+ if n_rep == 1:
190
+ return hidden_states
191
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
192
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
193
+
194
+
195
+ def eager_attention_forward(
196
+ module: nn.Module,
197
+ query: torch.Tensor,
198
+ key: torch.Tensor,
199
+ value: torch.Tensor,
200
+ attention_mask: torch.Tensor | None,
201
+ scaling: float,
202
+ dropout: float = 0.0,
203
+ **kwargs: Unpack[TransformersKwargs],
204
+ ):
205
+ key_states = repeat_kv(key, module.num_key_value_groups)
206
+ value_states = repeat_kv(value, module.num_key_value_groups)
207
+
208
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
209
+ if attention_mask is not None:
210
+ attn_weights = attn_weights + attention_mask
211
+
212
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
213
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
214
+ attn_output = torch.matmul(attn_weights, value_states)
215
+ attn_output = attn_output.transpose(1, 2).contiguous()
216
+
217
+ return attn_output, attn_weights
218
+
219
+
220
+ @use_kernelized_func(apply_rotary_pos_emb)
221
+ class ArceeAttention(nn.Module):
222
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
223
+
224
+ def __init__(self, config: ArceeConfig, layer_idx: int):
225
+ super().__init__()
226
+ self.config = config
227
+ self.layer_idx = layer_idx
228
+ self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
229
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
230
+ self.scaling = self.head_dim**-0.5
231
+ self.attention_dropout = config.attention_dropout
232
+ self.is_causal = True
233
+
234
+ self.q_proj = nn.Linear(
235
+ config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
236
+ )
237
+ self.k_proj = nn.Linear(
238
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
239
+ )
240
+ self.v_proj = nn.Linear(
241
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
242
+ )
243
+ self.o_proj = nn.Linear(
244
+ config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
245
+ )
246
+
247
+ def forward(
248
+ self,
249
+ hidden_states: torch.Tensor,
250
+ position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
251
+ attention_mask: torch.Tensor | None = None,
252
+ past_key_values: Cache | None = None,
253
+ **kwargs: Unpack[TransformersKwargs],
254
+ ) -> tuple[torch.Tensor, torch.Tensor]:
255
+ input_shape = hidden_states.shape[:-1]
256
+ hidden_shape = (*input_shape, -1, self.head_dim)
257
+
258
+ query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
259
+ key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
260
+ value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
261
+
262
+ cos, sin = position_embeddings
263
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
264
+
265
+ if past_key_values is not None:
266
+ key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
267
+
268
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
269
+ self.config._attn_implementation, eager_attention_forward
270
+ )
271
+
272
+ attn_output, attn_weights = attention_interface(
273
+ self,
274
+ query_states,
275
+ key_states,
276
+ value_states,
277
+ attention_mask,
278
+ dropout=0.0 if not self.training else self.attention_dropout,
279
+ scaling=self.scaling,
280
+ **kwargs,
281
+ )
282
+
283
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
284
+ attn_output = self.o_proj(attn_output)
285
+ return attn_output, attn_weights
286
+
287
+
288
+ class ArceeDecoderLayer(GradientCheckpointingLayer):
289
+ def __init__(self, config: ArceeConfig, layer_idx: int):
290
+ super().__init__()
291
+ self.hidden_size = config.hidden_size
292
+
293
+ self.self_attn = ArceeAttention(config=config, layer_idx=layer_idx)
294
+
295
+ self.mlp = ArceeMLP(config)
296
+ self.input_layernorm = ArceeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
297
+ self.post_attention_layernorm = ArceeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
298
+
299
+ def forward(
300
+ self,
301
+ hidden_states: torch.Tensor,
302
+ attention_mask: torch.Tensor | None = None,
303
+ position_ids: torch.LongTensor | None = None,
304
+ past_key_values: Cache | None = None,
305
+ use_cache: bool | None = False,
306
+ position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
307
+ **kwargs: Unpack[TransformersKwargs],
308
+ ) -> torch.Tensor:
309
+ residual = hidden_states
310
+ hidden_states = self.input_layernorm(hidden_states)
311
+ # Self Attention
312
+ hidden_states, _ = self.self_attn(
313
+ hidden_states=hidden_states,
314
+ attention_mask=attention_mask,
315
+ position_ids=position_ids,
316
+ past_key_values=past_key_values,
317
+ use_cache=use_cache,
318
+ position_embeddings=position_embeddings,
319
+ **kwargs,
320
+ )
321
+ hidden_states = residual + hidden_states
322
+
323
+ # Fully Connected
324
+ residual = hidden_states
325
+ hidden_states = self.post_attention_layernorm(hidden_states)
326
+ hidden_states = self.mlp(hidden_states)
327
+ hidden_states = residual + hidden_states
328
+ return hidden_states
329
+
330
+
331
+ @auto_docstring
332
+ class ArceePreTrainedModel(PreTrainedModel):
333
+ config: ArceeConfig
334
+ base_model_prefix = "model"
335
+ supports_gradient_checkpointing = True
336
+ _no_split_modules = ["ArceeDecoderLayer"]
337
+ _skip_keys_device_placement = ["past_key_values"]
338
+ _supports_flash_attn = True
339
+ _supports_sdpa = True
340
+ _supports_flex_attn = True
341
+
342
+ _can_compile_fullgraph = True
343
+ _supports_attention_backend = True
344
+ _can_record_outputs = {
345
+ "hidden_states": ArceeDecoderLayer,
346
+ "attentions": ArceeAttention,
347
+ }
348
+
349
+
350
+ @auto_docstring
351
+ class ArceeModel(ArceePreTrainedModel):
352
+ def __init__(self, config: ArceeConfig):
353
+ super().__init__(config)
354
+ self.padding_idx = config.pad_token_id
355
+ self.vocab_size = config.vocab_size
356
+
357
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
358
+ self.layers = nn.ModuleList(
359
+ [ArceeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
360
+ )
361
+ self.norm = ArceeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
362
+ self.rotary_emb = ArceeRotaryEmbedding(config=config)
363
+ self.gradient_checkpointing = False
364
+
365
+ # Initialize weights and apply final processing
366
+ self.post_init()
367
+
368
+ @merge_with_config_defaults
369
+ @capture_outputs
370
+ @auto_docstring
371
+ def forward(
372
+ self,
373
+ input_ids: torch.LongTensor | None = None,
374
+ attention_mask: torch.Tensor | None = None,
375
+ position_ids: torch.LongTensor | None = None,
376
+ past_key_values: Cache | None = None,
377
+ inputs_embeds: torch.FloatTensor | None = None,
378
+ use_cache: bool | None = None,
379
+ **kwargs: Unpack[TransformersKwargs],
380
+ ) -> BaseModelOutputWithPast:
381
+ if (input_ids is None) ^ (inputs_embeds is not None):
382
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
383
+
384
+ if inputs_embeds is None:
385
+ inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
386
+
387
+ if use_cache and past_key_values is None:
388
+ past_key_values = DynamicCache(config=self.config)
389
+
390
+ if position_ids is None:
391
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
392
+ position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
393
+ position_ids = position_ids.unsqueeze(0)
394
+
395
+ causal_mask = create_causal_mask(
396
+ config=self.config,
397
+ inputs_embeds=inputs_embeds,
398
+ attention_mask=attention_mask,
399
+ past_key_values=past_key_values,
400
+ position_ids=position_ids,
401
+ )
402
+
403
+ hidden_states = inputs_embeds
404
+ position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids)
405
+
406
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
407
+ hidden_states = decoder_layer(
408
+ hidden_states,
409
+ attention_mask=causal_mask,
410
+ position_embeddings=position_embeddings,
411
+ position_ids=position_ids,
412
+ past_key_values=past_key_values,
413
+ use_cache=use_cache,
414
+ **kwargs,
415
+ )
416
+
417
+ hidden_states = self.norm(hidden_states)
418
+ return BaseModelOutputWithPast(
419
+ last_hidden_state=hidden_states,
420
+ past_key_values=past_key_values,
421
+ )
422
+
423
+
424
+ @auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
425
+ class ArceeForCausalLM(ArceePreTrainedModel, GenerationMixin):
426
+ _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
427
+ _tp_plan = {"lm_head": "colwise_gather_output"}
428
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
429
+
430
+ def __init__(self, config):
431
+ super().__init__(config)
432
+ self.model = ArceeModel(config)
433
+ self.vocab_size = config.vocab_size
434
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
435
+
436
+ # Initialize weights and apply final processing
437
+ self.post_init()
438
+
439
+ @can_return_tuple
440
+ @auto_docstring
441
+ def forward(
442
+ self,
443
+ input_ids: torch.LongTensor | None = None,
444
+ attention_mask: torch.Tensor | None = None,
445
+ position_ids: torch.LongTensor | None = None,
446
+ past_key_values: Cache | None = None,
447
+ inputs_embeds: torch.FloatTensor | None = None,
448
+ labels: torch.LongTensor | None = None,
449
+ use_cache: bool | None = None,
450
+ logits_to_keep: int | torch.Tensor = 0,
451
+ **kwargs: Unpack[TransformersKwargs],
452
+ ) -> CausalLMOutputWithPast:
453
+ r"""
454
+ Example:
455
+
456
+ ```python
457
+ >>> from transformers import AutoTokenizer, ArceeForCausalLM
458
+
459
+ >>> model = ArceeForCausalLM.from_pretrained("meta-arcee/Arcee-2-7b-hf")
460
+ >>> tokenizer = AutoTokenizer.from_pretrained("meta-arcee/Arcee-2-7b-hf")
461
+
462
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
463
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
464
+
465
+ >>> # Generate
466
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
467
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
468
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
469
+ ```"""
470
+ outputs: BaseModelOutputWithPast = self.model(
471
+ input_ids=input_ids,
472
+ attention_mask=attention_mask,
473
+ position_ids=position_ids,
474
+ past_key_values=past_key_values,
475
+ inputs_embeds=inputs_embeds,
476
+ use_cache=use_cache,
477
+ **kwargs,
478
+ )
479
+
480
+ hidden_states = outputs.last_hidden_state
481
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
482
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
483
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
484
+
485
+ loss = None
486
+ if labels is not None:
487
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
488
+
489
+ return CausalLMOutputWithPast(
490
+ loss=loss,
491
+ logits=logits,
492
+ past_key_values=outputs.past_key_values,
493
+ hidden_states=outputs.hidden_states,
494
+ attentions=outputs.attentions,
495
+ )
496
+
497
+
498
+ @auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
499
+ class ArceeForSequenceClassification(GenericForSequenceClassification, ArceePreTrainedModel):
500
+ pass
501
+
502
+
503
+ @auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
504
+ class ArceeForQuestionAnswering(GenericForQuestionAnswering, ArceePreTrainedModel):
505
+ base_model_prefix = "transformer" # For BC, where `transformer` was used instead of `model`
506
+
507
+
508
+ @auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
509
+ class ArceeForTokenClassification(GenericForTokenClassification, ArceePreTrainedModel):
510
+ pass
511
+
512
+
513
+ __all__ = [
514
+ "ArceeForCausalLM",
515
+ "ArceeForQuestionAnswering",
516
+ "ArceeForSequenceClassification",
517
+ "ArceeForTokenClassification",
518
+ "ArceeModel",
519
+ "ArceePreTrainedModel",
520
+ ]
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/arcee/modular_arcee.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """PyTorch Arcee model."""
15
+
16
+ from huggingface_hub.dataclasses import strict
17
+
18
+ from transformers.utils import auto_docstring, logging
19
+
20
+ from ...modeling_rope_utils import RopeParameters
21
+ from ..llama.configuration_llama import LlamaConfig
22
+ from ..llama.modeling_llama import (
23
+ LlamaForCausalLM,
24
+ LlamaForQuestionAnswering,
25
+ LlamaForSequenceClassification,
26
+ LlamaForTokenClassification,
27
+ )
28
+ from ..nemotron.modeling_nemotron import NemotronMLP
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+
34
+ @auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
35
+ @strict
36
+ class ArceeConfig(LlamaConfig):
37
+ r"""
38
+ ```python
39
+ >>> from transformers import ArceeModel, ArceeConfig
40
+
41
+ >>> # Initializing an Arcee AFM-4.5B-Base style configuration
42
+ >>> configuration = ArceeConfig()
43
+
44
+ >>> # Initializing a model from the AFM-4.5B-Base style configuration
45
+ >>> model = ArceeModel(configuration)
46
+
47
+ >>> # Accessing the model configuration
48
+ >>> configuration = model.config
49
+ ```"""
50
+
51
+ model_type = "arcee"
52
+ base_model_tp_plan = {
53
+ "layers.*.self_attn.q_proj": "colwise",
54
+ "layers.*.self_attn.k_proj": "colwise",
55
+ "layers.*.self_attn.v_proj": "colwise",
56
+ "layers.*.self_attn.o_proj": "rowwise",
57
+ "layers.*.mlp.up_proj": "colwise",
58
+ "layers.*.mlp.down_proj": "rowwise",
59
+ }
60
+
61
+ vocab_size: int = 32000
62
+ hidden_size: int = 2560
63
+ intermediate_size: int = 18432
64
+ num_hidden_layers: int = 32
65
+ num_attention_heads: int = 32
66
+ num_key_value_heads: int | None = None
67
+ hidden_act: str = "relu2"
68
+ max_position_embeddings: int = 4096
69
+ initializer_range: float = 0.02
70
+ rms_norm_eps: float = 1e-5
71
+ use_cache: bool = True
72
+ pad_token_id: int | None = None
73
+ bos_token_id: int | None = 128000
74
+ eos_token_id: int | list[int] | None = 128001
75
+ tie_word_embeddings: bool = False
76
+ rope_parameters: RopeParameters | dict | None = None
77
+ attention_bias: bool = False
78
+ attention_dropout: float | int = 0.0
79
+ mlp_bias: bool = False
80
+ head_dim: int | None = None
81
+
82
+ pretraining_tp = AttributeError()
83
+
84
+
85
+ class ArceeMLP(NemotronMLP):
86
+ pass
87
+
88
+
89
+ @auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
90
+ class ArceeForCausalLM(LlamaForCausalLM):
91
+ pass
92
+
93
+
94
+ @auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
95
+ class ArceeForSequenceClassification(LlamaForSequenceClassification):
96
+ pass
97
+
98
+
99
+ @auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
100
+ class ArceeForQuestionAnswering(LlamaForQuestionAnswering):
101
+ pass
102
+
103
+
104
+ @auto_docstring(checkpoint="arcee-ai/AFM-4.5B")
105
+ class ArceeForTokenClassification(LlamaForTokenClassification):
106
+ pass
107
+
108
+
109
+ __all__ = [
110
+ "ArceeConfig",
111
+ "ArceeForCausalLM",
112
+ "ArceeForQuestionAnswering",
113
+ "ArceeForSequenceClassification",
114
+ "ArceeForTokenClassification",
115
+ "ArceeModel", # noqa: F822
116
+ "ArceePreTrainedModel", # noqa: F822
117
+ ]
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__init__.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import TYPE_CHECKING
15
+
16
+ from ...utils import _LazyModule
17
+ from ...utils.import_utils import define_import_structure
18
+
19
+
20
+ if TYPE_CHECKING:
21
+ from .configuration_bigbird_pegasus import *
22
+ from .modeling_bigbird_pegasus import *
23
+ else:
24
+ import sys
25
+
26
+ _file = globals()["__file__"]
27
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright Google Research and The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """BigBirdPegasus model configuration"""
15
+
16
+ from huggingface_hub.dataclasses import strict
17
+
18
+ from ...configuration_utils import PreTrainedConfig
19
+ from ...utils import auto_docstring
20
+
21
+
22
+ @auto_docstring(checkpoint="google/bigbird-pegasus-large-arxiv")
23
+ @strict
24
+ class BigBirdPegasusConfig(PreTrainedConfig):
25
+ r"""
26
+ attention_type (`str`, *optional*, defaults to `"block_sparse"`):
27
+ Whether to use block sparse attention (with n complexity) as introduced in paper or original attention
28
+ layer (with n^2 complexity). Possible values are `"original_full"` and `"block_sparse"`.
29
+ block_size (`int`, *optional*, defaults to 64):
30
+ Size of each block. Useful only when `attention_type == "block_sparse"`.
31
+ num_random_blocks (`int`, *optional*, defaults to 3):
32
+ Each query is going to attend these many number of random blocks. Useful only when `attention_type ==
33
+ "block_sparse"`.
34
+ use_bias (`bool`, *optional*, defaults to `True`):
35
+ Whether to use bias in query, key, value.
36
+
37
+ Example:
38
+
39
+ ```python
40
+ >>> from transformers import BigBirdPegasusConfig, BigBirdPegasusModel
41
+
42
+ >>> # Initializing a BigBirdPegasus bigbird-pegasus-base style configuration
43
+ >>> configuration = BigBirdPegasusConfig()
44
+
45
+ >>> # Initializing a model (with random weights) from the bigbird-pegasus-base style configuration
46
+ >>> model = BigBirdPegasusModel(configuration)
47
+
48
+ >>> # Accessing the model configuration
49
+ >>> configuration = model.config
50
+ ```"""
51
+
52
+ model_type = "bigbird_pegasus"
53
+ keys_to_ignore_at_inference = ["past_key_values"]
54
+ attribute_map = {
55
+ "num_attention_heads": "encoder_attention_heads",
56
+ "hidden_size": "d_model",
57
+ "attention_probs_dropout_prob": "attention_dropout",
58
+ "num_hidden_layers": "encoder_layers",
59
+ }
60
+
61
+ vocab_size: int = 96103
62
+ max_position_embeddings: int = 4096
63
+ encoder_layers: int = 16
64
+ encoder_ffn_dim: int = 4096
65
+ encoder_attention_heads: int = 16
66
+ decoder_layers: int = 16
67
+ decoder_ffn_dim: int = 4096
68
+ decoder_attention_heads: int = 16
69
+ encoder_layerdrop: float | int = 0.0
70
+ decoder_layerdrop: float | int = 0.0
71
+ use_cache: bool = True
72
+ is_encoder_decoder: bool = True
73
+ activation_function: str = "gelu_new"
74
+ d_model: int = 1024
75
+ dropout: float | int = 0.1
76
+ attention_dropout: float | int = 0.0
77
+ activation_dropout: float | int = 0.0
78
+ init_std: float = 0.02
79
+ decoder_start_token_id: int = 2
80
+ classifier_dropout: float | int = 0.0
81
+ scale_embedding: bool = True
82
+ pad_token_id: int | None = 0
83
+ bos_token_id: int | None = 2
84
+ eos_token_id: int | list[int] | None = 1
85
+ attention_type: str = "block_sparse" # only for encoder
86
+ block_size: int = 64
87
+ num_random_blocks: int = 3
88
+ use_bias: bool = False
89
+ is_decoder: bool = False
90
+ tie_word_embeddings: bool = True
91
+
92
+
93
+ __all__ = ["BigBirdPegasusConfig"]
LTA_openwebtext_dualt/mini_owt_logdirichlet/.venv_qwen35_uv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py ADDED
The diff for this file is too large to render. See raw diff
 
LTA_openwebtext_dualt/mini_owt_logdirichlet/cache/owt_t5_llmclean_qwen36_35b_articlefull_pack1023_10k_rejected_docs.txt ADDED
The diff for this file is too large to render. See raw diff