| | |
| | """ |
| | AETHER-Micro Configuration (Hugging Face Standard) |
| | |
| | 기존 AETHER-Micro의 모든 하이퍼파라미터를 HF PretrainedConfig 형식으로 정의합니다. |
| | Wu-Xing, Heterogeneous MoE, Latent Thought Loop, RLP 등 모든 커스텀 기능을 보존합니다. |
| | """ |
| |
|
| | from transformers import PretrainedConfig |
| | from typing import Optional |
| |
|
| |
|
| | class AETHERMicroConfig(PretrainedConfig): |
| | """ |
| | AETHER-Micro 모델 설정 클래스 |
| | |
| | Hugging Face PretrainedConfig를 상속하여 AETHER-Micro의 모든 하이퍼파라미터를 정의합니다. |
| | 이 설정은 Unsloth와 완전히 호환됩니다. |
| | |
| | Args: |
| | vocab_size (int, optional, defaults to 64000): |
| | 어휘 크기 (64K tokenizer 기준) |
| | |
| | hidden_size (int, optional, defaults to 896): |
| | 히든 레이어 차원 (14의 배수로 최적화) |
| | |
| | intermediate_size (int, optional, defaults to 3584): |
| | FFN 중간 레이어 차원 (hidden_size × 4) |
| | |
| | num_hidden_layers (int, optional, defaults to 24): |
| | Transformer 레이어 개수 |
| | |
| | num_attention_heads (int, optional, defaults to 14): |
| | Multi-Head Attention 헤드 개수 |
| | |
| | num_key_value_heads (int, optional, defaults to 14): |
| | GQA (Grouped Query Attention) K/V 헤드 개수 |
| | |
| | max_position_embeddings (int, optional, defaults to 8192): |
| | 최대 시퀀스 길이 |
| | |
| | rms_norm_eps (float, optional, defaults to 1e-6): |
| | RMSNorm epsilon 값 |
| | |
| | rope_theta (float, optional, defaults to 10000.0): |
| | RoPE (Rotary Position Embedding) theta 값 |
| | |
| | attention_dropout (float, optional, defaults to 0.0): |
| | Attention 드롭아웃 비율 |
| | |
| | # ======================================== |
| | # Heterogeneous MoE Settings |
| | # ======================================== |
| | |
| | enable_hetero_moe (bool, optional, defaults to True): |
| | Heterogeneous MoE 활성화 여부 |
| | |
| | num_大_experts (int, optional, defaults to 5): |
| | 大型 Expert 개수 (2048 dim, 금/수/목/화/토 Agent 각 1개 제어) |
| | |
| | num_小_experts (int, optional, defaults to 15): |
| | 小型 Expert 개수 (1024 dim, 각 Agent가 3개씩 제어) |
| | |
| | num_shared_experts (int, optional, defaults to 2): |
| | 공유 Expert 개수 (1536 dim, 항상 활성화) |
| | |
| | top_k (int, optional, defaults to 2): |
| | MoE Top-K 라우팅 개수 |
| | |
| | # ======================================== |
| | # Wu-Xing Router Settings |
| | # ======================================== |
| | |
| | enable_wuxing (bool, optional, defaults to True): |
| | Wu-Xing (오행) 라우터 활성화 여부 |
| | |
| | enable_magic_init (bool, optional, defaults to True): |
| | Magic Square 초기화 활성화 여부 |
| | |
| | # ======================================== |
| | # Wu-Xing Annealing Settings |
| | # ======================================== |
| | |
| | enable_annealing (bool, optional, defaults to True): |
| | Wu-Xing Annealing Scheduler 활성화 여부 |
| | |
| | alpha_start (float, optional, defaults to 0.5): |
| | Magic Square 초기 가중치 (warmup phase) |
| | |
| | beta_start (float, optional, defaults to 0.3): |
| | Learned Matrix 초기 가중치 (warmup phase) |
| | |
| | gamma_start (float, optional, defaults to 0.2): |
| | History Buffer 초기 가중치 (warmup phase) |
| | |
| | alpha_end (float, optional, defaults to 0.1): |
| | Magic Square 최종 가중치 (mature phase) |
| | |
| | beta_end (float, optional, defaults to 0.2): |
| | Learned Matrix 최종 가중치 (mature phase) |
| | |
| | gamma_end (float, optional, defaults to 0.7): |
| | History Buffer 최종 가중치 (mature phase) |
| | |
| | # ======================================== |
| | # Latent Thought Loop Settings |
| | # ======================================== |
| | |
| | enable_latent_thought (bool, optional, defaults to True): |
| | Latent Thought Loop 활성화 여부 |
| | |
| | num_latents (int, optional, defaults to 8): |
| | Latent token 개수 |
| | |
| | latent_dim (int, optional, defaults to 512): |
| | Latent 차원 (hidden_size보다 작음) |
| | |
| | max_k (int, optional, defaults to 2): |
| | 최대 사고 깊이 (K=0: direct, K=1: shallow, K=2: deep) |
| | |
| | # ======================================== |
| | # RLP (Reinforcement as Pretraining) Settings |
| | # ======================================== |
| | |
| | enable_rlp (bool, optional, defaults to False): |
| | RLP 훈련 활성화 여부 (기본 비활성화, 훈련 시 명시적 활성화) |
| | |
| | rlp_quality_weight (float, optional, defaults to 1.0): |
| | RLP Quality 가중치 |
| | |
| | rlp_info_gain_clip (float, optional, defaults to 5.0): |
| | Information Gain 클리핑 범위 |
| | |
| | rlp_ntp_weight (float, optional, defaults to 0.7): |
| | NTP (Next Token Prediction) 손실 가중치 |
| | |
| | rlp_target_reward_weight (float, optional, defaults to 0.3): |
| | Target RLP 보상 가중치 (annealing으로 0 → 0.3) |
| | |
| | rlp_warmup_steps (int, optional, defaults to 1500): |
| | RLP 보상 가중치 warmup 스텝 (전체의 10%) |
| | |
| | # ======================================== |
| | # Self-Evaluation Head Settings |
| | # ======================================== |
| | |
| | enable_self_eval (bool, optional, defaults to True): |
| | Self-Evaluation Head 활성화 여부 (RLP 훈련 시 필요) |
| | |
| | self_eval_dims (int, optional, defaults to 4): |
| | Quality 차원 개수 (factuality, coherence, completeness, specificity) |
| | |
| | Example: |
| | ```python |
| | from transformers import AutoConfig |
| | |
| | # 기본 설정으로 생성 |
| | config = AETHERMicroConfig() |
| | |
| | # 커스텀 설정 |
| | config = AETHERMicroConfig( |
| | vocab_size=64000, |
| | hidden_size=896, |
| | num_hidden_layers=24, |
| | enable_wuxing=True, |
| | enable_hetero_moe=True, |
| | enable_latent_thought=True, |
| | enable_rlp=False |
| | ) |
| | |
| | # 저장 및 로드 |
| | config.save_pretrained("./aether-micro-config") |
| | config = AutoConfig.from_pretrained("./aether-micro-config") |
| | ``` |
| | """ |
| |
|
| | model_type = "aether_micro" |
| |
|
| | def __init__( |
| | self, |
| | |
| | |
| | |
| | vocab_size: int = 64000, |
| | hidden_size: int = 1024, |
| | intermediate_size: int = 4096, |
| | num_hidden_layers: int = 24, |
| | num_attention_heads: int = 16, |
| | num_key_value_heads: int = 4, |
| | max_position_embeddings: int = 8192, |
| | rms_norm_eps: float = 1e-6, |
| | rope_theta: float = 10000.0, |
| | attention_dropout: float = 0.0, |
| | |
| | |
| | |
| | |
| | use_cache: bool = True, |
| | pad_token_id: int = 0, |
| | bos_token_id: int = 1, |
| | eos_token_id: int = 2, |
| | tie_word_embeddings: bool = False, |
| | |
| | |
| | |
| | |
| | enable_hetero_moe: bool = True, |
| | num_大_experts: int = 5, |
| | num_小_experts: int = 15, |
| | num_shared_experts: int = 2, |
| | 大_intermediate_size: int = 2048, |
| | 小_intermediate_size: int = 1024, |
| | shared_intermediate_size: int = 1536, |
| | top_k: int = 2, |
| | num_experts_per_tok: int = 2, |
| | |
| | |
| | |
| | |
| | enable_wuxing: bool = True, |
| | enable_magic_init: bool = True, |
| | |
| | |
| | |
| | |
| | enable_annealing: bool = True, |
| | alpha_start: float = 0.5, |
| | beta_start: float = 0.3, |
| | gamma_start: float = 0.2, |
| | alpha_end: float = 0.1, |
| | beta_end: float = 0.2, |
| | gamma_end: float = 0.7, |
| | |
| | |
| | |
| | |
| | enable_latent_thought: bool = True, |
| | num_latents: int = 8, |
| | latent_dim: int = 512, |
| | max_k: int = 2, |
| | |
| | |
| | |
| | |
| | enable_rlp: bool = False, |
| | rlp_quality_weight: float = 1.0, |
| | rlp_info_gain_clip: float = 5.0, |
| | rlp_ntp_weight: float = 0.7, |
| | rlp_target_reward_weight: float = 0.3, |
| | rlp_warmup_steps: int = 1500, |
| | |
| | |
| | |
| | |
| | enable_quality_head: bool = True, |
| | quality_head_dim: int = 4, |
| | |
| | |
| | |
| | |
| | enable_self_eval: bool = True, |
| | self_eval_dims: int = 4, |
| | |
| | |
| | |
| | |
| | enable_mtp_loss: bool = True, |
| | mtp_num_predictions: int = 4, |
| | |
| | |
| | |
| | enable_magic_square: bool = True, |
| | |
| | **kwargs |
| | ): |
| | |
| | self.vocab_size = vocab_size |
| | self.hidden_size = hidden_size |
| | self.intermediate_size = intermediate_size |
| | self.num_hidden_layers = num_hidden_layers |
| | self.num_attention_heads = num_attention_heads |
| | self.num_key_value_heads = num_key_value_heads |
| | self.max_position_embeddings = max_position_embeddings |
| | self.rms_norm_eps = rms_norm_eps |
| | self.rope_theta = rope_theta |
| | self.attention_dropout = attention_dropout |
| |
|
| | |
| | self.use_cache = use_cache |
| | self.pad_token_id = pad_token_id |
| | self.bos_token_id = bos_token_id |
| | self.eos_token_id = eos_token_id |
| | self.tie_word_embeddings = tie_word_embeddings |
| |
|
| | |
| | self.enable_hetero_moe = enable_hetero_moe |
| | self.num_大_experts = num_大_experts |
| | self.num_小_experts = num_小_experts |
| | self.num_shared_experts = num_shared_experts |
| | self.大_intermediate_size = 大_intermediate_size |
| | self.小_intermediate_size = 小_intermediate_size |
| | self.shared_intermediate_size = shared_intermediate_size |
| | self.top_k = top_k |
| | self.num_experts_per_tok = num_experts_per_tok |
| |
|
| | |
| | self.enable_wuxing = enable_wuxing |
| | self.enable_magic_init = enable_magic_init |
| |
|
| | |
| | self.enable_annealing = enable_annealing |
| | self.alpha_start = alpha_start |
| | self.beta_start = beta_start |
| | self.gamma_start = gamma_start |
| | self.alpha_end = alpha_end |
| | self.beta_end = beta_end |
| | self.gamma_end = gamma_end |
| |
|
| | |
| | self.enable_latent_thought = enable_latent_thought |
| | self.num_latents = num_latents |
| | self.latent_dim = latent_dim |
| | self.max_k = max_k |
| |
|
| | |
| | self.enable_rlp = enable_rlp |
| | self.rlp_quality_weight = rlp_quality_weight |
| | self.rlp_info_gain_clip = rlp_info_gain_clip |
| | self.rlp_ntp_weight = rlp_ntp_weight |
| | self.rlp_target_reward_weight = rlp_target_reward_weight |
| | self.rlp_warmup_steps = rlp_warmup_steps |
| |
|
| | |
| | self.enable_quality_head = enable_quality_head |
| | self.quality_head_dim = quality_head_dim |
| |
|
| | |
| | self.enable_self_eval = enable_self_eval |
| | self.self_eval_dims = self_eval_dims |
| |
|
| | |
| | self.enable_mtp_loss = enable_mtp_loss |
| | self.mtp_num_predictions = mtp_num_predictions |
| |
|
| | |
| | self.enable_magic_square = enable_magic_square |
| |
|
| | |
| | kwargs_copy = kwargs.copy() |
| | for key in ['enable_hetero_moe', 'num_大_experts', 'num_小_experts', 'num_shared_experts', |
| | '大_intermediate_size', '小_intermediate_size', 'shared_intermediate_size', |
| | 'top_k', 'num_experts_per_tok', 'enable_wuxing', 'enable_magic_init', |
| | 'enable_annealing', 'alpha_start', 'beta_start', 'gamma_start', |
| | 'alpha_end', 'beta_end', 'gamma_end', 'enable_latent_thought', |
| | 'num_latents', 'latent_dim', 'max_k', 'enable_rlp', 'rlp_quality_weight', |
| | 'enable_quality_head', 'quality_head_dim', |
| | 'rlp_info_gain_clip', 'rlp_ntp_weight', 'rlp_target_reward_weight', |
| | 'rlp_warmup_steps', 'enable_self_eval', 'self_eval_dims', |
| | 'enable_mtp_loss', 'mtp_num_predictions', 'enable_magic_square']: |
| | kwargs_copy.pop(key, None) |
| |
|
| | |
| | super().__init__(**kwargs_copy) |
| |
|
| | |
| | self.vocab_size = vocab_size |
| | self.hidden_size = hidden_size |
| | self.intermediate_size = intermediate_size |
| | self.num_hidden_layers = num_hidden_layers |
| | self.num_attention_heads = num_attention_heads |
| | self.num_key_value_heads = num_key_value_heads |
| | self.max_position_embeddings = max_position_embeddings |
| | self.rms_norm_eps = rms_norm_eps |
| | self.rope_theta = rope_theta |
| | self.attention_dropout = attention_dropout |
| | self.use_cache = use_cache |
| | self.pad_token_id = pad_token_id |
| | self.bos_token_id = bos_token_id |
| | self.eos_token_id = eos_token_id |
| | self.tie_word_embeddings = tie_word_embeddings |
| | self.enable_hetero_moe = enable_hetero_moe |
| | self.num_大_experts = num_大_experts |
| | self.num_小_experts = num_小_experts |
| | self.num_shared_experts = num_shared_experts |
| | self.大_intermediate_size = 大_intermediate_size |
| | self.小_intermediate_size = 小_intermediate_size |
| | self.shared_intermediate_size = shared_intermediate_size |
| | self.top_k = top_k |
| | self.num_experts_per_tok = num_experts_per_tok |
| | self.enable_wuxing = enable_wuxing |
| | self.enable_magic_init = enable_magic_init |
| | self.enable_annealing = enable_annealing |
| | self.alpha_start = alpha_start |
| | self.beta_start = beta_start |
| | self.gamma_start = gamma_start |
| | self.alpha_end = alpha_end |
| | self.beta_end = beta_end |
| | self.gamma_end = gamma_end |
| | self.enable_latent_thought = enable_latent_thought |
| | self.num_latents = num_latents |
| | self.latent_dim = latent_dim |
| | self.max_k = max_k |
| | self.enable_rlp = enable_rlp |
| | self.rlp_quality_weight = rlp_quality_weight |
| | self.rlp_info_gain_clip = rlp_info_gain_clip |
| | self.rlp_ntp_weight = rlp_ntp_weight |
| | self.rlp_target_reward_weight = rlp_target_reward_weight |
| | self.rlp_warmup_steps = rlp_warmup_steps |
| | self.enable_self_eval = enable_self_eval |
| | self.self_eval_dims = self_eval_dims |
| |
|
| | def to_dict(self): |
| | """ |
| | 설정을 dict로 변환 (저장 시 사용) |
| | """ |
| | output = super().to_dict() |
| |
|
| | |
| | output.update({ |
| | |
| | "vocab_size": self.vocab_size, |
| | "hidden_size": self.hidden_size, |
| | "intermediate_size": self.intermediate_size, |
| | "num_hidden_layers": self.num_hidden_layers, |
| | "num_attention_heads": self.num_attention_heads, |
| | "num_key_value_heads": self.num_key_value_heads, |
| | "max_position_embeddings": self.max_position_embeddings, |
| | "rms_norm_eps": self.rms_norm_eps, |
| | "rope_theta": self.rope_theta, |
| | "attention_dropout": self.attention_dropout, |
| |
|
| | |
| | "use_cache": self.use_cache, |
| | "pad_token_id": self.pad_token_id, |
| | "bos_token_id": self.bos_token_id, |
| | "eos_token_id": self.eos_token_id, |
| | "tie_word_embeddings": self.tie_word_embeddings, |
| |
|
| | |
| | "enable_hetero_moe": self.enable_hetero_moe, |
| | "num_大_experts": self.num_大_experts, |
| | "num_小_experts": self.num_小_experts, |
| | "num_shared_experts": self.num_shared_experts, |
| | "大_intermediate_size": self.大_intermediate_size, |
| | "小_intermediate_size": self.小_intermediate_size, |
| | "shared_intermediate_size": self.shared_intermediate_size, |
| | "top_k": self.top_k, |
| | "num_experts_per_tok": self.num_experts_per_tok, |
| |
|
| | |
| | "enable_wuxing": self.enable_wuxing, |
| | "enable_magic_init": self.enable_magic_init, |
| | "enable_annealing": self.enable_annealing, |
| | "alpha_start": self.alpha_start, |
| | "beta_start": self.beta_start, |
| | "gamma_start": self.gamma_start, |
| | "alpha_end": self.alpha_end, |
| | "beta_end": self.beta_end, |
| | "gamma_end": self.gamma_end, |
| |
|
| | |
| | "enable_latent_thought": self.enable_latent_thought, |
| | "num_latents": self.num_latents, |
| | "latent_dim": self.latent_dim, |
| | "max_k": self.max_k, |
| |
|
| | |
| | "enable_rlp": self.enable_rlp, |
| | "rlp_quality_weight": self.rlp_quality_weight, |
| | "rlp_info_gain_clip": self.rlp_info_gain_clip, |
| | "rlp_ntp_weight": self.rlp_ntp_weight, |
| | "rlp_target_reward_weight": self.rlp_target_reward_weight, |
| | "rlp_warmup_steps": self.rlp_warmup_steps, |
| |
|
| | |
| | "enable_self_eval": self.enable_self_eval, |
| | "self_eval_dims": self.self_eval_dims, |
| | }) |
| |
|
| | return output |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def get_aether_micro_config_full(): |
| | """ |
| | Full 설정 (모든 기능 활성화) |
| | 0.5B 검증 전략 준수: 1024 hidden, 16 heads |
| | """ |
| | return AETHERMicroConfig( |
| | vocab_size=64000, |
| | hidden_size=1024, |
| | intermediate_size=4096, |
| | num_hidden_layers=24, |
| | num_attention_heads=16, |
| | num_key_value_heads=4, |
| | max_position_embeddings=8192, |
| |
|
| | enable_wuxing=True, |
| | enable_magic_init=True, |
| | enable_hetero_moe=True, |
| | enable_latent_thought=True, |
| | enable_rlp=True, |
| | enable_annealing=True, |
| |
|
| | num_大_experts=5, |
| | num_小_experts=15, |
| | num_shared_experts=2, |
| | top_k=2, |
| |
|
| | num_latents=8, |
| | latent_dim=512, |
| | max_k=2, |
| |
|
| | alpha_start=0.5, |
| | beta_start=0.3, |
| | gamma_start=0.2, |
| | alpha_end=0.1, |
| | beta_end=0.2, |
| | gamma_end=0.7, |
| | ) |
| |
|
| |
|
| | def get_aether_micro_config_baseline(): |
| | """ |
| | Baseline 설정 (커스텀 기능 비활성화) |
| | 0.5B 검증 전략 Ablation Run 0 |
| | """ |
| | return AETHERMicroConfig( |
| | vocab_size=64000, |
| | hidden_size=1024, |
| | intermediate_size=4096, |
| | num_hidden_layers=24, |
| | num_attention_heads=16, |
| | num_key_value_heads=4, |
| | max_position_embeddings=8192, |
| |
|
| | enable_wuxing=False, |
| | enable_magic_init=False, |
| | enable_hetero_moe=False, |
| | enable_latent_thought=False, |
| | enable_rlp=False, |
| | enable_annealing=False, |
| |
|
| | |
| | num_大_experts=0, |
| | num_小_experts=20, |
| | num_shared_experts=2, |
| | top_k=2, |
| | ) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def validate_aether_micro_config(config: AETHERMicroConfig) -> bool: |
| | """ |
| | 설정 검증 |
| | |
| | Args: |
| | config: AETHERMicroConfig 인스턴스 |
| | |
| | Returns: |
| | bool: 검증 성공 여부 |
| | |
| | Raises: |
| | ValueError: 잘못된 설정 발견 시 |
| | """ |
| | |
| | if config.hidden_size % config.num_attention_heads != 0: |
| | raise ValueError( |
| | f"hidden_size ({config.hidden_size}) must be divisible by " |
| | f"num_attention_heads ({config.num_attention_heads})" |
| | ) |
| |
|
| | |
| | if config.enable_annealing: |
| | if not (0 <= config.alpha_start <= 1): |
| | raise ValueError(f"alpha_start must be in [0, 1], got {config.alpha_start}") |
| | if not (0 <= config.beta_start <= 1): |
| | raise ValueError(f"beta_start must be in [0, 1], got {config.beta_start}") |
| | if not (0 <= config.gamma_start <= 1): |
| | raise ValueError(f"gamma_start must be in [0, 1], got {config.gamma_start}") |
| |
|
| | |
| | total_start = config.alpha_start + config.beta_start + config.gamma_start |
| | if abs(total_start - 1.0) > 1e-6: |
| | raise ValueError( |
| | f"alpha_start + beta_start + gamma_start must sum to 1.0, " |
| | f"got {total_start}" |
| | ) |
| |
|
| | |
| | if config.enable_hetero_moe: |
| | if config.num_大_experts != 5: |
| | raise ValueError( |
| | f"Heterogeneous MoE requires num_大_experts=5 (one per Wu-Xing Agent), " |
| | f"got {config.num_大_experts}" |
| | ) |
| | if config.num_小_experts % 5 != 0: |
| | raise ValueError( |
| | f"num_小_experts must be divisible by 5 (for 5 Agents), " |
| | f"got {config.num_小_experts}" |
| | ) |
| |
|
| | |
| | if config.enable_latent_thought: |
| | if config.latent_dim >= config.hidden_size: |
| | raise ValueError( |
| | f"latent_dim ({config.latent_dim}) must be less than " |
| | f"hidden_size ({config.hidden_size})" |
| | ) |
| | if config.max_k < 0 or config.max_k > 2: |
| | raise ValueError(f"max_k must be in [0, 2], got {config.max_k}") |
| |
|
| | |
| | if config.enable_rlp: |
| | if not config.enable_self_eval: |
| | raise ValueError("RLP requires enable_self_eval=True") |
| | if config.rlp_ntp_weight + config.rlp_target_reward_weight > 1.0: |
| | raise ValueError( |
| | f"rlp_ntp_weight + rlp_target_reward_weight must be <= 1.0, " |
| | f"got {config.rlp_ntp_weight + config.rlp_target_reward_weight}" |
| | ) |
| |
|
| | return True |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | print("=" * 80) |
| | print("AETHER-Micro Configuration Test") |
| | print("=" * 80) |
| | print() |
| |
|
| | |
| | config_full = get_aether_micro_config_full() |
| | print("✅ Full config created") |
| | print(f" Model type: {config_full.model_type}") |
| | print(f" Parameters: {config_full.vocab_size} vocab, {config_full.hidden_size} hidden") |
| | print(f" Wu-Xing: {config_full.enable_wuxing}") |
| | print(f" Hetero MoE: {config_full.enable_hetero_moe} (大{config_full.num_大_experts} + 小{config_full.num_小_experts})") |
| | print(f" Latent Thought: {config_full.enable_latent_thought}") |
| | print(f" RLP: {config_full.enable_rlp}") |
| | print() |
| |
|
| | |
| | config_baseline = get_aether_micro_config_baseline() |
| | print("✅ Baseline config created") |
| | print(f" All custom features: {config_baseline.enable_wuxing}") |
| | print() |
| |
|
| | |
| | validate_aether_micro_config(config_full) |
| | print("✅ Full config validation passed") |
| |
|
| | validate_aether_micro_config(config_baseline) |
| | print("✅ Baseline config validation passed") |
| | print() |
| |
|
| | |
| | config_full.save_pretrained("/tmp/aether-micro-config-test") |
| | print("✅ Config saved to /tmp/aether-micro-config-test") |
| | print() |
| |
|
| | |
| | from transformers import AutoConfig |
| | config_loaded = AutoConfig.from_pretrained("/tmp/aether-micro-config-test", trust_remote_code=True) |
| | print("✅ Config loaded from /tmp/aether-micro-config-test") |
| | print(f" Loaded model type: {config_loaded.model_type}") |
| | print() |
| |
|
| | print("=" * 80) |
| | print("All tests passed!") |
| | print("=" * 80) |
| |
|