| """ |
| SmolLM2-360M-Instruct Architecture Analysis |
| For 8bit-threshold-computer LLM Integration Project |
| """ |
|
|
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig |
| from collections import defaultdict |
| import json |
|
|
| def analyze_smollm2(): |
| model_name = "HuggingFaceTB/SmolLM2-360M-Instruct" |
|
|
| print("=" * 80) |
| print("SmolLM2-360M-Instruct Architecture Analysis") |
| print("=" * 80) |
|
|
| |
| print("\n[1] Loading model configuration...") |
| config = AutoConfig.from_pretrained(model_name) |
| print(f"Config loaded: {type(config).__name__}") |
|
|
| |
| print("\n[2] Loading tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| print(f"Tokenizer loaded: {type(tokenizer).__name__}") |
|
|
| |
| print("\n[3] Loading model with output_hidden_states=True...") |
| model = AutoModelForCausalLM.from_pretrained( |
| model_name, |
| output_hidden_states=True, |
| torch_dtype=torch.float32 |
| ) |
| model.eval() |
| print(f"Model loaded: {type(model).__name__}") |
|
|
| |
| |
| |
| print("\n" + "=" * 80) |
| print("ARCHITECTURE CENSUS") |
| print("=" * 80) |
|
|
| print("\n--- Model Configuration ---") |
| config_dict = config.to_dict() |
| for key, value in sorted(config_dict.items()): |
| print(f" {key}: {value}") |
|
|
| print("\n--- Key Architecture Parameters ---") |
| print(f" Model type: {config.model_type}") |
| print(f" Vocabulary size: {config.vocab_size}") |
| print(f" Hidden size: {config.hidden_size}") |
| print(f" Intermediate size: {config.intermediate_size}") |
| print(f" Number of hidden layers: {config.num_hidden_layers}") |
| print(f" Number of attention heads: {config.num_attention_heads}") |
| print(f" Number of KV heads: {getattr(config, 'num_key_value_heads', config.num_attention_heads)}") |
| print(f" Head dimension: {config.hidden_size // config.num_attention_heads}") |
| print(f" Max position embeddings: {config.max_position_embeddings}") |
| print(f" RMS norm epsilon: {getattr(config, 'rms_norm_eps', 'N/A')}") |
| print(f" Rope theta: {getattr(config, 'rope_theta', 'N/A')}") |
| print(f" Tie word embeddings: {getattr(config, 'tie_word_embeddings', 'N/A')}") |
|
|
| |
| |
| |
| print("\n" + "=" * 80) |
| print("WEIGHT INVENTORY") |
| print("=" * 80) |
|
|
| total_params = 0 |
| param_groups = defaultdict(list) |
|
|
| for name, param in model.named_parameters(): |
| total_params += param.numel() |
|
|
| |
| if "embed_tokens" in name: |
| group = "Embedding" |
| elif "lm_head" in name: |
| group = "LM Head" |
| elif "norm" in name and "layers" not in name: |
| group = "Final Norm" |
| elif "layers" in name: |
| layer_num = name.split(".")[2] |
| if "self_attn" in name: |
| group = f"Layer {layer_num} - Attention" |
| elif "mlp" in name: |
| group = f"Layer {layer_num} - MLP" |
| elif "norm" in name: |
| group = f"Layer {layer_num} - Norms" |
| else: |
| group = f"Layer {layer_num} - Other" |
| else: |
| group = "Other" |
|
|
| param_groups[group].append({ |
| "name": name, |
| "shape": tuple(param.shape), |
| "numel": param.numel(), |
| "dtype": str(param.dtype) |
| }) |
|
|
| print(f"\n--- Total Parameters: {total_params:,} ---") |
| print(f" ({total_params / 1e6:.2f}M parameters)") |
|
|
| |
| for group_name in sorted(param_groups.keys()): |
| params = param_groups[group_name] |
| group_total = sum(p["numel"] for p in params) |
| print(f"\n### {group_name} ({group_total:,} params, {group_total/total_params*100:.2f}%)") |
| for p in params: |
| print(f" {p['name']}") |
| print(f" Shape: {p['shape']}, Elements: {p['numel']:,}, Dtype: {p['dtype']}") |
|
|
| |
| |
| |
| print("\n" + "=" * 80) |
| print("TOKENIZATION ANALYSIS") |
| print("=" * 80) |
|
|
| test_input = "47 + 86" |
| print(f"\n--- Test Input: '{test_input}' ---") |
|
|
| tokens = tokenizer(test_input, return_tensors="pt") |
| input_ids = tokens["input_ids"][0] |
|
|
| print(f"\nInput IDs: {input_ids.tolist()}") |
| print(f"Number of tokens: {len(input_ids)}") |
|
|
| print("\nToken breakdown:") |
| for i, token_id in enumerate(input_ids): |
| token_str = tokenizer.decode([token_id]) |
| print(f" Position {i}: ID={token_id.item():5d}, Token='{token_str}'") |
|
|
| |
| print("\n--- Additional Tokenization Tests ---") |
| test_cases = ["0", "1", "47", "86", "133", " + ", "="] |
| for tc in test_cases: |
| ids = tokenizer.encode(tc, add_special_tokens=False) |
| decoded = [tokenizer.decode([i]) for i in ids] |
| print(f" '{tc}' -> IDs: {ids}, Tokens: {decoded}") |
|
|
| |
| |
| |
| print("\n" + "=" * 80) |
| print("HIDDEN STATE ANALYSIS") |
| print("=" * 80) |
|
|
| print(f"\n--- Running inference on '{test_input}' ---") |
|
|
| with torch.no_grad(): |
| outputs = model(**tokens) |
|
|
| hidden_states = outputs.hidden_states |
| print(f"\nNumber of hidden state outputs: {len(hidden_states)}") |
| print("(This includes embedding output + each layer's output)") |
|
|
| print("\nHidden state shapes at each layer:") |
| for i, hs in enumerate(hidden_states): |
| layer_name = "Embedding" if i == 0 else f"Layer {i-1}" |
| print(f" {layer_name}: {tuple(hs.shape)}") |
| if i == 0: |
| print(f" (batch_size=1, seq_len={hs.shape[1]}, hidden_dim={hs.shape[2]})") |
|
|
| |
| print("\n--- Hidden State Statistics (per layer) ---") |
| for i, hs in enumerate(hidden_states): |
| layer_name = "Embedding" if i == 0 else f"Layer {i-1}" |
| hs_flat = hs.view(-1) |
| print(f" {layer_name}:") |
| print(f" Mean: {hs_flat.mean().item():.6f}") |
| print(f" Std: {hs_flat.std().item():.6f}") |
| print(f" Min: {hs_flat.min().item():.6f}") |
| print(f" Max: {hs_flat.max().item():.6f}") |
|
|
| |
| |
| |
| print("\n" + "=" * 80) |
| print("MODEL STRUCTURE DEEP DIVE") |
| print("=" * 80) |
|
|
| print("\n--- Model Architecture String ---") |
| print(model) |
|
|
| |
| |
| |
| summary = { |
| "model_name": model_name, |
| "total_params": total_params, |
| "config": { |
| "vocab_size": config.vocab_size, |
| "hidden_size": config.hidden_size, |
| "intermediate_size": config.intermediate_size, |
| "num_hidden_layers": config.num_hidden_layers, |
| "num_attention_heads": config.num_attention_heads, |
| "num_kv_heads": getattr(config, 'num_key_value_heads', config.num_attention_heads), |
| "head_dim": config.hidden_size // config.num_attention_heads, |
| "max_position_embeddings": config.max_position_embeddings, |
| "rms_norm_eps": getattr(config, 'rms_norm_eps', None), |
| "rope_theta": getattr(config, 'rope_theta', None), |
| "tie_word_embeddings": getattr(config, 'tie_word_embeddings', None), |
| }, |
| "tokenization": { |
| "test_input": test_input, |
| "token_ids": input_ids.tolist(), |
| "num_tokens": len(input_ids), |
| "tokens": [tokenizer.decode([tid]) for tid in input_ids] |
| }, |
| "hidden_states": { |
| "num_outputs": len(hidden_states), |
| "shape": list(hidden_states[0].shape) |
| }, |
| "param_groups": {k: {"count": len(v), "total": sum(p["numel"] for p in v)} for k, v in param_groups.items()} |
| } |
|
|
| |
| with open("D:/8bit-threshold-computer/llm_integration/smollm2_analysis.json", "w") as f: |
| json.dump(summary, f, indent=2) |
|
|
| print("\n" + "=" * 80) |
| print("Analysis complete. Summary saved to smollm2_analysis.json") |
| print("=" * 80) |
|
|
| return summary, model, tokenizer, hidden_states, param_groups |
|
|
| if __name__ == "__main__": |
| summary, model, tokenizer, hidden_states, param_groups = analyze_smollm2() |
|
|