CharlesCNorton
Move SmolLM2 analysis files into smollm2 subfolder
8a1465b
"""
SmolLM2-360M-Instruct Architecture Analysis
For 8bit-threshold-computer LLM Integration Project
"""
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from collections import defaultdict
import json
def analyze_smollm2():
model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"
print("=" * 80)
print("SmolLM2-360M-Instruct Architecture Analysis")
print("=" * 80)
# Load config first
print("\n[1] Loading model configuration...")
config = AutoConfig.from_pretrained(model_name)
print(f"Config loaded: {type(config).__name__}")
# Load tokenizer
print("\n[2] Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"Tokenizer loaded: {type(tokenizer).__name__}")
# Load model with hidden states output
print("\n[3] Loading model with output_hidden_states=True...")
model = AutoModelForCausalLM.from_pretrained(
model_name,
output_hidden_states=True,
torch_dtype=torch.float32
)
model.eval()
print(f"Model loaded: {type(model).__name__}")
# ========================================================================
# ARCHITECTURE CENSUS
# ========================================================================
print("\n" + "=" * 80)
print("ARCHITECTURE CENSUS")
print("=" * 80)
print("\n--- Model Configuration ---")
config_dict = config.to_dict()
for key, value in sorted(config_dict.items()):
print(f" {key}: {value}")
print("\n--- Key Architecture Parameters ---")
print(f" Model type: {config.model_type}")
print(f" Vocabulary size: {config.vocab_size}")
print(f" Hidden size: {config.hidden_size}")
print(f" Intermediate size: {config.intermediate_size}")
print(f" Number of hidden layers: {config.num_hidden_layers}")
print(f" Number of attention heads: {config.num_attention_heads}")
print(f" Number of KV heads: {getattr(config, 'num_key_value_heads', config.num_attention_heads)}")
print(f" Head dimension: {config.hidden_size // config.num_attention_heads}")
print(f" Max position embeddings: {config.max_position_embeddings}")
print(f" RMS norm epsilon: {getattr(config, 'rms_norm_eps', 'N/A')}")
print(f" Rope theta: {getattr(config, 'rope_theta', 'N/A')}")
print(f" Tie word embeddings: {getattr(config, 'tie_word_embeddings', 'N/A')}")
# ========================================================================
# WEIGHT INVENTORY
# ========================================================================
print("\n" + "=" * 80)
print("WEIGHT INVENTORY")
print("=" * 80)
total_params = 0
param_groups = defaultdict(list)
for name, param in model.named_parameters():
total_params += param.numel()
# Group by component
if "embed_tokens" in name:
group = "Embedding"
elif "lm_head" in name:
group = "LM Head"
elif "norm" in name and "layers" not in name:
group = "Final Norm"
elif "layers" in name:
layer_num = name.split(".")[2]
if "self_attn" in name:
group = f"Layer {layer_num} - Attention"
elif "mlp" in name:
group = f"Layer {layer_num} - MLP"
elif "norm" in name:
group = f"Layer {layer_num} - Norms"
else:
group = f"Layer {layer_num} - Other"
else:
group = "Other"
param_groups[group].append({
"name": name,
"shape": tuple(param.shape),
"numel": param.numel(),
"dtype": str(param.dtype)
})
print(f"\n--- Total Parameters: {total_params:,} ---")
print(f" ({total_params / 1e6:.2f}M parameters)")
# Print by group
for group_name in sorted(param_groups.keys()):
params = param_groups[group_name]
group_total = sum(p["numel"] for p in params)
print(f"\n### {group_name} ({group_total:,} params, {group_total/total_params*100:.2f}%)")
for p in params:
print(f" {p['name']}")
print(f" Shape: {p['shape']}, Elements: {p['numel']:,}, Dtype: {p['dtype']}")
# ========================================================================
# TOKENIZATION ANALYSIS
# ========================================================================
print("\n" + "=" * 80)
print("TOKENIZATION ANALYSIS")
print("=" * 80)
test_input = "47 + 86"
print(f"\n--- Test Input: '{test_input}' ---")
tokens = tokenizer(test_input, return_tensors="pt")
input_ids = tokens["input_ids"][0]
print(f"\nInput IDs: {input_ids.tolist()}")
print(f"Number of tokens: {len(input_ids)}")
print("\nToken breakdown:")
for i, token_id in enumerate(input_ids):
token_str = tokenizer.decode([token_id])
print(f" Position {i}: ID={token_id.item():5d}, Token='{token_str}'")
# Additional tokenization tests
print("\n--- Additional Tokenization Tests ---")
test_cases = ["0", "1", "47", "86", "133", " + ", "="]
for tc in test_cases:
ids = tokenizer.encode(tc, add_special_tokens=False)
decoded = [tokenizer.decode([i]) for i in ids]
print(f" '{tc}' -> IDs: {ids}, Tokens: {decoded}")
# ========================================================================
# HIDDEN STATE ANALYSIS
# ========================================================================
print("\n" + "=" * 80)
print("HIDDEN STATE ANALYSIS")
print("=" * 80)
print(f"\n--- Running inference on '{test_input}' ---")
with torch.no_grad():
outputs = model(**tokens)
hidden_states = outputs.hidden_states
print(f"\nNumber of hidden state outputs: {len(hidden_states)}")
print("(This includes embedding output + each layer's output)")
print("\nHidden state shapes at each layer:")
for i, hs in enumerate(hidden_states):
layer_name = "Embedding" if i == 0 else f"Layer {i-1}"
print(f" {layer_name}: {tuple(hs.shape)}")
if i == 0:
print(f" (batch_size=1, seq_len={hs.shape[1]}, hidden_dim={hs.shape[2]})")
# Analyze hidden state statistics at different layers
print("\n--- Hidden State Statistics (per layer) ---")
for i, hs in enumerate(hidden_states):
layer_name = "Embedding" if i == 0 else f"Layer {i-1}"
hs_flat = hs.view(-1)
print(f" {layer_name}:")
print(f" Mean: {hs_flat.mean().item():.6f}")
print(f" Std: {hs_flat.std().item():.6f}")
print(f" Min: {hs_flat.min().item():.6f}")
print(f" Max: {hs_flat.max().item():.6f}")
# ========================================================================
# MODEL STRUCTURE DEEP DIVE
# ========================================================================
print("\n" + "=" * 80)
print("MODEL STRUCTURE DEEP DIVE")
print("=" * 80)
print("\n--- Model Architecture String ---")
print(model)
# ========================================================================
# SUMMARY DATA FOR REPORT
# ========================================================================
summary = {
"model_name": model_name,
"total_params": total_params,
"config": {
"vocab_size": config.vocab_size,
"hidden_size": config.hidden_size,
"intermediate_size": config.intermediate_size,
"num_hidden_layers": config.num_hidden_layers,
"num_attention_heads": config.num_attention_heads,
"num_kv_heads": getattr(config, 'num_key_value_heads', config.num_attention_heads),
"head_dim": config.hidden_size // config.num_attention_heads,
"max_position_embeddings": config.max_position_embeddings,
"rms_norm_eps": getattr(config, 'rms_norm_eps', None),
"rope_theta": getattr(config, 'rope_theta', None),
"tie_word_embeddings": getattr(config, 'tie_word_embeddings', None),
},
"tokenization": {
"test_input": test_input,
"token_ids": input_ids.tolist(),
"num_tokens": len(input_ids),
"tokens": [tokenizer.decode([tid]) for tid in input_ids]
},
"hidden_states": {
"num_outputs": len(hidden_states),
"shape": list(hidden_states[0].shape)
},
"param_groups": {k: {"count": len(v), "total": sum(p["numel"] for p in v)} for k, v in param_groups.items()}
}
# Save summary as JSON for report generation
with open("D:/8bit-threshold-computer/llm_integration/smollm2_analysis.json", "w") as f:
json.dump(summary, f, indent=2)
print("\n" + "=" * 80)
print("Analysis complete. Summary saved to smollm2_analysis.json")
print("=" * 80)
return summary, model, tokenizer, hidden_states, param_groups
if __name__ == "__main__":
summary, model, tokenizer, hidden_states, param_groups = analyze_smollm2()