Faaz

Add GPU diagnostic script, fix architecture loading with low_cpu_mem_usage and sync

5fb9ec3 30 days ago

11 kB

	"""
	MINDI 1.5 Vision-Coder — Model Architecture

	Loads Qwen/Qwen2.5-Coder-7B-Instruct with LoRA adapters.
	Handles model initialization, LoRA application, save/load,
	and parameter counting for the base LLM component.
	"""

	from __future__ import annotations

	from pathlib import Path
	from typing import Optional

	import torch
	from peft import LoraConfig, PeftModel, TaskType, get_peft_model
	from transformers import AutoModelForCausalLM, AutoTokenizer


	class MINDIArchitecture:
	"""Qwen2.5-Coder-7B-Instruct with LoRA for MINDI 1.5 fine-tuning."""

	DEFAULT_TARGET_MODULES: list[str] = [
	"q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj",
	]

	def __init__(
	self,
	model_name: str = "Qwen/Qwen2.5-Coder-7B-Instruct",
	device: Optional[str] = None,
	cache_dir: Optional[Path] = None,
	torch_dtype: torch.dtype = torch.bfloat16,
	) -> None:
	"""
	Initialize the architecture wrapper.

	Args:
	model_name: HuggingFace model identifier.
	device: Target device ('cuda', 'cpu', or None for auto).
	cache_dir: Local directory for model weight cache.
	torch_dtype: Data type for model weights.
	"""
	self.model_name = model_name
	self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
	self.cache_dir = Path(cache_dir) if cache_dir else Path("./checkpoints/base")
	self.cache_dir.mkdir(parents=True, exist_ok=True)
	self.torch_dtype = torch_dtype

	self.model: Optional[AutoModelForCausalLM] = None
	self.peft_model: Optional[PeftModel] = None
	self.tokenizer: Optional[AutoTokenizer] = None

	self._load_model()

	def _load_model(self) -> None:
	"""Load the base model and tokenizer from HuggingFace or cache."""
	print(f"[MINDIArchitecture] Loading {self.model_name} ...")

	if self.device == "cuda":
	# Clear GPU state before loading
	torch.cuda.empty_cache()
	torch.cuda.synchronize()
	print(f"[MINDIArchitecture] GPU cleared, loading to CPU first ...")

	self.model = AutoModelForCausalLM.from_pretrained(
	self.model_name,
	torch_dtype=self.torch_dtype,
	device_map=None,
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	)
	param_count = sum(p.numel() for p in self.model.parameters())
	print(f"[MINDIArchitecture] CPU load done ({param_count / 1e9:.2f}B params)")

	if self.device == "cuda":
	print(f"[MINDIArchitecture] Moving to CUDA ...")
	self.model = self.model.to("cuda")
	torch.cuda.synchronize()
	vram_gb = torch.cuda.memory_allocated() / (1024**3)
	print(f"[MINDIArchitecture] CUDA transfer done ({vram_gb:.1f} GB VRAM)")

	self.tokenizer = AutoTokenizer.from_pretrained(
	self.model_name,
	trust_remote_code=True,
	)
	print(f"[MINDIArchitecture] Loaded on {self.device} "
	f"({self._fmt_params(self._total_params())} params)")

	def apply_lora(
	self,
	r: int = 64,
	lora_alpha: int = 128,
	lora_dropout: float = 0.05,
	target_modules: Optional[list[str]] = None,
	) -> PeftModel:
	"""
	Apply LoRA adapters to the base model.

	Args:
	r: LoRA rank.
	lora_alpha: LoRA scaling factor.
	lora_dropout: Dropout probability for LoRA layers.
	target_modules: List of module names to apply LoRA to.

	Returns:
	The PEFT-wrapped model.
	"""
	if self.model is None:
	raise RuntimeError("Base model not loaded.")

	if target_modules is None:
	target_modules = self.DEFAULT_TARGET_MODULES

	lora_config = LoraConfig(
	r=r,
	lora_alpha=lora_alpha,
	lora_dropout=lora_dropout,
	target_modules=target_modules,
	bias="none",
	task_type=TaskType.CAUSAL_LM,
	)

	self.peft_model = get_peft_model(self.model, lora_config)

	info = self.get_trainable_params()
	print(f"[MINDIArchitecture] LoRA applied (r={r}, alpha={lora_alpha})")
	print(f" Trainable: {info['trainable']:>14,} ({info['trainable_pct']:.2f}%)")
	print(f" Frozen: {info['frozen']:>14,}")
	print(f" Total: {info['total']:>14,}")

	return self.peft_model

	def get_trainable_params(self) -> dict:
	"""
	Count trainable, frozen, and total parameters.

	Returns:
	Dictionary with 'trainable', 'frozen', 'total', 'trainable_pct'.
	"""
	model = self.peft_model or self.model
	if model is None:
	return {"trainable": 0, "frozen": 0, "total": 0, "trainable_pct": 0.0}

	trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
	total = sum(p.numel() for p in model.parameters())
	frozen = total - trainable
	pct = 100.0 * trainable / total if total > 0 else 0.0

	return {
	"trainable": trainable,
	"frozen": frozen,
	"total": total,
	"trainable_pct": round(pct, 4),
	}

	def print_model_info(self) -> None:
	"""Print detailed model architecture and parameter information."""
	model = self.peft_model or self.model
	if model is None:
	print("[MINDIArchitecture] No model loaded.")
	return

	info = self.get_trainable_params()
	print()
	print("=" * 60)
	print(" MINDI 1.5 — Model Architecture Info")
	print("=" * 60)
	print(f" Base model: {self.model_name}")
	print(f" Device: {self.device}")
	print(f" Dtype: {self.torch_dtype}")
	print(f" LoRA active: {self.peft_model is not None}")
	print(f" Total params: {self._fmt_params(info['total'])}")
	print(f" Trainable: {self._fmt_params(info['trainable'])} "
	f"({info['trainable_pct']:.2f}%)")
	print(f" Frozen: {self._fmt_params(info['frozen'])}")

	if self.peft_model is not None:
	config = self.peft_model.peft_config.get("default")
	if config is not None:
	print(f" LoRA rank: {config.r}")
	print(f" LoRA alpha: {config.lora_alpha}")
	print(f" LoRA dropout: {config.lora_dropout}")
	print(f" Target modules: {config.target_modules}")
	print("=" * 60)
	print()

	def save_lora(self, path: Optional[Path] = None) -> Path:
	"""
	Save LoRA adapter weights to disk.

	Args:
	path: Directory to save to. Defaults to checkpoints/lora.

	Returns:
	Path where weights were saved.
	"""
	if self.peft_model is None:
	raise RuntimeError("No LoRA adapter to save. Call apply_lora() first.")

	save_path = Path(path) if path else Path("./checkpoints/lora")
	save_path.mkdir(parents=True, exist_ok=True)
	self.peft_model.save_pretrained(str(save_path))
	print(f"[MINDIArchitecture] LoRA saved to {save_path}")
	return save_path

	def load_lora(self, path: Path) -> PeftModel:
	"""
	Load LoRA adapter weights from disk.

	Args:
	path: Directory containing saved adapter weights.

	Returns:
	The PEFT-wrapped model with loaded adapter.
	"""
	path = Path(path)
	if not path.exists():
	raise FileNotFoundError(f"LoRA adapter not found: {path}")
	if self.model is None:
	raise RuntimeError("Base model not loaded.")

	self.peft_model = PeftModel.from_pretrained(
	self.model, str(path)
	)
	print(f"[MINDIArchitecture] LoRA loaded from {path}")
	return self.peft_model

	def resize_embeddings(self, new_vocab_size: int) -> None:
	"""Resize model embeddings for new special tokens."""
	model = self.peft_model or self.model
	if model is None:
	raise RuntimeError("No model loaded.")
	old_size = model.get_input_embeddings().weight.shape[0]
	if new_vocab_size != old_size:
	model.resize_token_embeddings(new_vocab_size)
	print(f"[MINDIArchitecture] Resized embeddings: {old_size} → {new_vocab_size}")

	def get_model(self) -> AutoModelForCausalLM \| PeftModel:
	"""Return the active model (PEFT if LoRA applied, else base)."""
	model = self.peft_model or self.model
	if model is None:
	raise RuntimeError("No model loaded.")
	return model

	# ── helpers ───────────────────────────────────────────────────
	def _total_params(self) -> int:
	model = self.peft_model or self.model
	if model is None:
	return 0
	return sum(p.numel() for p in model.parameters())

	@staticmethod
	def _fmt_params(n: int) -> str:
	if n >= 1_000_000_000:
	return f"{n / 1_000_000_000:.2f}B"
	if n >= 1_000_000:
	return f"{n / 1_000_000:.2f}M"
	if n >= 1_000:
	return f"{n / 1_000:.1f}K"
	return str(n)


	# ── Test block ────────────────────────────────────────────────────────
	if __name__ == "__main__":
	print("=" * 60)
	print(" MINDI 1.5 — Architecture Test")
	print("=" * 60)
	print()

	# 1. Load base model
	arch = MINDIArchitecture(
	model_name="Qwen/Qwen2.5-Coder-7B-Instruct",
	)

	# 2. Apply LoRA
	peft_model = arch.apply_lora(
	r=64,
	lora_alpha=128,
	lora_dropout=0.05,
	)

	# 3. Print full info
	arch.print_model_info()

	# 4. Verify trainable params
	info = arch.get_trainable_params()
	assert info["trainable"] > 0, "No trainable parameters!"
	assert info["frozen"] > info["trainable"], "More trainable than frozen — LoRA may not be applied!"

	# 5. Verify LoRA modules exist
	lora_modules = [name for name, _ in peft_model.named_parameters() if "lora_" in name]
	print(f" LoRA modules found: {len(lora_modules)}")
	assert len(lora_modules) > 0, "No LoRA modules found!"

	# 6. Quick forward pass test (small input)
	print("\n Running forward pass test ...")
	test_input = arch.tokenizer("Hello MINDI!", return_tensors="pt")
	test_input = {k: v.to(arch.device) for k, v in test_input.items()}
	with torch.no_grad():
	output = peft_model(**test_input)
	print(f" Output logits shape: {output.logits.shape}")
	print(f" Loss: {output.loss}")

	print("\n ✓ All architecture tests passed!")
	print("=" * 60)