|
|
|
|
|
""" |
|
|
Test FastVLM-7B with 8-bit quantization for limited RAM systems |
|
|
Following exact HuggingFace model card implementation |
|
|
""" |
|
|
|
|
|
import torch |
|
|
import psutil |
|
|
from PIL import Image |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
|
|
|
|
|
def check_system(): |
|
|
"""Check system capabilities""" |
|
|
print("="*60) |
|
|
print("System Check") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
mem = psutil.virtual_memory() |
|
|
print(f"Total RAM: {mem.total / 1e9:.2f} GB") |
|
|
print(f"Available RAM: {mem.available / 1e9:.2f} GB") |
|
|
print(f"Used RAM: {mem.percent}%") |
|
|
|
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
device = "cuda" |
|
|
print(f"GPU: {torch.cuda.get_device_name(0)}") |
|
|
elif torch.backends.mps.is_available(): |
|
|
device = "mps" |
|
|
print("Device: Apple Silicon MPS") |
|
|
else: |
|
|
device = "cpu" |
|
|
print("Device: CPU") |
|
|
|
|
|
print() |
|
|
return device, mem.available / 1e9 |
|
|
|
|
|
def test_fastvlm_quantized(): |
|
|
"""Test FastVLM-7B with quantization""" |
|
|
print("="*60) |
|
|
print("Testing FastVLM-7B with 8-bit Quantization") |
|
|
print("="*60) |
|
|
|
|
|
device, available_gb = check_system() |
|
|
|
|
|
|
|
|
MID = "apple/FastVLM-7B" |
|
|
IMAGE_TOKEN_INDEX = -200 |
|
|
|
|
|
print(f"\n1. Loading tokenizer from {MID}...") |
|
|
try: |
|
|
tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True) |
|
|
print(f" β Tokenizer loaded: {tok.__class__.__name__}") |
|
|
print(f" β Vocab size: {tok.vocab_size}") |
|
|
print(f" β IMAGE_TOKEN_INDEX = {IMAGE_TOKEN_INDEX}") |
|
|
except Exception as e: |
|
|
print(f" β Failed to load tokenizer: {e}") |
|
|
return False |
|
|
|
|
|
print(f"\n2. Configuring 8-bit quantization...") |
|
|
if available_gb < 12: |
|
|
print(f" Memory available: {available_gb:.2f} GB") |
|
|
print(" Using 8-bit quantization for memory efficiency") |
|
|
|
|
|
|
|
|
quantization_config = BitsAndBytesConfig( |
|
|
load_in_8bit=True, |
|
|
bnb_8bit_compute_dtype=torch.float16 if device != "cpu" else torch.float32, |
|
|
bnb_8bit_use_double_quant=True, |
|
|
bnb_8bit_quant_type="nf4" |
|
|
) |
|
|
|
|
|
model_kwargs = { |
|
|
"quantization_config": quantization_config, |
|
|
"trust_remote_code": True, |
|
|
"low_cpu_mem_usage": True |
|
|
} |
|
|
print(" Configuration: 8-bit NF4 quantization with double quantization") |
|
|
print(" Expected memory usage: ~7GB") |
|
|
else: |
|
|
print(f" Memory available: {available_gb:.2f} GB (sufficient for full precision)") |
|
|
model_kwargs = { |
|
|
"torch_dtype": torch.float16 if device != "cpu" else torch.float32, |
|
|
"device_map": "auto", |
|
|
"trust_remote_code": True, |
|
|
"low_cpu_mem_usage": True |
|
|
} |
|
|
print(" Configuration: Full precision") |
|
|
print(" Expected memory usage: ~14GB") |
|
|
|
|
|
print(f"\n3. Loading model from {MID}...") |
|
|
print(" This may take several minutes on first run...") |
|
|
|
|
|
try: |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MID, |
|
|
**model_kwargs |
|
|
) |
|
|
print(" β Model loaded successfully!") |
|
|
|
|
|
|
|
|
total_params = sum(p.numel() for p in model.parameters()) |
|
|
print(f" β Parameters: {total_params / 1e9:.2f}B") |
|
|
|
|
|
|
|
|
if hasattr(model, 'get_vision_tower'): |
|
|
print(" β Vision tower (FastViTHD) available") |
|
|
else: |
|
|
print(" β Vision tower not detected") |
|
|
|
|
|
print(f"\n4. Testing generation with IMAGE_TOKEN_INDEX...") |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{"role": "user", "content": "<image>\nDescribe this image."} |
|
|
] |
|
|
|
|
|
|
|
|
rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) |
|
|
pre, post = rendered.split("<image>", 1) |
|
|
|
|
|
|
|
|
pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids |
|
|
post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids |
|
|
|
|
|
|
|
|
img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype) |
|
|
|
|
|
|
|
|
input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1) |
|
|
print(f" Input IDs shape: {input_ids.shape}") |
|
|
print(f" Image token inserted at position: {(input_ids == IMAGE_TOKEN_INDEX).nonzero()[0, 1].item()}") |
|
|
|
|
|
print("\nβ
SUCCESS: FastVLM-7B is properly configured!") |
|
|
print(f" - Model: {MID}") |
|
|
print(f" - IMAGE_TOKEN_INDEX: {IMAGE_TOKEN_INDEX}") |
|
|
print(f" - Quantization: {'8-bit' if available_gb < 12 else 'Full precision'}") |
|
|
print(f" - trust_remote_code: True") |
|
|
print(f" - Device: {device}") |
|
|
|
|
|
|
|
|
mem_after = psutil.virtual_memory() |
|
|
mem_used = (mem.total - mem_after.available) / 1e9 |
|
|
print(f"\n Memory used by model: ~{mem_used:.2f} GB") |
|
|
|
|
|
return True |
|
|
|
|
|
except RuntimeError as e: |
|
|
if "out of memory" in str(e).lower(): |
|
|
print("\nβ Out of Memory Error!") |
|
|
print("\nThe system does not have enough RAM even with 8-bit quantization.") |
|
|
print("Solutions:") |
|
|
print("1. Close other applications to free memory") |
|
|
print("2. Use apple/FastVLM-1.5B (smaller model)") |
|
|
print("3. Upgrade to 16GB+ RAM") |
|
|
print("4. Use cloud GPU services") |
|
|
else: |
|
|
print(f"\nβ Runtime Error: {e}") |
|
|
return False |
|
|
|
|
|
except ImportError as e: |
|
|
if "bitsandbytes" in str(e): |
|
|
print("\nβ bitsandbytes not installed properly") |
|
|
print("Run: pip install bitsandbytes") |
|
|
else: |
|
|
print(f"\nβ Import Error: {e}") |
|
|
return False |
|
|
|
|
|
except Exception as e: |
|
|
print(f"\nβ Error: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return False |
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("FastVLM-7B Quantization Test") |
|
|
print("Using exact implementation from HuggingFace model card") |
|
|
print() |
|
|
|
|
|
success = test_fastvlm_quantized() |
|
|
|
|
|
if not success: |
|
|
print("\n" + "="*60) |
|
|
print("Hardware Requirements Not Met") |
|
|
print("="*60) |
|
|
print("\nFastVLM-7B requires one of:") |
|
|
print("β’ 14GB+ RAM for full precision") |
|
|
print("β’ 7-8GB RAM with 8-bit quantization") |
|
|
print("β’ GPU with 8GB+ VRAM") |
|
|
print("\nYour system has insufficient resources.") |
|
|
print("The code is correctly configured but needs more memory.") |