File size: 6,953 Bytes
509a107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env python3
"""
Test FastVLM-7B with 8-bit quantization for limited RAM systems
Following exact HuggingFace model card implementation
"""

import torch
import psutil
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

def check_system():
    """Check system capabilities"""
    print("="*60)
    print("System Check")
    print("="*60)
    
    # Memory check
    mem = psutil.virtual_memory()
    print(f"Total RAM: {mem.total / 1e9:.2f} GB")
    print(f"Available RAM: {mem.available / 1e9:.2f} GB")
    print(f"Used RAM: {mem.percent}%")
    
    # Device check
    if torch.cuda.is_available():
        device = "cuda"
        print(f"GPU: {torch.cuda.get_device_name(0)}")
    elif torch.backends.mps.is_available():
        device = "mps"
        print("Device: Apple Silicon MPS")
    else:
        device = "cpu"
        print("Device: CPU")
    
    print()
    return device, mem.available / 1e9

def test_fastvlm_quantized():
    """Test FastVLM-7B with quantization"""
    print("="*60)
    print("Testing FastVLM-7B with 8-bit Quantization")
    print("="*60)
    
    device, available_gb = check_system()
    
    # Model ID from HuggingFace
    MID = "apple/FastVLM-7B"
    IMAGE_TOKEN_INDEX = -200  # As specified in model card
    
    print(f"\n1. Loading tokenizer from {MID}...")
    try:
        tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
        print(f"   βœ“ Tokenizer loaded: {tok.__class__.__name__}")
        print(f"   βœ“ Vocab size: {tok.vocab_size}")
        print(f"   βœ“ IMAGE_TOKEN_INDEX = {IMAGE_TOKEN_INDEX}")
    except Exception as e:
        print(f"   βœ— Failed to load tokenizer: {e}")
        return False
    
    print(f"\n2. Configuring 8-bit quantization...")
    if available_gb < 12:
        print(f"   Memory available: {available_gb:.2f} GB")
        print("   Using 8-bit quantization for memory efficiency")
        
        # Configure 8-bit quantization
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_compute_dtype=torch.float16 if device != "cpu" else torch.float32,
            bnb_8bit_use_double_quant=True,  # Extra memory optimization
            bnb_8bit_quant_type="nf4"  # Better quality quantization
        )
        
        model_kwargs = {
            "quantization_config": quantization_config,
            "trust_remote_code": True,
            "low_cpu_mem_usage": True
        }
        print("   Configuration: 8-bit NF4 quantization with double quantization")
        print("   Expected memory usage: ~7GB")
    else:
        print(f"   Memory available: {available_gb:.2f} GB (sufficient for full precision)")
        model_kwargs = {
            "torch_dtype": torch.float16 if device != "cpu" else torch.float32,
            "device_map": "auto",
            "trust_remote_code": True,
            "low_cpu_mem_usage": True
        }
        print("   Configuration: Full precision")
        print("   Expected memory usage: ~14GB")
    
    print(f"\n3. Loading model from {MID}...")
    print("   This may take several minutes on first run...")
    
    try:
        model = AutoModelForCausalLM.from_pretrained(
            MID,
            **model_kwargs
        )
        print("   βœ“ Model loaded successfully!")
        
        # Check model details
        total_params = sum(p.numel() for p in model.parameters())
        print(f"   βœ“ Parameters: {total_params / 1e9:.2f}B")
        
        # Check if vision tower is available
        if hasattr(model, 'get_vision_tower'):
            print("   βœ“ Vision tower (FastViTHD) available")
        else:
            print("   ⚠ Vision tower not detected")
        
        print(f"\n4. Testing generation with IMAGE_TOKEN_INDEX...")
        
        # Test message with image placeholder
        messages = [
            {"role": "user", "content": "<image>\nDescribe this image."}
        ]
        
        # Apply chat template
        rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        pre, post = rendered.split("<image>", 1)
        
        # Tokenize parts
        pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
        post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
        
        # Create image token
        img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
        
        # Combine tokens
        input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1)
        print(f"   Input IDs shape: {input_ids.shape}")
        print(f"   Image token inserted at position: {(input_ids == IMAGE_TOKEN_INDEX).nonzero()[0, 1].item()}")
        
        print("\nβœ… SUCCESS: FastVLM-7B is properly configured!")
        print(f"   - Model: {MID}")
        print(f"   - IMAGE_TOKEN_INDEX: {IMAGE_TOKEN_INDEX}")
        print(f"   - Quantization: {'8-bit' if available_gb < 12 else 'Full precision'}")
        print(f"   - trust_remote_code: True")
        print(f"   - Device: {device}")
        
        # Memory usage after loading
        mem_after = psutil.virtual_memory()
        mem_used = (mem.total - mem_after.available) / 1e9
        print(f"\n   Memory used by model: ~{mem_used:.2f} GB")
        
        return True
        
    except RuntimeError as e:
        if "out of memory" in str(e).lower():
            print("\nβœ— Out of Memory Error!")
            print("\nThe system does not have enough RAM even with 8-bit quantization.")
            print("Solutions:")
            print("1. Close other applications to free memory")
            print("2. Use apple/FastVLM-1.5B (smaller model)")
            print("3. Upgrade to 16GB+ RAM")
            print("4. Use cloud GPU services")
        else:
            print(f"\nβœ— Runtime Error: {e}")
        return False
        
    except ImportError as e:
        if "bitsandbytes" in str(e):
            print("\nβœ— bitsandbytes not installed properly")
            print("Run: pip install bitsandbytes")
        else:
            print(f"\nβœ— Import Error: {e}")
        return False
        
    except Exception as e:
        print(f"\nβœ— Error: {e}")
        import traceback
        traceback.print_exc()
        return False

if __name__ == "__main__":
    print("FastVLM-7B Quantization Test")
    print("Using exact implementation from HuggingFace model card")
    print()
    
    success = test_fastvlm_quantized()
    
    if not success:
        print("\n" + "="*60)
        print("Hardware Requirements Not Met")
        print("="*60)
        print("\nFastVLM-7B requires one of:")
        print("β€’ 14GB+ RAM for full precision")
        print("β€’ 7-8GB RAM with 8-bit quantization")
        print("β€’ GPU with 8GB+ VRAM")
        print("\nYour system has insufficient resources.")
        print("The code is correctly configured but needs more memory.")