| --- |
| license: apache-2.0 |
| language: |
| - vi |
| - en |
| tags: |
| - vision-language-model |
| - vlm |
| - qwen3 |
| - fastvlm |
| - vietnamese |
| base_model: Qwen/Qwen3-0.6B |
| datasets: |
| - 5CD-AI/Viet-multimodal-open-r1-8k-verified |
| --- |
| |
| # Belle-VLM: Vietnamese Vision Language Model |
|
|
| ## Model Description |
|
|
| Belle-VLM is a Vision Language Model trained for Vietnamese multimodal reasoning tasks. |
|
|
| ### Architecture |
| - **LLM Backbone**: Qwen3-0.6B |
| - **Vision Encoder**: FastViTHD (MobileCLIP) |
| - **Projector**: MLP 2-layer (3072 -> 1024) |
|
|
| ### Training |
| - **Dataset**: 5CD-AI/Viet-multimodal-open-r1-8k-verified |
| - **Method**: LoRA fine-tuning |
| - **Steps**: 2 |
| - **Learning Rate**: 2e-05 |
|
|
| ## Usage |
|
|
| ```python |
| !pip install -q transformers>=4.51.0 torch torchvision timm pillow requests datasets einops |
| !pip install -q open-clip-torch |
| !git clone https://github.com/Hert4/ml-fastvlm-v2.git |
| %cd ml-fastvlm-v2 |
| |
| import sys |
| import os |
| import torch |
| import requests |
| from PIL import Image |
| from io import BytesIO |
| |
| # Add path to ml-fastvlm-v2 |
| FASTVLM_PATH = "/kaggle/working/ml-fastvlm-v2" |
| if os.path.exists(FASTVLM_PATH): |
| sys.path.insert(0, FASTVLM_PATH) |
| else: |
| # Local path fallback |
| sys.path.insert(0, ".") |
| |
| |
| MODEL_PATH = "beyoru/Belle-VLM" |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32 |
| |
| print(f"Device: {DEVICE}") |
| print(f"Dtype: {DTYPE}") |
| |
| from transformers import AutoTokenizer |
| from llava.model.language_model.llava_qwen import LlavaQwen3ForCausalLM |
| |
| print(f"\nLoading model from: {MODEL_PATH}") |
| |
| # Load tokenizer |
| tokenizer = AutoTokenizer.from_pretrained( |
| MODEL_PATH, |
| trust_remote_code=True, |
| use_fast=False |
| ) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| # Load model |
| model = LlavaQwen3ForCausalLM.from_pretrained( |
| MODEL_PATH, |
| torch_dtype=DTYPE, |
| device_map="auto", |
| trust_remote_code=True, |
| ) |
| model.eval() |
| |
| # Setup vision tower |
| vision_tower = model.get_vision_tower() |
| if not vision_tower.is_loaded: |
| vision_tower.load_model() |
| vision_tower = vision_tower.to(device=model.device, dtype=DTYPE) |
| image_processor = vision_tower.image_processor |
| |
| print("Model loaded!") |
| |
| |
| def debug_model(): |
| """Check if model components are properly loaded.""" |
| print("\n" + "=" * 60) |
| print("DEBUG: Checking model components") |
| print("=" * 60) |
| |
| # 1. Check mm_projector |
| if hasattr(model.model, 'mm_projector'): |
| mm_proj = model.model.mm_projector |
| print(f"[OK] mm_projector exists: {type(mm_proj)}") |
| |
| # Check weights |
| if isinstance(mm_proj, torch.nn.Sequential): |
| first_layer = mm_proj[0] |
| w = first_layer.weight.float() # Convert to float for stats |
| print(f" First layer shape: {w.shape}") |
| print(f" Weight mean: {w.mean().item():.6f}") |
| print(f" Weight std: {w.std().item():.6f}") |
| print(f" Weight range: [{w.min().item():.4f}, {w.max().item():.4f}]") |
| |
| # Check if looks trained |
| if w.std().item() > 0.3: |
| print(" [WARNING] Std too high - might be random init!") |
| else: |
| print(" [OK] Weights look trained") |
| else: |
| print("[ERROR] mm_projector NOT FOUND!") |
| return False |
| |
| # 2. Check vision tower |
| print(f"\n[OK] Vision tower: {type(vision_tower).__name__}") |
| print(f" Is loaded: {vision_tower.is_loaded}") |
| print(f" Hidden size: {getattr(vision_tower, 'hidden_size', 'unknown')}") |
| |
| # 3. Check config |
| print(f"\n[INFO] Config:") |
| print(f" mm_vision_tower: {getattr(model.config, 'mm_vision_tower', 'NOT SET')}") |
| print(f" mm_hidden_size: {getattr(model.config, 'mm_hidden_size', 'NOT SET')}") |
| print(f" mm_projector_type: {getattr(model.config, 'mm_projector_type', 'NOT SET')}") |
| print(f" LLM hidden_size: {model.config.hidden_size}") |
| |
| return True |
| |
| |
| def debug_image_encoding(image): |
| """Test image encoding pipeline.""" |
| from llava.mm_utils import process_images |
| |
| print("\n" + "=" * 60) |
| print("DEBUG: Testing image encoding pipeline") |
| print("=" * 60) |
| |
| # Process image |
| image_tensor = process_images([image], image_processor, model.config)[0] |
| image_tensor = image_tensor.unsqueeze(0).to(dtype=DTYPE, device=model.device) |
| |
| print(f"[1] Image tensor:") |
| print(f" Shape: {image_tensor.shape}") |
| print(f" Dtype: {image_tensor.dtype}") |
| print(f" Range: [{image_tensor.min().item():.3f}, {image_tensor.max().item():.3f}]") |
| |
| # Get vision features |
| with torch.no_grad(): |
| vision_features = vision_tower(image_tensor) |
| print(f"\n[2] Vision features (from vision tower):") |
| print(f" Shape: {vision_features.shape}") |
| print(f" Mean: {vision_features.float().mean().item():.6f}") |
| print(f" Std: {vision_features.float().std().item():.6f}") |
| |
| # Check for NaN/Inf |
| if torch.isnan(vision_features).any(): |
| print(" [ERROR] Contains NaN!") |
| if torch.isinf(vision_features).any(): |
| print(" [ERROR] Contains Inf!") |
| |
| # Project through mm_projector |
| projected = model.model.mm_projector(vision_features) |
| print(f"\n[3] Projected features (after mm_projector):") |
| print(f" Shape: {projected.shape}") |
| print(f" Mean: {projected.float().mean().item():.6f}") |
| print(f" Std: {projected.float().std().item():.6f}") |
| |
| # Check for NaN/Inf |
| if torch.isnan(projected).any(): |
| print(" [ERROR] Contains NaN!") |
| if torch.isinf(projected).any(): |
| print(" [ERROR] Contains Inf!") |
| |
| # Compare with text embeddings scale |
| sample_ids = torch.tensor([[1, 2, 3]], device=model.device) |
| text_embeds = model.model.embed_tokens(sample_ids) |
| print(f"\n[4] Text embeddings (for comparison):") |
| print(f" Mean: {text_embeds.float().mean().item():.6f}") |
| print(f" Std: {text_embeds.float().std().item():.6f}") |
| |
| # Check scale match |
| proj_std = projected.float().std().item() |
| text_std = text_embeds.float().std().item() |
| ratio = proj_std / text_std if text_std > 0 else float('inf') |
| print(f"\n[5] Scale ratio (projected/text): {ratio:.2f}") |
| if ratio > 10 or ratio < 0.1: |
| print(" [WARNING] Scale mismatch! May cause issues.") |
| else: |
| print(" [OK] Scales are similar") |
| |
| return projected |
| |
| |
| |
| from llava.conversation import conv_templates |
| from llava.mm_utils import process_images, tokenizer_image_token |
| from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN |
| |
| |
| def chat(image, question, temperature=0.7, max_new_tokens=512, debug=False): |
| """ |
| Chat with the model about an image. |
| |
| Args: |
| image: PIL Image |
| question: str |
| temperature: float (0.0 = deterministic, higher = more random) |
| max_new_tokens: int |
| debug: bool - print debug info |
| |
| Returns: |
| str: Model response |
| """ |
| if debug: |
| debug_image_encoding(image) |
| |
| # Process image |
| image_tensor = process_images([image], image_processor, model.config)[0] |
| |
| # Build conversation |
| conv = conv_templates["qwen_3"].copy() |
| conv.append_message(conv.roles[0], f"{DEFAULT_IMAGE_TOKEN}\n{question}") |
| conv.append_message(conv.roles[1], None) |
| prompt = conv.get_prompt() |
| |
| if debug: |
| print(f"\n[DEBUG] Prompt:\n{prompt[:500]}...") |
| |
| # Tokenize |
| input_ids = tokenizer_image_token( |
| prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt' |
| ).unsqueeze(0).to(model.device) |
| |
| if debug: |
| print(f"\n[DEBUG] Input IDs shape: {input_ids.shape}") |
| # Check for IMAGE_TOKEN_INDEX |
| num_image_tokens = (input_ids == IMAGE_TOKEN_INDEX).sum().item() |
| print(f"[DEBUG] Number of image tokens: {num_image_tokens}") |
| |
| # Generate |
| with torch.inference_mode(): |
| output_ids = model.generate( |
| input_ids, |
| images=image_tensor.unsqueeze(0).to(dtype=DTYPE, device=model.device), |
| image_sizes=[image.size], |
| do_sample=temperature > 0, |
| temperature=temperature if temperature > 0 else None, |
| max_new_tokens=max_new_tokens, |
| pad_token_id=tokenizer.pad_token_id, |
| eos_token_id=tokenizer.eos_token_id, |
| ) |
| |
| # Decode |
| response = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
| |
| # Clean up response (remove prompt echo if present) |
| if "<|im_start|>" in response: |
| # Find the assistant's response |
| parts = response.split("<|im_start|>assistant") |
| if len(parts) > 1: |
| response = parts[-1].strip() |
| |
| return response |
| |
| |
| |
| if __name__ == "__main__": |
| # Run debug checks |
| debug_model() |
| |
| # Load test image |
| print("\n" + "=" * 60) |
| print("Loading test image...") |
| print("=" * 60) |
| |
| url = "<your url here>" |
| image = Image.open(BytesIO(requests.get(url).content)).convert("RGB") |
| print(f"Image size: {image.size}") |
| |
| # Debug image encoding |
| debug_image_encoding(image) |
| |
| # Test chat |
| print("\n" + "=" * 60) |
| print("Testing chat...") |
| print("=" * 60) |
| |
| questions = [ |
| "What is in this image?", |
| "Mô tả hình ảnh này", |
| "Con vật trong ảnh là gì?", |
| ] |
| |
| for q in questions: |
| print(f"\nQ: {q}") |
| response = chat(image, q, temperature=0.3, max_new_tokens=256) |
| print(f"A: {response[:500]}...") |
| print("-" * 40) |
| |
| ``` |
|
|
| ## Training Details |
|
|
| | Parameter | Value | |
| |-----------|-------| |
| | Base Model | Qwen/Qwen3-0.6B | |
| | Vision Tower | apple/MobileCLIP2-S4 | |
| | LoRA Rank | 4 | |
| | LoRA Alpha | 8 | |
| | Batch Size | 1 x 1 | |
| | Max Steps | 2 | |
|
|
|
|
| ## Conclusion: |
|
|
| > Limitation of MobileClip or training might make low quality model. |
|
|
| ## 🤝 Contribute |
|
|
| [](https://github.com/Hert4/ml-fastvlm-v2) |
|
|
|
|
| ## License |
|
|
| Apache 2.0 |
|
|