--- license: apache-2.0 language: - vi - en tags: - vision-language-model - vlm - qwen3 - fastvlm - vietnamese base_model: Qwen/Qwen3-0.6B datasets: - 5CD-AI/Viet-multimodal-open-r1-8k-verified --- # Belle-VLM: Vietnamese Vision Language Model ## Model Description Belle-VLM is a Vision Language Model trained for Vietnamese multimodal reasoning tasks. ### Architecture - **LLM Backbone**: Qwen3-0.6B - **Vision Encoder**: FastViTHD (MobileCLIP) - **Projector**: MLP 2-layer (3072 -> 1024) ### Training - **Dataset**: 5CD-AI/Viet-multimodal-open-r1-8k-verified - **Method**: LoRA fine-tuning - **Steps**: 2 - **Learning Rate**: 2e-05 ## Usage ```python !pip install -q transformers>=4.51.0 torch torchvision timm pillow requests datasets einops !pip install -q open-clip-torch !git clone https://github.com/Hert4/ml-fastvlm-v2.git %cd ml-fastvlm-v2 import sys import os import torch import requests from PIL import Image from io import BytesIO # Add path to ml-fastvlm-v2 FASTVLM_PATH = "/kaggle/working/ml-fastvlm-v2" if os.path.exists(FASTVLM_PATH): sys.path.insert(0, FASTVLM_PATH) else: # Local path fallback sys.path.insert(0, ".") MODEL_PATH = "beyoru/Belle-VLM" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32 print(f"Device: {DEVICE}") print(f"Dtype: {DTYPE}") from transformers import AutoTokenizer from llava.model.language_model.llava_qwen import LlavaQwen3ForCausalLM print(f"\nLoading model from: {MODEL_PATH}") # Load tokenizer tokenizer = AutoTokenizer.from_pretrained( MODEL_PATH, trust_remote_code=True, use_fast=False ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Load model model = LlavaQwen3ForCausalLM.from_pretrained( MODEL_PATH, torch_dtype=DTYPE, device_map="auto", trust_remote_code=True, ) model.eval() # Setup vision tower vision_tower = model.get_vision_tower() if not vision_tower.is_loaded: vision_tower.load_model() vision_tower = vision_tower.to(device=model.device, dtype=DTYPE) image_processor = vision_tower.image_processor print("Model loaded!") def debug_model(): """Check if model components are properly loaded.""" print("\n" + "=" * 60) print("DEBUG: Checking model components") print("=" * 60) # 1. Check mm_projector if hasattr(model.model, 'mm_projector'): mm_proj = model.model.mm_projector print(f"[OK] mm_projector exists: {type(mm_proj)}") # Check weights if isinstance(mm_proj, torch.nn.Sequential): first_layer = mm_proj[0] w = first_layer.weight.float() # Convert to float for stats print(f" First layer shape: {w.shape}") print(f" Weight mean: {w.mean().item():.6f}") print(f" Weight std: {w.std().item():.6f}") print(f" Weight range: [{w.min().item():.4f}, {w.max().item():.4f}]") # Check if looks trained if w.std().item() > 0.3: print(" [WARNING] Std too high - might be random init!") else: print(" [OK] Weights look trained") else: print("[ERROR] mm_projector NOT FOUND!") return False # 2. Check vision tower print(f"\n[OK] Vision tower: {type(vision_tower).__name__}") print(f" Is loaded: {vision_tower.is_loaded}") print(f" Hidden size: {getattr(vision_tower, 'hidden_size', 'unknown')}") # 3. Check config print(f"\n[INFO] Config:") print(f" mm_vision_tower: {getattr(model.config, 'mm_vision_tower', 'NOT SET')}") print(f" mm_hidden_size: {getattr(model.config, 'mm_hidden_size', 'NOT SET')}") print(f" mm_projector_type: {getattr(model.config, 'mm_projector_type', 'NOT SET')}") print(f" LLM hidden_size: {model.config.hidden_size}") return True def debug_image_encoding(image): """Test image encoding pipeline.""" from llava.mm_utils import process_images print("\n" + "=" * 60) print("DEBUG: Testing image encoding pipeline") print("=" * 60) # Process image image_tensor = process_images([image], image_processor, model.config)[0] image_tensor = image_tensor.unsqueeze(0).to(dtype=DTYPE, device=model.device) print(f"[1] Image tensor:") print(f" Shape: {image_tensor.shape}") print(f" Dtype: {image_tensor.dtype}") print(f" Range: [{image_tensor.min().item():.3f}, {image_tensor.max().item():.3f}]") # Get vision features with torch.no_grad(): vision_features = vision_tower(image_tensor) print(f"\n[2] Vision features (from vision tower):") print(f" Shape: {vision_features.shape}") print(f" Mean: {vision_features.float().mean().item():.6f}") print(f" Std: {vision_features.float().std().item():.6f}") # Check for NaN/Inf if torch.isnan(vision_features).any(): print(" [ERROR] Contains NaN!") if torch.isinf(vision_features).any(): print(" [ERROR] Contains Inf!") # Project through mm_projector projected = model.model.mm_projector(vision_features) print(f"\n[3] Projected features (after mm_projector):") print(f" Shape: {projected.shape}") print(f" Mean: {projected.float().mean().item():.6f}") print(f" Std: {projected.float().std().item():.6f}") # Check for NaN/Inf if torch.isnan(projected).any(): print(" [ERROR] Contains NaN!") if torch.isinf(projected).any(): print(" [ERROR] Contains Inf!") # Compare with text embeddings scale sample_ids = torch.tensor([[1, 2, 3]], device=model.device) text_embeds = model.model.embed_tokens(sample_ids) print(f"\n[4] Text embeddings (for comparison):") print(f" Mean: {text_embeds.float().mean().item():.6f}") print(f" Std: {text_embeds.float().std().item():.6f}") # Check scale match proj_std = projected.float().std().item() text_std = text_embeds.float().std().item() ratio = proj_std / text_std if text_std > 0 else float('inf') print(f"\n[5] Scale ratio (projected/text): {ratio:.2f}") if ratio > 10 or ratio < 0.1: print(" [WARNING] Scale mismatch! May cause issues.") else: print(" [OK] Scales are similar") return projected from llava.conversation import conv_templates from llava.mm_utils import process_images, tokenizer_image_token from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN def chat(image, question, temperature=0.7, max_new_tokens=512, debug=False): """ Chat with the model about an image. Args: image: PIL Image question: str temperature: float (0.0 = deterministic, higher = more random) max_new_tokens: int debug: bool - print debug info Returns: str: Model response """ if debug: debug_image_encoding(image) # Process image image_tensor = process_images([image], image_processor, model.config)[0] # Build conversation conv = conv_templates["qwen_3"].copy() conv.append_message(conv.roles[0], f"{DEFAULT_IMAGE_TOKEN}\n{question}") conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() if debug: print(f"\n[DEBUG] Prompt:\n{prompt[:500]}...") # Tokenize input_ids = tokenizer_image_token( prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt' ).unsqueeze(0).to(model.device) if debug: print(f"\n[DEBUG] Input IDs shape: {input_ids.shape}") # Check for IMAGE_TOKEN_INDEX num_image_tokens = (input_ids == IMAGE_TOKEN_INDEX).sum().item() print(f"[DEBUG] Number of image tokens: {num_image_tokens}") # Generate with torch.inference_mode(): output_ids = model.generate( input_ids, images=image_tensor.unsqueeze(0).to(dtype=DTYPE, device=model.device), image_sizes=[image.size], do_sample=temperature > 0, temperature=temperature if temperature > 0 else None, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, ) # Decode response = tokenizer.decode(output_ids[0], skip_special_tokens=True) # Clean up response (remove prompt echo if present) if "<|im_start|>" in response: # Find the assistant's response parts = response.split("<|im_start|>assistant") if len(parts) > 1: response = parts[-1].strip() return response if __name__ == "__main__": # Run debug checks debug_model() # Load test image print("\n" + "=" * 60) print("Loading test image...") print("=" * 60) url = "" image = Image.open(BytesIO(requests.get(url).content)).convert("RGB") print(f"Image size: {image.size}") # Debug image encoding debug_image_encoding(image) # Test chat print("\n" + "=" * 60) print("Testing chat...") print("=" * 60) questions = [ "What is in this image?", "Mô tả hình ảnh này", "Con vật trong ảnh là gì?", ] for q in questions: print(f"\nQ: {q}") response = chat(image, q, temperature=0.3, max_new_tokens=256) print(f"A: {response[:500]}...") print("-" * 40) ``` ## Training Details | Parameter | Value | |-----------|-------| | Base Model | Qwen/Qwen3-0.6B | | Vision Tower | apple/MobileCLIP2-S4 | | LoRA Rank | 4 | | LoRA Alpha | 8 | | Batch Size | 1 x 1 | | Max Steps | 2 | ## Conclusion: > Limitation of MobileClip or training might make low quality model. ## 🤝 Contribute [![GitHub](https://img.shields.io/badge/GitHub-ml--fastvlm--v2-181717?style=for-the-badge&logo=github)](https://github.com/Hert4/ml-fastvlm-v2) ## License Apache 2.0