---
license: apache-2.0
language:
- vi
- en
tags:
- vision-language-model
- vlm
- qwen3
- fastvlm
- vietnamese
base_model: Qwen/Qwen3-0.6B
datasets:
- 5CD-AI/Viet-multimodal-open-r1-8k-verified
---

# Belle-VLM: Vietnamese Vision Language Model

## Model Description

Belle-VLM is a Vision Language Model trained for Vietnamese multimodal reasoning tasks.

### Architecture
- **LLM Backbone**: Qwen3-0.6B
- **Vision Encoder**: FastViTHD (MobileCLIP)
- **Projector**: MLP 2-layer (3072 -> 1024)

### Training
- **Dataset**: 5CD-AI/Viet-multimodal-open-r1-8k-verified
- **Method**: LoRA fine-tuning
- **Steps**: 2
- **Learning Rate**: 2e-05

## Usage

```python
!pip install -q transformers>=4.51.0 torch torchvision timm pillow requests datasets einops
!pip install -q open-clip-torch
!git clone https://github.com/Hert4/ml-fastvlm-v2.git
%cd ml-fastvlm-v2

import sys
import os
import torch
import requests
from PIL import Image
from io import BytesIO

# Add path to ml-fastvlm-v2
FASTVLM_PATH = "/kaggle/working/ml-fastvlm-v2"
if os.path.exists(FASTVLM_PATH):
    sys.path.insert(0, FASTVLM_PATH)
else:
    # Local path fallback
    sys.path.insert(0, ".")


MODEL_PATH = "beyoru/Belle-VLM"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32

print(f"Device: {DEVICE}")
print(f"Dtype: {DTYPE}")

from transformers import AutoTokenizer
from llava.model.language_model.llava_qwen import LlavaQwen3ForCausalLM

print(f"\nLoading model from: {MODEL_PATH}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
    use_fast=False
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model
model = LlavaQwen3ForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=DTYPE,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()

# Setup vision tower
vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
    vision_tower.load_model()
vision_tower = vision_tower.to(device=model.device, dtype=DTYPE)
image_processor = vision_tower.image_processor

print("Model loaded!")


def debug_model():
    """Check if model components are properly loaded."""
    print("\n" + "=" * 60)
    print("DEBUG: Checking model components")
    print("=" * 60)

    # 1. Check mm_projector
    if hasattr(model.model, 'mm_projector'):
        mm_proj = model.model.mm_projector
        print(f"[OK] mm_projector exists: {type(mm_proj)}")

        # Check weights
        if isinstance(mm_proj, torch.nn.Sequential):
            first_layer = mm_proj[0]
            w = first_layer.weight.float()  # Convert to float for stats
            print(f"     First layer shape: {w.shape}")
            print(f"     Weight mean: {w.mean().item():.6f}")
            print(f"     Weight std:  {w.std().item():.6f}")
            print(f"     Weight range: [{w.min().item():.4f}, {w.max().item():.4f}]")

            # Check if looks trained
            if w.std().item() > 0.3:
                print("     [WARNING] Std too high - might be random init!")
            else:
                print("     [OK] Weights look trained")
    else:
        print("[ERROR] mm_projector NOT FOUND!")
        return False

    # 2. Check vision tower
    print(f"\n[OK] Vision tower: {type(vision_tower).__name__}")
    print(f"     Is loaded: {vision_tower.is_loaded}")
    print(f"     Hidden size: {getattr(vision_tower, 'hidden_size', 'unknown')}")

    # 3. Check config
    print(f"\n[INFO] Config:")
    print(f"     mm_vision_tower: {getattr(model.config, 'mm_vision_tower', 'NOT SET')}")
    print(f"     mm_hidden_size: {getattr(model.config, 'mm_hidden_size', 'NOT SET')}")
    print(f"     mm_projector_type: {getattr(model.config, 'mm_projector_type', 'NOT SET')}")
    print(f"     LLM hidden_size: {model.config.hidden_size}")

    return True


def debug_image_encoding(image):
    """Test image encoding pipeline."""
    from llava.mm_utils import process_images

    print("\n" + "=" * 60)
    print("DEBUG: Testing image encoding pipeline")
    print("=" * 60)

    # Process image
    image_tensor = process_images([image], image_processor, model.config)[0]
    image_tensor = image_tensor.unsqueeze(0).to(dtype=DTYPE, device=model.device)

    print(f"[1] Image tensor:")
    print(f"    Shape: {image_tensor.shape}")
    print(f"    Dtype: {image_tensor.dtype}")
    print(f"    Range: [{image_tensor.min().item():.3f}, {image_tensor.max().item():.3f}]")

    # Get vision features
    with torch.no_grad():
        vision_features = vision_tower(image_tensor)
        print(f"\n[2] Vision features (from vision tower):")
        print(f"    Shape: {vision_features.shape}")
        print(f"    Mean: {vision_features.float().mean().item():.6f}")
        print(f"    Std:  {vision_features.float().std().item():.6f}")

        # Check for NaN/Inf
        if torch.isnan(vision_features).any():
            print("    [ERROR] Contains NaN!")
        if torch.isinf(vision_features).any():
            print("    [ERROR] Contains Inf!")

        # Project through mm_projector
        projected = model.model.mm_projector(vision_features)
        print(f"\n[3] Projected features (after mm_projector):")
        print(f"    Shape: {projected.shape}")
        print(f"    Mean: {projected.float().mean().item():.6f}")
        print(f"    Std:  {projected.float().std().item():.6f}")

        # Check for NaN/Inf
        if torch.isnan(projected).any():
            print("    [ERROR] Contains NaN!")
        if torch.isinf(projected).any():
            print("    [ERROR] Contains Inf!")

        # Compare with text embeddings scale
        sample_ids = torch.tensor([[1, 2, 3]], device=model.device)
        text_embeds = model.model.embed_tokens(sample_ids)
        print(f"\n[4] Text embeddings (for comparison):")
        print(f"    Mean: {text_embeds.float().mean().item():.6f}")
        print(f"    Std:  {text_embeds.float().std().item():.6f}")

        # Check scale match
        proj_std = projected.float().std().item()
        text_std = text_embeds.float().std().item()
        ratio = proj_std / text_std if text_std > 0 else float('inf')
        print(f"\n[5] Scale ratio (projected/text): {ratio:.2f}")
        if ratio > 10 or ratio < 0.1:
            print("    [WARNING] Scale mismatch! May cause issues.")
        else:
            print("    [OK] Scales are similar")

    return projected


from llava.conversation import conv_templates
from llava.mm_utils import process_images, tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN


def chat(image, question, temperature=0.7, max_new_tokens=512, debug=False):
    """
    Chat with the model about an image.

    Args:
        image: PIL Image
        question: str
        temperature: float (0.0 = deterministic, higher = more random)
        max_new_tokens: int
        debug: bool - print debug info

    Returns:
        str: Model response
    """
    if debug:
        debug_image_encoding(image)

    # Process image
    image_tensor = process_images([image], image_processor, model.config)[0]

    # Build conversation
    conv = conv_templates["qwen_3"].copy()
    conv.append_message(conv.roles[0], f"{DEFAULT_IMAGE_TOKEN}\n{question}")
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()

    if debug:
        print(f"\n[DEBUG] Prompt:\n{prompt[:500]}...")

    # Tokenize
    input_ids = tokenizer_image_token(
        prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt'
    ).unsqueeze(0).to(model.device)

    if debug:
        print(f"\n[DEBUG] Input IDs shape: {input_ids.shape}")
        # Check for IMAGE_TOKEN_INDEX
        num_image_tokens = (input_ids == IMAGE_TOKEN_INDEX).sum().item()
        print(f"[DEBUG] Number of image tokens: {num_image_tokens}")

    # Generate
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=image_tensor.unsqueeze(0).to(dtype=DTYPE, device=model.device),
            image_sizes=[image.size],
            do_sample=temperature > 0,
            temperature=temperature if temperature > 0 else None,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Decode
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Clean up response (remove prompt echo if present)
    if "<|im_start|>" in response:
        # Find the assistant's response
        parts = response.split("<|im_start|>assistant")
        if len(parts) > 1:
            response = parts[-1].strip()

    return response


if __name__ == "__main__":
    # Run debug checks
    debug_model()

    # Load test image
    print("\n" + "=" * 60)
    print("Loading test image...")
    print("=" * 60)

    url = "<your url here>"
    image = Image.open(BytesIO(requests.get(url).content)).convert("RGB")
    print(f"Image size: {image.size}")

    # Debug image encoding
    debug_image_encoding(image)

    # Test chat
    print("\n" + "=" * 60)
    print("Testing chat...")
    print("=" * 60)

    questions = [
        "What is in this image?",
        "Mô tả hình ảnh này",
        "Con vật trong ảnh là gì?",
    ]

    for q in questions:
        print(f"\nQ: {q}")
        response = chat(image, q, temperature=0.3, max_new_tokens=256)
        print(f"A: {response[:500]}...")
        print("-" * 40)

```

## Training Details

| Parameter | Value |
|-----------|-------|
| Base Model | Qwen/Qwen3-0.6B |
| Vision Tower | apple/MobileCLIP2-S4 |
| LoRA Rank | 4 |
| LoRA Alpha | 8 |
| Batch Size | 1 x 1 |
| Max Steps | 2 |


## Conclusion:

> Limitation of MobileClip or training might make low quality model.

## 🤝 Contribute

[![GitHub](https://img.shields.io/badge/GitHub-ml--fastvlm--v2-181717?style=for-the-badge&logo=github)](https://github.com/Hert4/ml-fastvlm-v2)


## License

Apache 2.0