Update README.md

6902421 verified about 2 months ago

10 kB

	---
	license: apache-2.0
	language:
	- vi
	- en
	tags:
	- vision-language-model
	- vlm
	- qwen3
	- fastvlm
	- vietnamese
	base_model: Qwen/Qwen3-0.6B
	datasets:
	- 5CD-AI/Viet-multimodal-open-r1-8k-verified
	---

	# Belle-VLM: Vietnamese Vision Language Model

	## Model Description

	Belle-VLM is a Vision Language Model trained for Vietnamese multimodal reasoning tasks.

	### Architecture
	- LLM Backbone: Qwen3-0.6B
	- Vision Encoder: FastViTHD (MobileCLIP)
	- Projector: MLP 2-layer (3072 -> 1024)

	### Training
	- Dataset: 5CD-AI/Viet-multimodal-open-r1-8k-verified
	- Method: LoRA fine-tuning
	- Steps: 2
	- Learning Rate: 2e-05

	## Usage

	```python
	!pip install -q transformers>=4.51.0 torch torchvision timm pillow requests datasets einops
	!pip install -q open-clip-torch
	!git clone https://github.com/Hert4/ml-fastvlm-v2.git
	%cd ml-fastvlm-v2

	import sys
	import os
	import torch
	import requests
	from PIL import Image
	from io import BytesIO

	# Add path to ml-fastvlm-v2
	FASTVLM_PATH = "/kaggle/working/ml-fastvlm-v2"
	if os.path.exists(FASTVLM_PATH):
	sys.path.insert(0, FASTVLM_PATH)
	else:
	# Local path fallback
	sys.path.insert(0, ".")


	MODEL_PATH = "beyoru/Belle-VLM"
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32

	print(f"Device: {DEVICE}")
	print(f"Dtype: {DTYPE}")

	from transformers import AutoTokenizer
	from llava.model.language_model.llava_qwen import LlavaQwen3ForCausalLM

	print(f"\nLoading model from: {MODEL_PATH}")

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_PATH,
	trust_remote_code=True,
	use_fast=False
	)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Load model
	model = LlavaQwen3ForCausalLM.from_pretrained(
	MODEL_PATH,
	torch_dtype=DTYPE,
	device_map="auto",
	trust_remote_code=True,
	)
	model.eval()

	# Setup vision tower
	vision_tower = model.get_vision_tower()
	if not vision_tower.is_loaded:
	vision_tower.load_model()
	vision_tower = vision_tower.to(device=model.device, dtype=DTYPE)
	image_processor = vision_tower.image_processor

	print("Model loaded!")


	def debug_model():
	"""Check if model components are properly loaded."""
	print("\n" + "=" * 60)
	print("DEBUG: Checking model components")
	print("=" * 60)

	# 1. Check mm_projector
	if hasattr(model.model, 'mm_projector'):
	mm_proj = model.model.mm_projector
	print(f"[OK] mm_projector exists: {type(mm_proj)}")

	# Check weights
	if isinstance(mm_proj, torch.nn.Sequential):
	first_layer = mm_proj[0]
	w = first_layer.weight.float() # Convert to float for stats
	print(f" First layer shape: {w.shape}")
	print(f" Weight mean: {w.mean().item():.6f}")
	print(f" Weight std: {w.std().item():.6f}")
	print(f" Weight range: [{w.min().item():.4f}, {w.max().item():.4f}]")

	# Check if looks trained
	if w.std().item() > 0.3:
	print(" [WARNING] Std too high - might be random init!")
	else:
	print(" [OK] Weights look trained")
	else:
	print("[ERROR] mm_projector NOT FOUND!")
	return False

	# 2. Check vision tower
	print(f"\n[OK] Vision tower: {type(vision_tower).__name__}")
	print(f" Is loaded: {vision_tower.is_loaded}")
	print(f" Hidden size: {getattr(vision_tower, 'hidden_size', 'unknown')}")

	# 3. Check config
	print(f"\n[INFO] Config:")
	print(f" mm_vision_tower: {getattr(model.config, 'mm_vision_tower', 'NOT SET')}")
	print(f" mm_hidden_size: {getattr(model.config, 'mm_hidden_size', 'NOT SET')}")
	print(f" mm_projector_type: {getattr(model.config, 'mm_projector_type', 'NOT SET')}")
	print(f" LLM hidden_size: {model.config.hidden_size}")

	return True


	def debug_image_encoding(image):
	"""Test image encoding pipeline."""
	from llava.mm_utils import process_images

	print("\n" + "=" * 60)
	print("DEBUG: Testing image encoding pipeline")
	print("=" * 60)

	# Process image
	image_tensor = process_images([image], image_processor, model.config)[0]
	image_tensor = image_tensor.unsqueeze(0).to(dtype=DTYPE, device=model.device)

	print(f"[1] Image tensor:")
	print(f" Shape: {image_tensor.shape}")
	print(f" Dtype: {image_tensor.dtype}")
	print(f" Range: [{image_tensor.min().item():.3f}, {image_tensor.max().item():.3f}]")

	# Get vision features
	with torch.no_grad():
	vision_features = vision_tower(image_tensor)
	print(f"\n[2] Vision features (from vision tower):")
	print(f" Shape: {vision_features.shape}")
	print(f" Mean: {vision_features.float().mean().item():.6f}")
	print(f" Std: {vision_features.float().std().item():.6f}")

	# Check for NaN/Inf
	if torch.isnan(vision_features).any():
	print(" [ERROR] Contains NaN!")
	if torch.isinf(vision_features).any():
	print(" [ERROR] Contains Inf!")

	# Project through mm_projector
	projected = model.model.mm_projector(vision_features)
	print(f"\n[3] Projected features (after mm_projector):")
	print(f" Shape: {projected.shape}")
	print(f" Mean: {projected.float().mean().item():.6f}")
	print(f" Std: {projected.float().std().item():.6f}")

	# Check for NaN/Inf
	if torch.isnan(projected).any():
	print(" [ERROR] Contains NaN!")
	if torch.isinf(projected).any():
	print(" [ERROR] Contains Inf!")

	# Compare with text embeddings scale
	sample_ids = torch.tensor([[1, 2, 3]], device=model.device)
	text_embeds = model.model.embed_tokens(sample_ids)
	print(f"\n[4] Text embeddings (for comparison):")
	print(f" Mean: {text_embeds.float().mean().item():.6f}")
	print(f" Std: {text_embeds.float().std().item():.6f}")

	# Check scale match
	proj_std = projected.float().std().item()
	text_std = text_embeds.float().std().item()
	ratio = proj_std / text_std if text_std > 0 else float('inf')
	print(f"\n[5] Scale ratio (projected/text): {ratio:.2f}")
	if ratio > 10 or ratio < 0.1:
	print(" [WARNING] Scale mismatch! May cause issues.")
	else:
	print(" [OK] Scales are similar")

	return projected



	from llava.conversation import conv_templates
	from llava.mm_utils import process_images, tokenizer_image_token
	from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN


	def chat(image, question, temperature=0.7, max_new_tokens=512, debug=False):
	"""
	Chat with the model about an image.

	Args:
	image: PIL Image
	question: str
	temperature: float (0.0 = deterministic, higher = more random)
	max_new_tokens: int
	debug: bool - print debug info

	Returns:
	str: Model response
	"""
	if debug:
	debug_image_encoding(image)

	# Process image
	image_tensor = process_images([image], image_processor, model.config)[0]

	# Build conversation
	conv = conv_templates["qwen_3"].copy()
	conv.append_message(conv.roles[0], f"{DEFAULT_IMAGE_TOKEN}\n{question}")
	conv.append_message(conv.roles[1], None)
	prompt = conv.get_prompt()

	if debug:
	print(f"\n[DEBUG] Prompt:\n{prompt[:500]}...")

	# Tokenize
	input_ids = tokenizer_image_token(
	prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt'
	).unsqueeze(0).to(model.device)

	if debug:
	print(f"\n[DEBUG] Input IDs shape: {input_ids.shape}")
	# Check for IMAGE_TOKEN_INDEX
	num_image_tokens = (input_ids == IMAGE_TOKEN_INDEX).sum().item()
	print(f"[DEBUG] Number of image tokens: {num_image_tokens}")

	# Generate
	with torch.inference_mode():
	output_ids = model.generate(
	input_ids,
	images=image_tensor.unsqueeze(0).to(dtype=DTYPE, device=model.device),
	image_sizes=[image.size],
	do_sample=temperature > 0,
	temperature=temperature if temperature > 0 else None,
	max_new_tokens=max_new_tokens,
	pad_token_id=tokenizer.pad_token_id,
	eos_token_id=tokenizer.eos_token_id,
	)

	# Decode
	response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

	# Clean up response (remove prompt echo if present)
	if "<\|im_start\|>" in response:
	# Find the assistant's response
	parts = response.split("<\|im_start\|>assistant")
	if len(parts) > 1:
	response = parts[-1].strip()

	return response



	if __name__ == "__main__":
	# Run debug checks
	debug_model()

	# Load test image
	print("\n" + "=" * 60)
	print("Loading test image...")
	print("=" * 60)

	url = "<your url here>"
	image = Image.open(BytesIO(requests.get(url).content)).convert("RGB")
	print(f"Image size: {image.size}")

	# Debug image encoding
	debug_image_encoding(image)

	# Test chat
	print("\n" + "=" * 60)
	print("Testing chat...")
	print("=" * 60)

	questions = [
	"What is in this image?",
	"Mô tả hình ảnh này",
	"Con vật trong ảnh là gì?",
	]

	for q in questions:
	print(f"\nQ: {q}")
	response = chat(image, q, temperature=0.3, max_new_tokens=256)
	print(f"A: {response[:500]}...")
	print("-" * 40)

	```

	## Training Details

	\| Parameter \| Value \|
	\|-----------\|-------\|
	\| Base Model \| Qwen/Qwen3-0.6B \|
	\| Vision Tower \| apple/MobileCLIP2-S4 \|
	\| LoRA Rank \| 4 \|
	\| LoRA Alpha \| 8 \|
	\| Batch Size \| 1 x 1 \|
	\| Max Steps \| 2 \|


	## Conclusion:

	> Limitation of MobileClip or training might make low quality model.

	## 🤝 Contribute

	[![GitHub](https://img.shields.io/badge/GitHub-ml--fastvlm--v2-181717?style=for-the-badge&logo=github)](https://github.com/Hert4/ml-fastvlm-v2)


	## License

	Apache 2.0