YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)
model, tokenizer = FastVisionModel.from_pretrained(
model_name = "unsloth/Qwen3-VL-8B-Instruct-unsloth-bnb-4bit",
max_seq_length = max_seq_length,
load_in_4bit = True, # False for LoRA 16bit
fast_inference = False, # Enable vLLM fast inference
gpu_memory_utilization = 0.8, # Reduce if out of memory
)
model = FastVisionModel.get_peft_model(
model,
finetune_vision_layers = False, # False if not finetuning vision layers
finetune_language_layers = True, # False if not finetuning language layers
finetune_attention_modules = True, # False if not finetuning attention layers
finetune_mlp_modules = True, # False if not finetuning MLP layers
r = 16, # The larger, the higher the accuracy, but might overfit
lora_alpha = 16, # Recommended alpha == r at least
lora_dropout = 0,
bias = "none",
random_state = 3407,
use_rslora = False, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
use_gradient_checkpointing = "unsloth", # Reduces memory usage
# target_modules = "all-linear", # Optional now! Can specify a list if needed
)
dataset = load_dataset("AI4Math/MathVista", split = "testmini")
# Reward functions
import re
def formatting_reward_func(completions,**kwargs):
import re
thinking_pattern = f'{REASONING_START}(.*?){REASONING_END}'
answer_pattern = f'{SOLUTION_START}(.*?){SOLUTION_END}'
scores = []
for completion in completions:
score = 0
thinking_matches = re.findall(thinking_pattern, completion, re.DOTALL)
answer_matches = re.findall(answer_pattern, completion, re.DOTALL)
if len(thinking_matches) == 1:
score += 1.0
if len(answer_matches) == 1:
score += 1.0
# Fix up addCriterion issues
# See https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl#qwen-2.5-vl-vision-rl-issues-and-quirks
# Penalize on excessive addCriterion and newlines
if len(completion) != 0:
removal = completion.replace("addCriterion", "").replace("\n", "")
if (len(completion)-len(removal))/len(completion) >= 0.5:
score -= 2.0
scores.append(score)
return scores
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
answer_pattern = f'{SOLUTION_START}(.*?){SOLUTION_END}'
responses = [re.findall(answer_pattern, completion, re.DOTALL) for completion in completions]
q = prompts[0]
print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:{completions[0]}")
return [
2.0 if len(r)==1 and a == r[0].replace('\n','') else 0.0
for r, a in zip(responses, answer)
]
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
learning_rate = 5e-6,
adam_beta1 = 0.9,
adam_beta2 = 0.99,
weight_decay = 0.1,
warmup_ratio = 0.1,
lr_scheduler_type = "cosine",
optim = "adamw_8bit",
logging_steps = 1,
log_completions = False,
per_device_train_batch_size = 1,
gradient_accumulation_steps = 4, # Increase to 4 for smoother training
num_generations = 2, # Decrease if out of memory
max_prompt_length = 1024,
max_completion_length = 1024,
num_train_epochs = 0.6, # Set to 1 for a full training run
# max_steps = 60,
save_steps = 60,
max_grad_norm = 0.1,
report_to = "none", # Can use Weights & Biases
output_dir = "outputs",
# Below enables GSPO:
importance_sampling_level = "sequence",
mask_truncated_completions = False,
loss_type = "dr_grpo",
)
trainer = GRPOTrainer(
model = model,
args = training_args,
# Pass the processor to handle multimodal inputs
processing_class = tokenizer,
reward_funcs = [
formatting_reward_func,
correctness_reward_func,
],
train_dataset = train_dataset,
)
trainer.train()
base_model: unsloth/Qwen3-VL-8B-Instruct-unsloth-bnb-4bit tags: - text-generation-inference - transformers - unsloth - qwen3_vl license: apache-2.0 language: - en
Uploaded finetuned model
- Developed by: xTronz
- License: apache-2.0
- Finetuned from model : unsloth/Qwen3-VL-8B-Instruct-unsloth-bnb-4bit
This qwen3_vl model was trained 2x faster with Unsloth and Huggingface's TRL library.
- Downloads last month
- -
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support
