--- base_model: unsloth/llama-3.2-11b-vision-instruct-bnb-4bit tags: - text-generation-inference - transformers - unsloth - mllama license: apache-2.0 language: - en --- # Uploaded model - **Developed by:** pollitoconpapass - **License:** apache-2.0 - **Finetuned from model :** unsloth/llama-3.2-11b-vision-instruct-bnb-4bit This mllama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library. [](https://github.com/unslothai/unsloth) # Implementation ```py from datasets import load_dataset from unsloth import FastVisionModel model, tokenizer = FastVisionModel.from_pretrained( # "unsloth/Llama-3.2-11B-Vision-Instruct", "pollitoconpapass/Llama-3.2-11B-Vision-Radiology-mini", load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA. use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context ) model = FastVisionModel.get_peft_model( model, finetune_vision_layers = True, # False if not finetuning vision layers finetune_language_layers = True, # False if not finetuning language layers finetune_attention_modules = True, # False if not finetuning attention layers finetune_mlp_modules = True, # False if not finetuning MLP layers r = 16, # The larger, the higher the accuracy, but might overfit lora_alpha = 16, # Recommended alpha == r at least lora_dropout = 0, bias = "none", random_state = 3407, use_rslora = False, # We support rank stabilized LoRA loftq_config = None, # And LoftQ # target_modules = "all-linear", # Optional now! Can specify a list if needed ) dataset = load_dataset("unsloth/Radiology_mini", split = "train") instruction = "You are an expert radiographer. Describe accurately what you see in this image." def convert_to_conversation(sample): conversation = [ { "role": "user", "content" : [ {"type" : "text", "text" : instruction}, {"type" : "image", "image" : sample["image"]} ] }, { "role" : "assistant", "content" : [ {"type" : "text", "text" : sample["caption"]} ] }, ] return { "messages" : conversation } pass converted_dataset = [convert_to_conversation(sample) for sample in dataset] FastVisionModel.for_inference(model) # Enable for inference! image = dataset[0]["image"] instruction = "You are an expert radiographer. Describe accurately what you see in this image." messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": instruction} ]} ] input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True) inputs = tokenizer( image, input_text, add_special_tokens = False, return_tensors = "pt", ).to("cuda") from transformers import TextStreamer text_streamer = TextStreamer(tokenizer, skip_prompt = True) _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 500, use_cache = True, temperature = 1.5, min_p = 0.1) ```