--- base_model: unsloth/gemma-4-E2B-it tags: - text-generation-inference - transformers - unsloth - gemma4 - trl license: apache-2.0 language: - en --- ## Run in kaggle ``` # ========================================================= # Install dependencies (Kaggle usually already has some) # ========================================================= !pip install -q transformers peft accelerate bitsandbytes # ========================================================= # Imports # ========================================================= import torch from transformers import AutoProcessor, AutoModelForCausalLM from peft import PeftModel # ========================================================= # Config # ========================================================= BASE_MODEL = "google/gemma-4-E2B-it" LORA_MODEL = "rahul7star/gemma_4_lora" # ========================================================= # Load processor # ========================================================= processor = AutoProcessor.from_pretrained(BASE_MODEL) # ========================================================= # Load base model # ========================================================= model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.float16, # safer for Kaggle GPU device_map="auto" ) # ========================================================= # Load LoRA adapter on top of base model # ========================================================= model = PeftModel.from_pretrained(model, LORA_MODEL) # optional: merge LoRA for faster inference model = model.merge_and_unload() print("Model + LoRA loaded successfully 🚀") # ========================================================= # Inference function # ========================================================= def generate_response(user_input): messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": user_input}, ] text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False ) inputs = processor(text=text, return_tensors="pt").to(model.device) input_len = inputs["input_ids"].shape[-1] with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=512, temperature=0.7, top_p=0.9 ) response = processor.decode( outputs[0][input_len:], skip_special_tokens=True ) return response # ========================================================= # Test # ========================================================= print(generate_response("Write a short joke about saving RAM.")) ``` # Uploaded model - **Developed by:** rahul7star - **License:** apache-2.0 - **Finetuned from model :** unsloth/gemma-4-E2B-it This gemma4 model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) [](https://github.com/unslothai/unsloth)