umairimran
/

model

 language:
 - en
 base_model: unsloth/llama-3-8b-bnb-4bit
+---
+from transformers import AutoTokenizer, TextStreamer
+from peft import PeftModel, PeftConfig
+from transformers import AutoModelForCausalLM
+from unsloth import FastLanguageModel
+import torch
+alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+### Instruction:
+{}
+### Input:
+{}
+### Response:
+{}"""
+# Load the Peft configuration
+config = PeftConfig.from_pretrained("umairimran/medical_chatbot_model_trained_on_ruslanmv_ai-medical-chatbot")
+# Load the base model from Hugging Face with float16 data type
+base_model = AutoModelForCausalLM.from_pretrained(
+    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
+    torch_dtype=torch.float16,  # Switch to float16
+    device_map="auto"  # Automatically map model to available devices
+)
+# Apply the PeftModel to the base model
+model = PeftModel.from_pretrained(base_model, "umairimran/medical_chatbot_model_trained_on_ruslanmv_ai-medical-chatbot")
+# Initialize the tokenizer
+tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-bnb-4bit")
+# Optimize the model for inference (this applies if using FastLanguageModel)
+FastLanguageModel.for_inference(model)
+# Prepare the input without using .to("cuda") because device_map="auto" handles it
+inputs = tokenizer(
+alpaca_prompt.format(
+        "hello doctor can you understand me i want to know about deseacse of flu",    # instruction
+        "i dont know how to flu",
+    "", # output - leave this blank for generation!
+    ),
+    return_tensors="pt"
+)
+# Set up the streamer for output
+text_streamer = TextStreamer(tokenizer)
+# Generate the output
+_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)