HackWeasel
/

llama-3.2-1b-QLORA-IMDB

@@ -26,61 +26,79 @@ Ask questions about movies which have been rated on IMDB
 Use the code below to get started with the model.
 ``` Python
-from peft import PeftModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 # Set device
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load tokenizer and models
-print("Loading models...")
-tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3.2-1b-instruct-bnb-4bit")
-base_model = AutoModelForCausalLM.from_pretrained("unsloth/llama-3.2-1b-instruct-bnb-4bit").to(device)
-model = PeftModel.from_pretrained(base_model, "HackWeasel/llama-3.2-1b-QLORA-IMDB").to(device)
-model.eval()
-print("Models loaded!")
-def generate_response(prompt, max_length=4096, temperature=0.7):
     with torch.no_grad():
-        inputs = tokenizer(prompt, return_tensors="pt").to(device)  # Move inputs to GPU
         outputs = model.generate(
             **inputs,
             max_length=max_length,
             temperature=temperature,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id
         )
         return tokenizer.decode(outputs[0], skip_special_tokens=True)
 def main():
     conversation_history = ""
     print("\nWelcome! Start chatting with the model (type 'quit' to exit)")
     while True:
-        user_input = input("\nYou: ").strip()
-        if user_input.lower() == 'quit':
-            print("Goodbye!")
-            break
-        # Construct the prompt with conversation history
-        if conversation_history:
-            full_prompt = f"{conversation_history}\nHuman: {user_input}\nAssistant:"
-        else:
-            full_prompt = f"Human: {user_input}\nAssistant:"
         try:
-            # Generate response
-            response = generate_response(full_prompt)
-            # Extract just the new response
-            new_response = response.split("Assistant:")[-1].strip()
-            # Update conversation history
             conversation_history = f"{conversation_history}\nHuman: {user_input}\nAssistant: {new_response}"
-            # Print the response
             print("\nAssistant:", new_response)
         except Exception as e:
             print(f"An error occurred: {e}")
             print("Continuing conversation...")

 Use the code below to get started with the model.
 ``` Python
+from peft import PeftModel, PeftConfig
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 # Set device
 device = "cuda" if torch.cuda.is_available() else "cpu"
+def load_model(base_model_id, adapter_model_id):
+    print("Loading models...")
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(base_model_id)
+    # Load base model (using model's built-in quantization)
+    base_model = AutoModelForCausalLM.from_pretrained(
+        base_model_id,
+        device_map="auto",
+        low_cpu_mem_usage=True
+    )
+    # Load the PEFT model
+    model = PeftModel.from_pretrained(
+        base_model,
+        adapter_model_id,
+        device_map="auto"
+    )
+    model.eval()
+    print("Models loaded!")
+    return model, tokenizer
+def generate_response(model, tokenizer, prompt, max_length=4096, temperature=0.7):
     with torch.no_grad():
+        inputs = tokenizer(prompt, return_tensors="pt").to(device)
         outputs = model.generate(
             **inputs,
             max_length=max_length,
             temperature=temperature,
             do_sample=True,
+            top_p=0.95,
+            top_k=40,
+            num_return_sequences=1,
             pad_token_id=tokenizer.eos_token_id
         )
         return tokenizer.decode(outputs[0], skip_special_tokens=True)
 def main():
+    model, tokenizer = load_model(
+        "unsloth/llama-3.2-1b-instruct-bnb-4bit",
+        "HackWeasel/llama-3.2-1b-QLORA-IMDB"
+    )
     conversation_history = ""
     print("\nWelcome! Start chatting with the model (type 'quit' to exit)")
+    print("Note: This model is fine-tuned on IMDB reviews data")
     while True:
         try:
+            user_input = input("\nYou: ").strip()
+            if user_input.lower() == 'quit':
+                print("Goodbye!")
+                break
+            if conversation_history:
+                full_prompt = f"{conversation_history}\nHuman: {user_input}\nAssistant:"
+            else:
+                full_prompt = f"Human: {user_input}\nAssistant:"
+            response = generate_response(model, tokenizer, full_prompt)
+            new_response = response.split("Assistant:")[-1].strip()
             conversation_history = f"{conversation_history}\nHuman: {user_input}\nAssistant: {new_response}"
             print("\nAssistant:", new_response)
         except Exception as e:
             print(f"An error occurred: {e}")
             print("Continuing conversation...")