dfurman
/

Llama-2-70B-Instruct-v0.1

@@ -154,29 +154,26 @@ While great efforts have been taken to clean the pretraining data, it is possibl
 Basic usage: [notebook](assets/basic_inference_llama_2_dolphin.ipynb)
-Install and import the package dependencies:
 ```python
 !pip install -q -U huggingface_hub peft transformers torch accelerate
 ```
 ```python
 import torch
 from peft import PeftModel, PeftConfig
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-```
-Sign into a HF account with access to Llama-2:
-```python
-from huggingface_hub import notebook_login
 notebook_login()
 ```
-Basic model loading:
 ```python
-peft_model_id = "dfurman/llama-2-70b-dolphin-peft"
 config = PeftConfig.from_pretrained(peft_model_id)
 bnb_config = BitsAndBytesConfig(
@@ -189,83 +186,42 @@ model = AutoModelForCausalLM.from_pretrained(
     config.base_model_name_or_path,
     quantization_config=bnb_config,
     use_auth_token=True,
-    torch_dtype=torch.bfloat16,
     device_map="auto",
 )
-tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
 tokenizer.pad_token = tokenizer.eos_token
-# Load the Lora model
 model = PeftModel.from_pretrained(model, peft_model_id)
-```
-Once loaded, the model and tokenizer can be used with the following code:
-```python
-def llama_generate(
-    model: AutoModelForCausalLM,
-    tokenizer: AutoTokenizer,
-    prompt: str,
-    max_new_tokens: int = 128,
-    temperature: float = 0.92,
-) -> str:
-    """
-    Initialize the pipeline
-    Uses Hugging Face GenerationConfig defaults
-        https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationConfig
-    Args:
-        model (transformers.AutoModelForCausalLM): Falcon model for text generation
-        tokenizer (transformers.AutoTokenizer): Tokenizer for model
-        prompt (str): Prompt for text generation
-        max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.
-        temperature (float, optional): The value used to modulate the next token probabilities.
-            Defaults to 1.0
-    """
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    inputs = tokenizer(
-        [prompt],
-        return_tensors="pt",
-        return_token_type_ids=False,
-    ).to(
-        device
-    )  # tokenize inputs, load on device
-    # when running Torch modules in lower precision, it is best practice to use the torch.autocast context manager.
-    with torch.autocast("cuda", dtype=torch.bfloat16):
-        response = model.generate(
-            **inputs,
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            return_dict_in_generate=True,
-            eos_token_id=tokenizer.eos_token_id,
-            pad_token_id=tokenizer.pad_token_id,
-        )
-    decoded_output = tokenizer.decode(
-        response["sequences"][0],
-        skip_special_tokens=True,
-    )  # grab output in natural language
-    return decoded_output[len(prompt) :]  # remove prompt from output
 ```
-We can now generate text! For example:
 ```python
-prompt = "You are a helpful assistant. Tell me a recipe for vegan banana bread.\n"
-response = llama_generate(
-    model,
-    tokenizer,
-    prompt,
-    max_new_tokens=500,
-    temperature=0.92,
-)
-print(response)
 ```
 ### Runtime tests
 | runtime / 50 tokens (sec) | GPU             | attn | torch dtype | VRAM (GB) |

 Basic usage: [notebook](assets/basic_inference_llama_2_dolphin.ipynb)
 ```python
 !pip install -q -U huggingface_hub peft transformers torch accelerate
 ```
 ```python
+from huggingface_hub import notebook_login
 import torch
 from peft import PeftModel, PeftConfig
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    pipeline,
+)
 notebook_login()
 ```
 ```python
+peft_model_id = "dfurman/llama-2-13b-dolphin-peft"
 config = PeftConfig.from_pretrained(peft_model_id)
 bnb_config = BitsAndBytesConfig(
     config.base_model_name_or_path,
     quantization_config=bnb_config,
     use_auth_token=True,
     device_map="auto",
 )
+tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, use_fast=True)
 tokenizer.pad_token = tokenizer.eos_token
 model = PeftModel.from_pretrained(model, peft_model_id)
+format_template = "You are a helpful assistant. {query}\n"
 ```
 ```python
+# First, format the prompt
+query = "Tell me a recipe for vegan banana bread."
+prompt = format_template.format(query=query)
+# Inference can be done using model.generate
+print("\n\n*** Generate:")
+input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
+with torch.autocast("cuda", dtype=torch.bfloat16):
+    output = model.generate(
+        input_ids=input_ids,
+        max_new_tokens=512,
+        do_sample=True,
+        temperature=0.7,
+        return_dict_in_generate=True,
+        eos_token_id=tokenizer.eos_token_id,
+        pad_token_id=tokenizer.pad_token_id,
+        repetition_penalty=1.2,
+    )
+print(tokenizer.decode(output["sequences"][0], skip_special_tokens=True))
 ```
 ### Runtime tests
 | runtime / 50 tokens (sec) | GPU             | attn | torch dtype | VRAM (GB) |