apple
/

OpenELM-3B

@@ -20,16 +20,17 @@ We have provided an example function to generate output from OpenELM models load
 You can try the model by running the following command:
 ```
-python generate_openelm.py --checkpoint apple/OpenELM-3B --hf_security_token [HF_SECURITY_TOKEN] --prompt 'Once upon a time there was' --generate_kwargs no_repeat_ngram_size=10
 ```
-Additional arguments to the huggingface generate function can be passed via `generate_kwargs`. As an example, to speedup the inference, you can try [lookup token speculative generation](https://huggingface.co/docs/transformers/generation_strategies)  by passing the `prompt_lookup_num_tokens` argument as follows:
 ```
-python generate_openelm.py --checkpoint apple/OpenELM-3B --hf_security_token [HF_SECURITY_TOKEN] --prompt 'Once upon a time there was' --generate_kwargs no_repeat_ngram_size=10 prompt_lookup_num_tokens=10
 ```
-Alternatively, model-wise speculative generation can be also tried by passing a smaller model checkpoint through the `speculative_model_ckpt` argument, for example:
 ```
-python generate_openelm.py --checkpoint apple/OpenELM-3B --hf_security_token [HF_SECURITY_TOKEN] --prompt 'Once upon a time there was' --generate_kwargs no_repeat_ngram_size=10 --speculative_model_ckpt apple/OpenELM-270M
 ```

 You can try the model by running the following command:
 ```
+python generate_openelm.py --model apple/OpenELM-3B --hf_access_token [HF_ACCESS_TOKEN] --prompt 'Once upon a time there was' --generate_kwargs repetition_penalty=1.2
 ```
+Please refer to [this link](https://huggingface.co/docs/hub/security-tokens) to obtain your hugging face access token.
+Additional arguments to the hugging face generate function can be passed via `generate_kwargs`. As an example, to speedup the inference, you can try [lookup token speculative generation](https://huggingface.co/docs/transformers/generation_strategies) by passing the `prompt_lookup_num_tokens` argument as follows:
 ```
+python generate_openelm.py --model apple/OpenELM-3B --hf_access_token [HF_ACCESS_TOKEN] --prompt 'Once upon a time there was' --generate_kwargs repetition_penalty=1.2 prompt_lookup_num_tokens=10
 ```
+Alternatively, model-wise speculative generation with an [assistive model](https://huggingface.co/blog/assisted-generation) can be also tried by passing a smaller model model through the `assistant_model` argument, for example:
 ```
+python generate_openelm.py --model apple/OpenELM-3B --hf_access_token [HF_ACCESS_TOKEN] --prompt 'Once upon a time there was' --generate_kwargs repetition_penalty=1.2 --assistant_model apple/OpenELM-270M
 ```

generate_openelm.py CHANGED Viewed

@@ -12,11 +12,11 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 def generate(
     prompt: str,
     model: Union[str, AutoModelForCausalLM],
-    hf_security_token: str = None,
     tokenizer: Union[str, AutoTokenizer] = 'meta-llama/Llama-2-7b-hf',
     device: Optional[str] = None,
     max_length: int = 1024,
-    speculative_model: Optional[Union[str, AutoModelForCausalLM]] = None,
     generate_kwargs: Optional[dict] = None,
 ) -> str:
     """ Generates output given a prompt.
@@ -25,16 +25,16 @@ def generate(
         prompt: The string prompt.
         model: The LLM Model. If a string is passed, it should be the path to
             the hf converted checkpoint.
-        hf_security_token: Hugging face security token.
         tokenizer: Tokenizer instance. If model is set as a string path,
             the tokenizer will be loaded from the checkpoint.
         device: String representation of device to run the model on. If None
             and cuda available it would be set to cuda:0 else cpu.
         max_length: Maximum length of tokens, input prompt + generated tokens.
-        speculative_model: If set, this model will be used for
             speculative generation. If a string is passed, it should be the
             path to the hf converted checkpoint.
-        generate_kwargs: Extra kwargs passed to the generate function.
     Returns:
         output_text: output generated as a string.
@@ -42,9 +42,8 @@ def generate(
     Raises:
         ValueError: If device is set to CUDA but no CUDA device is detected.
-        FileNotFoundError: If model or speculative_model are strings but
-            the model paths do not exist.
-        ValueError: If hf_security_token is not specified.
     """
     if not device:
         if torch.cuda.is_available() and torch.cuda.device_count():
@@ -55,28 +54,22 @@ def generate(
             )
         else:
             device = 'cpu'
-            logging.warning('no CUDA device detected, using cpu, expect slower speeds.')
     if 'cuda' in device and not torch.cuda.is_available():
         raise ValueError('CUDA device requested but no CUDA device detected.')
-    if isinstance(model, str) and (not model or not os.path.exists(model)):
-        raise FileNotFoundError(f'Model checkpoint does not exist at {model}.')
-    if (isinstance(speculative_model, str) and (
-            not speculative_model and not os.path.exists(speculative_model))):
-        raise FileNotFoundError(
-            (
-                'Speculative checkpoint path does not exist at '
-                f'{speculative_model}.'
-            )
-        )
-    if not tokenizer and not isinstance(model, str):
         raise ValueError('Tokenizer is not set in the generate function.')
-    if not hf_security_token:
         raise ValueError((
-            'Hugging face security key needs to be specified. '
             'Please refer to https://huggingface.co/docs/hub/security-tokens'
             ' to obtain one.'
             )
@@ -92,16 +85,16 @@ def generate(
     if isinstance(tokenizer, str):
         tokenizer = AutoTokenizer.from_pretrained(
             tokenizer,
-            token=hf_security_token,
         )
     # Speculative mode
     draft_model = None
-    if speculative_model:
-        draft_model = speculative_model
-        if isinstance(speculative_model, str):
             draft_model = AutoModelForCausalLM.from_pretrained(
-                speculative_model,
                 trust_remote_code=True
             )
         draft_model.to(device).eval()
@@ -161,22 +154,22 @@ def openelm_generate_parser():
     parser = argparse.ArgumentParser('OpenELM Generate Module')
     parser.add_argument(
-        '--checkpoint',
-        dest='checkpoint_path',
-        help='Path to the model hf converted checkpoint.',
         required=True,
         type=str,
     )
     parser.add_argument(
-        '--hf_security_token',
-        dest='hf_security_token',
-        help='HF security token, starting with "hf_".',
         type=str,
     )
     parser.add_argument(
       '--prompt',
       dest='prompt',
-      help='Prompt for LLM call. Ignored if demo is True.',
       default='',
       type=str,
     )
@@ -194,17 +187,20 @@ def openelm_generate_parser():
         type=int,
     )
     parser.add_argument(
-        '--speculative_model_ckpt',
-        dest='speculative_model_ckpt',
         help=(
-            'If set, this is used as a draft model for speculative generation.'
         ),
         type=str,
     )
     parser.add_argument(
         '--generate_kwargs',
         dest='generate_kwargs',
-        help='additional kwargs passed to the HF generate function.',
         type=str,
         nargs='*',
         action=KwargsParser,
@@ -218,12 +214,12 @@ if __name__ == '__main__':
     output_text, genertaion_time = generate(
         prompt=prompt,
-        model=args.checkpoint_path,
         device=args.device,
         max_length=args.max_length,
-        speculative_model=args.speculative_model_ckpt,
         generate_kwargs=args.generate_kwargs,
-        hf_security_token=args.hf_security_token,
     )
     print_txt = (

 def generate(
     prompt: str,
     model: Union[str, AutoModelForCausalLM],
+    hf_access_token: str = None,
     tokenizer: Union[str, AutoTokenizer] = 'meta-llama/Llama-2-7b-hf',
     device: Optional[str] = None,
     max_length: int = 1024,
+    assistant_model: Optional[Union[str, AutoModelForCausalLM]] = None,
     generate_kwargs: Optional[dict] = None,
 ) -> str:
     """ Generates output given a prompt.
         prompt: The string prompt.
         model: The LLM Model. If a string is passed, it should be the path to
             the hf converted checkpoint.
+        hf_access_token: Hugging face access token.
         tokenizer: Tokenizer instance. If model is set as a string path,
             the tokenizer will be loaded from the checkpoint.
         device: String representation of device to run the model on. If None
             and cuda available it would be set to cuda:0 else cpu.
         max_length: Maximum length of tokens, input prompt + generated tokens.
+        assistant_model: If set, this model will be used for
             speculative generation. If a string is passed, it should be the
             path to the hf converted checkpoint.
+        generate_kwargs: Extra kwargs passed to the hf generate function.
     Returns:
         output_text: output generated as a string.
     Raises:
         ValueError: If device is set to CUDA but no CUDA device is detected.
+        ValueError: If tokenizer is not set.
+        ValueError: If hf_access_token is not specified.
     """
     if not device:
         if torch.cuda.is_available() and torch.cuda.device_count():
             )
         else:
             device = 'cpu'
+            logging.warning(
+                (
+                    'No CUDA device detected, using cpu, '
+                    'expect slower speeds.'
+                )
+            )
     if 'cuda' in device and not torch.cuda.is_available():
         raise ValueError('CUDA device requested but no CUDA device detected.')
+    if not tokenizer:
         raise ValueError('Tokenizer is not set in the generate function.')
+    if not hf_access_token:
         raise ValueError((
+            'Hugging face access token needs to be specified. '
             'Please refer to https://huggingface.co/docs/hub/security-tokens'
             ' to obtain one.'
             )
     if isinstance(tokenizer, str):
         tokenizer = AutoTokenizer.from_pretrained(
             tokenizer,
+            token=hf_access_token,
         )
     # Speculative mode
     draft_model = None
+    if assistant_model:
+        draft_model = assistant_model
+        if isinstance(assistant_model, str):
             draft_model = AutoModelForCausalLM.from_pretrained(
+                assistant_model,
                 trust_remote_code=True
             )
         draft_model.to(device).eval()
     parser = argparse.ArgumentParser('OpenELM Generate Module')
     parser.add_argument(
+        '--model',
+        dest='model',
+        help='Path to the hf converted model.',
         required=True,
         type=str,
     )
     parser.add_argument(
+        '--hf_access_token',
+        dest='hf_access_token',
+        help='Hugging face access token, starting with "hf_".',
         type=str,
     )
     parser.add_argument(
       '--prompt',
       dest='prompt',
+      help='Prompt for LLM call.',
       default='',
       type=str,
     )
         type=int,
     )
     parser.add_argument(
+        '--assistant_model',
+        dest='assistant_model',
         help=(
+            (
+                'If set, this is used as a draft model '
+                'for assisted speculative generation.'
+            )
         ),
         type=str,
     )
     parser.add_argument(
         '--generate_kwargs',
         dest='generate_kwargs',
+        help='Additional kwargs passed to the HF generate function.',
         type=str,
         nargs='*',
         action=KwargsParser,
     output_text, genertaion_time = generate(
         prompt=prompt,
+        model=args.model,
         device=args.device,
         max_length=args.max_length,
+        assistant_model=args.assistant_model,
         generate_kwargs=args.generate_kwargs,
+        hf_access_token=args.hf_access_token,
     )
     print_txt = (