amazon
/

MistralLite

@@ -160,6 +160,10 @@ hub = {
     'HF_MODEL_ID':'amazon/MistralLite',
     'HF_TASK':'text-generation',
     'SM_NUM_GPUS':'1',
 }
 model = HuggingFaceModel(
@@ -171,7 +175,8 @@ model = HuggingFaceModel(
 predictor = model.deploy(
   initial_instance_count=1,
   instance_type="ml.g5.2xlarge",
-  endpoint_name=model_name
 )
 ```
@@ -185,10 +190,10 @@ input_data = {
     "do_sample": False,
     "max_new_tokens": 400,
     "return_full_text": False,
-    "typical_p": 0.2,
-    "temperature":None,
-    "truncate":None,
-    "seed": 1,
   }
 }
 result = predictor.predict(input_data)[0]["generated_text"]
@@ -215,12 +220,12 @@ parameters = {
     "do_sample": False,
     "max_new_tokens": 400,
     "return_full_text": False,
-    "typical_p": 0.2,
-    "temperature":None,
-    "truncate":None,
-    "seed": 1,
-  }
-endpoint_name = "MistralLite-2023-10-16-09-45-58"
 prompt = "<|prompter|>What are the main challenges to support a long context for LLM?</s><|assistant|>"
 result = call_endpoint(client, prompt, endpoint_name, parameters)
 print(result)
@@ -236,7 +241,7 @@ Example Docker parameters:
 ```shell
 docker run -d --gpus all --shm-size 1g -p 443:80 -v $(pwd)/models:/data ghcr.io/huggingface/text-generation-inference:1.1.0 \
       --model-id amazon/MistralLite \
-      --max-input-length 8192 \
       --max-total-tokens 16384 \
       --max-batch-prefill-tokens 16384 \
       --trust-remote-code
@@ -263,16 +268,17 @@ def invoke_tgi(prompt,
                       print_stream=True,
                       assist_role=True):
     if (assist_role):
-        prompt = f"<|prompter|>{prompt}<|/s|><|assistant|>"
     output = ""
     for response in tgi_client.generate_stream(
         prompt,
         do_sample=False,
         max_new_tokens=max_new_tokens,
-        temperature=None,
-        truncate=None,
-        seed=random_seed,
-        typical_p=0.2,
     ):
         if hasattr(response, "token"):
             if not response.token.special:

     'HF_MODEL_ID':'amazon/MistralLite',
     'HF_TASK':'text-generation',
     'SM_NUM_GPUS':'1',
+    "MAX_INPUT_LENGTH": '16000',
+    "MAX_TOTAL_TOKENS": '16384',
+    "MAX_BATCH_PREFILL_TOKENS": '16384',
+    "MAX_BATCH_TOTAL_TOKENS":  '16384',
 }
 model = HuggingFaceModel(
 predictor = model.deploy(
   initial_instance_count=1,
   instance_type="ml.g5.2xlarge",
+  endpoint_name=model_name,
 )
 ```
     "do_sample": False,
     "max_new_tokens": 400,
     "return_full_text": False,
+    #"typical_p": 0.2,
+    #"temperature":None,
+    #"truncate":None,
+    #"seed": 1,
   }
 }
 result = predictor.predict(input_data)[0]["generated_text"]
     "do_sample": False,
     "max_new_tokens": 400,
     "return_full_text": False,
+    #"typical_p": 0.2,
+    #"temperature":None,
+    #"truncate":None,
+    #"seed": 1,
+}
+endpoint_name = predictor.endpoint_name
 prompt = "<|prompter|>What are the main challenges to support a long context for LLM?</s><|assistant|>"
 result = call_endpoint(client, prompt, endpoint_name, parameters)
 print(result)
 ```shell
 docker run -d --gpus all --shm-size 1g -p 443:80 -v $(pwd)/models:/data ghcr.io/huggingface/text-generation-inference:1.1.0 \
       --model-id amazon/MistralLite \
+      --max-input-length 16000 \
       --max-total-tokens 16384 \
       --max-batch-prefill-tokens 16384 \
       --trust-remote-code
                       print_stream=True,
                       assist_role=True):
     if (assist_role):
+        prompt = f"<|prompter|>{prompt}</s><|assistant|>"
     output = ""
     for response in tgi_client.generate_stream(
         prompt,
         do_sample=False,
         max_new_tokens=max_new_tokens,
+        return_full_text=False,
+        #temperature=None,
+        #truncate=None,
+        #seed=random_seed,
+        #typical_p=0.2,
     ):
         if hasattr(response, "token"):
             if not response.token.special: