Commit
·
0ae4abd
1
Parent(s):
ae1084a
Update README.md
Browse files
README.md
CHANGED
|
@@ -160,6 +160,10 @@ hub = {
|
|
| 160 |
'HF_MODEL_ID':'amazon/MistralLite',
|
| 161 |
'HF_TASK':'text-generation',
|
| 162 |
'SM_NUM_GPUS':'1',
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
}
|
| 164 |
|
| 165 |
model = HuggingFaceModel(
|
|
@@ -171,7 +175,8 @@ model = HuggingFaceModel(
|
|
| 171 |
predictor = model.deploy(
|
| 172 |
initial_instance_count=1,
|
| 173 |
instance_type="ml.g5.2xlarge",
|
| 174 |
-
endpoint_name=model_name
|
|
|
|
| 175 |
)
|
| 176 |
```
|
| 177 |
|
|
@@ -185,10 +190,10 @@ input_data = {
|
|
| 185 |
"do_sample": False,
|
| 186 |
"max_new_tokens": 400,
|
| 187 |
"return_full_text": False,
|
| 188 |
-
"typical_p": 0.2,
|
| 189 |
-
"temperature":None,
|
| 190 |
-
"truncate":None,
|
| 191 |
-
"seed": 1,
|
| 192 |
}
|
| 193 |
}
|
| 194 |
result = predictor.predict(input_data)[0]["generated_text"]
|
|
@@ -215,12 +220,12 @@ parameters = {
|
|
| 215 |
"do_sample": False,
|
| 216 |
"max_new_tokens": 400,
|
| 217 |
"return_full_text": False,
|
| 218 |
-
"typical_p": 0.2,
|
| 219 |
-
"temperature":None,
|
| 220 |
-
"truncate":None,
|
| 221 |
-
"seed": 1,
|
| 222 |
-
|
| 223 |
-
endpoint_name =
|
| 224 |
prompt = "<|prompter|>What are the main challenges to support a long context for LLM?</s><|assistant|>"
|
| 225 |
result = call_endpoint(client, prompt, endpoint_name, parameters)
|
| 226 |
print(result)
|
|
@@ -236,7 +241,7 @@ Example Docker parameters:
|
|
| 236 |
```shell
|
| 237 |
docker run -d --gpus all --shm-size 1g -p 443:80 -v $(pwd)/models:/data ghcr.io/huggingface/text-generation-inference:1.1.0 \
|
| 238 |
--model-id amazon/MistralLite \
|
| 239 |
-
--max-input-length
|
| 240 |
--max-total-tokens 16384 \
|
| 241 |
--max-batch-prefill-tokens 16384 \
|
| 242 |
--trust-remote-code
|
|
@@ -263,16 +268,17 @@ def invoke_tgi(prompt,
|
|
| 263 |
print_stream=True,
|
| 264 |
assist_role=True):
|
| 265 |
if (assist_role):
|
| 266 |
-
prompt = f"<|prompter|>{prompt}
|
| 267 |
output = ""
|
| 268 |
for response in tgi_client.generate_stream(
|
| 269 |
prompt,
|
| 270 |
do_sample=False,
|
| 271 |
max_new_tokens=max_new_tokens,
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
|
|
|
| 276 |
):
|
| 277 |
if hasattr(response, "token"):
|
| 278 |
if not response.token.special:
|
|
|
|
| 160 |
'HF_MODEL_ID':'amazon/MistralLite',
|
| 161 |
'HF_TASK':'text-generation',
|
| 162 |
'SM_NUM_GPUS':'1',
|
| 163 |
+
"MAX_INPUT_LENGTH": '16000',
|
| 164 |
+
"MAX_TOTAL_TOKENS": '16384',
|
| 165 |
+
"MAX_BATCH_PREFILL_TOKENS": '16384',
|
| 166 |
+
"MAX_BATCH_TOTAL_TOKENS": '16384',
|
| 167 |
}
|
| 168 |
|
| 169 |
model = HuggingFaceModel(
|
|
|
|
| 175 |
predictor = model.deploy(
|
| 176 |
initial_instance_count=1,
|
| 177 |
instance_type="ml.g5.2xlarge",
|
| 178 |
+
endpoint_name=model_name,
|
| 179 |
+
|
| 180 |
)
|
| 181 |
```
|
| 182 |
|
|
|
|
| 190 |
"do_sample": False,
|
| 191 |
"max_new_tokens": 400,
|
| 192 |
"return_full_text": False,
|
| 193 |
+
#"typical_p": 0.2,
|
| 194 |
+
#"temperature":None,
|
| 195 |
+
#"truncate":None,
|
| 196 |
+
#"seed": 1,
|
| 197 |
}
|
| 198 |
}
|
| 199 |
result = predictor.predict(input_data)[0]["generated_text"]
|
|
|
|
| 220 |
"do_sample": False,
|
| 221 |
"max_new_tokens": 400,
|
| 222 |
"return_full_text": False,
|
| 223 |
+
#"typical_p": 0.2,
|
| 224 |
+
#"temperature":None,
|
| 225 |
+
#"truncate":None,
|
| 226 |
+
#"seed": 1,
|
| 227 |
+
}
|
| 228 |
+
endpoint_name = predictor.endpoint_name
|
| 229 |
prompt = "<|prompter|>What are the main challenges to support a long context for LLM?</s><|assistant|>"
|
| 230 |
result = call_endpoint(client, prompt, endpoint_name, parameters)
|
| 231 |
print(result)
|
|
|
|
| 241 |
```shell
|
| 242 |
docker run -d --gpus all --shm-size 1g -p 443:80 -v $(pwd)/models:/data ghcr.io/huggingface/text-generation-inference:1.1.0 \
|
| 243 |
--model-id amazon/MistralLite \
|
| 244 |
+
--max-input-length 16000 \
|
| 245 |
--max-total-tokens 16384 \
|
| 246 |
--max-batch-prefill-tokens 16384 \
|
| 247 |
--trust-remote-code
|
|
|
|
| 268 |
print_stream=True,
|
| 269 |
assist_role=True):
|
| 270 |
if (assist_role):
|
| 271 |
+
prompt = f"<|prompter|>{prompt}</s><|assistant|>"
|
| 272 |
output = ""
|
| 273 |
for response in tgi_client.generate_stream(
|
| 274 |
prompt,
|
| 275 |
do_sample=False,
|
| 276 |
max_new_tokens=max_new_tokens,
|
| 277 |
+
return_full_text=False,
|
| 278 |
+
#temperature=None,
|
| 279 |
+
#truncate=None,
|
| 280 |
+
#seed=random_seed,
|
| 281 |
+
#typical_p=0.2,
|
| 282 |
):
|
| 283 |
if hasattr(response, "token"):
|
| 284 |
if not response.token.special:
|