embedl
/

Llama-3.2-3B-Instruct-FlashHead-W4A16

text-generation-inference

compressed-tensors

Model card Files Files and versions

WilhelmT commited on 4 days ago

Commit

4ab8c36

·

verified ·

1 Parent(s): cc1a283

Update README.md

Files changed (1) hide show

README.md +13 -10

README.md CHANGED Viewed

@@ -90,12 +90,13 @@ from embedl.models.vllm import LLM
 model_id = "embedl/Llama-3.2-3B-Instruct-FlashHead-W4A16"
-sampling = SamplingParams(max_tokens=128, temperature=0.0)
-llm = LLM(model=model_id, trust_remote_code=True)
-prompt = "Write a haiku about coffee."
-output = llm.generate([prompt], sampling)
-print(output[0].outputs[0].text)
 ```
 ---
@@ -110,11 +111,13 @@ import asyncio
 from embedl.models.vllm.demo import run_repl
 model_id = "embedl/Llama-3.2-3B-Instruct-FlashHead-W4A16"
-asyncio.run(
-    run_repl(
-        model=model_id
     )
-)
 ```
 ---

 model_id = "embedl/Llama-3.2-3B-Instruct-FlashHead-W4A16"
+if __name__ == "__main__":
+    sampling = SamplingParams(max_tokens=128, temperature=0.0)
+    llm = LLM(model=model_id, trust_remote_code=True)
+    prompt = "Write a haiku about coffee."
+    output = llm.generate([prompt], sampling)
+    print(output[0].outputs[0].text)
 ```
 ---
 from embedl.models.vllm.demo import run_repl
 model_id = "embedl/Llama-3.2-3B-Instruct-FlashHead-W4A16"
+if __name__ == "__main__":
+    asyncio.run(
+        run_repl(
+            model=model_id
+        )
     )
 ```
 ---