Spaces:
Sleeping
Sleeping
File size: 907 Bytes
1e639fb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | # optimization.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
# 1. Use pipeline for simplicity
pipe = pipeline(
"text-generation",
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
model_kwargs={
"torch_dtype": torch.float16,
"device_map": "auto",
"load_in_4bit": True
},
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
)
# 2. Use vLLM for high-throughput (install: pip install vLLM)
from vllm import LLM, SamplingParams
llm = LLM(model="mTinyLlama/TinyLlama-1.1B-Chat-v1.0")
sampling_params = SamplingParams(temperature=0.7, max_tokens=500)
outputs = llm.generate(["Hello, how are you?"], sampling_params)
# 3. Cache model responses
import hashlib
from functools import lru_cache
@lru_cache(maxsize=1000)
def cached_generation(prompt, max_tokens=500):
return pipe(prompt, max_new_tokens=max_tokens)[0]['generated_text'] |