Spaces:
Sleeping
Sleeping
| # optimization.py | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| # 1. Use pipeline for simplicity | |
| pipe = pipeline( | |
| "text-generation", | |
| model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", | |
| model_kwargs={ | |
| "torch_dtype": torch.float16, | |
| "device_map": "auto", | |
| "load_in_4bit": True | |
| }, | |
| tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0", | |
| ) | |
| # 2. Use vLLM for high-throughput (install: pip install vLLM) | |
| from vllm import LLM, SamplingParams | |
| llm = LLM(model="mTinyLlama/TinyLlama-1.1B-Chat-v1.0") | |
| sampling_params = SamplingParams(temperature=0.7, max_tokens=500) | |
| outputs = llm.generate(["Hello, how are you?"], sampling_params) | |
| # 3. Cache model responses | |
| import hashlib | |
| from functools import lru_cache | |
| def cached_generation(prompt, max_tokens=500): | |
| return pipe(prompt, max_new_tokens=max_tokens)[0]['generated_text'] |