# optimization.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# 1. Use pipeline for simplicity
pipe = pipeline(
    "text-generation",
    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    model_kwargs={
        "torch_dtype": torch.float16,
        "device_map": "auto",
        "load_in_4bit": True
    },
    tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
)

# 2. Use vLLM for high-throughput (install: pip install vLLM)
from vllm import LLM, SamplingParams

llm = LLM(model="mTinyLlama/TinyLlama-1.1B-Chat-v1.0")
sampling_params = SamplingParams(temperature=0.7, max_tokens=500)
outputs = llm.generate(["Hello, how are you?"], sampling_params)

# 3. Cache model responses
import hashlib
from functools import lru_cache

@lru_cache(maxsize=1000)
def cached_generation(prompt, max_tokens=500):
    return pipe(prompt, max_new_tokens=max_tokens)[0]['generated_text']