Spaces:

abdelac
/

Mistral_Test

Sleeping

Mistral_Test / optimization.py

Add application file

1e639fb 3 months ago

907 Bytes

	# optimization.py
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

	# 1. Use pipeline for simplicity
	pipe = pipeline(
	"text-generation",
	model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	model_kwargs={
	"torch_dtype": torch.float16,
	"device_map": "auto",
	"load_in_4bit": True
	},
	tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	)

	# 2. Use vLLM for high-throughput (install: pip install vLLM)
	from vllm import LLM, SamplingParams

	llm = LLM(model="mTinyLlama/TinyLlama-1.1B-Chat-v1.0")
	sampling_params = SamplingParams(temperature=0.7, max_tokens=500)
	outputs = llm.generate(["Hello, how are you?"], sampling_params)

	# 3. Cache model responses
	import hashlib
	from functools import lru_cache

	@lru_cache(maxsize=1000)
	def cached_generation(prompt, max_tokens=500):
	return pipe(prompt, max_new_tokens=max_tokens)[0]['generated_text']