Spaces:
Sleeping
Sleeping
File size: 2,420 Bytes
3b4cee3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from codeInsight.utils.config import load_config
import litserve as ls
class LLMApi(ls.LitAPI):
def setup(self, device, config_path="config/model.yaml"):
self.config = load_config(config_path)
self.dataset_config = self.config['dataset']
model_name = self.config['paths']['final_model_repo']
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(model_name)
if device != "cpu":
self.model.to(device)
self.model.eval()
def _formet_prompt(self, prompt : str) -> str:
return f"{self.dataset_config['SYSTEM_PROMPT']}{self.dataset_config['USER_TOKEN']}{prompt}{self.dataset_config['END_TOKEN']}\n\n{self.dataset_config['ASSISTANT_TOKEN']}"
def generate(self, prompt : str, max_length : int = 512, temperature: float = 0.2, top_p : float =0.80) -> str:
try:
input_text = self._formet_prompt(prompt)
inputs = self.tokenizer(
input_text,
return_tensors="pt",
).to(self.model.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=max_length,
temperature=temperature,
top_p=top_p,
do_sample=True,
eos_token_id=self.tokenizer.convert_tokens_to_ids(self.dataset_config['END_TOKEN']),
pad_token_id=self.tokenizer.eos_token_id
)
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
if self.dataset_config['ASSISTANT_TOKEN'] in generated_text:
generated_code = generated_text.split(self.dataset_config['ASSISTANT_TOKEN'])[1].strip()
if self.dataset_config['END_TOKEN'] in generated_code:
generated_code = generated_code.split(self.dataset_config['END_TOKEN'])[0].strip()
else:
generated_code = generated_text
return {"response": generated_code}
except Exception as e:
return {"error": str(e)}
if __name__ == "__main__":
server = ls.LitServer(LLMApi(), accelerator="auto")
server.run() |