Spaces:
Paused
Paused
| def get_llm_response(repo, filename, model_type, gpu_layers, prompt): | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| model_name_or_path = "TheBloke/meditron-7B-GPTQ" | |
| # To use a different branch, change revision | |
| # For example: revision="gptq-4bit-128g-actorder_True" | |
| model = AutoModelForCausalLM.from_pretrained(model_name_or_path, | |
| device_map="auto", | |
| trust_remote_code=False, | |
| revision="main") | |
| tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) | |
| print("\n\n*** Generate:") | |
| #input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda() | |
| #output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512) | |
| #print(tokenizer.decode(output[0])) | |
| # Inference can also be done using transformers' pipeline | |
| print("*** Pipeline:") | |
| pipe = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| max_new_tokens=512, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.95, | |
| top_k=40, | |
| repetition_penalty=1.1 | |
| ) | |
| prompt_template=f'''<|im_start|>system | |
| {system_message}<|im_end|> | |
| <|im_start|>user | |
| {prompt}<|im_end|> | |
| <|im_start|>assistant | |
| '''.format(system_message="You are an assistant", prompt=prompt) | |
| response = pipe(prompt_template)[0]['generated_text'] | |
| print(response) | |
| return response |