| # from transformers import AutoTokenizer, pipeline, logging | |
| # from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig | |
| # model_name_or_path = "asyafiqe/Merak-7B-v3-Mini-Orca-Indo-GPTQ" | |
| # model_basename = "Merak-7B-v3-Mini-Orca-Indo-GPTQ" | |
| # use_triton = False | |
| # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) | |
| # model = AutoGPTQForCausalLM.from_quantized(model_name_or_path, | |
| # model_basename=model_basename, | |
| # use_safetensors=True, | |
| # trust_remote_code=True, | |
| # device="cuda:0", | |
| # use_triton=use_triton, | |
| # quantize_config=None) | |
| # def predict(prompt): | |
| # # prompt = "Buat rencana untuk menghemat listrik di rumah" | |
| # system_message = "Anda adalah asisten AI. Anda akan diberi tugas. Anda harus menghasilkan jawaban yang rinci dan panjang.\n" | |
| # prompt_template=f'''SYSTEM: {system_message} | |
| # USER: {prompt} | |
| # ASSISTANT: ''' | |
| # print("\n\n*** Generate:") | |
| # input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda() | |
| # output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512) | |
| # print(tokenizer.decode(output[0])) | |
| # # Inference can also be done using transformers' pipeline | |
| # # Prevent printing spurious transformers error when using pipeline with AutoGPTQ | |
| # logging.set_verbosity(logging.CRITICAL) | |
| # print("*** Pipeline:") | |
| # pipe = pipeline( | |
| # "text-generation", | |
| # model=model, | |
| # tokenizer=tokenizer, | |
| # max_new_tokens=512, | |
| # temperature=0.7, | |
| # top_p=0.95, | |
| # repetition_penalty=1.15 | |
| # ) | |
| # result = pipe(prompt_template)[0]['generated_text'] | |
| # return result |