import gradio as gr import torch from transformers import AutoTokenizer from peft import AutoPeftModelForCausalLM # Load the fine-tuned model and tokenizer model_name = "richardcsuwandi/llama2-javanese" model = AutoPeftModelForCausalLM.from_pretrained(model_name, device_map='cpu', offload_folder='./', torch_dtype=torch.bfloat16) # Merge adapter with base model = model.merge_and_unload() model.eval() tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.pad_token_id = 0 tokenizer.padding_side = "left" def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): # Format the input text input_text = f"[INST] <> {system_message} <> {message} [/INST]" # Tokenize the input text inputs = tokenizer(input_text, return_tensors="pt").to(model.device) # Generate response output_sequences = model.generate( input_ids=inputs['input_ids'], max_length=max_tokens, repetition_penalty=1.2 ) # Decode the generated response input_length = inputs['input_ids'].shape[1] generated_text = tokenizer.decode(output_sequences[0][input_length:], skip_special_tokens=True) return generated_text demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="Sampeyan minangka chatbot umum sing tansah mangsuli nganggo basa Jawa.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"), ], ) if __name__ == "__main__": demo.launch(share=True)