import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from peft import PeftModel import torch import os # Pastikan Hugging Face Token disediakan (jika private repo) hf_token = os.getenv('HF_TOKEN') # Path model dasar dan adapter base_model = "google/gemma-2b-it" adapter_model = "FadQ/gemma-2b-diary-consultaton-chatbot" # Pastikan menggunakan versi terbaru untuk kompatibilitas import subprocess subprocess.run(["pip", "install", "--upgrade", "peft", "transformers", "accelerate"]) # Load model dasar dengan memastikan tidak dalam mode meta tensor model = AutoModelForCausalLM.from_pretrained( base_model, torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True # Pastikan model benar-benar dimuat ke memori ) # Pastikan semua weight telah dimuat sebelum apply adapter model = model.to("cuda" if torch.cuda.is_available() else "cpu") # Load adapter PEFT setelah model utama benar-benar dimuat model = PeftModel.from_pretrained( model, adapter_model ) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(base_model) # Create pipeline pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0) def predict(input_text): inputs = tokenizer(input_text, return_tensors="pt").to("cuda") with torch.no_grad(): output = model.generate(**inputs, max_length=150) return tokenizer.decode(output[0], skip_special_tokens=True) # Create Gradio interface demo = gr.Interface( fn=predict, inputs=gr.Textbox(label="Input Text"), outputs=gr.Textbox(label="Generated Response") ) if __name__ == "__main__": demo.launch()