import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig from peft import PeftModel, PeftConfig # Necessary for loading the adapter weights # --- Configuration --- # 1. Base Llama 2 model used for fine-tuning BASE_MODEL = "aboonaji/llama2finetune-v2" # 2. Your newly published adapter model on the Hub ADAPTER_MODEL = "dynamodenis254/dynamo-denis-llama2finetune-medical" # --- Model Loading --- # This function loads the model and runs only once when the app starts def load_model(): """Loads the base model and applies the fine-tuned adapter weights.""" print(f"Loading base model: {BASE_MODEL}") # Check for GPU availability # device = "cuda" if torch.cuda.is_available() else "cpu" # print(f"Using device: {device}") # === FIX: Define 4-bit Quantization Configuration === # This dramatically reduces memory usage, solving the 'offload_dir' error. bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", # bfloat16 is recommended for Llama models on modern GPUs (A100, V100, T4) bnb_4bit_compute_dtype=torch.bfloat16, llm_int8_enable_fp32_cpu_offload=True ) # Load the base model (ensure trust_remote_code=True for custom Llama models) base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, quantization_config = bnb_config, dtype=torch.float16, # Use half precision for faster GPU inference device_map="auto", trust_remote_code=True, offload_folder="./offload_base" # === Specify a folder for disk offloading since we are using free CPU not GPU=== ) # === CRITICAL FIX: Prevent the 'multiple adapters' warning/crash === # If the base model repo contains an old PEFT config, it is loaded into the # model's internal _peft_config attribute. We must delete this before # loading the new adapter to prevent conflicts. if hasattr(base_model, "_peft_config"): print("Cleaning up potentially conflicting _peft_config from the base model.") del base_model._peft_config # ================================================================== # Load the Peft (LoRA) adapter weights on top of the base model model = PeftModel.from_pretrained( base_model, ADAPTER_MODEL, offload_dir="./offload_peft" # === Specify a folder for disk offloading since we are using free CPU not GPU=== ) # Get the tokenizer tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" # Create the Hugging Face Pipeline for easy text generation generator = pipeline( "text-generation", model=model, tokenizer=tokenizer, # device=0 if device == "cuda" else -1 # Use GPU 0 if available, otherwise use CPU. NB remove this since we are using device_map="auto" ) print("Model and Tokenizer loaded successfully.") return generator # Load the model outside the prediction function so it runs only once generator = load_model() # --- Prediction Function --- def generate_response(prompt, max_new_tokens=256, temperature=0.7): """Generates text using the fine-tuned model.""" # Llama models often work best with a system prompt structure system_prompt = "You are a specialized medical assistant. Provide concise and accurate information." formatted_prompt = f"### System:\n{system_prompt}\n\n### User:\n{prompt}\n\n### Assistant:\n" try: # Run the generation pipeline result = generator( formatted_prompt, max_new_tokens=max_new_tokens, temperature=temperature, do_sample=True, return_full_text=False # Only return the generated part of the response ) # Extract the text and clean up any potential trailing newlines generated_text = result[0]['generated_text'].strip() return generated_text except Exception as e: return f"An error occurred during generation: {e}" # --- Gradio Interface Setup --- iface = gr.Interface( fn=generate_response, inputs=[ gr.Textbox(lines=4, label="Medical Query (e.g., 'What are the symptoms of type 2 diabetes?')", placeholder="Enter your medical question..."), gr.Slider(minimum=32, maximum=1024, step=32, value=256, label="Max Response Length", info="Controls the length of the generated answer."), gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.7, label="Creativity (Temperature)", info="Higher temperature means more creative/risky answers.") ], outputs=gr.Textbox(lines=10, label="Fine-Tuned Medical Assistant Response"), title="⚕️ Medical Llama 2 Fine-Tune Demo By Denis Mbugua (dynamodenis254)", description="This demo uses a Llama 2 model fine-tuned on medical data. Enter a query and observe the specialized response.", theme="soft" ) # Launch is handled automatically by Hugging Face Spaces if __name__ == "__main__": iface.launch()