Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig | |
| from peft import PeftModel, PeftConfig # Necessary for loading the adapter weights | |
| # --- Configuration --- | |
| # 1. Base Llama 2 model used for fine-tuning | |
| BASE_MODEL = "aboonaji/llama2finetune-v2" | |
| # 2. Your newly published adapter model on the Hub | |
| ADAPTER_MODEL = "dynamodenis254/dynamo-denis-llama2finetune-medical" | |
| # --- Model Loading --- | |
| # This function loads the model and runs only once when the app starts | |
| def load_model(): | |
| """Loads the base model and applies the fine-tuned adapter weights.""" | |
| print(f"Loading base model: {BASE_MODEL}") | |
| # Check for GPU availability | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # print(f"Using device: {device}") | |
| # === FIX: Define 4-bit Quantization Configuration === | |
| # This dramatically reduces memory usage, solving the 'offload_dir' error. | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4", | |
| # bfloat16 is recommended for Llama models on modern GPUs (A100, V100, T4) | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| llm_int8_enable_fp32_cpu_offload=True | |
| ) | |
| # Load the base model (ensure trust_remote_code=True for custom Llama models) | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| quantization_config = bnb_config, | |
| dtype=torch.float16, # Use half precision for faster GPU inference | |
| device_map="auto", | |
| trust_remote_code=True, | |
| offload_folder="./offload_base" # === Specify a folder for disk offloading since we are using free CPU not GPU=== | |
| ) | |
| # === CRITICAL FIX: Prevent the 'multiple adapters' warning/crash === | |
| # If the base model repo contains an old PEFT config, it is loaded into the | |
| # model's internal _peft_config attribute. We must delete this before | |
| # loading the new adapter to prevent conflicts. | |
| if hasattr(base_model, "_peft_config"): | |
| print("Cleaning up potentially conflicting _peft_config from the base model.") | |
| del base_model._peft_config | |
| # ================================================================== | |
| # Load the Peft (LoRA) adapter weights on top of the base model | |
| model = PeftModel.from_pretrained( | |
| base_model, | |
| ADAPTER_MODEL, | |
| offload_dir="./offload_peft" # === Specify a folder for disk offloading since we are using free CPU not GPU=== | |
| ) | |
| # Get the tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) | |
| tokenizer.pad_token = tokenizer.eos_token | |
| tokenizer.padding_side = "right" | |
| # Create the Hugging Face Pipeline for easy text generation | |
| generator = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| # device=0 if device == "cuda" else -1 # Use GPU 0 if available, otherwise use CPU. NB remove this since we are using device_map="auto" | |
| ) | |
| print("Model and Tokenizer loaded successfully.") | |
| return generator | |
| # Load the model outside the prediction function so it runs only once | |
| generator = load_model() | |
| # --- Prediction Function --- | |
| def generate_response(prompt, max_new_tokens=256, temperature=0.7): | |
| """Generates text using the fine-tuned model.""" | |
| # Llama models often work best with a system prompt structure | |
| system_prompt = "You are a specialized medical assistant. Provide concise and accurate information." | |
| formatted_prompt = f"### System:\n{system_prompt}\n\n### User:\n{prompt}\n\n### Assistant:\n" | |
| try: | |
| # Run the generation pipeline | |
| result = generator( | |
| formatted_prompt, | |
| max_new_tokens=max_new_tokens, | |
| temperature=temperature, | |
| do_sample=True, | |
| return_full_text=False # Only return the generated part of the response | |
| ) | |
| # Extract the text and clean up any potential trailing newlines | |
| generated_text = result[0]['generated_text'].strip() | |
| return generated_text | |
| except Exception as e: | |
| return f"An error occurred during generation: {e}" | |
| # --- Gradio Interface Setup --- | |
| iface = gr.Interface( | |
| fn=generate_response, | |
| inputs=[ | |
| gr.Textbox(lines=4, label="Medical Query (e.g., 'What are the symptoms of type 2 diabetes?')", placeholder="Enter your medical question..."), | |
| gr.Slider(minimum=32, maximum=1024, step=32, value=256, label="Max Response Length", info="Controls the length of the generated answer."), | |
| gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.7, label="Creativity (Temperature)", info="Higher temperature means more creative/risky answers.") | |
| ], | |
| outputs=gr.Textbox(lines=10, label="Fine-Tuned Medical Assistant Response"), | |
| title="⚕️ Medical Llama 2 Fine-Tune Demo By Denis Mbugua (dynamodenis254)", | |
| description="This demo uses a Llama 2 model fine-tuned on medical data. Enter a query and observe the specialized response.", | |
| theme="soft" | |
| ) | |
| # Launch is handled automatically by Hugging Face Spaces | |
| if __name__ == "__main__": | |
| iface.launch() | |