| from unsloth import FastLanguageModel, is_bfloat16_supported |
| from vllm import SamplingParams |
| from huggingface_hub import snapshot_download |
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name="iimran/Qwen2.5-3B-R1-MedicalReasoner", |
| load_in_4bit=True, |
| fast_inference=True, |
| gpu_memory_utilization=0.5 |
| ) |
| lora_rank = 64 |
| model = FastLanguageModel.get_peft_model( |
| model, |
| r=lora_rank, |
| target_modules=["q_proj", "k_proj", "v_proj", "o_proj", |
| "gate_proj", "up_proj", "down_proj"], |
| lora_alpha=lora_rank, |
| use_gradient_checkpointing="unsloth", |
| random_state=3407, |
| ) |
| lora_path = snapshot_download("iimran/Qwen2.5-3B-R1-MedicalReasoner-lora-adapter") |
| print("LoRA adapter downloaded to:", lora_path) |
| model.load_lora(lora_path) |
| SYSTEM_PROMPT = ( |
| "Respond in the following format:\n" |
| "<reasoning>\n" |
| "...\n" |
| "</reasoning>\n" |
| "<answer>\n" |
| "...\n" |
| "</answer>" |
| ) |
| USER_PROMPT = ( |
| "In the context of disseminated intravascular coagulation (DIC), " |
| "which blood component is expected to show an increase due to the excessive breakdown of fibrin?" |
| ) |
| text = tokenizer.apply_chat_template( |
| [ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| {"role": "user", "content": USER_PROMPT}, |
| ], |
| tokenize=False, |
| add_generation_prompt=True |
| ) |
| sampling_params = SamplingParams( |
| temperature=0.1, |
| top_p=0.95, |
| max_tokens=4096, |
| ) |
| outputs = model.fast_generate( |
| text, |
| sampling_params=sampling_params, |
| lora_request=None |
| ) |
| print(outputs[0].outputs[0].text) |
|
|