iimran
/

Qwen2.5-3B-R1-MedicalReasoner

Text Generation

text-generation-inference

Model card Files Files and versions

iimran commited on Apr 10, 2025

Commit

1f2fe79

·

verified ·

1 Parent(s): efdbd62

Update README.md

Files changed (1) hide show

README.md +38 -10

README.md CHANGED Viewed

@@ -36,30 +36,58 @@ tags:
 Below is an example of how to use the model for inference or refer to inference.py in files section:
 ```python
 from vllm import SamplingParams
-# Prepare a structured prompt:
 text = tokenizer.apply_chat_template(
     [
-        {"role": "system", "content": "Respond in the following format:\n<reasoning>\n...\n</reasoning>\n<answer>\n...\n</answer>"},
-        {"role": "user", "content": "In the context of disseminated intravascular coagulation (DIC), which blood component is expected to increase due to the excessive breakdown of fibrin?"}
     ],
     tokenize=False,
     add_generation_prompt=True
 )
-# Define sampling parameters:
 sampling_params = SamplingParams(
-    temperature=0.8,
     top_p=0.95,
     max_tokens=4096,
 )
-# Generate and print the output:
 outputs = model.fast_generate(
     text,
     sampling_params=sampling_params,
-    lora_request=None  # Use None if the LoRA adapter is already loaded
 )
 print(outputs[0].outputs[0].text)
 ```

 Below is an example of how to use the model for inference or refer to inference.py in files section:
 ```python
+from unsloth import FastLanguageModel, is_bfloat16_supported
 from vllm import SamplingParams
+from huggingface_hub import snapshot_download
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name="iimran/Qwen2.5-3B-R1-MedicalReasoner",
+    load_in_4bit=True,
+    fast_inference=True,
+    gpu_memory_utilization=0.5
+)
+lora_rank = 64
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=lora_rank,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                    "gate_proj", "up_proj", "down_proj"],
+    lora_alpha=lora_rank,
+    use_gradient_checkpointing="unsloth",
+    random_state=3407,
+)
+lora_path = snapshot_download("iimran/Qwen2.5-3B-R1-MedicalReasoner-lora-adapter")
+print("LoRA adapter downloaded to:", lora_path)
+model.load_lora(lora_path)
+SYSTEM_PROMPT = (
+    "Respond in the following format:\n"
+    "<reasoning>\n"
+    "...\n"
+    "</reasoning>\n"
+    "<answer>\n"
+    "...\n"
+    "</answer>"
+)
+USER_PROMPT = (
+    "In the context of disseminated intravascular coagulation (DIC), "
+    "which blood component is expected to show an increase due to the excessive breakdown of fibrin?"
+)
 text = tokenizer.apply_chat_template(
     [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": USER_PROMPT},
     ],
     tokenize=False,
     add_generation_prompt=True
 )
 sampling_params = SamplingParams(
+    temperature=0.1,
     top_p=0.95,
     max_tokens=4096,
 )
 outputs = model.fast_generate(
     text,
     sampling_params=sampling_params,
+    lora_request=None
 )
 print(outputs[0].outputs[0].text)
 ```