iimran commited on
Commit
1f2fe79
·
verified ·
1 Parent(s): efdbd62

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +38 -10
README.md CHANGED
@@ -36,30 +36,58 @@ tags:
36
  Below is an example of how to use the model for inference or refer to inference.py in files section:
37
 
38
  ```python
 
39
  from vllm import SamplingParams
40
-
41
- # Prepare a structured prompt:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  text = tokenizer.apply_chat_template(
43
  [
44
- {"role": "system", "content": "Respond in the following format:\n<reasoning>\n...\n</reasoning>\n<answer>\n...\n</answer>"},
45
- {"role": "user", "content": "In the context of disseminated intravascular coagulation (DIC), which blood component is expected to increase due to the excessive breakdown of fibrin?"}
46
  ],
47
  tokenize=False,
48
  add_generation_prompt=True
49
  )
50
-
51
- # Define sampling parameters:
52
  sampling_params = SamplingParams(
53
- temperature=0.8,
54
  top_p=0.95,
55
  max_tokens=4096,
56
  )
57
-
58
- # Generate and print the output:
59
  outputs = model.fast_generate(
60
  text,
61
  sampling_params=sampling_params,
62
- lora_request=None # Use None if the LoRA adapter is already loaded
63
  )
64
  print(outputs[0].outputs[0].text)
65
  ```
 
36
  Below is an example of how to use the model for inference or refer to inference.py in files section:
37
 
38
  ```python
39
+ from unsloth import FastLanguageModel, is_bfloat16_supported
40
  from vllm import SamplingParams
41
+ from huggingface_hub import snapshot_download
42
+ model, tokenizer = FastLanguageModel.from_pretrained(
43
+ model_name="iimran/Qwen2.5-3B-R1-MedicalReasoner",
44
+ load_in_4bit=True,
45
+ fast_inference=True,
46
+ gpu_memory_utilization=0.5
47
+ )
48
+ lora_rank = 64
49
+ model = FastLanguageModel.get_peft_model(
50
+ model,
51
+ r=lora_rank,
52
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
53
+ "gate_proj", "up_proj", "down_proj"],
54
+ lora_alpha=lora_rank,
55
+ use_gradient_checkpointing="unsloth",
56
+ random_state=3407,
57
+ )
58
+ lora_path = snapshot_download("iimran/Qwen2.5-3B-R1-MedicalReasoner-lora-adapter")
59
+ print("LoRA adapter downloaded to:", lora_path)
60
+ model.load_lora(lora_path)
61
+ SYSTEM_PROMPT = (
62
+ "Respond in the following format:\n"
63
+ "<reasoning>\n"
64
+ "...\n"
65
+ "</reasoning>\n"
66
+ "<answer>\n"
67
+ "...\n"
68
+ "</answer>"
69
+ )
70
+ USER_PROMPT = (
71
+ "In the context of disseminated intravascular coagulation (DIC), "
72
+ "which blood component is expected to show an increase due to the excessive breakdown of fibrin?"
73
+ )
74
  text = tokenizer.apply_chat_template(
75
  [
76
+ {"role": "system", "content": SYSTEM_PROMPT},
77
+ {"role": "user", "content": USER_PROMPT},
78
  ],
79
  tokenize=False,
80
  add_generation_prompt=True
81
  )
 
 
82
  sampling_params = SamplingParams(
83
+ temperature=0.1,
84
  top_p=0.95,
85
  max_tokens=4096,
86
  )
 
 
87
  outputs = model.fast_generate(
88
  text,
89
  sampling_params=sampling_params,
90
+ lora_request=None
91
  )
92
  print(outputs[0].outputs[0].text)
93
  ```