Jn-Huang commited on
Commit
89babab
·
1 Parent(s): eaaeae1

Switch to vLLM for faster inference with lazy loading and multi-turn fix

Browse files
Files changed (4) hide show
  1. app.py +52 -46
  2. app_transformers.py +111 -0
  3. app_vllm.py +117 -0
  4. requirements.txt +1 -0
app.py CHANGED
@@ -1,77 +1,83 @@
1
- # app.py
2
  import os
3
- import torch
4
  import spaces
5
  import gradio as gr
6
- from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
7
 
8
  HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
9
 
10
  BASE_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
11
  PEFT_MODEL_ID = "befm/Be.FM-8B"
12
 
13
- USE_PEFT = True
14
- try:
15
- from peft import PeftModel, PeftConfig # noqa
16
- except Exception:
17
- USE_PEFT = False
18
- print("[WARN] 'peft' not installed; running base model only.")
19
-
20
- def load_model_and_tokenizer():
21
  if HF_TOKEN is None:
22
  raise RuntimeError(
23
  "HF_TOKEN is not set. Add it in Space → Settings → Secrets. "
24
  "Also ensure your account has access to the gated base model."
25
  )
26
- dtype = torch.float16 if torch.cuda.is_available() else torch.float32
27
- tok = AutoTokenizer.from_pretrained(BASE_MODEL_ID, token=HF_TOKEN)
28
- if tok.pad_token is None:
29
- tok.pad_token = tok.eos_token
30
-
31
- base = AutoModelForCausalLM.from_pretrained(
32
- BASE_MODEL_ID,
33
- device_map="auto" if torch.cuda.is_available() else None,
34
- torch_dtype=dtype,
35
- token=HF_TOKEN,
 
 
 
 
 
 
 
 
 
36
  )
 
 
 
37
 
38
- if USE_PEFT:
39
- try:
40
- _ = PeftConfig.from_pretrained(PEFT_MODEL_ID, token=HF_TOKEN)
41
- model = PeftModel.from_pretrained(base, PEFT_MODEL_ID, token=HF_TOKEN)
42
- print(f"[INFO] Loaded PEFT adapter: {PEFT_MODEL_ID}")
43
- return model, tok
44
- except Exception as e:
45
- print(f"[WARN] Failed to load PEFT adapter: {e}")
46
- return base, tok
47
- return base, tok
48
 
49
- model, tokenizer = load_model_and_tokenizer()
50
- DEVICE = model.device
 
 
 
 
51
 
52
  @spaces.GPU
53
- @torch.inference_mode()
54
  def generate_response(messages, max_new_tokens=512, temperature=0.7, top_p=0.9) -> str:
 
 
55
  # Apply Llama 3.1 chat template
56
  prompt = tokenizer.apply_chat_template(
57
  messages,
58
  tokenize=False,
59
  add_generation_prompt=True
60
  )
61
- enc = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
62
- enc = {k: v.to(DEVICE) for k, v in enc.items()}
63
 
64
- input_length = enc['input_ids'].shape[1]
65
- out = model.generate(
66
- **enc,
67
- max_new_tokens=max_new_tokens,
68
- do_sample=True,
69
  temperature=temperature,
70
  top_p=top_p,
71
- pad_token_id=tokenizer.eos_token_id,
72
  )
73
- # Decode only the newly generated tokens
74
- return tokenizer.decode(out[0][input_length:], skip_special_tokens=True)
 
 
 
 
 
 
 
75
 
76
  def chat_fn(message, history, system_prompt, max_new_tokens, temperature, top_p):
77
  # Build conversation in Llama 3.1 chat format
@@ -103,8 +109,8 @@ demo = gr.ChatInterface(
103
  gr.Slider(0.1, 1.5, value=0.7, step=0.05, label="temperature"),
104
  gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p"),
105
  ],
106
- title="Be.FM-8B (PEFT) on Meta-Llama-3.1-8B-Instruct",
107
- description="Chat interface using Meta-Llama-3.1-8B-Instruct with PEFT adapter befm/Be.FM-8B."
108
  )
109
 
110
  if __name__ == "__main__":
 
1
+ # app_vllm.py - Faster inference using vLLM
2
  import os
 
3
  import spaces
4
  import gradio as gr
5
+ from vllm import LLM, SamplingParams
6
+ from vllm.lora.request import LoRARequest
7
+ from transformers import AutoTokenizer
8
 
9
  HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
10
 
11
  BASE_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
12
  PEFT_MODEL_ID = "befm/Be.FM-8B"
13
 
14
+ def load_model():
 
 
 
 
 
 
 
15
  if HF_TOKEN is None:
16
  raise RuntimeError(
17
  "HF_TOKEN is not set. Add it in Space → Settings → Secrets. "
18
  "Also ensure your account has access to the gated base model."
19
  )
20
+
21
+ # Initialize vLLM with PEFT support
22
+ llm = LLM(
23
+ model=BASE_MODEL_ID,
24
+ tokenizer=BASE_MODEL_ID,
25
+ enable_lora=True,
26
+ max_lora_rank=64,
27
+ dtype="float16",
28
+ gpu_memory_utilization=0.9,
29
+ trust_remote_code=True,
30
+ )
31
+
32
+ print(f"[INFO] vLLM loaded base model: {BASE_MODEL_ID}")
33
+
34
+ # Load PEFT adapter
35
+ lora_request = LoRARequest(
36
+ lora_name="befm",
37
+ lora_int_id=1,
38
+ lora_path=PEFT_MODEL_ID,
39
  )
40
+ print(f"[INFO] PEFT adapter prepared: {PEFT_MODEL_ID}")
41
+
42
+ return llm, lora_request
43
 
44
+ # Lazy load model and tokenizer
45
+ _llm = None
46
+ _lora_request = None
47
+ _tokenizer = None
 
 
 
 
 
 
48
 
49
+ def get_model_and_tokenizer():
50
+ global _llm, _lora_request, _tokenizer
51
+ if _llm is None:
52
+ _llm, _lora_request = load_model()
53
+ _tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, token=HF_TOKEN)
54
+ return _llm, _lora_request, _tokenizer
55
 
56
  @spaces.GPU
 
57
  def generate_response(messages, max_new_tokens=512, temperature=0.7, top_p=0.9) -> str:
58
+ llm, lora_request, tokenizer = get_model_and_tokenizer()
59
+
60
  # Apply Llama 3.1 chat template
61
  prompt = tokenizer.apply_chat_template(
62
  messages,
63
  tokenize=False,
64
  add_generation_prompt=True
65
  )
 
 
66
 
67
+ sampling_params = SamplingParams(
 
 
 
 
68
  temperature=temperature,
69
  top_p=top_p,
70
+ max_tokens=max_new_tokens,
71
  )
72
+
73
+ # Generate with vLLM
74
+ outputs = llm.generate(
75
+ prompts=[prompt],
76
+ sampling_params=sampling_params,
77
+ lora_request=lora_request,
78
+ )
79
+
80
+ return outputs[0].outputs[0].text
81
 
82
  def chat_fn(message, history, system_prompt, max_new_tokens, temperature, top_p):
83
  # Build conversation in Llama 3.1 chat format
 
109
  gr.Slider(0.1, 1.5, value=0.7, step=0.05, label="temperature"),
110
  gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p"),
111
  ],
112
+ title="Be.FM-8B (vLLM) - Fast Inference",
113
+ description="Chat interface using vLLM for optimized inference with Meta-Llama-3.1-8B-Instruct and PEFT adapter befm/Be.FM-8B."
114
  )
115
 
116
  if __name__ == "__main__":
app_transformers.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import os
3
+ import torch
4
+ import spaces
5
+ import gradio as gr
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM
7
+
8
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
9
+
10
+ BASE_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
11
+ PEFT_MODEL_ID = "befm/Be.FM-8B"
12
+
13
+ USE_PEFT = True
14
+ try:
15
+ from peft import PeftModel, PeftConfig # noqa
16
+ except Exception:
17
+ USE_PEFT = False
18
+ print("[WARN] 'peft' not installed; running base model only.")
19
+
20
+ def load_model_and_tokenizer():
21
+ if HF_TOKEN is None:
22
+ raise RuntimeError(
23
+ "HF_TOKEN is not set. Add it in Space → Settings → Secrets. "
24
+ "Also ensure your account has access to the gated base model."
25
+ )
26
+ dtype = torch.float16 if torch.cuda.is_available() else torch.float32
27
+ tok = AutoTokenizer.from_pretrained(BASE_MODEL_ID, token=HF_TOKEN)
28
+ if tok.pad_token is None:
29
+ tok.pad_token = tok.eos_token
30
+
31
+ base = AutoModelForCausalLM.from_pretrained(
32
+ BASE_MODEL_ID,
33
+ device_map="auto" if torch.cuda.is_available() else None,
34
+ torch_dtype=dtype,
35
+ token=HF_TOKEN,
36
+ )
37
+
38
+ if USE_PEFT:
39
+ try:
40
+ _ = PeftConfig.from_pretrained(PEFT_MODEL_ID, token=HF_TOKEN)
41
+ model = PeftModel.from_pretrained(base, PEFT_MODEL_ID, token=HF_TOKEN)
42
+ print(f"[INFO] Loaded PEFT adapter: {PEFT_MODEL_ID}")
43
+ return model, tok
44
+ except Exception as e:
45
+ print(f"[WARN] Failed to load PEFT adapter: {e}")
46
+ return base, tok
47
+ return base, tok
48
+
49
+ model, tokenizer = load_model_and_tokenizer()
50
+ DEVICE = model.device
51
+
52
+ @spaces.GPU
53
+ @torch.inference_mode()
54
+ def generate_response(messages, max_new_tokens=512, temperature=0.7, top_p=0.9) -> str:
55
+ # Apply Llama 3.1 chat template
56
+ prompt = tokenizer.apply_chat_template(
57
+ messages,
58
+ tokenize=False,
59
+ add_generation_prompt=True
60
+ )
61
+ enc = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
62
+ enc = {k: v.to(DEVICE) for k, v in enc.items()}
63
+
64
+ input_length = enc['input_ids'].shape[1]
65
+ out = model.generate(
66
+ **enc,
67
+ max_new_tokens=max_new_tokens,
68
+ do_sample=True,
69
+ temperature=temperature,
70
+ top_p=top_p,
71
+ pad_token_id=tokenizer.eos_token_id,
72
+ )
73
+ # Decode only the newly generated tokens
74
+ return tokenizer.decode(out[0][input_length:], skip_special_tokens=True)
75
+
76
+ def chat_fn(message, history, system_prompt, max_new_tokens, temperature, top_p):
77
+ # Build conversation in Llama 3.1 chat format
78
+ messages = []
79
+ if system_prompt:
80
+ messages.append({"role": "system", "content": system_prompt})
81
+
82
+ # History is already in dict format: [{"role": "user", "content": "..."}, ...]
83
+ for msg in (history or []):
84
+ messages.append(msg)
85
+
86
+ if message:
87
+ messages.append({"role": "user", "content": message})
88
+
89
+ reply = generate_response(
90
+ messages,
91
+ max_new_tokens=max_new_tokens,
92
+ temperature=temperature,
93
+ top_p=top_p,
94
+ )
95
+ return reply
96
+
97
+ demo = gr.ChatInterface(
98
+ fn=lambda message, history, system_prompt, max_new_tokens, temperature, top_p:
99
+ chat_fn(message, history, system_prompt, max_new_tokens, temperature, top_p),
100
+ additional_inputs=[
101
+ gr.Textbox(label="System prompt (optional)", placeholder="You are Be.FM assistant...", lines=2),
102
+ gr.Slider(16, 2048, value=512, step=16, label="max_new_tokens"),
103
+ gr.Slider(0.1, 1.5, value=0.7, step=0.05, label="temperature"),
104
+ gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p"),
105
+ ],
106
+ title="Be.FM-8B (PEFT) on Meta-Llama-3.1-8B-Instruct",
107
+ description="Chat interface using Meta-Llama-3.1-8B-Instruct with PEFT adapter befm/Be.FM-8B."
108
+ )
109
+
110
+ if __name__ == "__main__":
111
+ demo.launch()
app_vllm.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app_vllm.py - Faster inference using vLLM
2
+ import os
3
+ import spaces
4
+ import gradio as gr
5
+ from vllm import LLM, SamplingParams
6
+ from vllm.lora.request import LoRARequest
7
+ from transformers import AutoTokenizer
8
+
9
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
10
+
11
+ BASE_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
12
+ PEFT_MODEL_ID = "befm/Be.FM-8B"
13
+
14
+ def load_model():
15
+ if HF_TOKEN is None:
16
+ raise RuntimeError(
17
+ "HF_TOKEN is not set. Add it in Space → Settings → Secrets. "
18
+ "Also ensure your account has access to the gated base model."
19
+ )
20
+
21
+ # Initialize vLLM with PEFT support
22
+ llm = LLM(
23
+ model=BASE_MODEL_ID,
24
+ tokenizer=BASE_MODEL_ID,
25
+ enable_lora=True,
26
+ max_lora_rank=64,
27
+ dtype="float16",
28
+ gpu_memory_utilization=0.9,
29
+ trust_remote_code=True,
30
+ )
31
+
32
+ print(f"[INFO] vLLM loaded base model: {BASE_MODEL_ID}")
33
+
34
+ # Load PEFT adapter
35
+ lora_request = LoRARequest(
36
+ lora_name="befm",
37
+ lora_int_id=1,
38
+ lora_path=PEFT_MODEL_ID,
39
+ )
40
+ print(f"[INFO] PEFT adapter prepared: {PEFT_MODEL_ID}")
41
+
42
+ return llm, lora_request
43
+
44
+ # Lazy load model and tokenizer
45
+ _llm = None
46
+ _lora_request = None
47
+ _tokenizer = None
48
+
49
+ def get_model_and_tokenizer():
50
+ global _llm, _lora_request, _tokenizer
51
+ if _llm is None:
52
+ _llm, _lora_request = load_model()
53
+ _tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, token=HF_TOKEN)
54
+ return _llm, _lora_request, _tokenizer
55
+
56
+ @spaces.GPU
57
+ def generate_response(messages, max_new_tokens=512, temperature=0.7, top_p=0.9) -> str:
58
+ llm, lora_request, tokenizer = get_model_and_tokenizer()
59
+
60
+ # Apply Llama 3.1 chat template
61
+ prompt = tokenizer.apply_chat_template(
62
+ messages,
63
+ tokenize=False,
64
+ add_generation_prompt=True
65
+ )
66
+
67
+ sampling_params = SamplingParams(
68
+ temperature=temperature,
69
+ top_p=top_p,
70
+ max_tokens=max_new_tokens,
71
+ )
72
+
73
+ # Generate with vLLM
74
+ outputs = llm.generate(
75
+ prompts=[prompt],
76
+ sampling_params=sampling_params,
77
+ lora_request=lora_request,
78
+ )
79
+
80
+ return outputs[0].outputs[0].text
81
+
82
+ def chat_fn(message, history, system_prompt, max_new_tokens, temperature, top_p):
83
+ # Build conversation in Llama 3.1 chat format
84
+ messages = []
85
+ if system_prompt:
86
+ messages.append({"role": "system", "content": system_prompt})
87
+
88
+ # History is already in dict format: [{"role": "user", "content": "..."}, ...]
89
+ for msg in (history or []):
90
+ messages.append(msg)
91
+
92
+ if message:
93
+ messages.append({"role": "user", "content": message})
94
+
95
+ reply = generate_response(
96
+ messages,
97
+ max_new_tokens=max_new_tokens,
98
+ temperature=temperature,
99
+ top_p=top_p,
100
+ )
101
+ return reply
102
+
103
+ demo = gr.ChatInterface(
104
+ fn=lambda message, history, system_prompt, max_new_tokens, temperature, top_p:
105
+ chat_fn(message, history, system_prompt, max_new_tokens, temperature, top_p),
106
+ additional_inputs=[
107
+ gr.Textbox(label="System prompt (optional)", placeholder="You are Be.FM assistant...", lines=2),
108
+ gr.Slider(16, 2048, value=512, step=16, label="max_new_tokens"),
109
+ gr.Slider(0.1, 1.5, value=0.7, step=0.05, label="temperature"),
110
+ gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p"),
111
+ ],
112
+ title="Be.FM-8B (vLLM) - Fast Inference",
113
+ description="Chat interface using vLLM for optimized inference with Meta-Llama-3.1-8B-Instruct and PEFT adapter befm/Be.FM-8B."
114
+ )
115
+
116
+ if __name__ == "__main__":
117
+ demo.launch()
requirements.txt CHANGED
@@ -3,3 +3,4 @@ transformers>=4.30.0
3
  peft>=0.4.0
4
  spaces
5
  accelerate
 
 
3
  peft>=0.4.0
4
  spaces
5
  accelerate
6
+ vllm>=0.6.0