humair025 commited on
Commit
44cb2fc
·
verified ·
1 Parent(s): 3e6a4f1

Update soprano/backends/lmdeploy.py

Browse files
Files changed (1) hide show
  1. soprano/backends/lmdeploy.py +26 -13
soprano/backends/lmdeploy.py CHANGED
@@ -8,26 +8,37 @@ class LMDeployModel(BaseModel):
8
  device='cuda',
9
  cache_size_mb=100,
10
  **kwargs):
11
- assert device == 'cuda', "lmdeploy only supports cuda devices, consider changing device or using a different backend instead."
12
- cache_size_ratio = cache_size_mb * 1024**2 / torch.cuda.get_device_properties('cuda').total_memory
13
- backend_config = TurbomindEngineConfig(cache_max_entry_count=cache_size_ratio)
14
- print("Loaded config.")
15
- self.pipeline = pipeline('ekwek/Soprano-80M',
16
- log_level='ERROR',
17
- backend_config=backend_config)
18
- print("Loaded pipeline.")
 
 
 
 
 
 
 
 
 
19
 
20
  def infer(self,
21
  prompts,
22
  top_p=0.95,
23
  temperature=0.3,
24
  repetition_penalty=1.2):
25
- gen_config=GenerationConfig(output_last_hidden_state='generation',
 
26
  do_sample=True,
27
  top_p=top_p,
28
  temperature=temperature,
29
  repetition_penalty=repetition_penalty,
30
- max_new_tokens=512)
 
31
  responses = self.pipeline(prompts, gen_config=gen_config)
32
  res = []
33
  for response in responses:
@@ -42,15 +53,17 @@ class LMDeployModel(BaseModel):
42
  top_p=0.95,
43
  temperature=0.3,
44
  repetition_penalty=1.2):
45
- gen_config=GenerationConfig(output_last_hidden_state='generation',
 
46
  do_sample=True,
47
  top_p=top_p,
48
  temperature=temperature,
49
  repetition_penalty=repetition_penalty,
50
- max_new_tokens=512)
 
51
  responses = self.pipeline.stream_infer([prompt], gen_config=gen_config)
52
  for response in responses:
53
  yield {
54
  'finish_reason': response.finish_reason,
55
  'hidden_state': response.last_hidden_state
56
- }
 
8
  device='cuda',
9
  cache_size_mb=100,
10
  **kwargs):
11
+ # LMDeploy supports both CUDA and CPU
12
+ self.device = device
13
+
14
+ if device == 'cuda':
15
+ # Original CUDA implementation with cache size optimization
16
+ cache_size_ratio = cache_size_mb * 1024**2 / torch.cuda.get_device_properties('cuda').total_memory
17
+ backend_config = TurbomindEngineConfig(cache_max_entry_count=cache_size_ratio)
18
+ self.pipeline = pipeline('ekwek/Soprano-80M',
19
+ log_level='ERROR',
20
+ backend_config=backend_config)
21
+ elif device == 'cpu':
22
+ # CPU implementation - TurbomindEngineConfig not needed
23
+ # LMDeploy will automatically use CPU inference
24
+ self.pipeline = pipeline('ekwek/Soprano-80M',
25
+ log_level='ERROR')
26
+ else:
27
+ raise ValueError(f"Unsupported device: {device}. Must be 'cuda' or 'cpu'.")
28
 
29
  def infer(self,
30
  prompts,
31
  top_p=0.95,
32
  temperature=0.3,
33
  repetition_penalty=1.2):
34
+ gen_config = GenerationConfig(
35
+ output_last_hidden_state='generation',
36
  do_sample=True,
37
  top_p=top_p,
38
  temperature=temperature,
39
  repetition_penalty=repetition_penalty,
40
+ max_new_tokens=512
41
+ )
42
  responses = self.pipeline(prompts, gen_config=gen_config)
43
  res = []
44
  for response in responses:
 
53
  top_p=0.95,
54
  temperature=0.3,
55
  repetition_penalty=1.2):
56
+ gen_config = GenerationConfig(
57
+ output_last_hidden_state='generation',
58
  do_sample=True,
59
  top_p=top_p,
60
  temperature=temperature,
61
  repetition_penalty=repetition_penalty,
62
+ max_new_tokens=512
63
+ )
64
  responses = self.pipeline.stream_infer([prompt], gen_config=gen_config)
65
  for response in responses:
66
  yield {
67
  'finish_reason': response.finish_reason,
68
  'hidden_state': response.last_hidden_state
69
+ }