SevenLabs

Running

App Files Files Community

humair025 commited on Dec 23, 2025

Commit

44cb2fc

verified ·

1 Parent(s): 3e6a4f1

Update soprano/backends/lmdeploy.py

Browse files

Files changed (1) hide show

soprano/backends/lmdeploy.py +26 -13

soprano/backends/lmdeploy.py CHANGED Viewed

@@ -8,26 +8,37 @@ class LMDeployModel(BaseModel):
             device='cuda',
             cache_size_mb=100,
             **kwargs):
-        assert device == 'cuda', "lmdeploy only supports cuda devices, consider changing device or using a different backend instead."
-        cache_size_ratio = cache_size_mb * 1024**2 / torch.cuda.get_device_properties('cuda').total_memory
-        backend_config = TurbomindEngineConfig(cache_max_entry_count=cache_size_ratio)
-        print("Loaded config.")
-        self.pipeline = pipeline('ekwek/Soprano-80M',
-            log_level='ERROR',
-            backend_config=backend_config)
-        print("Loaded pipeline.")
     def infer(self,
             prompts,
             top_p=0.95,
             temperature=0.3,
             repetition_penalty=1.2):
-        gen_config=GenerationConfig(output_last_hidden_state='generation',
             do_sample=True,
             top_p=top_p,
             temperature=temperature,
             repetition_penalty=repetition_penalty,
-            max_new_tokens=512)
         responses = self.pipeline(prompts, gen_config=gen_config)
         res = []
         for response in responses:
@@ -42,15 +53,17 @@ class LMDeployModel(BaseModel):
             top_p=0.95,
             temperature=0.3,
             repetition_penalty=1.2):
-        gen_config=GenerationConfig(output_last_hidden_state='generation',
             do_sample=True,
             top_p=top_p,
             temperature=temperature,
             repetition_penalty=repetition_penalty,
-            max_new_tokens=512)
         responses = self.pipeline.stream_infer([prompt], gen_config=gen_config)
         for response in responses:
             yield {
                 'finish_reason': response.finish_reason,
                 'hidden_state': response.last_hidden_state
-            }

             device='cuda',
             cache_size_mb=100,
             **kwargs):
+        # LMDeploy supports both CUDA and CPU
+        self.device = device
+        if device == 'cuda':
+            # Original CUDA implementation with cache size optimization
+            cache_size_ratio = cache_size_mb * 1024**2 / torch.cuda.get_device_properties('cuda').total_memory
+            backend_config = TurbomindEngineConfig(cache_max_entry_count=cache_size_ratio)
+            self.pipeline = pipeline('ekwek/Soprano-80M',
+                log_level='ERROR',
+                backend_config=backend_config)
+        elif device == 'cpu':
+            # CPU implementation - TurbomindEngineConfig not needed
+            # LMDeploy will automatically use CPU inference
+            self.pipeline = pipeline('ekwek/Soprano-80M',
+                log_level='ERROR')
+        else:
+            raise ValueError(f"Unsupported device: {device}. Must be 'cuda' or 'cpu'.")
     def infer(self,
             prompts,
             top_p=0.95,
             temperature=0.3,
             repetition_penalty=1.2):
+        gen_config = GenerationConfig(
+            output_last_hidden_state='generation',
             do_sample=True,
             top_p=top_p,
             temperature=temperature,
             repetition_penalty=repetition_penalty,
+            max_new_tokens=512
+        )
         responses = self.pipeline(prompts, gen_config=gen_config)
         res = []
         for response in responses:
             top_p=0.95,
             temperature=0.3,
             repetition_penalty=1.2):
+        gen_config = GenerationConfig(
+            output_last_hidden_state='generation',
             do_sample=True,
             top_p=top_p,
             temperature=temperature,
             repetition_penalty=repetition_penalty,
+            max_new_tokens=512
+        )
         responses = self.pipeline.stream_infer([prompt], gen_config=gen_config)
         for response in responses:
             yield {
                 'finish_reason': response.finish_reason,
                 'hidden_state': response.last_hidden_state
+            }