openaudio-s1-mini

Running on Zero

App Files Files Community

ford442 commited on 14 days ago

Commit

1d253a7

verified ·

1 Parent(s): 27e3c8b

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -24

app.py CHANGED Viewed

@@ -31,7 +31,13 @@ from loguru import logger
 from fish_speech.i18n import i18n
 from fish_speech.inference_engine import TTSInferenceEngine
 from fish_speech.models.dac.inference import load_model as load_decoder_model
-from fish_speech.models.text2semantic.inference import launch_thread_safe_queue
 from tools.webui.inference import get_inference_wrapper
 from fish_speech.utils.schema import ServeTTSRequest
@@ -63,10 +69,10 @@ The model running in this WebUI is OpenAudio S1 Mini.
 TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
 try:
     GPU_DECORATOR = spaces.GPU
 except ImportError:
     def GPU_DECORATOR(func):
         def wrapper(*args, **kwargs):
             return func(*args, **kwargs)
@@ -263,24 +269,69 @@ def parse_args():
     return parser.parse_args()
 if __name__ == "__main__":
     args = parse_args()
     args.precision = torch.half if args.half else torch.bfloat16
     logger.info("Loading Llama model...")
-    llama_queue = launch_thread_safe_queue(
-        checkpoint_path=args.llama_checkpoint_path,
-        device=args.device,
-        precision=args.precision,
-        compile=args.compile,
-    )
     logger.info("Llama model loaded, loading VQ-GAN model...")
     decoder_model = load_decoder_model(
         config_name=args.decoder_config_name,
         checkpoint_path=args.decoder_checkpoint_path,
-        device=args.device,
     )
     logger.info("Decoder model loaded, warming up...")
@@ -294,25 +345,38 @@ if __name__ == "__main__":
     )
     # Dry run to check if the model is loaded correctly and avoid the first-time latency
-    list(
-        inference_engine.inference(
-            ServeTTSRequest(
-                text="Hello world.",
-                references=[],
-                reference_id=None,
-                max_new_tokens=1024,
-                chunk_length=200,
-                top_p=0.7,
-                repetition_penalty=1.5,
-                temperature=0.7,
-                format="wav",
             )
         )
-    )
     logger.info("Warming up done, launching the web UI...")
     inference_fct = get_inference_wrapper(inference_engine)
-    app = build_app(inference_fct, args.theme)
     app.queue(api_open=True).launch(show_error=True, show_api=True)

 from fish_speech.i18n import i18n
 from fish_speech.inference_engine import TTSInferenceEngine
 from fish_speech.models.dac.inference import load_model as load_decoder_model
+from fish_speech.models.text2semantic.inference import (
+    launch_thread_safe_queue,
+    load_model as load_llama_model,
+    generate_long,
+    GenerateRequest,
+    WrappedGenerateResponse
+)
 from tools.webui.inference import get_inference_wrapper
 from fish_speech.utils.schema import ServeTTSRequest
 TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
 try:
     GPU_DECORATOR = spaces.GPU
+    IS_SPACES = True
 except ImportError:
+    IS_SPACES = False
     def GPU_DECORATOR(func):
         def wrapper(*args, **kwargs):
             return func(*args, **kwargs)
     return parser.parse_args()
+class SynchronousLlamaWorker:
+    def __init__(self, checkpoint_path, precision, compile):
+        self.model, self.decode_one_token = load_llama_model(
+            checkpoint_path, "cpu", precision, compile=compile
+        )
+    def put(self, req: GenerateRequest):
+        request_args = req.request
+        response_queue = req.response_queue
+        # Move model to CUDA for inference
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model.to(device)
+        # Setup caches
+        with torch.device(device):
+            self.model.setup_caches(
+                max_batch_size=1,
+                max_seq_len=self.model.config.max_seq_len,
+                dtype=next(self.model.parameters()).dtype,
+            )
+        request_args['device'] = device
+        try:
+            for chunk in generate_long(
+                model=self.model, decode_one_token=self.decode_one_token, **request_args
+            ):
+                response_queue.put(
+                    WrappedGenerateResponse(status="success", response=chunk)
+                )
+        except Exception as e:
+            response_queue.put(WrappedGenerateResponse(status="error", response=e))
 if __name__ == "__main__":
     args = parse_args()
     args.precision = torch.half if args.half else torch.bfloat16
     logger.info("Loading Llama model...")
+    # If running in a Spaces environment, we use a synchronous worker and lazy loading
+    if IS_SPACES:
+        llama_queue = SynchronousLlamaWorker(
+            checkpoint_path=args.llama_checkpoint_path,
+            precision=args.precision,
+            compile=args.compile,
+        )
+        device = "cpu"
+    else:
+        llama_queue = launch_thread_safe_queue(
+            checkpoint_path=args.llama_checkpoint_path,
+            device=args.device,
+            precision=args.precision,
+            compile=args.compile,
+        )
+        device = args.device
     logger.info("Llama model loaded, loading VQ-GAN model...")
     decoder_model = load_decoder_model(
         config_name=args.decoder_config_name,
         checkpoint_path=args.decoder_checkpoint_path,
+        device=device,
     )
     logger.info("Decoder model loaded, warming up...")
     )
     # Dry run to check if the model is loaded correctly and avoid the first-time latency
+    # Skip dry run in Spaces to avoid using quota or crashing due to GPU access in main process
+    if not IS_SPACES:
+        list(
+            inference_engine.inference(
+                ServeTTSRequest(
+                    text="Hello world.",
+                    references=[],
+                    reference_id=None,
+                    max_new_tokens=1024,
+                    chunk_length=200,
+                    top_p=0.7,
+                    repetition_penalty=1.5,
+                    temperature=0.7,
+                    format="wav",
+                )
             )
         )
     logger.info("Warming up done, launching the web UI...")
     inference_fct = get_inference_wrapper(inference_engine)
+    # Decorate the inference function with GPU access if in Spaces
+    if IS_SPACES:
+        @GPU_DECORATOR
+        def gpu_inference_wrapper(*args, **kwargs):
+            decoder_model.to("cuda")
+            return inference_fct(*args, **kwargs)
+        final_inference_fct = gpu_inference_wrapper
+    else:
+        final_inference_fct = inference_fct
+    app = build_app(final_inference_fct, args.theme)
     app.queue(api_open=True).launch(show_error=True, show_api=True)