Spaces:
Running
on
Zero
Running
on
Zero
fix log
Browse files
app.py
CHANGED
|
@@ -36,6 +36,7 @@ MODEL_PATH = os.getenv("MODEL_PATH", os.path.join(MODELS_DIR, DEFAULT_MODEL))
|
|
| 36 |
VOCODER_PATH = os.getenv("VOCODER_PATH", None)
|
| 37 |
USE_GPU = os.getenv("USE_GPU", "false").lower() == "true"
|
| 38 |
SAMPLE_RATE = 22050
|
|
|
|
| 39 |
|
| 40 |
|
| 41 |
def get_available_models():
|
|
@@ -93,8 +94,9 @@ def process_japanese_text(text: str):
|
|
| 93 |
phonemes = phonemes.replace(" ", "")
|
| 94 |
phonemes = phonemes.replace("pau", " ")
|
| 95 |
|
| 96 |
-
|
| 97 |
-
|
|
|
|
| 98 |
|
| 99 |
# Text to sequence
|
| 100 |
sequence = text_to_sequence(phonemes)
|
|
@@ -136,7 +138,8 @@ class ONNXModelManager:
|
|
| 136 |
|
| 137 |
def _load_model(self):
|
| 138 |
"""Load ONNX model(s)"""
|
| 139 |
-
|
|
|
|
| 140 |
self.model = ort.InferenceSession(self.model_path, providers=self.providers)
|
| 141 |
|
| 142 |
model_inputs = self.model.get_inputs()
|
|
@@ -145,12 +148,14 @@ class ONNXModelManager:
|
|
| 145 |
self.is_multi_speaker = len(model_inputs) == 4
|
| 146 |
self.has_vocoder_embedded = model_outputs[0].name == "wav"
|
| 147 |
|
| 148 |
-
|
| 149 |
-
|
|
|
|
| 150 |
|
| 151 |
# Load external vocoder if needed
|
| 152 |
if not self.has_vocoder_embedded and self.vocoder_path:
|
| 153 |
-
|
|
|
|
| 154 |
self.vocoder = ort.InferenceSession(self.vocoder_path, providers=self.providers)
|
| 155 |
|
| 156 |
def synthesize(
|
|
@@ -204,7 +209,8 @@ def get_model_manager(model_name: str) -> ONNXModelManager:
|
|
| 204 |
model_path = os.path.join(MODELS_DIR, model_name)
|
| 205 |
|
| 206 |
if model_name not in model_managers:
|
| 207 |
-
|
|
|
|
| 208 |
model_managers[model_name] = ONNXModelManager(
|
| 209 |
model_path=model_path,
|
| 210 |
vocoder_path=VOCODER_PATH,
|
|
@@ -216,10 +222,12 @@ def get_model_manager(model_name: str) -> ONNXModelManager:
|
|
| 216 |
|
| 217 |
|
| 218 |
# Pre-load all available models
|
| 219 |
-
|
|
|
|
| 220 |
for model_name in get_available_models():
|
| 221 |
get_model_manager(model_name)
|
| 222 |
-
|
|
|
|
| 223 |
|
| 224 |
# ============================================================================
|
| 225 |
# Gradio Interface Functions
|
|
@@ -274,9 +282,10 @@ def synthesise(
|
|
| 274 |
audio_duration_sec = len(audio) / SAMPLE_RATE
|
| 275 |
rtf = inference_time / audio_duration_sec
|
| 276 |
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
|
|
|
| 280 |
|
| 281 |
# Save to temporary file
|
| 282 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
|
@@ -445,4 +454,4 @@ if __name__ == "__main__":
|
|
| 445 |
server_port=7860,
|
| 446 |
share=False,
|
| 447 |
show_error=True
|
| 448 |
-
)
|
|
|
|
| 36 |
VOCODER_PATH = os.getenv("VOCODER_PATH", None)
|
| 37 |
USE_GPU = os.getenv("USE_GPU", "false").lower() == "true"
|
| 38 |
SAMPLE_RATE = 22050
|
| 39 |
+
DEBUG = os.getenv("DEBUG", "false").lower() == "true"
|
| 40 |
|
| 41 |
|
| 42 |
def get_available_models():
|
|
|
|
| 94 |
phonemes = phonemes.replace(" ", "")
|
| 95 |
phonemes = phonemes.replace("pau", " ")
|
| 96 |
|
| 97 |
+
if DEBUG:
|
| 98 |
+
print(f"Input: {text}")
|
| 99 |
+
print(f"Phonemes: {phonemes}")
|
| 100 |
|
| 101 |
# Text to sequence
|
| 102 |
sequence = text_to_sequence(phonemes)
|
|
|
|
| 138 |
|
| 139 |
def _load_model(self):
|
| 140 |
"""Load ONNX model(s)"""
|
| 141 |
+
if DEBUG:
|
| 142 |
+
print(f"Loading model from {self.model_path} with providers {self.providers}")
|
| 143 |
self.model = ort.InferenceSession(self.model_path, providers=self.providers)
|
| 144 |
|
| 145 |
model_inputs = self.model.get_inputs()
|
|
|
|
| 148 |
self.is_multi_speaker = len(model_inputs) == 4
|
| 149 |
self.has_vocoder_embedded = model_outputs[0].name == "wav"
|
| 150 |
|
| 151 |
+
if DEBUG:
|
| 152 |
+
print(f"Model loaded: multi_speaker={self.is_multi_speaker}, "
|
| 153 |
+
f"vocoder_embedded={self.has_vocoder_embedded}")
|
| 154 |
|
| 155 |
# Load external vocoder if needed
|
| 156 |
if not self.has_vocoder_embedded and self.vocoder_path:
|
| 157 |
+
if DEBUG:
|
| 158 |
+
print(f"Loading external vocoder from {self.vocoder_path}")
|
| 159 |
self.vocoder = ort.InferenceSession(self.vocoder_path, providers=self.providers)
|
| 160 |
|
| 161 |
def synthesize(
|
|
|
|
| 209 |
model_path = os.path.join(MODELS_DIR, model_name)
|
| 210 |
|
| 211 |
if model_name not in model_managers:
|
| 212 |
+
if DEBUG:
|
| 213 |
+
print(f"Loading new model: {model_name}")
|
| 214 |
model_managers[model_name] = ONNXModelManager(
|
| 215 |
model_path=model_path,
|
| 216 |
vocoder_path=VOCODER_PATH,
|
|
|
|
| 222 |
|
| 223 |
|
| 224 |
# Pre-load all available models
|
| 225 |
+
if DEBUG:
|
| 226 |
+
print("Pre-loading all models for ZeroGPU...")
|
| 227 |
for model_name in get_available_models():
|
| 228 |
get_model_manager(model_name)
|
| 229 |
+
if DEBUG:
|
| 230 |
+
print("All models loaded.")
|
| 231 |
|
| 232 |
# ============================================================================
|
| 233 |
# Gradio Interface Functions
|
|
|
|
| 282 |
audio_duration_sec = len(audio) / SAMPLE_RATE
|
| 283 |
rtf = inference_time / audio_duration_sec
|
| 284 |
|
| 285 |
+
if DEBUG:
|
| 286 |
+
print(f"Inference time: {inference_time:.3f}s, "
|
| 287 |
+
f"Audio duration: {audio_duration_sec:.3f}s, "
|
| 288 |
+
f"RTF: {rtf:.3f}")
|
| 289 |
|
| 290 |
# Save to temporary file
|
| 291 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
|
|
|
| 454 |
server_port=7860,
|
| 455 |
share=False,
|
| 456 |
show_error=True
|
| 457 |
+
)
|