Update app.py
Browse files
app.py
CHANGED
|
@@ -29,6 +29,11 @@ WHISPER_FRAME_RATE = 50
|
|
| 29 |
MODEL_ID = os.getenv("MODEL_ID", "AdoCleanCode/LAVCO-v3")
|
| 30 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
# Global model and tokenizer (loaded once)
|
| 33 |
model = None
|
| 34 |
tokenizer = None
|
|
@@ -208,29 +213,47 @@ class LAVCOModel(nn.Module):
|
|
| 208 |
with open(config_path, "r") as f:
|
| 209 |
config = json.load(f)
|
| 210 |
|
| 211 |
-
|
|
|
|
|
|
|
| 212 |
self.llasa = AutoModelForCausalLM.from_pretrained(
|
| 213 |
llasa_path,
|
| 214 |
trust_remote_code=True,
|
| 215 |
torch_dtype=torch.bfloat16,
|
| 216 |
)
|
| 217 |
self.hidden_size = self.llasa.config.hidden_size
|
|
|
|
|
|
|
| 218 |
|
| 219 |
-
print(f"π₯ Loading Whisper encoder from {config['whisper_model']}...")
|
|
|
|
| 220 |
whisper_full = WhisperModel.from_pretrained(config["whisper_model"])
|
| 221 |
self.whisper = whisper_full.encoder
|
| 222 |
self.whisper_dim = self.whisper.config.d_model
|
| 223 |
del whisper_full
|
|
|
|
|
|
|
| 224 |
|
| 225 |
-
print(f"π₯ Loading XCodec2 from {config['xcodec_model']}...")
|
|
|
|
| 226 |
self.xcodec = XCodec2Model.from_pretrained(config["xcodec_model"])
|
| 227 |
self.xcodec.eval()
|
|
|
|
|
|
|
| 228 |
|
|
|
|
|
|
|
| 229 |
self.whisper_processor = WhisperFeatureExtractor.from_pretrained(config["whisper_model"])
|
|
|
|
|
|
|
| 230 |
|
|
|
|
|
|
|
| 231 |
proj_state = torch.load(proj_path, map_location="cpu")
|
| 232 |
self.projection = nn.Linear(self.whisper_dim, self.hidden_size)
|
| 233 |
self.projection.load_state_dict(proj_state)
|
|
|
|
|
|
|
| 234 |
|
| 235 |
self.u_start_id = config.get("u_start_id")
|
| 236 |
self.u_end_id = config.get("u_end_id")
|
|
@@ -359,14 +382,26 @@ def load_model():
|
|
| 359 |
global model, tokenizer
|
| 360 |
|
| 361 |
if model is None:
|
| 362 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
model = LAVCOModel(MODEL_ID, device=DEVICE)
|
|
|
|
| 364 |
model = model.to(DEVICE)
|
| 365 |
model.eval()
|
| 366 |
-
|
| 367 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
|
|
|
| 368 |
model.set_special_token_ids(tokenizer)
|
| 369 |
-
|
|
|
|
|
|
|
|
|
|
| 370 |
|
| 371 |
return model, tokenizer
|
| 372 |
|
|
@@ -536,12 +571,14 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
|
|
| 536 |
source_audio = gr.Audio(
|
| 537 |
label="Source Audio (content to convert)",
|
| 538 |
type="filepath",
|
| 539 |
-
sources=["upload", "microphone"]
|
|
|
|
| 540 |
)
|
| 541 |
reference_audio = gr.Audio(
|
| 542 |
label="Reference Audio (target voice)",
|
| 543 |
type="filepath",
|
| 544 |
-
sources=["upload", "microphone"]
|
|
|
|
| 545 |
)
|
| 546 |
|
| 547 |
with gr.Column():
|
|
@@ -604,7 +641,11 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
|
|
| 604 |
### π How to Use
|
| 605 |
|
| 606 |
1. **Upload or record** your source audio (the speech you want to convert)
|
|
|
|
|
|
|
| 607 |
2. **Upload or record** your reference audio (the voice you want to mimic)
|
|
|
|
|
|
|
| 608 |
3. Adjust generation parameters if needed (defaults work well)
|
| 609 |
4. Click **Convert Voice** and wait for the result
|
| 610 |
|
|
@@ -612,6 +653,7 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
|
|
| 612 |
|
| 613 |
- Keep audio clips under 30 seconds for best results
|
| 614 |
- Reference audio should be clear speech (1+ seconds recommended)
|
|
|
|
| 615 |
- Higher repetition penalty helps avoid repetitive outputs
|
| 616 |
- Lower temperature = more stable, higher = more creative
|
| 617 |
""")
|
|
@@ -631,4 +673,28 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
|
|
| 631 |
)
|
| 632 |
|
| 633 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 634 |
demo.launch(share=False)
|
|
|
|
| 29 |
MODEL_ID = os.getenv("MODEL_ID", "AdoCleanCode/LAVCO-v3")
|
| 30 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 31 |
|
| 32 |
+
# Default audio files (will be in examples/ directory)
|
| 33 |
+
EXAMPLES_DIR = "examples"
|
| 34 |
+
DEFAULT_SOURCE_AUDIO = os.path.join(EXAMPLES_DIR, "default_source.wav") if os.path.exists(os.path.join(EXAMPLES_DIR, "default_source.wav")) else None
|
| 35 |
+
DEFAULT_REFERENCE_AUDIO = os.path.join(EXAMPLES_DIR, "default_reference.wav") if os.path.exists(os.path.join(EXAMPLES_DIR, "default_reference.wav")) else None
|
| 36 |
+
|
| 37 |
# Global model and tokenizer (loaded once)
|
| 38 |
model = None
|
| 39 |
tokenizer = None
|
|
|
|
| 213 |
with open(config_path, "r") as f:
|
| 214 |
config = json.load(f)
|
| 215 |
|
| 216 |
+
import sys
|
| 217 |
+
print(f"π₯ Loading LLASA from {llasa_path}...", flush=True)
|
| 218 |
+
sys.stdout.flush()
|
| 219 |
self.llasa = AutoModelForCausalLM.from_pretrained(
|
| 220 |
llasa_path,
|
| 221 |
trust_remote_code=True,
|
| 222 |
torch_dtype=torch.bfloat16,
|
| 223 |
)
|
| 224 |
self.hidden_size = self.llasa.config.hidden_size
|
| 225 |
+
print(f" β
LLASA loaded (hidden_size={self.hidden_size})", flush=True)
|
| 226 |
+
sys.stdout.flush()
|
| 227 |
|
| 228 |
+
print(f"π₯ Loading Whisper encoder from {config['whisper_model']}...", flush=True)
|
| 229 |
+
sys.stdout.flush()
|
| 230 |
whisper_full = WhisperModel.from_pretrained(config["whisper_model"])
|
| 231 |
self.whisper = whisper_full.encoder
|
| 232 |
self.whisper_dim = self.whisper.config.d_model
|
| 233 |
del whisper_full
|
| 234 |
+
print(f" β
Whisper loaded (dim={self.whisper_dim})", flush=True)
|
| 235 |
+
sys.stdout.flush()
|
| 236 |
|
| 237 |
+
print(f"π₯ Loading XCodec2 from {config['xcodec_model']}...", flush=True)
|
| 238 |
+
sys.stdout.flush()
|
| 239 |
self.xcodec = XCodec2Model.from_pretrained(config["xcodec_model"])
|
| 240 |
self.xcodec.eval()
|
| 241 |
+
print(f" β
XCodec2 loaded", flush=True)
|
| 242 |
+
sys.stdout.flush()
|
| 243 |
|
| 244 |
+
print(f"π₯ Loading Whisper processor...", flush=True)
|
| 245 |
+
sys.stdout.flush()
|
| 246 |
self.whisper_processor = WhisperFeatureExtractor.from_pretrained(config["whisper_model"])
|
| 247 |
+
print(f" β
Whisper processor loaded", flush=True)
|
| 248 |
+
sys.stdout.flush()
|
| 249 |
|
| 250 |
+
print(f"π₯ Loading projection layer...", flush=True)
|
| 251 |
+
sys.stdout.flush()
|
| 252 |
proj_state = torch.load(proj_path, map_location="cpu")
|
| 253 |
self.projection = nn.Linear(self.whisper_dim, self.hidden_size)
|
| 254 |
self.projection.load_state_dict(proj_state)
|
| 255 |
+
print(f" β
Projection layer loaded", flush=True)
|
| 256 |
+
sys.stdout.flush()
|
| 257 |
|
| 258 |
self.u_start_id = config.get("u_start_id")
|
| 259 |
self.u_end_id = config.get("u_end_id")
|
|
|
|
| 382 |
global model, tokenizer
|
| 383 |
|
| 384 |
if model is None:
|
| 385 |
+
import sys
|
| 386 |
+
import time
|
| 387 |
+
|
| 388 |
+
print(f"π₯ Loading model: {MODEL_ID}", flush=True)
|
| 389 |
+
sys.stdout.flush()
|
| 390 |
+
|
| 391 |
+
start_time = time.time()
|
| 392 |
+
print(" β Loading LAVCO model components...", flush=True)
|
| 393 |
model = LAVCOModel(MODEL_ID, device=DEVICE)
|
| 394 |
+
print(f" β Moving model to {DEVICE}...", flush=True)
|
| 395 |
model = model.to(DEVICE)
|
| 396 |
model.eval()
|
| 397 |
+
print(f" β Loading tokenizer...", flush=True)
|
| 398 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 399 |
+
print(f" β Setting special tokens...", flush=True)
|
| 400 |
model.set_special_token_ids(tokenizer)
|
| 401 |
+
|
| 402 |
+
elapsed = time.time() - start_time
|
| 403 |
+
print(f"β
Model loaded in {elapsed:.1f}s", flush=True)
|
| 404 |
+
sys.stdout.flush()
|
| 405 |
|
| 406 |
return model, tokenizer
|
| 407 |
|
|
|
|
| 571 |
source_audio = gr.Audio(
|
| 572 |
label="Source Audio (content to convert)",
|
| 573 |
type="filepath",
|
| 574 |
+
sources=["upload", "microphone"],
|
| 575 |
+
value=DEFAULT_SOURCE_AUDIO
|
| 576 |
)
|
| 577 |
reference_audio = gr.Audio(
|
| 578 |
label="Reference Audio (target voice)",
|
| 579 |
type="filepath",
|
| 580 |
+
sources=["upload", "microphone"],
|
| 581 |
+
value=DEFAULT_REFERENCE_AUDIO
|
| 582 |
)
|
| 583 |
|
| 584 |
with gr.Column():
|
|
|
|
| 641 |
### π How to Use
|
| 642 |
|
| 643 |
1. **Upload or record** your source audio (the speech you want to convert)
|
| 644 |
+
- Click the microphone icon to record directly from your microphone
|
| 645 |
+
- Or upload an audio file (WAV, MP3, etc.)
|
| 646 |
2. **Upload or record** your reference audio (the voice you want to mimic)
|
| 647 |
+
- Click the microphone icon to record the target voice
|
| 648 |
+
- Or upload a reference audio file
|
| 649 |
3. Adjust generation parameters if needed (defaults work well)
|
| 650 |
4. Click **Convert Voice** and wait for the result
|
| 651 |
|
|
|
|
| 653 |
|
| 654 |
- Keep audio clips under 30 seconds for best results
|
| 655 |
- Reference audio should be clear speech (1+ seconds recommended)
|
| 656 |
+
- When recording, speak clearly and minimize background noise
|
| 657 |
- Higher repetition penalty helps avoid repetitive outputs
|
| 658 |
- Lower temperature = more stable, higher = more creative
|
| 659 |
""")
|
|
|
|
| 673 |
)
|
| 674 |
|
| 675 |
if __name__ == "__main__":
|
| 676 |
+
import sys
|
| 677 |
+
print("=" * 60, flush=True)
|
| 678 |
+
print("π Starting LAVCO Gradio App", flush=True)
|
| 679 |
+
print("=" * 60, flush=True)
|
| 680 |
+
print(f"Device: {DEVICE}", flush=True)
|
| 681 |
+
print(f"Model: {MODEL_ID}", flush=True)
|
| 682 |
+
sys.stdout.flush()
|
| 683 |
+
|
| 684 |
+
# Pre-load model at startup (so first user doesn't wait)
|
| 685 |
+
print("\nβ³ Pre-loading model (this may take a few minutes)...", flush=True)
|
| 686 |
+
sys.stdout.flush()
|
| 687 |
+
try:
|
| 688 |
+
load_model()
|
| 689 |
+
print("β
Model ready! Starting Gradio interface...", flush=True)
|
| 690 |
+
sys.stdout.flush()
|
| 691 |
+
except Exception as e:
|
| 692 |
+
print(f"β οΈ Model pre-loading failed: {e}", flush=True)
|
| 693 |
+
print(" Model will load on first use instead.", flush=True)
|
| 694 |
+
import traceback
|
| 695 |
+
traceback.print_exc()
|
| 696 |
+
sys.stdout.flush()
|
| 697 |
+
|
| 698 |
+
print("\nπ Launching web interface...", flush=True)
|
| 699 |
+
sys.stdout.flush()
|
| 700 |
demo.launch(share=False)
|