AAdonis commited on
Commit
5517bcd
Β·
verified Β·
1 Parent(s): fd7c103

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -8
app.py CHANGED
@@ -29,6 +29,11 @@ WHISPER_FRAME_RATE = 50
29
  MODEL_ID = os.getenv("MODEL_ID", "AdoCleanCode/LAVCO-v3")
30
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
31
 
 
 
 
 
 
32
  # Global model and tokenizer (loaded once)
33
  model = None
34
  tokenizer = None
@@ -208,29 +213,47 @@ class LAVCOModel(nn.Module):
208
  with open(config_path, "r") as f:
209
  config = json.load(f)
210
 
211
- print(f"πŸ“₯ Loading LLASA from {llasa_path}...")
 
 
212
  self.llasa = AutoModelForCausalLM.from_pretrained(
213
  llasa_path,
214
  trust_remote_code=True,
215
  torch_dtype=torch.bfloat16,
216
  )
217
  self.hidden_size = self.llasa.config.hidden_size
 
 
218
 
219
- print(f"πŸ“₯ Loading Whisper encoder from {config['whisper_model']}...")
 
220
  whisper_full = WhisperModel.from_pretrained(config["whisper_model"])
221
  self.whisper = whisper_full.encoder
222
  self.whisper_dim = self.whisper.config.d_model
223
  del whisper_full
 
 
224
 
225
- print(f"πŸ“₯ Loading XCodec2 from {config['xcodec_model']}...")
 
226
  self.xcodec = XCodec2Model.from_pretrained(config["xcodec_model"])
227
  self.xcodec.eval()
 
 
228
 
 
 
229
  self.whisper_processor = WhisperFeatureExtractor.from_pretrained(config["whisper_model"])
 
 
230
 
 
 
231
  proj_state = torch.load(proj_path, map_location="cpu")
232
  self.projection = nn.Linear(self.whisper_dim, self.hidden_size)
233
  self.projection.load_state_dict(proj_state)
 
 
234
 
235
  self.u_start_id = config.get("u_start_id")
236
  self.u_end_id = config.get("u_end_id")
@@ -359,14 +382,26 @@ def load_model():
359
  global model, tokenizer
360
 
361
  if model is None:
362
- print(f"πŸ“₯ Loading model: {MODEL_ID}")
 
 
 
 
 
 
 
363
  model = LAVCOModel(MODEL_ID, device=DEVICE)
 
364
  model = model.to(DEVICE)
365
  model.eval()
366
-
367
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
368
  model.set_special_token_ids(tokenizer)
369
- print("βœ… Model loaded")
 
 
 
370
 
371
  return model, tokenizer
372
 
@@ -536,12 +571,14 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
536
  source_audio = gr.Audio(
537
  label="Source Audio (content to convert)",
538
  type="filepath",
539
- sources=["upload", "microphone"]
 
540
  )
541
  reference_audio = gr.Audio(
542
  label="Reference Audio (target voice)",
543
  type="filepath",
544
- sources=["upload", "microphone"]
 
545
  )
546
 
547
  with gr.Column():
@@ -604,7 +641,11 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
604
  ### πŸ“– How to Use
605
 
606
  1. **Upload or record** your source audio (the speech you want to convert)
 
 
607
  2. **Upload or record** your reference audio (the voice you want to mimic)
 
 
608
  3. Adjust generation parameters if needed (defaults work well)
609
  4. Click **Convert Voice** and wait for the result
610
 
@@ -612,6 +653,7 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
612
 
613
  - Keep audio clips under 30 seconds for best results
614
  - Reference audio should be clear speech (1+ seconds recommended)
 
615
  - Higher repetition penalty helps avoid repetitive outputs
616
  - Lower temperature = more stable, higher = more creative
617
  """)
@@ -631,4 +673,28 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
631
  )
632
 
633
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634
  demo.launch(share=False)
 
29
  MODEL_ID = os.getenv("MODEL_ID", "AdoCleanCode/LAVCO-v3")
30
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
31
 
32
+ # Default audio files (will be in examples/ directory)
33
+ EXAMPLES_DIR = "examples"
34
+ DEFAULT_SOURCE_AUDIO = os.path.join(EXAMPLES_DIR, "default_source.wav") if os.path.exists(os.path.join(EXAMPLES_DIR, "default_source.wav")) else None
35
+ DEFAULT_REFERENCE_AUDIO = os.path.join(EXAMPLES_DIR, "default_reference.wav") if os.path.exists(os.path.join(EXAMPLES_DIR, "default_reference.wav")) else None
36
+
37
  # Global model and tokenizer (loaded once)
38
  model = None
39
  tokenizer = None
 
213
  with open(config_path, "r") as f:
214
  config = json.load(f)
215
 
216
+ import sys
217
+ print(f"πŸ“₯ Loading LLASA from {llasa_path}...", flush=True)
218
+ sys.stdout.flush()
219
  self.llasa = AutoModelForCausalLM.from_pretrained(
220
  llasa_path,
221
  trust_remote_code=True,
222
  torch_dtype=torch.bfloat16,
223
  )
224
  self.hidden_size = self.llasa.config.hidden_size
225
+ print(f" βœ… LLASA loaded (hidden_size={self.hidden_size})", flush=True)
226
+ sys.stdout.flush()
227
 
228
+ print(f"πŸ“₯ Loading Whisper encoder from {config['whisper_model']}...", flush=True)
229
+ sys.stdout.flush()
230
  whisper_full = WhisperModel.from_pretrained(config["whisper_model"])
231
  self.whisper = whisper_full.encoder
232
  self.whisper_dim = self.whisper.config.d_model
233
  del whisper_full
234
+ print(f" βœ… Whisper loaded (dim={self.whisper_dim})", flush=True)
235
+ sys.stdout.flush()
236
 
237
+ print(f"πŸ“₯ Loading XCodec2 from {config['xcodec_model']}...", flush=True)
238
+ sys.stdout.flush()
239
  self.xcodec = XCodec2Model.from_pretrained(config["xcodec_model"])
240
  self.xcodec.eval()
241
+ print(f" βœ… XCodec2 loaded", flush=True)
242
+ sys.stdout.flush()
243
 
244
+ print(f"πŸ“₯ Loading Whisper processor...", flush=True)
245
+ sys.stdout.flush()
246
  self.whisper_processor = WhisperFeatureExtractor.from_pretrained(config["whisper_model"])
247
+ print(f" βœ… Whisper processor loaded", flush=True)
248
+ sys.stdout.flush()
249
 
250
+ print(f"πŸ“₯ Loading projection layer...", flush=True)
251
+ sys.stdout.flush()
252
  proj_state = torch.load(proj_path, map_location="cpu")
253
  self.projection = nn.Linear(self.whisper_dim, self.hidden_size)
254
  self.projection.load_state_dict(proj_state)
255
+ print(f" βœ… Projection layer loaded", flush=True)
256
+ sys.stdout.flush()
257
 
258
  self.u_start_id = config.get("u_start_id")
259
  self.u_end_id = config.get("u_end_id")
 
382
  global model, tokenizer
383
 
384
  if model is None:
385
+ import sys
386
+ import time
387
+
388
+ print(f"πŸ“₯ Loading model: {MODEL_ID}", flush=True)
389
+ sys.stdout.flush()
390
+
391
+ start_time = time.time()
392
+ print(" β†’ Loading LAVCO model components...", flush=True)
393
  model = LAVCOModel(MODEL_ID, device=DEVICE)
394
+ print(f" β†’ Moving model to {DEVICE}...", flush=True)
395
  model = model.to(DEVICE)
396
  model.eval()
397
+ print(f" β†’ Loading tokenizer...", flush=True)
398
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
399
+ print(f" β†’ Setting special tokens...", flush=True)
400
  model.set_special_token_ids(tokenizer)
401
+
402
+ elapsed = time.time() - start_time
403
+ print(f"βœ… Model loaded in {elapsed:.1f}s", flush=True)
404
+ sys.stdout.flush()
405
 
406
  return model, tokenizer
407
 
 
571
  source_audio = gr.Audio(
572
  label="Source Audio (content to convert)",
573
  type="filepath",
574
+ sources=["upload", "microphone"],
575
+ value=DEFAULT_SOURCE_AUDIO
576
  )
577
  reference_audio = gr.Audio(
578
  label="Reference Audio (target voice)",
579
  type="filepath",
580
+ sources=["upload", "microphone"],
581
+ value=DEFAULT_REFERENCE_AUDIO
582
  )
583
 
584
  with gr.Column():
 
641
  ### πŸ“– How to Use
642
 
643
  1. **Upload or record** your source audio (the speech you want to convert)
644
+ - Click the microphone icon to record directly from your microphone
645
+ - Or upload an audio file (WAV, MP3, etc.)
646
  2. **Upload or record** your reference audio (the voice you want to mimic)
647
+ - Click the microphone icon to record the target voice
648
+ - Or upload a reference audio file
649
  3. Adjust generation parameters if needed (defaults work well)
650
  4. Click **Convert Voice** and wait for the result
651
 
 
653
 
654
  - Keep audio clips under 30 seconds for best results
655
  - Reference audio should be clear speech (1+ seconds recommended)
656
+ - When recording, speak clearly and minimize background noise
657
  - Higher repetition penalty helps avoid repetitive outputs
658
  - Lower temperature = more stable, higher = more creative
659
  """)
 
673
  )
674
 
675
  if __name__ == "__main__":
676
+ import sys
677
+ print("=" * 60, flush=True)
678
+ print("πŸš€ Starting LAVCO Gradio App", flush=True)
679
+ print("=" * 60, flush=True)
680
+ print(f"Device: {DEVICE}", flush=True)
681
+ print(f"Model: {MODEL_ID}", flush=True)
682
+ sys.stdout.flush()
683
+
684
+ # Pre-load model at startup (so first user doesn't wait)
685
+ print("\n⏳ Pre-loading model (this may take a few minutes)...", flush=True)
686
+ sys.stdout.flush()
687
+ try:
688
+ load_model()
689
+ print("βœ… Model ready! Starting Gradio interface...", flush=True)
690
+ sys.stdout.flush()
691
+ except Exception as e:
692
+ print(f"⚠️ Model pre-loading failed: {e}", flush=True)
693
+ print(" Model will load on first use instead.", flush=True)
694
+ import traceback
695
+ traceback.print_exc()
696
+ sys.stdout.flush()
697
+
698
+ print("\n🌐 Launching web interface...", flush=True)
699
+ sys.stdout.flush()
700
  demo.launch(share=False)