Spaces:

AAdonis
/

LAVCO

Sleeping

App Files Files Community

AAdonis commited on Jan 26

Commit

5517bcd

verified ·

1 Parent(s): fd7c103

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -8

app.py CHANGED Viewed

@@ -29,6 +29,11 @@ WHISPER_FRAME_RATE = 50
 MODEL_ID = os.getenv("MODEL_ID", "AdoCleanCode/LAVCO-v3")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # Global model and tokenizer (loaded once)
 model = None
 tokenizer = None
@@ -208,29 +213,47 @@ class LAVCOModel(nn.Module):
         with open(config_path, "r") as f:
             config = json.load(f)
-        print(f"📥 Loading LLASA from {llasa_path}...")
         self.llasa = AutoModelForCausalLM.from_pretrained(
             llasa_path,
             trust_remote_code=True,
             torch_dtype=torch.bfloat16,
         )
         self.hidden_size = self.llasa.config.hidden_size
-        print(f"📥 Loading Whisper encoder from {config['whisper_model']}...")
         whisper_full = WhisperModel.from_pretrained(config["whisper_model"])
         self.whisper = whisper_full.encoder
         self.whisper_dim = self.whisper.config.d_model
         del whisper_full
-        print(f"📥 Loading XCodec2 from {config['xcodec_model']}...")
         self.xcodec = XCodec2Model.from_pretrained(config["xcodec_model"])
         self.xcodec.eval()
         self.whisper_processor = WhisperFeatureExtractor.from_pretrained(config["whisper_model"])
         proj_state = torch.load(proj_path, map_location="cpu")
         self.projection = nn.Linear(self.whisper_dim, self.hidden_size)
         self.projection.load_state_dict(proj_state)
         self.u_start_id = config.get("u_start_id")
         self.u_end_id = config.get("u_end_id")
@@ -359,14 +382,26 @@ def load_model():
     global model, tokenizer
     if model is None:
-        print(f"📥 Loading model: {MODEL_ID}")
         model = LAVCOModel(MODEL_ID, device=DEVICE)
         model = model.to(DEVICE)
         model.eval()
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
         model.set_special_token_ids(tokenizer)
-        print("✅ Model loaded")
     return model, tokenizer
@@ -536,12 +571,14 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
             source_audio = gr.Audio(
                 label="Source Audio (content to convert)",
                 type="filepath",
-                sources=["upload", "microphone"]
             )
             reference_audio = gr.Audio(
                 label="Reference Audio (target voice)",
                 type="filepath",
-                sources=["upload", "microphone"]
             )
         with gr.Column():
@@ -604,7 +641,11 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
     ### 📖 How to Use
     1. **Upload or record** your source audio (the speech you want to convert)
     2. **Upload or record** your reference audio (the voice you want to mimic)
     3. Adjust generation parameters if needed (defaults work well)
     4. Click **Convert Voice** and wait for the result
@@ -612,6 +653,7 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
     - Keep audio clips under 30 seconds for best results
     - Reference audio should be clear speech (1+ seconds recommended)
     - Higher repetition penalty helps avoid repetitive outputs
     - Lower temperature = more stable, higher = more creative
     """)
@@ -631,4 +673,28 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
     )
 if __name__ == "__main__":
     demo.launch(share=False)

 MODEL_ID = os.getenv("MODEL_ID", "AdoCleanCode/LAVCO-v3")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Default audio files (will be in examples/ directory)
+EXAMPLES_DIR = "examples"
+DEFAULT_SOURCE_AUDIO = os.path.join(EXAMPLES_DIR, "default_source.wav") if os.path.exists(os.path.join(EXAMPLES_DIR, "default_source.wav")) else None
+DEFAULT_REFERENCE_AUDIO = os.path.join(EXAMPLES_DIR, "default_reference.wav") if os.path.exists(os.path.join(EXAMPLES_DIR, "default_reference.wav")) else None
 # Global model and tokenizer (loaded once)
 model = None
 tokenizer = None
         with open(config_path, "r") as f:
             config = json.load(f)
+        import sys
+        print(f"📥 Loading LLASA from {llasa_path}...", flush=True)
+        sys.stdout.flush()
         self.llasa = AutoModelForCausalLM.from_pretrained(
             llasa_path,
             trust_remote_code=True,
             torch_dtype=torch.bfloat16,
         )
         self.hidden_size = self.llasa.config.hidden_size
+        print(f"  ✅ LLASA loaded (hidden_size={self.hidden_size})", flush=True)
+        sys.stdout.flush()
+        print(f"📥 Loading Whisper encoder from {config['whisper_model']}...", flush=True)
+        sys.stdout.flush()
         whisper_full = WhisperModel.from_pretrained(config["whisper_model"])
         self.whisper = whisper_full.encoder
         self.whisper_dim = self.whisper.config.d_model
         del whisper_full
+        print(f"  ✅ Whisper loaded (dim={self.whisper_dim})", flush=True)
+        sys.stdout.flush()
+        print(f"📥 Loading XCodec2 from {config['xcodec_model']}...", flush=True)
+        sys.stdout.flush()
         self.xcodec = XCodec2Model.from_pretrained(config["xcodec_model"])
         self.xcodec.eval()
+        print(f"  ✅ XCodec2 loaded", flush=True)
+        sys.stdout.flush()
+        print(f"📥 Loading Whisper processor...", flush=True)
+        sys.stdout.flush()
         self.whisper_processor = WhisperFeatureExtractor.from_pretrained(config["whisper_model"])
+        print(f"  ✅ Whisper processor loaded", flush=True)
+        sys.stdout.flush()
+        print(f"📥 Loading projection layer...", flush=True)
+        sys.stdout.flush()
         proj_state = torch.load(proj_path, map_location="cpu")
         self.projection = nn.Linear(self.whisper_dim, self.hidden_size)
         self.projection.load_state_dict(proj_state)
+        print(f"  ✅ Projection layer loaded", flush=True)
+        sys.stdout.flush()
         self.u_start_id = config.get("u_start_id")
         self.u_end_id = config.get("u_end_id")
     global model, tokenizer
     if model is None:
+        import sys
+        import time
+        print(f"📥 Loading model: {MODEL_ID}", flush=True)
+        sys.stdout.flush()
+        start_time = time.time()
+        print("  → Loading LAVCO model components...", flush=True)
         model = LAVCOModel(MODEL_ID, device=DEVICE)
+        print(f"  → Moving model to {DEVICE}...", flush=True)
         model = model.to(DEVICE)
         model.eval()
+        print(f"  → Loading tokenizer...", flush=True)
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+        print(f"  → Setting special tokens...", flush=True)
         model.set_special_token_ids(tokenizer)
+        elapsed = time.time() - start_time
+        print(f"✅ Model loaded in {elapsed:.1f}s", flush=True)
+        sys.stdout.flush()
     return model, tokenizer
             source_audio = gr.Audio(
                 label="Source Audio (content to convert)",
                 type="filepath",
+                sources=["upload", "microphone"],
+                value=DEFAULT_SOURCE_AUDIO
             )
             reference_audio = gr.Audio(
                 label="Reference Audio (target voice)",
                 type="filepath",
+                sources=["upload", "microphone"],
+                value=DEFAULT_REFERENCE_AUDIO
             )
         with gr.Column():
     ### 📖 How to Use
     1. **Upload or record** your source audio (the speech you want to convert)
+       - Click the microphone icon to record directly from your microphone
+       - Or upload an audio file (WAV, MP3, etc.)
     2. **Upload or record** your reference audio (the voice you want to mimic)
+       - Click the microphone icon to record the target voice
+       - Or upload a reference audio file
     3. Adjust generation parameters if needed (defaults work well)
     4. Click **Convert Voice** and wait for the result
     - Keep audio clips under 30 seconds for best results
     - Reference audio should be clear speech (1+ seconds recommended)
+    - When recording, speak clearly and minimize background noise
     - Higher repetition penalty helps avoid repetitive outputs
     - Lower temperature = more stable, higher = more creative
     """)
     )
 if __name__ == "__main__":
+    import sys
+    print("=" * 60, flush=True)
+    print("🚀 Starting LAVCO Gradio App", flush=True)
+    print("=" * 60, flush=True)
+    print(f"Device: {DEVICE}", flush=True)
+    print(f"Model: {MODEL_ID}", flush=True)
+    sys.stdout.flush()
+    # Pre-load model at startup (so first user doesn't wait)
+    print("\n⏳ Pre-loading model (this may take a few minutes)...", flush=True)
+    sys.stdout.flush()
+    try:
+        load_model()
+        print("✅ Model ready! Starting Gradio interface...", flush=True)
+        sys.stdout.flush()
+    except Exception as e:
+        print(f"⚠️ Model pre-loading failed: {e}", flush=True)
+        print("   Model will load on first use instead.", flush=True)
+        import traceback
+        traceback.print_exc()
+        sys.stdout.flush()
+    print("\n🌐 Launching web interface...", flush=True)
+    sys.stdout.flush()
     demo.launch(share=False)