LAVCO / app.py
AAdonis's picture
Update app.py
f993740 verified
"""
LAVCO Gradio App for HuggingFace Spaces
A beautiful web interface for voice conversion using LAVCO (Llasa-VC).
"""
import os
import re
import tempfile
import gradio as gr
import torch
import torch.nn as nn
import numpy as np
import soundfile as sf
import librosa
from typing import List, Optional, Dict, Tuple
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
WhisperModel,
WhisperFeatureExtractor,
)
# Constants
XCODEC2_FRAME_RATE = 50
WHISPER_FRAME_RATE = 50
# Model configuration
MODEL_ID = os.getenv("MODEL_ID", "AdoCleanCode/LAVCO-v3")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Default audio files (will be in examples/ directory)
EXAMPLES_DIR = "examples"
DEFAULT_SOURCE_PATH = os.path.join(EXAMPLES_DIR, "sample1_source.wav")
DEFAULT_REFERENCE_PATH = os.path.join(EXAMPLES_DIR, "sample1_reference.wav")
# Check if files exist and use absolute paths
DEFAULT_SOURCE_AUDIO = None
DEFAULT_REFERENCE_AUDIO = None
if os.path.exists(DEFAULT_SOURCE_PATH):
DEFAULT_SOURCE_AUDIO = os.path.abspath(DEFAULT_SOURCE_PATH)
print(f"βœ… Found default source audio: {DEFAULT_SOURCE_AUDIO}", flush=True)
else:
print(f"⚠️ Default source audio not found: {DEFAULT_SOURCE_PATH}", flush=True)
if os.path.exists(DEFAULT_REFERENCE_PATH):
DEFAULT_REFERENCE_AUDIO = os.path.abspath(DEFAULT_REFERENCE_PATH)
print(f"βœ… Found default reference audio: {DEFAULT_REFERENCE_AUDIO}", flush=True)
else:
print(f"⚠️ Default reference audio not found: {DEFAULT_REFERENCE_PATH}", flush=True)
# Global model and tokenizer (loaded once)
model = None
tokenizer = None
class SpeechOnlyLogitsProcessor:
"""Only allow XCodec2 speech tokens and custom EOS."""
def __init__(self, tokenizer, eos_id: int):
self.allowed = torch.zeros(len(tokenizer), dtype=torch.bool)
vocab = tokenizer.get_vocab()
pat = re.compile(r"^<\|s_\d+\|>$")
for t, tid in vocab.items():
if pat.match(t):
self.allowed[tid] = True
self.allowed[eos_id] = True
def __call__(self, input_ids, scores):
mask = self.allowed.to(scores.device)
return scores.masked_fill(~mask, float("-inf"))
def apply_repetition_penalty(logits: torch.Tensor, generated_ids: List[int], penalty: float = 1.2, window: int = 5):
"""Apply repetition penalty ONLY to recently repeated tokens."""
if penalty == 1.0 or len(generated_ids) < 2:
return logits
recent_tokens = generated_ids[-window:] if len(generated_ids) >= window else generated_ids
token_counts = {}
for token_id in recent_tokens:
token_counts[token_id] = token_counts.get(token_id, 0) + 1
for token_id, count in token_counts.items():
if count > 1:
effective_penalty = penalty ** (count - 1)
if logits[0, token_id] > 0:
logits[0, token_id] /= effective_penalty
else:
logits[0, token_id] *= effective_penalty
return logits
def sample_with_temperature_and_top_p(logits: torch.Tensor, temperature: float = 1.0, top_p: float = 0.9):
"""Sample token with temperature scaling and nucleus (top-p) sampling."""
if temperature != 1.0:
logits = logits / temperature
probs = torch.softmax(logits, dim=-1)
if top_p < 1.0:
sorted_probs, sorted_indices = torch.sort(probs, descending=True, dim=-1)
cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
sorted_indices_to_remove = cumulative_probs > top_p
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
sorted_indices_to_remove[..., 0] = False
indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
probs = probs.masked_fill(indices_to_remove, 0.0)
probs = probs / probs.sum(dim=-1, keepdim=True)
next_token_id = torch.multinomial(probs, num_samples=1).item()
return next_token_id
def greedy_generate_with_embeds(
model,
inputs_embeds: torch.Tensor,
embed_layer,
logits_processor,
max_new_tokens: int,
eos_token_id: int,
pad_token_id: int = 0,
verbose: bool = False,
tokenizer=None,
temperature: float = 1.0,
repetition_penalty: float = 1.2,
top_p: float = 0.9,
repetition_window: int = 5,
) -> List[int]:
"""KV-cache enabled greedy generation starting from inputs_embeds."""
device = inputs_embeds.device
generated = []
past_key_values = None
cur_embeds = inputs_embeds
dummy_input_ids = torch.zeros(1, inputs_embeds.shape[1], dtype=torch.long, device=device)
with torch.no_grad():
outputs = model(
inputs_embeds=cur_embeds,
use_cache=True,
return_dict=True,
)
logits = outputs.logits[:, -1, :]
past_key_values = outputs.past_key_values
logits = logits_processor(dummy_input_ids, logits)
logits = apply_repetition_penalty(logits, generated, repetition_penalty, repetition_window)
if temperature == 1.0 and top_p == 1.0:
next_token_id = torch.argmax(logits, dim=-1).item()
else:
next_token_id = sample_with_temperature_and_top_p(logits, temperature, top_p)
generated.append(next_token_id)
if next_token_id == eos_token_id:
return generated
for step in range(1, max_new_tokens):
new_token_embed = embed_layer(torch.tensor([[next_token_id]], device=device))
with torch.no_grad():
outputs = model(
inputs_embeds=new_token_embed,
past_key_values=past_key_values,
use_cache=True,
return_dict=True,
)
logits = outputs.logits[:, -1, :]
past_key_values = outputs.past_key_values
dummy_input_ids = torch.cat([
dummy_input_ids,
torch.tensor([[next_token_id]], device=device)
], dim=1)
logits = logits_processor(dummy_input_ids, logits)
logits = apply_repetition_penalty(logits, generated, repetition_penalty, repetition_window)
if temperature == 1.0 and top_p == 1.0:
next_token_id = torch.argmax(logits, dim=-1).item()
else:
next_token_id = sample_with_temperature_and_top_p(logits, temperature, top_p)
generated.append(next_token_id)
if next_token_id == eos_token_id:
break
return generated
class LAVCOModel(nn.Module):
"""LAVCO model for voice conversion."""
def __init__(self, load_dir_or_repo: str, device: str = "cuda", cache_dir: str = None):
super().__init__()
import json
from huggingface_hub import hf_hub_download, snapshot_download
from xcodec2.modeling_xcodec2 import XCodec2Model
is_local = os.path.isdir(load_dir_or_repo)
if is_local:
config_path = os.path.join(load_dir_or_repo, "llasa_vc_config.json")
proj_path = os.path.join(load_dir_or_repo, "projection.pt")
llasa_path = os.path.join(load_dir_or_repo, "llasa")
else:
print(f"πŸ“₯ Downloading from HuggingFace: {load_dir_or_repo}")
config_path = hf_hub_download(
repo_id=load_dir_or_repo,
filename="llasa_vc_config.json",
cache_dir=cache_dir,
)
proj_path = hf_hub_download(
repo_id=load_dir_or_repo,
filename="projection.pt",
cache_dir=cache_dir,
)
llasa_path = snapshot_download(
repo_id=load_dir_or_repo,
allow_patterns=["llasa/*"],
cache_dir=cache_dir,
)
llasa_path = os.path.join(llasa_path, "llasa")
with open(config_path, "r") as f:
config = json.load(f)
import sys
print(f"πŸ“₯ Loading LLASA from {llasa_path}...", flush=True)
sys.stdout.flush()
self.llasa = AutoModelForCausalLM.from_pretrained(
llasa_path,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
)
self.hidden_size = self.llasa.config.hidden_size
print(f" βœ… LLASA loaded (hidden_size={self.hidden_size})", flush=True)
sys.stdout.flush()
print(f"πŸ“₯ Loading Whisper encoder from {config['whisper_model']}...", flush=True)
sys.stdout.flush()
whisper_full = WhisperModel.from_pretrained(config["whisper_model"])
self.whisper = whisper_full.encoder
self.whisper_dim = self.whisper.config.d_model
del whisper_full
print(f" βœ… Whisper loaded (dim={self.whisper_dim})", flush=True)
sys.stdout.flush()
print(f"πŸ“₯ Loading XCodec2 from {config['xcodec_model']}...", flush=True)
sys.stdout.flush()
self.xcodec = XCodec2Model.from_pretrained(config["xcodec_model"])
self.xcodec.eval()
print(f" βœ… XCodec2 loaded", flush=True)
sys.stdout.flush()
print(f"πŸ“₯ Loading Whisper processor...", flush=True)
sys.stdout.flush()
self.whisper_processor = WhisperFeatureExtractor.from_pretrained(config["whisper_model"])
print(f" βœ… Whisper processor loaded", flush=True)
sys.stdout.flush()
print(f"πŸ“₯ Loading projection layer...", flush=True)
sys.stdout.flush()
proj_state = torch.load(proj_path, map_location="cpu", weights_only=False)
self.projection = nn.Linear(self.whisper_dim, self.hidden_size)
self.projection.load_state_dict(proj_state)
print(f" βœ… Projection layer loaded", flush=True)
sys.stdout.flush()
self.u_start_id = config.get("u_start_id")
self.u_end_id = config.get("u_end_id")
self.g_start_id = config["g_start_id"]
self.g_end_id = config["g_end_id"]
self.pad_id = config["pad_id"]
for param in self.whisper.parameters():
param.requires_grad = False
self.whisper.eval()
for param in self.xcodec.parameters():
param.requires_grad = False
self.xcodec.eval()
def set_special_token_ids(self, tokenizer):
"""Set special token IDs and instruction text embeddings."""
self.tokenizer = tokenizer
self.u_start_id = tokenizer.convert_tokens_to_ids("<|SPEECH_UNDERSTANDING_START|>")
self.u_end_id = tokenizer.convert_tokens_to_ids("<|SPEECH_UNDERSTANDING_END|>")
self.g_start_id = tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_START|>")
self.g_end_id = tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_END|>")
self.pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
prefix_text = "Convert "
middle_text = " into speech using this speaker: "
self.prefix_ids = tokenizer(prefix_text, add_special_tokens=False, return_tensors="pt")["input_ids"]
self.middle_ids = tokenizer(middle_text, add_special_tokens=False, return_tensors="pt")["input_ids"]
def _tokenizer_ids_to_xcodec_codes(self, tokenizer_ids: torch.Tensor) -> torch.Tensor:
"""Convert LLASA tokenizer IDs back to raw XCodec2 codes (0-65535)."""
batch_size, seq_len = tokenizer_ids.shape
xcodec_codes = torch.zeros_like(tokenizer_ids)
for i in range(batch_size):
tokens = self.tokenizer.convert_ids_to_tokens(tokenizer_ids[i].tolist())
for j, tok in enumerate(tokens):
if tok and tok.startswith("<|s_") and tok.endswith("|>"):
try:
code = int(tok[4:-2])
xcodec_codes[i, j] = code
except ValueError:
xcodec_codes[i, j] = 0
else:
xcodec_codes[i, j] = 0
return xcodec_codes
def generate(
self,
wav_or_mel: np.ndarray,
ref_ids: torch.Tensor,
ref_length: int,
max_new_tokens: int = 2000,
tokenizer=None,
temperature: float = 1.0,
repetition_penalty: float = 1.2,
top_p: float = 0.9,
repetition_window: int = 5,
verbose: bool = False,
) -> List[int]:
"""Generate voice conversion tokens."""
device = ref_ids.device
model_dtype = next(self.llasa.parameters()).dtype
mel = self.whisper_processor(wav_or_mel, sampling_rate=16000, return_tensors="pt").input_features.to(device)
whisper_out = self.whisper(mel).last_hidden_state
audio_dur = len(wav_or_mel) / 16000
num_frames = min(int(audio_dur * WHISPER_FRAME_RATE), 1500)
soft_tokens = self.projection(whisper_out[:, :num_frames]).to(model_dtype)
embed_layer = self.llasa.get_input_embeddings()
prefix_emb = embed_layer(self.prefix_ids.to(device))
middle_emb = embed_layer(self.middle_ids.to(device))
u_start_emb = embed_layer(torch.tensor([[self.u_start_id]], device=device))
u_end_emb = embed_layer(torch.tensor([[self.u_end_id]], device=device))
g_start_emb = embed_layer(torch.tensor([[self.g_start_id]], device=device))
ref_embeds = embed_layer(ref_ids[:, :ref_length])
inputs_embeds = torch.cat([
prefix_emb,
soft_tokens,
middle_emb,
u_start_emb,
ref_embeds,
u_end_emb,
g_start_emb,
], dim=1).to(model_dtype)
if tokenizer is not None:
logits_processor = SpeechOnlyLogitsProcessor(tokenizer, self.g_end_id)
generated = greedy_generate_with_embeds(
model=self.llasa,
inputs_embeds=inputs_embeds,
embed_layer=embed_layer,
logits_processor=logits_processor,
max_new_tokens=max_new_tokens,
eos_token_id=self.g_end_id,
pad_token_id=self.pad_id,
verbose=verbose,
tokenizer=tokenizer,
temperature=temperature,
repetition_penalty=repetition_penalty,
top_p=top_p,
repetition_window=repetition_window,
)
return generated
else:
outputs = self.llasa.generate(
inputs_embeds=inputs_embeds,
max_new_tokens=max_new_tokens,
pad_token_id=self.pad_id,
eos_token_id=self.g_end_id,
do_sample=False,
)
return outputs[0].tolist()
def load_model():
"""Load model once at startup."""
global model, tokenizer
if model is None:
import sys
import time
print(f"πŸ“₯ Loading model: {MODEL_ID}", flush=True)
sys.stdout.flush()
start_time = time.time()
print(" β†’ Loading LAVCO model components...", flush=True)
model = LAVCOModel(MODEL_ID, device=DEVICE)
print(f" β†’ Moving model to {DEVICE}...", flush=True)
model = model.to(DEVICE)
model.eval()
print(f" β†’ Loading tokenizer...", flush=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
print(f" β†’ Setting special tokens...", flush=True)
model.set_special_token_ids(tokenizer)
elapsed = time.time() - start_time
print(f"βœ… Model loaded in {elapsed:.1f}s", flush=True)
sys.stdout.flush()
return model, tokenizer
def extract_xcodec2_from_generated(tokenizer, token_ids: list) -> list:
"""Extract XCodec2 token IDs from generated token IDs."""
xcodec2_ids = []
for tid in token_ids:
token = tokenizer.convert_ids_to_tokens(tid)
if token and token.startswith("<|s_") and token.endswith("|>"):
try:
xcodec2_ids.append(int(token[4:-2]))
except ValueError:
pass
return xcodec2_ids
def convert_voice(source_audio, reference_audio, temperature, repetition_penalty, top_p, repetition_window, max_tokens, progress=gr.Progress()):
"""Convert source voice to reference voice using LAVCO."""
if source_audio is None:
return None, "❌ Please provide source audio"
if reference_audio is None:
return None, "❌ Please provide reference audio"
try:
progress(0.1, desc="Loading model...")
model, tokenizer = load_model()
progress(0.2, desc="Loading audio files...")
if isinstance(source_audio, tuple):
source_path = source_audio[1]
else:
source_path = source_audio
if isinstance(reference_audio, tuple):
reference_path = reference_audio[1]
else:
reference_path = reference_audio
source_wav = librosa.load(source_path, sr=16000)[0].astype(np.float32)
reference_wav = librosa.load(reference_path, sr=16000)[0].astype(np.float32)
progress(0.4, desc="Encoding audio...")
with torch.no_grad():
xcodec_device = next(model.xcodec.parameters()).device
ref_tensor_audio = torch.from_numpy(reference_wav).float().unsqueeze(0).to(xcodec_device)
ref_codes = model.xcodec.encode_code(input_waveform=ref_tensor_audio)
if isinstance(ref_codes, torch.Tensor):
ref_codes_np = ref_codes.cpu().numpy()
else:
ref_codes_np = np.array(ref_codes)
ref_xcodec_ids = ref_codes_np.flatten().astype(int).tolist()
ref_token_str = "".join([f"<|s_{rid}|>" for rid in ref_xcodec_ids])
ref_tokenizer_ids = tokenizer(ref_token_str, add_special_tokens=False)["input_ids"]
ref_ids = torch.tensor(ref_tokenizer_ids, dtype=torch.long, device=DEVICE).unsqueeze(0)
ref_length = len(ref_tokenizer_ids)
source_tensor_audio = torch.from_numpy(source_wav).float().unsqueeze(0).to(xcodec_device)
source_codes = model.xcodec.encode_code(input_waveform=source_tensor_audio)
if isinstance(source_codes, torch.Tensor):
source_codes_np = source_codes.cpu().numpy()
else:
source_codes_np = np.array(source_codes)
source_xcodec_ids = source_codes_np.flatten().astype(int).tolist()
source_token_str = "".join([f"<|s_{rid}|>" for rid in source_xcodec_ids])
source_tokenizer_ids = tokenizer(source_token_str, add_special_tokens=False)["input_ids"]
seedvc_ids = torch.tensor(source_tokenizer_ids, dtype=torch.long, device=DEVICE).unsqueeze(0)
seedvc_length = len(source_tokenizer_ids)
xcodec_codes = model._tokenizer_ids_to_xcodec_codes(seedvc_ids)
codes = xcodec_codes.unsqueeze(1).to(xcodec_device)
wav = model.xcodec.decode_code(codes)
if len(wav.shape) == 3:
wav = wav.squeeze(1)
num_samples_audio = int(seedvc_length / XCODEC2_FRAME_RATE * 16000)
num_samples_audio = min(num_samples_audio, wav.shape[-1])
source_wav_processed = wav[0, :num_samples_audio].cpu().numpy()
progress(0.7, desc="Generating voice conversion...")
import inspect
gen_sig = inspect.signature(model.generate)
gen_params = gen_sig.parameters
gen_kwargs = {
'max_new_tokens': max_tokens,
'tokenizer': tokenizer,
'verbose': False,
}
if 'temperature' in gen_params:
gen_kwargs['temperature'] = temperature
if 'repetition_penalty' in gen_params:
gen_kwargs['repetition_penalty'] = repetition_penalty
if 'top_p' in gen_params:
gen_kwargs['top_p'] = top_p
if 'repetition_window' in gen_params:
gen_kwargs['repetition_window'] = repetition_window
generated_token_ids = model.generate(
source_wav_processed,
ref_ids,
ref_length,
**gen_kwargs
)
progress(0.9, desc="Decoding audio...")
gen_xcodec_ids = extract_xcodec2_from_generated(tokenizer, generated_token_ids)
if not gen_xcodec_ids:
return None, "❌ No audio tokens generated!"
codes = torch.tensor(gen_xcodec_ids, device=xcodec_device).unsqueeze(0).unsqueeze(0)
output_wav = model.xcodec.decode_code(codes)
if len(output_wav.shape) == 3:
output_wav = output_wav[0, 0, :].cpu().numpy()
elif len(output_wav.shape) == 2:
output_wav = output_wav[0, :].cpu().numpy()
else:
output_wav = output_wav.cpu().numpy()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
sf.write(tmp_file.name, output_wav, 16000)
output_path = tmp_file.name
progress(1.0, desc="Complete!")
return output_path, f"βœ… Generated {len(gen_xcodec_ids)} tokens ({len(gen_xcodec_ids)/XCODEC2_FRAME_RATE:.2f}s)"
except Exception as e:
import traceback
error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
return None, error_msg
# Custom CSS for beautiful UI
css = """
.gradio-container {
font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
}
.main-header {
text-align: center;
padding: 2rem 0;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
border-radius: 10px;
margin-bottom: 2rem;
}
"""
# Create Gradio interface
with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
gr.Markdown("""
<div class="main-header">
<h1>🎀 LAVCO: Voice Conversion</h1>
<p>Convert speech to match any reference voice using semantic/acoustic interleaving</p>
</div>
""")
with gr.Row():
with gr.Column():
gr.Markdown("### πŸ“₯ Input Audio")
source_audio = gr.Audio(
label="Source Audio (content to convert)",
type="filepath",
sources=["upload", "microphone"]
)
reference_audio = gr.Audio(
label="Reference Audio (target voice)",
type="filepath",
sources=["upload", "microphone"]
)
# Add examples if default files exist
if DEFAULT_SOURCE_AUDIO and DEFAULT_REFERENCE_AUDIO:
gr.Examples(
examples=[[DEFAULT_SOURCE_AUDIO, DEFAULT_REFERENCE_AUDIO]],
inputs=[source_audio, reference_audio],
label="πŸ“ Example Audio Files (Click to load)",
)
with gr.Column():
gr.Markdown("### βš™οΈ Generation Parameters")
temperature = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Temperature",
info="Higher = more diverse, lower = more deterministic"
)
repetition_penalty = gr.Slider(
minimum=1.0,
maximum=2.0,
value=1.3,
step=0.1,
label="Repetition Penalty",
info="Penalize repeated tokens (1.0 = off)"
)
top_p = gr.Slider(
minimum=0.5,
maximum=1.0,
value=0.9,
step=0.05,
label="Top-P (Nucleus Sampling)",
info="Sample from top P probability mass"
)
repetition_window = gr.Slider(
minimum=3,
maximum=10,
value=5,
step=1,
label="Repetition Window",
info="Look at last N tokens for repetition"
)
max_tokens = gr.Slider(
minimum=100,
maximum=2000,
value=2000,
step=100,
label="Max Tokens",
info="Maximum tokens to generate"
)
convert_btn = gr.Button("🎯 Convert Voice", variant="primary", size="lg")
with gr.Row():
output_audio = gr.Audio(
label="Converted Audio",
type="filepath",
autoplay=True
)
status_text = gr.Textbox(
label="Status",
interactive=False
)
gr.Markdown("""
### πŸ“– How to Use
1. **Upload or record** your source audio (the speech you want to convert)
- Click the microphone icon to record directly from your microphone
- Or upload an audio file (WAV, MP3, etc.)
2. **Upload or record** your reference audio (the voice you want to mimic)
- Click the microphone icon to record the target voice
- Or upload a reference audio file
3. Adjust generation parameters if needed (defaults work well)
4. Click **Convert Voice** and wait for the result
### πŸ’‘ Tips
- Keep audio clips under 30 seconds for best results
- Reference audio should be clear speech (1+ seconds recommended)
- When recording, speak clearly and minimize background noise
- Higher repetition penalty helps avoid repetitive outputs
- Lower temperature = more stable, higher = more creative
""")
convert_btn.click(
fn=convert_voice,
inputs=[
source_audio,
reference_audio,
temperature,
repetition_penalty,
top_p,
repetition_window,
max_tokens,
],
outputs=[output_audio, status_text]
)
if __name__ == "__main__":
import sys
print("=" * 60, flush=True)
print("πŸš€ Starting LAVCO Gradio App", flush=True)
print("=" * 60, flush=True)
print(f"Device: {DEVICE}", flush=True)
print(f"Model: {MODEL_ID}", flush=True)
print(f"\nπŸ“ Checking for default audio files...", flush=True)
print(f" Examples directory: {os.path.abspath(EXAMPLES_DIR)}", flush=True)
print(f" Source audio: {DEFAULT_SOURCE_AUDIO or 'Not found'}", flush=True)
print(f" Reference audio: {DEFAULT_REFERENCE_AUDIO or 'Not found'}", flush=True)
sys.stdout.flush()
# Pre-load model at startup (so first user doesn't wait)
print("\n⏳ Pre-loading model (this may take a few minutes)...", flush=True)
sys.stdout.flush()
try:
load_model()
print("βœ… Model ready! Starting Gradio interface...", flush=True)
sys.stdout.flush()
except Exception as e:
print(f"⚠️ Model pre-loading failed: {e}", flush=True)
print(" Model will load on first use instead.", flush=True)
import traceback
traceback.print_exc()
sys.stdout.flush()
print("\n🌐 Launching web interface...", flush=True)
sys.stdout.flush()
demo.launch(
server_name="0.0.0.0", # Listen on all network interfaces
server_port=7860, # The default port HF expects
share=False # Don't create a public share link (HF handles this)
)