import os
import torch
import gradio as gr
import soundfile as sf
import numpy as np
import torchaudio
from omnivoice import OmniVoice

# 🔥 FIX torchaudio.load globally
def safe_load(path, *args, **kwargs):
    data, sr = sf.read(path)
    data = torch.tensor(data).float()
    
    if data.ndim > 1:
        data = data.mean(axis=1)
    
    return data.unsqueeze(0), sr

torchaudio.load = safe_load

# Cache fix
os.environ["HF_HOME"] = "/tmp/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
os.environ["HF_HUB_CACHE"] = "/tmp/hf_cache"

device = "cuda" if torch.cuda.is_available() else "cpu"

# 🚀 LOAD MODEL ONCE
print("🚀 Loading model...")
model = OmniVoice.from_pretrained(
    "k2-fsa/OmniVoice",
    device_map=device,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
)
print("✅ Model loaded")

# 🎯 FUNCTION
def clone_voice(audio_file, text, lang, ref_text):

    # Load audio safely
    waveform, sr = sf.read(audio_file)
    waveform = torch.tensor(waveform).float()

    if waveform.ndim > 1:
        waveform = waveform.mean(axis=1)

    waveform = waveform.unsqueeze(0)

    # Resample
    if sr != 24000:
        resampler = torchaudio.transforms.Resample(sr, 24000)
        waveform = resampler(waveform)

    temp_audio = "temp.wav"
    sf.write(temp_audio, waveform.squeeze().cpu().numpy(), 24000)

    # Text
    final_text = f"[{lang}] {text}"

    # Generate
    audio = model.generate(
        text=final_text,
        ref_audio=temp_audio,
        ref_text=ref_text if ref_text else None,
        language=lang
    )

    # Fix output
    if isinstance(audio, list):
        audio = audio[0]

    if not isinstance(audio, torch.Tensor):
        audio = torch.tensor(audio)

    if audio.dim() == 1:
        audio = audio.unsqueeze(0)

    # Save output
    output_file = "output.wav"
    sf.write(output_file, audio.squeeze().cpu().numpy(), 24000)

    return output_file


# 🎨 UI
demo = gr.Interface(
    fn=clone_voice,
    inputs=[
        gr.Audio(type="filepath", label="Upload Voice Sample"),
        gr.Textbox(label="Enter Text"),
        gr.Textbox(label="Language Code (en, hi, etc)"),
        gr.Textbox(label="Reference Text (optional)")
    ],
    outputs=gr.Audio(label="Generated Voice"),
    title="🎤 OmniVoice Cloner",
    description="Upload voice → enter text → language → generate cloned speech"
)

demo.launch()