import io
import scipy.io.wavfile as wavf
from fastapi import FastAPI, Response
from pydantic import BaseModel
import gradio as gr
from transformers import pipeline, AutoTokenizer, VitsModel  # VitsModel kept for compatibility fallback
import torch
import uvicorn
import numpy as np
import copy  # For potential TTS voice embeddings if you add custom inference

# ================================================
# IMPORTANT SETUP NOTES FOR VibeVoice 0.5B (Realtime TTS)
# ================================================
# 1. The VibeVoice-Realtime-0.5B is NOT a standard VITS model.
#    It uses a custom Qwen2.5-0.5B + diffusion head + streaming processor.
# 2. To use it properly:
#    git clone https://github.com/microsoft/VibeVoice.git
#    cd VibeVoice
#    pip install -e .[streamingtts]
# 3. The full inference code (including streaming & voice presets) is in:
#    demo/realtime_model_inference_from_file.py
#    demo/vibevoice_realtime_demo.py
# 4. For now, this script keeps the old VITS-style TTS as a fallback.
#    Replace the synthesize_speech() function with the custom logic from the repo
#    once installed (see the snippets in the comments below).
# 5. VibeVoice-ASR works out-of-the-box with the standard Transformers pipeline.

print("Loading VibeVoice-ASR (for recording classes → instant text)...")
asr_pipeline = pipeline(
    "automatic-speech-recognition",
    model="microsoft/VibeVoice-ASR",
    trust_remote_code=True
)
print("✅ VibeVoice-ASR loaded successfully!")

# TTS Model (0.5B Realtime) - placeholder loading
# Uncomment and adapt the lines below AFTER installing the VibeVoice package:
# from transformers import AutoModelForCausalLM
# from VibeVoice.tokenizer import VibeVoiceTextTokenizerFast  # custom from repo
# from VibeVoice.model import VibeVoiceStreamingProcessor, VibeVoiceStreamingForConditionalGenerationInference
#
# MODEL_PATH = "microsoft/VibeVoice-Realtime-0.5B"
# processor = VibeVoiceStreamingProcessor.from_pretrained(MODEL_PATH)
# model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
#     MODEL_PATH, trust_remote_code=True
# )
# # Example voice preset (Carter, Emma, etc.)
# # voice_embeddings = torch.load("demo/voices/streaming_model/carter.pt", map_location="cpu")

model_name = "microsoft/VibeVoice-Realtime-0.5B"  # ← Changed to your requested 0.5B model
print(f"Loading TTS model: {model_name} (note: custom inference required for full realtime streaming)")
tokenizer = AutoTokenizer.from_pretrained(model_name)  # May fail until custom tokenizer is installed
model = VitsModel.from_pretrained(model_name)          # Will fail for 0.5B until you swap to custom classes
print("✅ TTS model loaded (fallback mode)")

# ================================================
# Core TTS Synthesis Function (0.5B model)
# ================================================
def synthesize_speech(text: str):
    """
    TODO: Replace this entire function with the official VibeVoice realtime inference
    once you have installed the package (see notes above).
    
    Example skeleton from the repo (adapt as needed):
    
    inputs = processor(text, return_tensors="pt")
    # Add voice preset embeddings here if desired
    with torch.no_grad():
        output = model.generate(
            **inputs,
            tokenizer=processor.tokenizer,
            cfg_scale=1.5,
            ddpm_steps=10,   # lower = faster, higher = better quality
            verbose=False
        )
    # output will contain waveform or latent → decode to 24kHz audio
    """
    # Current fallback (will work only until you replace with custom code)
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        output = model(**inputs).waveform
    audio_data = output.squeeze().numpy()
    sample_rate = model.config.sampling_rate
    wav_io = io.BytesIO()
    wavf.write(wav_io, sample_rate, audio_data)
    wav_io.seek(0)
    return wav_io.read(), sample_rate, audio_data


# ================================================
# Core ASR Function (Record classes → instant text)
# ================================================
def transcribe_audio(audio):
    """Record audio (classes/lectures) → returns clean text instantly."""
    if audio is None:
        return "No audio recorded."
    sr, data = audio
    # VibeVoice-ASR handles numpy array + sample rate directly
    result = asr_pipeline({"sampling_rate": sr, "raw": data.astype(np.float32)})
    return result["text"]


# ================================================
# FastAPI Setup + API Endpoint (kept exactly as requested)
# ================================================
app = FastAPI(title="VibeVoice 0.5B Realtime TTS + ASR API")

class TTSRequest(BaseModel):
    text: str

@app.post("/api/tts", summary="Generate Speech from Text (0.5B model)")
async def api_tts(request: TTSRequest):
    """
    POST {"text": "your text here"}
    Returns downloadable WAV (maintains original API).
    """
    wav_bytes, _, _ = synthesize_speech(request.text)
    return Response(content=wav_bytes, media_type="audio/wav")


# ================================================
# Gradio Interface (now with BOTH features in tabs)
# ================================================
with gr.Blocks(title="VibeVoice 0.5B • Record Classes + TTS", theme=gr.themes.Soft()) as ui:
    gr.Markdown("# ⚡ VibeVoice 0.5B Realtime TTS + ASR\nRecord class lectures → get text instantly. Text → speech in one click.")

    with gr.Tabs():
        # ====================== ASR TAB ======================
        with gr.Tab("🎤 Record Class → Instant Text"):
            gr.Markdown("**Record your class/lecture audio → copy the transcribed text instantly**")
            audio_input = gr.Audio(
                sources=["microphone"],
                type="numpy",
                label="Record Audio (or upload)",
                waveform_options=gr.WaveformOptions(waveform_color="#4F46E5")
            )
            transcribe_btn = gr.Button("📝 Transcribe Now", variant="primary", size="large")
            text_output = gr.Textbox(label="Transcribed Text", lines=8, show_copy_button=True)
            
            transcribe_btn.click(
                fn=transcribe_audio,
                inputs=audio_input,
                outputs=text_output
            )

        # ====================== TTS TAB ======================
        with gr.Tab("🔊 Text → Speech (0.5B Realtime)"):
            gr.Markdown("**Type text → generate natural speech** (uses VibeVoice-Realtime-0.5B)")
            text_input = gr.Textbox(
                label="Enter Text",
                lines=4,
                placeholder="Type or paste your script here...",
                value="Hello, this is a test of Microsoft's new VibeVoice 0.5B real-time TTS."
            )
            tts_btn = gr.Button("🔊 Generate Speech", variant="primary", size="large")
            audio_output = gr.Audio(label="Generated Speech", type="numpy")
            
            tts_btn.click(
                fn=lambda t: synthesize_speech(t)[1:3],  # returns (rate, audio) for Gradio
                inputs=text_input,
                outputs=audio_output
            )

    gr.Markdown(
        "**How to use the API**: `curl -X POST http://localhost:7860/api/tts -d '{\"text\":\"Hello world\"}' --output speech.wav`\n\n"
        "Full realtime streaming & voice presets available in the official VibeVoice repo."
    )

# Mount Gradio onto FastAPI (keeps original behavior)
app = gr.mount_gradio_app(app, ui, path="/")

# Run the server
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)