import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, SpeechT5Processor, SpeechT5ForTextToSpeech
import torch
import soundfile as sf

# --------------------------
# 1. ASR (speech to text)
# --------------------------
asr = pipeline(
    task="automatic-speech-recognition",
    model="openai/whisper-small",
    device=-1
)

# --------------------------
# 2. Language Model (LLM) - more reliable
# --------------------------
llm_model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_id).to("cpu")

def ask_llm(prompt, max_new_tokens=200):
    inputs = tokenizer(prompt, return_tensors="pt").to(llm_model.device)
    with torch.no_grad():
        outputs = llm_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_k=50,
            top_p=0.95
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# --------------------------
# 3. TTS (text-to-speech) using SpeechT5
# --------------------------
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

speaker_embedding = torch.randn(1, 512)

def text_to_speech(text, out_path="output.wav"):
    inputs = processor(text=text, return_tensors="pt")
    speech = tts_model.generate_speech(inputs["input_ids"], speaker_embedding)
    sf.write(out_path, speech.numpy(), 16000)
    return out_path

# --------------------------
# 4. Full pipeline function
# --------------------------
def full_pipeline(audio_file):
    if not audio_file:
        return "No audio input detected.", None

    try:
        result = asr(audio_file, chunk_length_s=30, stride_length_s=[5, 5])
    except Exception as e:
        return f"ASR error: {e}", None

    user_text = result.get("text", "")

    try:
        llm_response = ask_llm(f"پاسخ بده به زبان ساده: {user_text}")
    except Exception as e:
        return f"Assistant generation error: {e}", None

    try:
        audio_path = text_to_speech(llm_response, "response.wav")
    except Exception as e:
        return f"TTS error: {e}", None

    return f"User said: {user_text}\nAssistant: {llm_response}", audio_path

# --------------------------
# 5. Gradio Interface
# --------------------------
iface = gr.Interface(
    fn=full_pipeline,
    inputs=gr.Audio(type="filepath", label="Record or upload audio"),
    outputs=[gr.Textbox(label="Conversation"), gr.Audio(label="TTS Response")],
    title="Persian Voice Assistant (Reliable LLM)",
    description="ASR → Flan-T5-Base → TTS"
)

if __name__ == "__main__":
    iface.launch()