# stt.py
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav

# -------------------
# 1️⃣ Detect GPU
# -------------------
use_cuda = torch.cuda.is_available()
device_index = 0 if use_cuda else -1
device_str = "cuda" if use_cuda else "cpu"
dtype = torch.float16 if use_cuda else torch.float32

# -------------------
# 2️⃣ Load Whisper model from Hugging Face
# -------------------
hub_id = "Muhammadidrees/WispherVOICE"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    hub_id,
    torch_dtype=dtype,
    device_map="auto",  # automatically assigns to GPU if available
    trust_remote_code=True
)
processor = AutoProcessor.from_pretrained(hub_id, trust_remote_code=True)

# -------------------
# 3️⃣ Setup ASR pipeline
# -------------------
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=dtype,
    device=device_index
)

print("🎧 Whisper pipeline ready using Muhammadidrees/WispherVOICE.")


# -------------------
# 4️⃣ Record & Transcribe Function
# -------------------
def record_and_transcribe(duration=5, samplerate=16000, filename="mic_input.wav") -> str:
    """
    Record audio from the microphone, save it as a WAV file,
    and return the transcribed text using Whisper.
    """
    # 1️⃣ Record audio
    print(f"🎙️ Recording for {duration} seconds...")
    audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype="float32")
    sd.wait()
    audio = np.squeeze(audio)

    # 2️⃣ Save as WAV
    wav.write(filename, samplerate, (audio * 32767).astype(np.int16))
    print(f"✅ Recording saved as {filename}")

    # 3️⃣ Transcribe
    result = pipe(filename)
    text = result["text"]
    print(f"📝 Transcribed text: {text}")

    return text