# stt.py import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import sounddevice as sd import numpy as np import scipy.io.wavfile as wav # ------------------- # 1️⃣ Detect GPU # ------------------- use_cuda = torch.cuda.is_available() device_index = 0 if use_cuda else -1 device_str = "cuda" if use_cuda else "cpu" dtype = torch.float16 if use_cuda else torch.float32 # ------------------- # 2️⃣ Load Whisper model from Hugging Face # ------------------- hub_id = "Muhammadidrees/WispherVOICE" model = AutoModelForSpeechSeq2Seq.from_pretrained( hub_id, torch_dtype=dtype, device_map="auto", # automatically assigns to GPU if available trust_remote_code=True ) processor = AutoProcessor.from_pretrained(hub_id, trust_remote_code=True) # ------------------- # 3️⃣ Setup ASR pipeline # ------------------- pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, torch_dtype=dtype, device=device_index ) print("🎧 Whisper pipeline ready using Muhammadidrees/WispherVOICE.") # ------------------- # 4️⃣ Record & Transcribe Function # ------------------- def record_and_transcribe(duration=5, samplerate=16000, filename="mic_input.wav") -> str: """ Record audio from the microphone, save it as a WAV file, and return the transcribed text using Whisper. """ # 1️⃣ Record audio print(f"🎙️ Recording for {duration} seconds...") audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype="float32") sd.wait() audio = np.squeeze(audio) # 2️⃣ Save as WAV wav.write(filename, samplerate, (audio * 32767).astype(np.int16)) print(f"✅ Recording saved as {filename}") # 3️⃣ Transcribe result = pipe(filename) text = result["text"] print(f"📝 Transcribed text: {text}") return text