Spaces:
Sleeping
Sleeping
| # stt.py | |
| import torch | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| import sounddevice as sd | |
| import numpy as np | |
| import scipy.io.wavfile as wav | |
| # ------------------- | |
| # 1️⃣ Detect GPU | |
| # ------------------- | |
| use_cuda = torch.cuda.is_available() | |
| device_index = 0 if use_cuda else -1 | |
| device_str = "cuda" if use_cuda else "cpu" | |
| dtype = torch.float16 if use_cuda else torch.float32 | |
| # ------------------- | |
| # 2️⃣ Load Whisper model from Hugging Face | |
| # ------------------- | |
| hub_id = "Muhammadidrees/WispherVOICE" | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| hub_id, | |
| torch_dtype=dtype, | |
| device_map="auto", # automatically assigns to GPU if available | |
| trust_remote_code=True | |
| ) | |
| processor = AutoProcessor.from_pretrained(hub_id, trust_remote_code=True) | |
| # ------------------- | |
| # 3️⃣ Setup ASR pipeline | |
| # ------------------- | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| torch_dtype=dtype, | |
| device=device_index | |
| ) | |
| print("🎧 Whisper pipeline ready using Muhammadidrees/WispherVOICE.") | |
| # ------------------- | |
| # 4️⃣ Record & Transcribe Function | |
| # ------------------- | |
| def record_and_transcribe(duration=5, samplerate=16000, filename="mic_input.wav") -> str: | |
| """ | |
| Record audio from the microphone, save it as a WAV file, | |
| and return the transcribed text using Whisper. | |
| """ | |
| # 1️⃣ Record audio | |
| print(f"🎙️ Recording for {duration} seconds...") | |
| audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype="float32") | |
| sd.wait() | |
| audio = np.squeeze(audio) | |
| # 2️⃣ Save as WAV | |
| wav.write(filename, samplerate, (audio * 32767).astype(np.int16)) | |
| print(f"✅ Recording saved as {filename}") | |
| # 3️⃣ Transcribe | |
| result = pipe(filename) | |
| text = result["text"] | |
| print(f"📝 Transcribed text: {text}") | |
| return text | |