RiayatechChatDoctor / PaitentVoiceToText.py
Muhammadidrees's picture
Update PaitentVoiceToText.py
0849418 verified
raw
history blame
1.97 kB
# stt.py
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav
# -------------------
# 1️⃣ Detect GPU
# -------------------
use_cuda = torch.cuda.is_available()
device_index = 0 if use_cuda else -1
device_str = "cuda" if use_cuda else "cpu"
dtype = torch.float16 if use_cuda else torch.float32
# -------------------
# 2️⃣ Load Whisper model from Hugging Face
# -------------------
hub_id = "Muhammadidrees/WispherVOICE"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
hub_id,
torch_dtype=dtype,
device_map="auto", # automatically assigns to GPU if available
trust_remote_code=True
)
processor = AutoProcessor.from_pretrained(hub_id, trust_remote_code=True)
# -------------------
# 3️⃣ Setup ASR pipeline
# -------------------
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
torch_dtype=dtype,
device=device_index
)
print("🎧 Whisper pipeline ready using Muhammadidrees/WispherVOICE.")
# -------------------
# 4️⃣ Record & Transcribe Function
# -------------------
def record_and_transcribe(duration=5, samplerate=16000, filename="mic_input.wav") -> str:
"""
Record audio from the microphone, save it as a WAV file,
and return the transcribed text using Whisper.
"""
# 1️⃣ Record audio
print(f"🎙️ Recording for {duration} seconds...")
audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype="float32")
sd.wait()
audio = np.squeeze(audio)
# 2️⃣ Save as WAV
wav.write(filename, samplerate, (audio * 32767).astype(np.int16))
print(f"✅ Recording saved as {filename}")
# 3️⃣ Transcribe
result = pipe(filename)
text = result["text"]
print(f"📝 Transcribed text: {text}")
return text