Spaces:

Muhammadidrees
/

RiayatechChatDoctor

Sleeping

App Files Files Community

RiayatechChatDoctor / PaitentVoiceToText.py

Muhammadidrees

Update PaitentVoiceToText.py

0849418 verified 3 months ago

raw

history blame

1.97 kB

	# stt.py
	import torch
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	import sounddevice as sd
	import numpy as np
	import scipy.io.wavfile as wav

	# -------------------
	# 1️⃣ Detect GPU
	# -------------------
	use_cuda = torch.cuda.is_available()
	device_index = 0 if use_cuda else -1
	device_str = "cuda" if use_cuda else "cpu"
	dtype = torch.float16 if use_cuda else torch.float32

	# -------------------
	# 2️⃣ Load Whisper model from Hugging Face
	# -------------------
	hub_id = "Muhammadidrees/WispherVOICE"

	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	hub_id,
	torch_dtype=dtype,
	device_map="auto", # automatically assigns to GPU if available
	trust_remote_code=True
	)
	processor = AutoProcessor.from_pretrained(hub_id, trust_remote_code=True)

	# -------------------
	# 3️⃣ Setup ASR pipeline
	# -------------------
	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	torch_dtype=dtype,
	device=device_index
	)

	print("🎧 Whisper pipeline ready using Muhammadidrees/WispherVOICE.")


	# -------------------
	# 4️⃣ Record & Transcribe Function
	# -------------------
	def record_and_transcribe(duration=5, samplerate=16000, filename="mic_input.wav") -> str:
	"""
	Record audio from the microphone, save it as a WAV file,
	and return the transcribed text using Whisper.
	"""
	# 1️⃣ Record audio
	print(f"🎙️ Recording for {duration} seconds...")
	audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype="float32")
	sd.wait()
	audio = np.squeeze(audio)

	# 2️⃣ Save as WAV
	wav.write(filename, samplerate, (audio * 32767).astype(np.int16))
	print(f"✅ Recording saved as {filename}")

	# 3️⃣ Transcribe
	result = pipe(filename)
	text = result["text"]
	print(f"📝 Transcribed text: {text}")

	return text