Muhammadidrees commited on
Commit
4921c24
·
verified ·
1 Parent(s): e1b1f98

Update PaitentVoiceToText.py

Browse files
Files changed (1) hide show
  1. PaitentVoiceToText.py +20 -31
PaitentVoiceToText.py CHANGED
@@ -1,34 +1,27 @@
1
- # stt.py
2
  import torch
3
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
- import sounddevice as sd
5
- import numpy as np
6
- import scipy.io.wavfile as wav
7
 
8
  # -------------------
9
  # 1️⃣ Detect GPU
10
  # -------------------
11
  use_cuda = torch.cuda.is_available()
12
  device_index = 0 if use_cuda else -1
13
- device_str = "cuda" if use_cuda else "cpu"
14
  dtype = torch.float16 if use_cuda else torch.float32
15
 
16
  # -------------------
17
- # 2️⃣ Load Whisper model from Hugging Face
18
  # -------------------
19
  hub_id = "Muhammadidrees/WispherVOICE"
20
-
21
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
22
  hub_id,
23
  torch_dtype=dtype,
24
- device_map="auto", # automatically assigns to GPU if available
25
  trust_remote_code=True
26
  )
27
  processor = AutoProcessor.from_pretrained(hub_id, trust_remote_code=True)
28
 
29
- # -------------------
30
- # 3️⃣ Setup ASR pipeline
31
- # -------------------
32
  pipe = pipeline(
33
  "automatic-speech-recognition",
34
  model=model,
@@ -38,30 +31,26 @@ pipe = pipeline(
38
  device=device_index
39
  )
40
 
41
- print("🎧 Whisper pipeline ready using Muhammadidrees/WispherVOICE.")
42
 
43
 
44
  # -------------------
45
- # 4️⃣ Record & Transcribe Function
46
  # -------------------
47
- def record_and_transcribe(duration=5, samplerate=16000, filename="mic_input.wav") -> str:
48
- """
49
- Record audio from the microphone, save it as a WAV file,
50
- and return the transcribed text using Whisper.
51
- """
52
- # 1️⃣ Record audio
53
- print(f"🎙️ Recording for {duration} seconds...")
54
- audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype="float32")
55
- sd.wait()
56
- audio = np.squeeze(audio)
57
 
58
- # 2️⃣ Save as WAV
59
- wav.write(filename, samplerate, (audio * 32767).astype(np.int16))
60
- print(f"✅ Recording saved as {filename}")
61
 
62
- # 3️⃣ Transcribe
63
- result = pipe(filename)
64
- text = result["text"]
65
- print(f"📝 Transcribed text: {text}")
 
 
 
 
66
 
67
- return text
 
 
1
+ # stt_gradio.py
2
  import torch
3
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
+ import gradio as gr
 
 
5
 
6
  # -------------------
7
  # 1️⃣ Detect GPU
8
  # -------------------
9
  use_cuda = torch.cuda.is_available()
10
  device_index = 0 if use_cuda else -1
 
11
  dtype = torch.float16 if use_cuda else torch.float32
12
 
13
  # -------------------
14
+ # 2️⃣ Load Whisper model
15
  # -------------------
16
  hub_id = "Muhammadidrees/WispherVOICE"
 
17
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
18
  hub_id,
19
  torch_dtype=dtype,
20
+ device_map="auto",
21
  trust_remote_code=True
22
  )
23
  processor = AutoProcessor.from_pretrained(hub_id, trust_remote_code=True)
24
 
 
 
 
25
  pipe = pipeline(
26
  "automatic-speech-recognition",
27
  model=model,
 
31
  device=device_index
32
  )
33
 
34
+ print("🎧 Whisper pipeline ready.")
35
 
36
 
37
  # -------------------
38
+ # 3️⃣ Function to Transcribe Uploaded/Recorded Audio
39
  # -------------------
40
+ def transcribe(audio):
41
+ # audio = (sr, data) from Gradio microphone
42
+ result = pipe(audio)
43
+ return result["text"]
 
 
 
 
 
 
44
 
 
 
 
45
 
46
+ # -------------------
47
+ # 4️⃣ Gradio Interface
48
+ # -------------------
49
+ demo = gr.Interface(
50
+ fn=transcribe,
51
+ inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
52
+ outputs="text"
53
+ )
54
 
55
+ if __name__ == "__main__":
56
+ demo.launch()