Muhammadidrees commited on
Commit
0849418
·
verified ·
1 Parent(s): b1a4c93

Update PaitentVoiceToText.py

Browse files
Files changed (1) hide show
  1. PaitentVoiceToText.py +67 -70
PaitentVoiceToText.py CHANGED
@@ -1,70 +1,67 @@
1
- # stt.py
2
- import torch
3
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
- import sounddevice as sd
5
- import numpy as np
6
- import scipy.io.wavfile as wav
7
-
8
- save_dir = r"C:\Users\JAY\Downloads\model\OpenAIWhisper"
9
-
10
- # Detect GPU
11
- use_cuda = torch.cuda.is_available()
12
- device_index = 0 if use_cuda else -1
13
- device_str = "cuda" if use_cuda else "cpu"
14
- dtype = torch.float16 if use_cuda else torch.float32
15
-
16
- # Load model
17
- try:
18
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
19
- save_dir,
20
- torch_dtype=dtype,
21
- low_cpu_mem_usage=True,
22
- use_safetensors=True,
23
- local_files_only=True
24
- ).to(device_str)
25
- processor = AutoProcessor.from_pretrained(save_dir, local_files_only=True)
26
- except Exception as e:
27
- print("Warning: Local model load failed, falling back to online model:", e)
28
- hub_id = "openai/whisper-small"
29
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
30
- hub_id,
31
- torch_dtype=dtype,
32
- low_cpu_mem_usage=True,
33
- use_safetensors=True,
34
- ).to(device_str)
35
- processor = AutoProcessor.from_pretrained(hub_id)
36
-
37
- pipe = pipeline(
38
- "automatic-speech-recognition",
39
- model=model,
40
- tokenizer=processor.tokenizer,
41
- feature_extractor=processor.feature_extractor,
42
- torch_dtype=dtype,
43
- device=device_index
44
- )
45
-
46
- print("Whisper pipeline ready.")
47
-
48
-
49
- def record_and_transcribe(duration=5, samplerate=16000, filename="mic_input.wav") -> str:
50
- """
51
- Record audio from the microphone, save it as a WAV file,
52
- and return the transcribed text using Whisper.
53
- """
54
- # 1️⃣ Record audio
55
- print(f"🎙️ Recording for {duration} seconds...")
56
- audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype="float32")
57
- sd.wait()
58
- audio = np.squeeze(audio)
59
-
60
- # 2️⃣ Save as WAV
61
- wav.write(filename, samplerate, (audio * 32767).astype(np.int16))
62
- print(f"✅ Recording saved as {filename}")
63
-
64
- # 3️⃣ Transcribe
65
- result = pipe(filename)
66
- text = result["text"]
67
- print(f"📝 Transcribed text: {text}")
68
-
69
- return text
70
-
 
1
+ # stt.py
2
+ import torch
3
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
+ import sounddevice as sd
5
+ import numpy as np
6
+ import scipy.io.wavfile as wav
7
+
8
+ # -------------------
9
+ # 1️⃣ Detect GPU
10
+ # -------------------
11
+ use_cuda = torch.cuda.is_available()
12
+ device_index = 0 if use_cuda else -1
13
+ device_str = "cuda" if use_cuda else "cpu"
14
+ dtype = torch.float16 if use_cuda else torch.float32
15
+
16
+ # -------------------
17
+ # 2️⃣ Load Whisper model from Hugging Face
18
+ # -------------------
19
+ hub_id = "Muhammadidrees/WispherVOICE"
20
+
21
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
22
+ hub_id,
23
+ torch_dtype=dtype,
24
+ device_map="auto", # automatically assigns to GPU if available
25
+ trust_remote_code=True
26
+ )
27
+ processor = AutoProcessor.from_pretrained(hub_id, trust_remote_code=True)
28
+
29
+ # -------------------
30
+ # 3️⃣ Setup ASR pipeline
31
+ # -------------------
32
+ pipe = pipeline(
33
+ "automatic-speech-recognition",
34
+ model=model,
35
+ tokenizer=processor.tokenizer,
36
+ feature_extractor=processor.feature_extractor,
37
+ torch_dtype=dtype,
38
+ device=device_index
39
+ )
40
+
41
+ print("🎧 Whisper pipeline ready using Muhammadidrees/WispherVOICE.")
42
+
43
+
44
+ # -------------------
45
+ # 4️⃣ Record & Transcribe Function
46
+ # -------------------
47
+ def record_and_transcribe(duration=5, samplerate=16000, filename="mic_input.wav") -> str:
48
+ """
49
+ Record audio from the microphone, save it as a WAV file,
50
+ and return the transcribed text using Whisper.
51
+ """
52
+ # 1️⃣ Record audio
53
+ print(f"🎙️ Recording for {duration} seconds...")
54
+ audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype="float32")
55
+ sd.wait()
56
+ audio = np.squeeze(audio)
57
+
58
+ # 2️⃣ Save as WAV
59
+ wav.write(filename, samplerate, (audio * 32767).astype(np.int16))
60
+ print(f"✅ Recording saved as {filename}")
61
+
62
+ # 3️⃣ Transcribe
63
+ result = pipe(filename)
64
+ text = result["text"]
65
+ print(f"📝 Transcribed text: {text}")
66
+
67
+ return text