Spaces:
Sleeping
Sleeping
File size: 5,768 Bytes
aa8e154 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | """
modules/stt.py β Speech-to-Text
Groq Whisper API (whisper-large-v3-turbo) β fast cloud transcription.
Improvements:
- Energy-based silence detection β skips API call on near-silent audio.
- Interview-context prompt for better accuracy.
- Hallucination suppression for common Whisper artifacts.
- Explicit English language hint.
"""
import os
import tempfile
import numpy as np
from dotenv import load_dotenv
from groq import Groq
load_dotenv()
GROQ_API_KEY = os.getenv('GROQ_API_KEY')
if not GROQ_API_KEY:
raise ValueError("GROQ_API_KEY not set in environment.")
client = Groq(api_key=GROQ_API_KEY)
SAMPLE_RATE = 16_000
RECORD_SECONDS = 45
MODEL = "whisper-large-v3-turbo"
# RMS energy threshold β below this is treated as silence (skip API call)
SILENCE_THRESHOLD = 0.002
# Context prompt that primes Whisper for interview-style speech
INTERVIEW_PROMPT = (
"This is a professional job interview. The candidate is speaking in English "
"about their technical skills, work experience, and career goals."
)
# Common Whisper hallucinations on near-silent audio β suppress these
HALLUCINATIONS = {
'', 'thank you.', 'thanks for watching.', 'thank you for watching.',
'thanks.', 'you', '.', 'bye.', 'bye bye.', 'thank you very much.',
'please subscribe.', 'subscribe.',
}
def _is_silent(audio_np: np.ndarray) -> bool:
"""Return True if audio RMS energy is below the silence threshold."""
rms = float(np.sqrt(np.mean(audio_np.astype(np.float32) ** 2)))
print(f'[STT] Audio RMS energy: {rms:.5f} (threshold: {SILENCE_THRESHOLD})')
return rms < SILENCE_THRESHOLD
def _record_audio(duration: int = RECORD_SECONDS) -> np.ndarray:
try:
import sounddevice as sd
except ImportError:
raise ImportError("sounddevice not installed. Run: pip install sounddevice")
print(f'[STT] Recording for {duration}s...')
audio = sd.rec(
int(duration * SAMPLE_RATE),
samplerate=SAMPLE_RATE,
channels=1,
dtype='float32',
)
sd.wait()
print('[STT] Recording complete.')
return audio.flatten()
def _numpy_to_wav_bytes(audio_np: np.ndarray, sample_rate: int = SAMPLE_RATE) -> bytes:
import struct
import io
num_samples = len(audio_np)
num_channels = 1
bits_per_sample = 16
byte_rate = sample_rate * num_channels * bits_per_sample // 8
block_align = num_channels * bits_per_sample // 8
data_size = num_samples * block_align
audio_int16 = (audio_np * 32767).astype(np.int16)
buf = io.BytesIO()
buf.write(b'RIFF')
buf.write(struct.pack('<I', 36 + data_size))
buf.write(b'WAVE')
buf.write(b'fmt ')
buf.write(struct.pack('<I', 16))
buf.write(struct.pack('<H', 1))
buf.write(struct.pack('<H', num_channels))
buf.write(struct.pack('<I', sample_rate))
buf.write(struct.pack('<I', byte_rate))
buf.write(struct.pack('<H', block_align))
buf.write(struct.pack('<H', bits_per_sample))
buf.write(b'data')
buf.write(struct.pack('<I', data_size))
buf.write(audio_int16.tobytes())
return buf.getvalue()
def transcribe(audio_path: str | None = None, duration: int = RECORD_SECONDS) -> str:
"""
Transcribe speech to text via Groq Whisper API.
Args:
audio_path: Path to an existing audio file. If None, records from mic.
duration: Seconds to record if audio_path is None.
Returns:
Transcribed text string (empty string on silence or failure).
"""
try:
if audio_path is not None:
# ββ Silence check on file ββββββββββββββββββββββββββββββββββββββββββ
try:
import soundfile as sf
audio_np, _ = sf.read(audio_path, dtype='float32')
if audio_np.ndim > 1:
audio_np = audio_np.mean(axis=1)
if _is_silent(audio_np):
print('[STT] Audio is silent β skipping transcription.')
return ''
except Exception:
pass # If we can't read it for silence check, let Groq try anyway
with open(audio_path, 'rb') as f:
audio_bytes = f.read()
filename = os.path.basename(audio_path)
else:
# ββ Live mic recording βββββββββββββββββββββββββββββββββββββββββββββ
audio_np = _record_audio(duration)
if _is_silent(audio_np):
print('[STT] Recorded audio is silent β skipping transcription.')
return ''
audio_bytes = _numpy_to_wav_bytes(audio_np)
filename = 'recording.wav'
response = client.audio.transcriptions.create(
model=MODEL,
file=(filename, audio_bytes),
response_format='text',
language='en', # force English β avoids misdetection lag
prompt=INTERVIEW_PROMPT, # context conditioning for accuracy
)
text = response.strip() if isinstance(response, str) else (response.text or '').strip()
# ββ Hallucination suppression ββββββββββββββββββββββββββββββββββββββββββ
if text.lower() in HALLUCINATIONS:
print(f'[STT] Suppressed hallucination: "{text}"')
return ''
print(f'[STT] Transcribed ({len(text)} chars): "{text[:100]}"')
return text
except Exception as e:
print(f'[STT] Groq transcription error: {e}')
return '' |