Spaces:
Paused
Paused
Commit
·
1f41a8a
1
Parent(s):
330157f
update whisper
Browse files
backend/services/interview_engine.py
CHANGED
|
@@ -7,6 +7,7 @@ from langchain_groq import ChatGroq
|
|
| 7 |
import logging
|
| 8 |
import tempfile
|
| 9 |
import shutil
|
|
|
|
| 10 |
|
| 11 |
# Initialize models
|
| 12 |
chat_groq_api = os.getenv("GROQ_API_KEY")
|
|
@@ -25,7 +26,7 @@ def load_whisper_model():
|
|
| 25 |
global whisper_model
|
| 26 |
if whisper_model is None:
|
| 27 |
try:
|
| 28 |
-
device = "cuda" if
|
| 29 |
compute_type = "float16" if device == "cuda" else "int8"
|
| 30 |
whisper_model = WhisperModel("base", device=device, compute_type=compute_type)
|
| 31 |
logging.info(f"Whisper model loaded on {device} with {compute_type}")
|
|
@@ -175,59 +176,30 @@ def convert_webm_to_wav(webm_path, wav_path):
|
|
| 175 |
return None
|
| 176 |
|
| 177 |
def whisper_stt(audio_path):
|
| 178 |
-
"""Speech-to-text using Faster-Whisper with better error handling"""
|
| 179 |
try:
|
| 180 |
if not audio_path or not os.path.exists(audio_path):
|
| 181 |
logging.error(f"Audio file does not exist: {audio_path}")
|
| 182 |
return ""
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
file_size = os.path.getsize(audio_path)
|
| 186 |
-
if file_size == 0:
|
| 187 |
logging.error(f"Audio file is empty: {audio_path}")
|
| 188 |
return ""
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
else:
|
| 199 |
-
logging.warning("Could not convert WebM to WAV, trying with original file")
|
| 200 |
-
|
| 201 |
-
model = load_whisper_model()
|
| 202 |
-
|
| 203 |
-
# Add timeout and better error handling
|
| 204 |
-
try:
|
| 205 |
-
segments, info = model.transcribe(
|
| 206 |
-
audio_path,
|
| 207 |
-
language="en", # Specify language for better performance
|
| 208 |
-
task="transcribe",
|
| 209 |
-
vad_filter=True, # Voice activity detection
|
| 210 |
-
vad_parameters=dict(min_silence_duration_ms=500)
|
| 211 |
-
)
|
| 212 |
-
|
| 213 |
-
transcript_parts = []
|
| 214 |
-
for segment in segments:
|
| 215 |
-
if hasattr(segment, 'text') and segment.text.strip():
|
| 216 |
-
transcript_parts.append(segment.text.strip())
|
| 217 |
-
|
| 218 |
-
transcript = " ".join(transcript_parts)
|
| 219 |
-
|
| 220 |
-
if transcript:
|
| 221 |
-
logging.info(f"Transcription successful: '{transcript[:100]}...'")
|
| 222 |
-
else:
|
| 223 |
-
logging.warning("No speech detected in audio file")
|
| 224 |
-
|
| 225 |
-
return transcript.strip()
|
| 226 |
-
|
| 227 |
-
except Exception as e:
|
| 228 |
-
logging.error(f"Error during transcription: {e}")
|
| 229 |
return ""
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
except Exception as e:
|
| 232 |
logging.error(f"Error in STT: {e}")
|
| 233 |
return ""
|
|
|
|
| 7 |
import logging
|
| 8 |
import tempfile
|
| 9 |
import shutil
|
| 10 |
+
import torch
|
| 11 |
|
| 12 |
# Initialize models
|
| 13 |
chat_groq_api = os.getenv("GROQ_API_KEY")
|
|
|
|
| 26 |
global whisper_model
|
| 27 |
if whisper_model is None:
|
| 28 |
try:
|
| 29 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 30 |
compute_type = "float16" if device == "cuda" else "int8"
|
| 31 |
whisper_model = WhisperModel("base", device=device, compute_type=compute_type)
|
| 32 |
logging.info(f"Whisper model loaded on {device} with {compute_type}")
|
|
|
|
| 176 |
return None
|
| 177 |
|
| 178 |
def whisper_stt(audio_path):
|
|
|
|
| 179 |
try:
|
| 180 |
if not audio_path or not os.path.exists(audio_path):
|
| 181 |
logging.error(f"Audio file does not exist: {audio_path}")
|
| 182 |
return ""
|
| 183 |
+
|
| 184 |
+
if os.path.getsize(audio_path) == 0:
|
|
|
|
|
|
|
| 185 |
logging.error(f"Audio file is empty: {audio_path}")
|
| 186 |
return ""
|
| 187 |
+
|
| 188 |
+
# Convert WebM to WAV using ffmpeg (ensure ffmpeg is available)
|
| 189 |
+
converted_path = audio_path.replace(".webm", ".wav")
|
| 190 |
+
subprocess.run([
|
| 191 |
+
"ffmpeg", "-y", "-i", audio_path, "-ar", "16000", "-ac", "1", converted_path
|
| 192 |
+
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 193 |
+
|
| 194 |
+
if not os.path.exists(converted_path) or os.path.getsize(converted_path) == 0:
|
| 195 |
+
logging.error(f"Conversion failed or produced empty file: {converted_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
return ""
|
| 197 |
+
|
| 198 |
+
model = load_whisper_model()
|
| 199 |
+
segments, _ = model.transcribe(converted_path)
|
| 200 |
+
transcript = " ".join(segment.text for segment in segments)
|
| 201 |
+
return transcript.strip()
|
| 202 |
+
|
| 203 |
except Exception as e:
|
| 204 |
logging.error(f"Error in STT: {e}")
|
| 205 |
return ""
|
backend/templates/interview.html
CHANGED
|
@@ -695,7 +695,10 @@
|
|
| 695 |
delete options.mimeType;
|
| 696 |
}
|
| 697 |
|
| 698 |
-
this.mediaRecorder = new MediaRecorder(stream,
|
|
|
|
|
|
|
|
|
|
| 699 |
this.audioChunks = [];
|
| 700 |
|
| 701 |
this.mediaRecorder.ondataavailable = (event) => {
|
|
@@ -757,7 +760,8 @@
|
|
| 757 |
console.log('Processing', this.audioChunks.length, 'audio chunks');
|
| 758 |
|
| 759 |
// Create blob from audio chunks
|
| 760 |
-
const audioBlob = new Blob(this.audioChunks, { type: 'audio/webm
|
|
|
|
| 761 |
|
| 762 |
console.log('Created audio blob:', audioBlob.size, 'bytes');
|
| 763 |
|
|
|
|
| 695 |
delete options.mimeType;
|
| 696 |
}
|
| 697 |
|
| 698 |
+
this.mediaRecorder = new MediaRecorder(stream, {
|
| 699 |
+
mimeType: 'audio/webm;codecs=opus'
|
| 700 |
+
});
|
| 701 |
+
|
| 702 |
this.audioChunks = [];
|
| 703 |
|
| 704 |
this.mediaRecorder.ondataavailable = (event) => {
|
|
|
|
| 760 |
console.log('Processing', this.audioChunks.length, 'audio chunks');
|
| 761 |
|
| 762 |
// Create blob from audio chunks
|
| 763 |
+
const audioBlob = new Blob(this.audioChunks, { type: 'audio/webm' });
|
| 764 |
+
formData.append('audio', audioBlob, 'recording.webm');
|
| 765 |
|
| 766 |
console.log('Created audio blob:', audioBlob.size, 'bytes');
|
| 767 |
|