Kunjan Shah commited on
Commit ·
1dd415e
1
Parent(s): e1c31bf
Added Whisper Speect-To-Text
Browse files- core/input_comp_gen.py +38 -31
- core/speech_converter.py +39 -75
core/input_comp_gen.py
CHANGED
|
@@ -3,10 +3,13 @@ import json
|
|
| 3 |
from typing import Dict, Any, List, Tuple
|
| 4 |
from model import generate_response
|
| 5 |
from utils import FileProcessor
|
| 6 |
-
from speech_converter import audio_to_text, text_to_audio
|
| 7 |
import tempfile
|
| 8 |
import os
|
|
|
|
| 9 |
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def generate_question(resume_text, job_desc_text, job_role):
|
| 12 |
system_prompt = "You are an experienced technical interviewer and JSON generator. You create professional interview questions and respond only with valid JSON arrays. Never include explanations, markdown formatting, or any text outside the JSON array."
|
|
@@ -221,6 +224,7 @@ if st.session_state.processing_complete and st.session_state.questions:
|
|
| 221 |
col1, col2 = st.columns(2)
|
| 222 |
with col1:
|
| 223 |
st.markdown("**👤 Your Answer:**")
|
|
|
|
| 224 |
current_answer = st.session_state.user_answers.get(qn, "")
|
| 225 |
answer_submitted = qn in st.session_state.user_answers
|
| 226 |
|
|
@@ -240,39 +244,42 @@ if st.session_state.processing_complete and st.session_state.questions:
|
|
| 240 |
st.markdown("🎤 **Record your answer:**")
|
| 241 |
audio_file = st.audio_input("Record audio", key=f"audio_input_{qn}")
|
| 242 |
|
| 243 |
-
# Process audio if available
|
| 244 |
if audio_file is not None:
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
tmp_file.write(audio_file.read())
|
| 252 |
-
tmp_file_path = tmp_file.name
|
| 253 |
-
|
| 254 |
-
# Convert audio to text using speech_converter
|
| 255 |
-
transcribed_text = audio_to_text(tmp_file_path)
|
| 256 |
-
|
| 257 |
-
# Clean up temporary file
|
| 258 |
-
os.unlink(tmp_file_path)
|
| 259 |
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
with st.sidebar:
|
| 270 |
-
st.error(
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
|
| 277 |
# Text area with transcribed text or manual input
|
| 278 |
initial_text = st.session_state.get(f"transcribed_text_{qn}", current_answer)
|
|
|
|
| 3 |
from typing import Dict, Any, List, Tuple
|
| 4 |
from model import generate_response
|
| 5 |
from utils import FileProcessor
|
| 6 |
+
from speech_converter import audio_to_text, text_to_audio, load_model
|
| 7 |
import tempfile
|
| 8 |
import os
|
| 9 |
+
import torch
|
| 10 |
|
| 11 |
+
# Initialize the model through speech_converter
|
| 12 |
+
whisper_model = load_model()
|
| 13 |
|
| 14 |
def generate_question(resume_text, job_desc_text, job_role):
|
| 15 |
system_prompt = "You are an experienced technical interviewer and JSON generator. You create professional interview questions and respond only with valid JSON arrays. Never include explanations, markdown formatting, or any text outside the JSON array."
|
|
|
|
| 224 |
col1, col2 = st.columns(2)
|
| 225 |
with col1:
|
| 226 |
st.markdown("**👤 Your Answer:**")
|
| 227 |
+
# Check if answer already submitted
|
| 228 |
current_answer = st.session_state.user_answers.get(qn, "")
|
| 229 |
answer_submitted = qn in st.session_state.user_answers
|
| 230 |
|
|
|
|
| 244 |
st.markdown("🎤 **Record your answer:**")
|
| 245 |
audio_file = st.audio_input("Record audio", key=f"audio_input_{qn}")
|
| 246 |
|
| 247 |
+
# Process audio if available using Whisper
|
| 248 |
if audio_file is not None:
|
| 249 |
+
try:
|
| 250 |
+
# Save audio to temporary file
|
| 251 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
| 252 |
+
# Read bytes from UploadedFile object
|
| 253 |
+
audio_file.seek(0) # Reset file pointer
|
| 254 |
+
audio_data = audio_file.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
+
# Write raw audio data to temporary file
|
| 257 |
+
tmp_file.write(audio_data)
|
| 258 |
+
tmp_file_path = tmp_file.name
|
| 259 |
+
|
| 260 |
+
# Convert audio to text using Whisper directly
|
| 261 |
+
result = whisper_model.transcribe(tmp_file_path)
|
| 262 |
+
transcribed_text = result["text"]
|
| 263 |
+
|
| 264 |
+
# Clean up temporary file
|
| 265 |
+
os.unlink(tmp_file_path)
|
| 266 |
+
|
| 267 |
+
if transcribed_text:
|
| 268 |
+
st.session_state[f"transcribed_text_{qn}"] = transcribed_text
|
| 269 |
+
with st.sidebar:
|
| 270 |
+
st.success("✅ Audio transcribed successfully!")
|
| 271 |
+
else:
|
| 272 |
with st.sidebar:
|
| 273 |
+
st.error("❌ Could not understand the audio. Please try again.")
|
| 274 |
+
|
| 275 |
+
except Exception as e:
|
| 276 |
+
with st.sidebar:
|
| 277 |
+
st.error(f"❌ Audio processing error: {str(e)}")
|
| 278 |
+
if 'tmp_file_path' in locals():
|
| 279 |
+
try:
|
| 280 |
+
os.unlink(tmp_file_path)
|
| 281 |
+
except:
|
| 282 |
+
pass
|
| 283 |
|
| 284 |
# Text area with transcribed text or manual input
|
| 285 |
initial_text = st.session_state.get(f"transcribed_text_{qn}", current_answer)
|
core/speech_converter.py
CHANGED
|
@@ -1,88 +1,52 @@
|
|
| 1 |
-
import
|
| 2 |
-
import
|
| 3 |
-
import wavio as wv
|
| 4 |
-
import os
|
| 5 |
-
import time
|
| 6 |
import pyttsx3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
def audio_to_text(audio_file_path=None):
|
| 9 |
-
"""
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
"""
|
| 14 |
-
r = sr.Recognizer()
|
| 15 |
-
text = None
|
| 16 |
|
| 17 |
try:
|
| 18 |
if audio_file_path:
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
audio = r.record(source)
|
| 22 |
-
else:
|
| 23 |
-
# Original microphone capture logic
|
| 24 |
-
freq = 44100
|
| 25 |
-
duration = 180
|
| 26 |
-
channels = 1
|
| 27 |
-
recording_file = "recording.wav"
|
| 28 |
-
|
| 29 |
-
print("🎤 You can answer now")
|
| 30 |
-
print(f"Recording will automatically stop after {duration} seconds")
|
| 31 |
-
|
| 32 |
-
recording = sd.rec(int(duration*freq), samplerate=freq, channels=channels)
|
| 33 |
-
|
| 34 |
-
for i in range(duration):
|
| 35 |
-
time.sleep(1)
|
| 36 |
-
seconds_left = duration - i - 1
|
| 37 |
-
print(f"⏱️ {seconds_left} seconds remaining...", end="\r")
|
| 38 |
-
|
| 39 |
-
print("Thank you for your answer")
|
| 40 |
-
sd.wait()
|
| 41 |
-
wv.write(recording_file, recording, freq, sampwidth=2)
|
| 42 |
-
|
| 43 |
-
with sr.AudioFile(recording_file) as source:
|
| 44 |
-
audio = r.record(source)
|
| 45 |
-
|
| 46 |
-
# Clean up recording file
|
| 47 |
-
if os.path.exists(recording_file):
|
| 48 |
-
os.remove(recording_file)
|
| 49 |
-
|
| 50 |
-
text = r.recognize_google(audio)
|
| 51 |
-
return text
|
| 52 |
-
|
| 53 |
-
except sr.UnknownValueError:
|
| 54 |
-
print("Sorry, I couldn't understand what you said. Please try again.")
|
| 55 |
-
return None
|
| 56 |
-
except sr.RequestError as e:
|
| 57 |
-
print(f"Speech recognition service error: {e}")
|
| 58 |
return None
|
|
|
|
| 59 |
except Exception as e:
|
| 60 |
print(f"Audio processing error: {e}")
|
| 61 |
return None
|
| 62 |
|
| 63 |
-
def text_to_audio(text
|
| 64 |
-
"""
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
try:
|
| 72 |
-
engine = pyttsx3.init()
|
| 73 |
-
voices = engine.getProperty("voices")
|
| 74 |
-
engine.setProperty("rate", rate)
|
| 75 |
-
engine.setProperty("voice", voices[voice_idx].id)
|
| 76 |
-
engine.say(text)
|
| 77 |
-
engine.runAndWait()
|
| 78 |
-
except Exception as e:
|
| 79 |
-
print(f"Text-to-speech error: {str(e)}")
|
| 80 |
-
|
| 81 |
|
| 82 |
if __name__ == "__main__":
|
| 83 |
-
|
| 84 |
-
text_to_audio("""The key to effective software development lies in balancing technical excellence with practical solutions.
|
| 85 |
-
In my experience at TechSolutions,
|
| 86 |
-
I implemented this philosophy by optimizing database queries which reduced load times by 40%.
|
| 87 |
-
I'm passionate about clean code and proper documentation, which has helped my teams maintain
|
| 88 |
-
systems efficiently over time. I'm excited to bring these skills to your cloud-based applications.""")
|
|
|
|
| 1 |
+
import whisper
|
| 2 |
+
import streamlit as st
|
|
|
|
|
|
|
|
|
|
| 3 |
import pyttsx3
|
| 4 |
+
import os
|
| 5 |
+
import torch
|
| 6 |
+
|
| 7 |
+
# Set PyTorch settings to avoid thread/loop errors
|
| 8 |
+
torch.set_num_threads(1)
|
| 9 |
+
torch.set_num_interop_threads(1)
|
| 10 |
+
|
| 11 |
+
@st.cache_resource(show_spinner="Loading speech recognition model...")
|
| 12 |
+
def load_model():
|
| 13 |
+
"""Load Whisper model with optimized settings"""
|
| 14 |
+
try:
|
| 15 |
+
# Use CPU device and weights_only to avoid torch serialization issues
|
| 16 |
+
return whisper.load_model(
|
| 17 |
+
"base",
|
| 18 |
+
device="cpu",
|
| 19 |
+
download_root="models",
|
| 20 |
+
in_memory=True
|
| 21 |
+
)
|
| 22 |
+
except Exception as e:
|
| 23 |
+
print(f"Model loading error: {e}")
|
| 24 |
+
return None
|
| 25 |
|
| 26 |
def audio_to_text(audio_file_path=None):
|
| 27 |
+
"""Converts audio file to text using Whisper"""
|
| 28 |
+
model = load_model()
|
| 29 |
+
if model is None:
|
| 30 |
+
return None
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
try:
|
| 33 |
if audio_file_path:
|
| 34 |
+
result = model.transcribe(audio_file_path, fp16=False)
|
| 35 |
+
return result["text"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
return None
|
| 37 |
+
|
| 38 |
except Exception as e:
|
| 39 |
print(f"Audio processing error: {e}")
|
| 40 |
return None
|
| 41 |
|
| 42 |
+
def text_to_audio(text):
|
| 43 |
+
"""Converts text to speech using pyttsx3"""
|
| 44 |
+
engine = pyttsx3.init()
|
| 45 |
+
voices = engine.getProperty("voices")
|
| 46 |
+
engine.setProperty("rate", 125)
|
| 47 |
+
engine.setProperty("voice", voices[1].id)
|
| 48 |
+
engine.say(text)
|
| 49 |
+
engine.runAndWait()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
if __name__ == "__main__":
|
| 52 |
+
text_to_audio("Test speech conversion")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|