Kunjan Shah commited on
Commit
1dd415e
·
1 Parent(s): e1c31bf

Added Whisper Speect-To-Text

Browse files
Files changed (2) hide show
  1. core/input_comp_gen.py +38 -31
  2. core/speech_converter.py +39 -75
core/input_comp_gen.py CHANGED
@@ -3,10 +3,13 @@ import json
3
  from typing import Dict, Any, List, Tuple
4
  from model import generate_response
5
  from utils import FileProcessor
6
- from speech_converter import audio_to_text, text_to_audio
7
  import tempfile
8
  import os
 
9
 
 
 
10
 
11
  def generate_question(resume_text, job_desc_text, job_role):
12
  system_prompt = "You are an experienced technical interviewer and JSON generator. You create professional interview questions and respond only with valid JSON arrays. Never include explanations, markdown formatting, or any text outside the JSON array."
@@ -221,6 +224,7 @@ if st.session_state.processing_complete and st.session_state.questions:
221
  col1, col2 = st.columns(2)
222
  with col1:
223
  st.markdown("**👤 Your Answer:**")
 
224
  current_answer = st.session_state.user_answers.get(qn, "")
225
  answer_submitted = qn in st.session_state.user_answers
226
 
@@ -240,39 +244,42 @@ if st.session_state.processing_complete and st.session_state.questions:
240
  st.markdown("🎤 **Record your answer:**")
241
  audio_file = st.audio_input("Record audio", key=f"audio_input_{qn}")
242
 
243
- # Process audio if available
244
  if audio_file is not None:
245
- #with st.spinner("🎧 Converting speech to text..."):
246
- try:
247
- # Save audio to temporary file
248
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
249
- # Read bytes from UploadedFile object
250
- audio_file.seek(0) # Reset file pointer
251
- tmp_file.write(audio_file.read())
252
- tmp_file_path = tmp_file.name
253
-
254
- # Convert audio to text using speech_converter
255
- transcribed_text = audio_to_text(tmp_file_path)
256
-
257
- # Clean up temporary file
258
- os.unlink(tmp_file_path)
259
 
260
- if transcribed_text:
261
- st.session_state[f"transcribed_text_{qn}"] = transcribed_text
262
- with st.sidebar:
263
- st.success("✅ Audio transcribed successfully!")
264
- else:
265
- with st.sidebar:
266
- st.error("❌ Could not understand the audio. Please try again.")
267
-
268
- except Exception as e:
 
 
 
 
 
 
 
269
  with st.sidebar:
270
- st.error(f"❌ Audio processing error: {str(e)}")
271
- if 'tmp_file_path' in locals():
272
- try:
273
- os.unlink(tmp_file_path)
274
- except:
275
- pass
 
 
 
 
276
 
277
  # Text area with transcribed text or manual input
278
  initial_text = st.session_state.get(f"transcribed_text_{qn}", current_answer)
 
3
  from typing import Dict, Any, List, Tuple
4
  from model import generate_response
5
  from utils import FileProcessor
6
+ from speech_converter import audio_to_text, text_to_audio, load_model
7
  import tempfile
8
  import os
9
+ import torch
10
 
11
+ # Initialize the model through speech_converter
12
+ whisper_model = load_model()
13
 
14
  def generate_question(resume_text, job_desc_text, job_role):
15
  system_prompt = "You are an experienced technical interviewer and JSON generator. You create professional interview questions and respond only with valid JSON arrays. Never include explanations, markdown formatting, or any text outside the JSON array."
 
224
  col1, col2 = st.columns(2)
225
  with col1:
226
  st.markdown("**👤 Your Answer:**")
227
+ # Check if answer already submitted
228
  current_answer = st.session_state.user_answers.get(qn, "")
229
  answer_submitted = qn in st.session_state.user_answers
230
 
 
244
  st.markdown("🎤 **Record your answer:**")
245
  audio_file = st.audio_input("Record audio", key=f"audio_input_{qn}")
246
 
247
+ # Process audio if available using Whisper
248
  if audio_file is not None:
249
+ try:
250
+ # Save audio to temporary file
251
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
252
+ # Read bytes from UploadedFile object
253
+ audio_file.seek(0) # Reset file pointer
254
+ audio_data = audio_file.read()
 
 
 
 
 
 
 
 
255
 
256
+ # Write raw audio data to temporary file
257
+ tmp_file.write(audio_data)
258
+ tmp_file_path = tmp_file.name
259
+
260
+ # Convert audio to text using Whisper directly
261
+ result = whisper_model.transcribe(tmp_file_path)
262
+ transcribed_text = result["text"]
263
+
264
+ # Clean up temporary file
265
+ os.unlink(tmp_file_path)
266
+
267
+ if transcribed_text:
268
+ st.session_state[f"transcribed_text_{qn}"] = transcribed_text
269
+ with st.sidebar:
270
+ st.success("✅ Audio transcribed successfully!")
271
+ else:
272
  with st.sidebar:
273
+ st.error("❌ Could not understand the audio. Please try again.")
274
+
275
+ except Exception as e:
276
+ with st.sidebar:
277
+ st.error(f"❌ Audio processing error: {str(e)}")
278
+ if 'tmp_file_path' in locals():
279
+ try:
280
+ os.unlink(tmp_file_path)
281
+ except:
282
+ pass
283
 
284
  # Text area with transcribed text or manual input
285
  initial_text = st.session_state.get(f"transcribed_text_{qn}", current_answer)
core/speech_converter.py CHANGED
@@ -1,88 +1,52 @@
1
- import speech_recognition as sr
2
- import sounddevice as sd
3
- import wavio as wv
4
- import os
5
- import time
6
  import pyttsx3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def audio_to_text(audio_file_path=None):
9
- """
10
- Converts audio file to text using speech recognition.
11
- If audio_file_path is provided, uses that file. Otherwise captures from microphone.
12
- Returns the transcribed text or None if error.
13
- """
14
- r = sr.Recognizer()
15
- text = None
16
 
17
  try:
18
  if audio_file_path:
19
- # Use provided audio file
20
- with sr.AudioFile(audio_file_path) as source:
21
- audio = r.record(source)
22
- else:
23
- # Original microphone capture logic
24
- freq = 44100
25
- duration = 180
26
- channels = 1
27
- recording_file = "recording.wav"
28
-
29
- print("🎤 You can answer now")
30
- print(f"Recording will automatically stop after {duration} seconds")
31
-
32
- recording = sd.rec(int(duration*freq), samplerate=freq, channels=channels)
33
-
34
- for i in range(duration):
35
- time.sleep(1)
36
- seconds_left = duration - i - 1
37
- print(f"⏱️ {seconds_left} seconds remaining...", end="\r")
38
-
39
- print("Thank you for your answer")
40
- sd.wait()
41
- wv.write(recording_file, recording, freq, sampwidth=2)
42
-
43
- with sr.AudioFile(recording_file) as source:
44
- audio = r.record(source)
45
-
46
- # Clean up recording file
47
- if os.path.exists(recording_file):
48
- os.remove(recording_file)
49
-
50
- text = r.recognize_google(audio)
51
- return text
52
-
53
- except sr.UnknownValueError:
54
- print("Sorry, I couldn't understand what you said. Please try again.")
55
- return None
56
- except sr.RequestError as e:
57
- print(f"Speech recognition service error: {e}")
58
  return None
 
59
  except Exception as e:
60
  print(f"Audio processing error: {e}")
61
  return None
62
 
63
- def text_to_audio(text: str, rate: int = 125, voice_idx: int = 0) -> None:
64
- """
65
- Converts text to speech using pyttsx3
66
- Args:
67
- text: Text to convert to speech
68
- rate: Speech rate (default: 125)
69
- voice_idx: Voice index to use (default: 1 for female voice)
70
- """
71
- try:
72
- engine = pyttsx3.init()
73
- voices = engine.getProperty("voices")
74
- engine.setProperty("rate", rate)
75
- engine.setProperty("voice", voices[voice_idx].id)
76
- engine.say(text)
77
- engine.runAndWait()
78
- except Exception as e:
79
- print(f"Text-to-speech error: {str(e)}")
80
-
81
 
82
  if __name__ == "__main__":
83
- audio_to_text()
84
- text_to_audio("""The key to effective software development lies in balancing technical excellence with practical solutions.
85
- In my experience at TechSolutions,
86
- I implemented this philosophy by optimizing database queries which reduced load times by 40%.
87
- I'm passionate about clean code and proper documentation, which has helped my teams maintain
88
- systems efficiently over time. I'm excited to bring these skills to your cloud-based applications.""")
 
1
+ import whisper
2
+ import streamlit as st
 
 
 
3
  import pyttsx3
4
+ import os
5
+ import torch
6
+
7
+ # Set PyTorch settings to avoid thread/loop errors
8
+ torch.set_num_threads(1)
9
+ torch.set_num_interop_threads(1)
10
+
11
+ @st.cache_resource(show_spinner="Loading speech recognition model...")
12
+ def load_model():
13
+ """Load Whisper model with optimized settings"""
14
+ try:
15
+ # Use CPU device and weights_only to avoid torch serialization issues
16
+ return whisper.load_model(
17
+ "base",
18
+ device="cpu",
19
+ download_root="models",
20
+ in_memory=True
21
+ )
22
+ except Exception as e:
23
+ print(f"Model loading error: {e}")
24
+ return None
25
 
26
  def audio_to_text(audio_file_path=None):
27
+ """Converts audio file to text using Whisper"""
28
+ model = load_model()
29
+ if model is None:
30
+ return None
 
 
 
31
 
32
  try:
33
  if audio_file_path:
34
+ result = model.transcribe(audio_file_path, fp16=False)
35
+ return result["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  return None
37
+
38
  except Exception as e:
39
  print(f"Audio processing error: {e}")
40
  return None
41
 
42
+ def text_to_audio(text):
43
+ """Converts text to speech using pyttsx3"""
44
+ engine = pyttsx3.init()
45
+ voices = engine.getProperty("voices")
46
+ engine.setProperty("rate", 125)
47
+ engine.setProperty("voice", voices[1].id)
48
+ engine.say(text)
49
+ engine.runAndWait()
 
 
 
 
 
 
 
 
 
 
50
 
51
  if __name__ == "__main__":
52
+ text_to_audio("Test speech conversion")