Spaces:

yaman007
/

12321321

Sleeping

App Files Files Community

Kunjan Shah commited on May 30, 2025

Commit

1dd415e

1 Parent(s): e1c31bf

Added Whisper Speect-To-Text

Browse files

Files changed (2) hide show

core/input_comp_gen.py +38 -31
core/speech_converter.py +39 -75

core/input_comp_gen.py CHANGED Viewed

@@ -3,10 +3,13 @@ import json
 from typing import Dict, Any, List, Tuple
 from model import generate_response
 from utils import FileProcessor
-from speech_converter import audio_to_text, text_to_audio
 import tempfile
 import os
 def generate_question(resume_text, job_desc_text, job_role):
     system_prompt = "You are an experienced technical interviewer and JSON generator. You create professional interview questions and respond only with valid JSON arrays. Never include explanations, markdown formatting, or any text outside the JSON array."
@@ -221,6 +224,7 @@ if st.session_state.processing_complete and st.session_state.questions:
         col1, col2 = st.columns(2)
         with col1:
             st.markdown("**👤 Your Answer:**")
             current_answer = st.session_state.user_answers.get(qn, "")
             answer_submitted = qn in st.session_state.user_answers
@@ -240,39 +244,42 @@ if st.session_state.processing_complete and st.session_state.questions:
                 st.markdown("🎤 **Record your answer:**")
                 audio_file = st.audio_input("Record audio", key=f"audio_input_{qn}")
-                # Process audio if available
                 if audio_file is not None:
-                    #with st.spinner("🎧 Converting speech to text..."):
-                        try:
-                            # Save audio to temporary file
-                            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
-                                # Read bytes from UploadedFile object
-                                audio_file.seek(0)  # Reset file pointer
-                                tmp_file.write(audio_file.read())
-                                tmp_file_path = tmp_file.name
-                            # Convert audio to text using speech_converter
-                            transcribed_text = audio_to_text(tmp_file_path)
-                            # Clean up temporary file
-                            os.unlink(tmp_file_path)
-                            if transcribed_text:
-                                st.session_state[f"transcribed_text_{qn}"] = transcribed_text
-                                with st.sidebar:
-                                    st.success("✅ Audio transcribed successfully!")
-                            else:
-                                with st.sidebar:
-                                    st.error("❌ Could not understand the audio. Please try again.")
-                        except Exception as e:
                             with st.sidebar:
-                                st.error(f"❌ Audio processing error: {str(e)}")
-                            if 'tmp_file_path' in locals():
-                                try:
-                                    os.unlink(tmp_file_path)
-                                except:
-                                    pass
                 # Text area with transcribed text or manual input
                 initial_text = st.session_state.get(f"transcribed_text_{qn}", current_answer)

 from typing import Dict, Any, List, Tuple
 from model import generate_response
 from utils import FileProcessor
+from speech_converter import audio_to_text, text_to_audio, load_model
 import tempfile
 import os
+import torch
+# Initialize the model through speech_converter
+whisper_model = load_model()
 def generate_question(resume_text, job_desc_text, job_role):
     system_prompt = "You are an experienced technical interviewer and JSON generator. You create professional interview questions and respond only with valid JSON arrays. Never include explanations, markdown formatting, or any text outside the JSON array."
         col1, col2 = st.columns(2)
         with col1:
             st.markdown("**👤 Your Answer:**")
+            # Check if answer already submitted
             current_answer = st.session_state.user_answers.get(qn, "")
             answer_submitted = qn in st.session_state.user_answers
                 st.markdown("🎤 **Record your answer:**")
                 audio_file = st.audio_input("Record audio", key=f"audio_input_{qn}")
+                # Process audio if available using Whisper
                 if audio_file is not None:
+                    try:
+                        # Save audio to temporary file
+                        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+                            # Read bytes from UploadedFile object
+                            audio_file.seek(0)  # Reset file pointer
+                            audio_data = audio_file.read()
+                            # Write raw audio data to temporary file
+                            tmp_file.write(audio_data)
+                            tmp_file_path = tmp_file.name
+                        # Convert audio to text using Whisper directly
+                        result = whisper_model.transcribe(tmp_file_path)
+                        transcribed_text = result["text"]
+                        # Clean up temporary file
+                        os.unlink(tmp_file_path)
+                        if transcribed_text:
+                            st.session_state[f"transcribed_text_{qn}"] = transcribed_text
+                            with st.sidebar:
+                                st.success("✅ Audio transcribed successfully!")
+                        else:
                             with st.sidebar:
+                                st.error("❌ Could not understand the audio. Please try again.")
+                    except Exception as e:
+                        with st.sidebar:
+                            st.error(f"❌ Audio processing error: {str(e)}")
+                        if 'tmp_file_path' in locals():
+                            try:
+                                os.unlink(tmp_file_path)
+                            except:
+                                pass
                 # Text area with transcribed text or manual input
                 initial_text = st.session_state.get(f"transcribed_text_{qn}", current_answer)

core/speech_converter.py CHANGED Viewed

@@ -1,88 +1,52 @@
-import speech_recognition as sr
-import sounddevice as sd
-import wavio as wv
-import os
-import time
 import pyttsx3
 def audio_to_text(audio_file_path=None):
-    """
-    Converts audio file to text using speech recognition.
-    If audio_file_path is provided, uses that file. Otherwise captures from microphone.
-    Returns the transcribed text or None if error.
-    """
-    r = sr.Recognizer()
-    text = None
     try:
         if audio_file_path:
-            # Use provided audio file
-            with sr.AudioFile(audio_file_path) as source:
-                audio = r.record(source)
-        else:
-            # Original microphone capture logic
-            freq = 44100
-            duration = 180
-            channels = 1
-            recording_file = "recording.wav"
-            print("🎤 You can answer now")
-            print(f"Recording will automatically stop after {duration} seconds")
-            recording = sd.rec(int(duration*freq), samplerate=freq, channels=channels)
-            for i in range(duration):
-                time.sleep(1)
-                seconds_left = duration - i - 1
-                print(f"⏱️ {seconds_left} seconds remaining...", end="\r")
-            print("Thank you for your answer")
-            sd.wait()
-            wv.write(recording_file, recording, freq, sampwidth=2)
-            with sr.AudioFile(recording_file) as source:
-                audio = r.record(source)
-            # Clean up recording file
-            if os.path.exists(recording_file):
-                os.remove(recording_file)
-        text = r.recognize_google(audio)
-        return text
-    except sr.UnknownValueError:
-        print("Sorry, I couldn't understand what you said. Please try again.")
-        return None
-    except sr.RequestError as e:
-        print(f"Speech recognition service error: {e}")
         return None
     except Exception as e:
         print(f"Audio processing error: {e}")
         return None
-def text_to_audio(text: str, rate: int = 125, voice_idx: int = 0) -> None:
-    """
-    Converts text to speech using pyttsx3
-    Args:
-        text: Text to convert to speech
-        rate: Speech rate (default: 125)
-        voice_idx: Voice index to use (default: 1 for female voice)
-    """
-    try:
-        engine = pyttsx3.init()
-        voices = engine.getProperty("voices")
-        engine.setProperty("rate", rate)
-        engine.setProperty("voice", voices[voice_idx].id)
-        engine.say(text)
-        engine.runAndWait()
-    except Exception as e:
-        print(f"Text-to-speech error: {str(e)}")
 if __name__ == "__main__":
-    audio_to_text()
-    text_to_audio("""The key to effective software development lies in balancing technical excellence with practical solutions.
-                     In my experience at TechSolutions,
-                     I implemented this philosophy by optimizing database queries which reduced load times by 40%.
-                     I'm passionate about clean code and proper documentation, which has helped my teams maintain
-                    systems efficiently over time. I'm excited to bring these skills to your cloud-based applications.""")

+import whisper
+import streamlit as st
 import pyttsx3
+import os
+import torch
+# Set PyTorch settings to avoid thread/loop errors
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+@st.cache_resource(show_spinner="Loading speech recognition model...")
+def load_model():
+    """Load Whisper model with optimized settings"""
+    try:
+        # Use CPU device and weights_only to avoid torch serialization issues
+        return whisper.load_model(
+            "base",
+            device="cpu",
+            download_root="models",
+            in_memory=True
+        )
+    except Exception as e:
+        print(f"Model loading error: {e}")
+        return None
 def audio_to_text(audio_file_path=None):
+    """Converts audio file to text using Whisper"""
+    model = load_model()
+    if model is None:
+        return None
     try:
         if audio_file_path:
+            result = model.transcribe(audio_file_path, fp16=False)
+            return result["text"]
         return None
     except Exception as e:
         print(f"Audio processing error: {e}")
         return None
+def text_to_audio(text):
+    """Converts text to speech using pyttsx3"""
+    engine = pyttsx3.init()
+    voices = engine.getProperty("voices")
+    engine.setProperty("rate", 125)
+    engine.setProperty("voice", voices[1].id)
+    engine.say(text)
+    engine.runAndWait()
 if __name__ == "__main__":
+    text_to_audio("Test speech conversion")