Spaces:

Subbu1304
/

voice_recognize

Runtime error

App Files Files Community

Subbu1304 commited on Feb 4, 2025

Commit

ec3b8b9

verified ·

1 Parent(s): 47f1f46

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -47

app.py CHANGED Viewed

@@ -1,49 +1,93 @@
-import gradio as gr
-import speech_recognition as sr
-def recognize_speech():
-    # Initialize recognizer
-    recognizer = sr.Recognizer()
-    # Microphone input
-    with sr.Microphone() as source:
-        print("Say something...")
-        recognizer.adjust_for_ambient_noise(source)
-        audio = recognizer.listen(source)
     try:
-        # Recognize speech using Google Web Speech API
-        text = recognizer.recognize_google(audio)
-        print("You said: " + text)
-        return text
-    except sr.UnknownValueError:
-        print("Sorry, I could not understand the audio.")
-        return "Sorry, I could not understand."
-    except sr.RequestError:
-        print("Could not request results from Google Speech Recognition service.")
-        return "Could not request results."
-def voice_login_interface():
-    # Step 1: Welcome message
-    welcome_text = "Welcome to Briyani Hub."
-    # Step 2: Voice input for name
-    name_prompt = "Please enter your name letter by letter."
-    name_input = recognize_speech()
-    # Step 3: Voice input for Gmail
-    gmail_prompt = "Please enter your Gmail."
-    gmail_input = recognize_speech()
-    return f"{welcome_text}\n{name_prompt} {name_input}\n{gmail_prompt} {gmail_input}"
-# Create the Gradio interface
-iface = gr.Interface(fn=voice_login_interface,
-                     inputs=[],
-                     outputs="text",
-                     live=True,
-                     title="Briyani Hub Voice Login",
-                     description="A voice-assisted login page for Briyani Hub")
-iface.launch()

+import torch
+from flask import Flask, render_template, request, jsonify
+import os
+from transformers import pipeline
+from gtts import gTTS
+from pydub import AudioSegment
+from pydub.silence import detect_nonsilent
+from waitress import serve
+app = Flask(__name__)
+# Use whisper-small for faster processing and better speed
+device = "cuda" if torch.cuda.is_available() else "cpu"
+asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if device == "cuda" else -1)
+# Function to generate audio prompts
+def generate_audio_prompt(text, filename):
+    tts = gTTS(text=text, lang="en")
+    tts.save(os.path.join("static", filename))
+# Generate required voice prompts
+prompts = {
+    "welcome": "Welcome to Biryani Hub.",
+    "ask_name": "Tell me your name.",
+    "ask_email": "Please provide your email address.",
+    "thank_you": "Thank you for registration."
+}
+for key, text in prompts.items():
+    generate_audio_prompt(text, f"{key}.mp3")
+# Symbol mapping for proper recognition
+SYMBOL_MAPPING = {
+    "at the rate": "@",
+    "at": "@",
+    "dot": ".",
+    "underscore": "_",
+    "hash": "#",
+    "plus": "+",
+    "dash": "-",
+    "comma": ",",
+    "space": " "
+}
+# Function to convert audio to WAV format
+def convert_to_wav(input_path, output_path):
     try:
+        audio = AudioSegment.from_file(input_path)
+        audio = audio.set_frame_rate(16000).set_channels(1)  # Convert to 16kHz, mono
+        audio.export(output_path, format="wav")
+    except Exception as e:
+        raise Exception(f"Audio conversion failed: {str(e)}")
+# Function to check if audio contains actual speech
+def is_silent_audio(audio_path):
+    audio = AudioSegment.from_wav(audio_path)
+    nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)  # Reduced silence duration
+    return len(nonsilent_parts) == 0  # If no speech detected
+@app.route("/")
+def index():
+    return render_template("index.html")
+@app.route("/transcribe", methods=["POST"])
+def transcribe():
+    if "audio" not in request.files:
+        return jsonify({"error": "No audio file provided"}), 400
+    audio_file = request.files["audio"]
+    input_audio_path = os.path.join("static", "temp_input.wav")
+    output_audio_path = os.path.join("static", "temp.wav")
+    audio_file.save(input_audio_path)
+    try:
+        # Convert to WAV
+        convert_to_wav(input_audio_path, output_audio_path)
+        # Check for silence
+        if is_silent_audio(output_audio_path):
+            return jsonify({"error": "No speech detected. Please try again."}), 400
+        # Use Whisper ASR model for transcription
+        result = asr_model(output_audio_path, generate_kwargs={"language": "en"})
+        transcribed_text = result["text"].strip().capitalize()
+        return jsonify({"text": transcribed_text})
+    except Exception as e:
+        return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
+# Start Production Server
+if __name__ == "__main__":
+    serve(app, host="0.0.0.0", port=7860)