MultiAgenticAI / models /speech_interface.py
Chaitanya895's picture
Update models/speech_interface.py
dc0c0aa verified
raw
history blame
26.6 kB
import streamlit as st
import speech_recognition as sr
try:
import pyttsx3
except Exception:
pyttsx3 = None
from gtts import gTTS
import io
import tempfile
import os
import threading
import time
from typing import Optional, Dict, List
try:
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
import av
WEBRTC_AVAILABLE = True
except Exception:
WEBRTC_AVAILABLE = False
import logging
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class SpeechInterface:
"""
Comprehensive speech interface supporting multiple languages for farmers
"""
def __init__(self):
self.recognizer = sr.Recognizer()
self.microphone = None
self.pyaudio_available = False
self.gtts_available = True # gTTS is imported; availability may depend on network
self.browser_mic_available = False
# Try to initialize microphone with error handling
try:
self.microphone = sr.Microphone()
self.pyaudio_available = True
logger.info("PyAudio and microphone initialized successfully")
except AttributeError as e:
if "PyAudio" in str(e):
logger.warning("PyAudio not available. Voice input will be disabled.")
self.pyaudio_available = False
else:
logger.error(f"Microphone initialization error: {e}")
self.pyaudio_available = False
except Exception as e:
logger.error(f"Unexpected error initializing microphone: {e}")
self.pyaudio_available = False
# Language mapping for speech recognition and synthesis
self.language_codes = {
'English': 'en',
'Hindi': 'hi',
'Telugu': 'te',
'Kannada': 'kn',
'Tamil': 'ta',
'Malayalam': 'ml',
'Marathi': 'mr',
'Bengali': 'bn',
'Gujarati': 'gu',
'Punjabi': 'pa',
'Urdu': 'ur',
'French': 'fr',
'Spanish': 'es'
}
# Initialize text-to-speech engine
if pyttsx3 is not None:
try:
self.tts_engine = pyttsx3.init()
self.tts_engine.setProperty('rate', 150) # Speed of speech
self.tts_engine.setProperty('volume', 0.9) # Volume level
except Exception as e:
logger.warning(f"Could not initialize TTS engine: {e}")
self.tts_engine = None
else:
self.tts_engine = None
def has_tts(self) -> bool:
"""Return True if any TTS option (gTTS or pyttsx3) is available."""
return bool(self.tts_engine) or self.gtts_available
def speech_to_text(self, language: str = 'English', timeout: int = 5) -> Optional[str]:
"""
Convert speech to text using microphone input
"""
# Prefer native mic if available, else offer WebRTC capture
if not (self.pyaudio_available and self.microphone is not None):
if WEBRTC_AVAILABLE:
return self._speech_to_text_webrtc(language)
st.error("❌ Microphone not available. PyAudio not installed and WebRTC not available.")
return None
try:
with self.microphone as source:
# Adjust for ambient noise
self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
# Show listening indicator
st.info("🎤 Listening... Speak now!")
# Listen for audio
audio = self.recognizer.listen(source, timeout=timeout, phrase_time_limit=10)
# Show processing indicator
st.info("🔄 Processing your speech...")
# Convert speech to text
language_code = self.language_codes.get(language, 'en')
text = self.recognizer.recognize_google(audio, language=language_code)
st.success(f"✅ Heard: {text}")
return text
except sr.WaitTimeoutError:
st.warning("⏰ No speech detected. Please try again.")
return None
except sr.UnknownValueError:
st.error("❌ Could not understand the speech. Please speak clearly.")
return None
except sr.RequestError as e:
st.error(f"❌ Speech recognition service error: {e}")
return None
except Exception as e:
st.error(f"❌ Unexpected error: {e}")
return None
def _speech_to_text_webrtc(self, language: str = 'English') -> Optional[str]:
"""Capture audio in-browser using WebRTC and run recognition on buffered audio."""
st.info("🎤 Using browser microphone (WebRTC)")
rtc_configuration = RTCConfiguration({
"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]
})
audio_frames: List[bytes] = []
def recv_audio(frame: av.AudioFrame):
# Convert to bytes and buffer
pcm = frame.to_ndarray()
# Downmix to mono int16 expected by Recognizer
import numpy as np
if pcm.ndim > 1:
pcm = pcm.mean(axis=0)
pcm = pcm.astype('int16').tobytes()
audio_frames.append(pcm)
return frame
ctx = webrtc_streamer(
key="webrtc-audio",
mode=WebRtcMode.SENDONLY,
audio_receiver_size=1024,
rtc_configuration=rtc_configuration,
media_stream_constraints={"audio": True, "video": False},
async_processing=False,
audio_frame_callback=recv_audio
)
self.browser_mic_available = bool(ctx and ctx.state.playing)
if st.button("🛑 Stop & Transcribe", help="Stop capture and transcribe the recorded audio"):
if not audio_frames:
st.warning("No audio captured")
return None
# Build an AudioData object for speech_recognition
import numpy as np
pcm = b"".join(audio_frames)
sr_rate = 48000 # typical WebRTC rate
sample_width = 2 # int16
audio_data = sr.AudioData(pcm, sr_rate, sample_width)
try:
language_code = self.language_codes.get(language, 'en')
text = self.recognizer.recognize_google(audio_data, language=language_code)
st.success(f"✅ Heard: {text}")
return text
except sr.UnknownValueError:
st.error("❌ Could not understand the speech.")
except sr.RequestError as e:
st.error(f"❌ Speech recognition service error: {e}")
return None
def text_to_speech(self, text: str, language: str = 'English', use_gtts: bool = True) -> bool:
"""
Convert text to speech using either gTTS (online) or pyttsx3 (offline)
"""
try:
if use_gtts and text.strip():
# Use Google Text-to-Speech (online, better quality, supports more languages)
language_code = self.language_codes.get(language, 'en')
tts = gTTS(text=text, lang=language_code, slow=False)
# Create temporary file with better handling
import uuid
temp_filename = f"tts_{uuid.uuid4().hex}.mp3"
temp_path = os.path.join(tempfile.gettempdir(), temp_filename)
try:
tts.save(temp_path)
# Play audio in Streamlit
with open(temp_path, 'rb') as audio_file:
audio_bytes = audio_file.read()
st.audio(audio_bytes, format='audio/mp3')
st.info(f"🔊 Audio generated! If you don't hear anything, check your speakers/headphones and browser audio settings.")
# Provide download option as fallback
st.download_button(
label="⬇️ Download Audio File",
data=audio_bytes,
file_name=f"recommendation_audio_{uuid.uuid4().hex[:8]}.mp3",
mime="audio/mp3",
help="Download the audio file to play it in your preferred audio player"
)
# Clean up with retry mechanism
try:
os.unlink(temp_path)
except PermissionError:
# File might still be in use, try again after a short delay
import time
time.sleep(0.5)
try:
os.unlink(temp_path)
except:
# If still can't delete, just leave it - temp files are cleaned up by system
pass
return True
except Exception as e:
logger.error(f"Error with gTTS file handling: {e}")
# Fallback to pyttsx3
if self.tts_engine:
self.tts_engine.say(text)
self.tts_engine.runAndWait()
return True
return False
elif self.tts_engine and text.strip():
# Use pyttsx3 (offline, limited language support)
try:
self.tts_engine.say(text)
self.tts_engine.runAndWait()
st.info("🔊 Playing audio using offline TTS engine...")
return True
except Exception as e:
logger.error(f"Error with pyttsx3: {e}")
st.error(f"❌ Offline TTS error: {e}")
return False
else:
if not text.strip():
st.warning("⚠️ No text provided to speak")
else:
st.warning("⚠️ No TTS engine available. Please check your internet connection for online TTS or install offline TTS dependencies.")
return False
except Exception as e:
logger.error(f"Error in text-to-speech: {e}")
st.error(f"❌ Text-to-speech error: {e}")
return False
def create_voice_input_widget(self, label: str, language: str = 'English',
key: str = None, help_text: str = None) -> Optional[str]:
"""
Create a voice input widget for Streamlit
"""
col1, col2 = st.columns([3, 1])
# Get the current value from session state
text_key = f"{key}_text" if key else "text_input"
voice_result_key = f"{key}_voice_result" if key else "voice_result"
# Initialize session state for voice result
if voice_result_key not in st.session_state:
st.session_state[voice_result_key] = ""
with col1:
text_input = st.text_input(
label,
key=text_key,
help=help_text
)
with col2:
if st.button("🎤 Voice", key=f"{key}_voice_btn" if key else None, help="Click to speak"):
if not self.pyaudio_available:
st.error("❌ Microphone not available")
return text_input
with st.spinner("Preparing microphone..."):
time.sleep(1) # Give time for microphone to initialize
try:
voice_text = self.speech_to_text(language)
if voice_text:
# Store voice input in session state
st.session_state[voice_result_key] = voice_text
st.success(f"✅ Voice input: {voice_text}")
# Show the voice input in a separate display
st.info(f"🎤 Voice input captured: **{voice_text}**")
st.info("💡 Copy this text and paste it into the input field above")
# Force a rerun to update the text input
st.rerun()
else:
st.warning("⚠️ No voice input detected")
except Exception as e:
st.error(f"❌ Voice input error: {e}")
logger.error(f"Voice input error: {e}")
# Show voice result if available
if st.session_state.get(voice_result_key):
st.info(f"🎤 Last voice input: **{st.session_state[voice_result_key]}**")
return text_input
def create_voice_output_button(self, text: str, language: str = 'English',
button_text: str = "🔊 Listen", key: str = None):
"""
Create a voice output button for Streamlit
"""
if st.button(button_text, key=f"{key}_speak" if key else None, help="Click to hear the text"):
if not text or not text.strip():
st.warning("⚠️ No text to speak")
return
with st.spinner("Generating speech..."):
success = self.text_to_speech(text, language)
if success:
st.success("✅ Audio generated successfully!")
else:
st.error("❌ Failed to generate audio. Please try again.")
def create_voice_interface_for_sustainability(self, language: str = 'English') -> Dict:
"""
Create voice interface specifically for sustainability tracker
"""
# Voice input for water usage
water_usage_text = self.create_voice_input_widget(
"💧 Water Usage (ML/ha) - Voice Input",
language=language,
key="water_voice",
help_text="Speak the water usage amount"
)
# Voice input for fertilizer usage
fertilizer_usage_text = self.create_voice_input_widget(
"🧪 Fertilizer Usage (tons/ha) - Voice Input",
language=language,
key="fertilizer_voice",
help_text="Speak the fertilizer usage amount"
)
# Voice input for crop rotation
rotation_text = self.create_voice_input_widget(
"🔄 Crop Rotation (Yes/No) - Voice Input",
language=language,
key="rotation_voice",
help_text="Say 'Yes' or 'No' for crop rotation"
)
# Get voice results from session state
water_voice = st.session_state.get("water_voice_voice_result", "")
fertilizer_voice = st.session_state.get("fertilizer_voice_voice_result", "")
rotation_voice = st.session_state.get("rotation_voice_voice_result", "")
# Use voice input if available, otherwise use text input
water_usage_text = water_voice if water_voice else water_usage_text
fertilizer_usage_text = fertilizer_voice if fertilizer_voice else fertilizer_usage_text
rotation_text = rotation_voice if rotation_voice else rotation_text
# Process voice inputs
data = {}
if water_usage_text:
try:
# Extract numbers from voice input
import re
numbers = re.findall(r'\d+\.?\d*', water_usage_text)
if numbers:
data['water_score'] = float(numbers[0])
except:
st.warning("Could not parse water usage from voice input")
if fertilizer_usage_text:
try:
import re
numbers = re.findall(r'\d+\.?\d*', fertilizer_usage_text)
if numbers:
data['fertilizer_use'] = float(numbers[0])
except:
st.warning("Could not parse fertilizer usage from voice input")
if rotation_text:
rotation_lower = rotation_text.lower()
if any(word in rotation_lower for word in ['yes', 'haan', 'ಹೌದು', 'అవును', 'ஆம்', 'അതെ', 'हाँ', 'oui', 'sí']):
data['rotation'] = True
elif any(word in rotation_lower for word in ['no', 'nahi', 'ಇಲ್ಲ', 'లేదు', 'இல்லை', 'ഇല്ല', 'नहीं', 'non', 'no']):
data['rotation'] = False
return data
def create_voice_interface_for_farm_details(self, language: str = 'English') -> Dict:
"""
Create voice interface for farm details input
"""
# Voice input for farm size
farm_size_text = self.create_voice_input_widget(
"🌾 Farm Size (hectares) - Voice Input",
language=language,
key="farm_size_voice",
help_text="Speak the farm size in hectares"
)
# Voice input for crop preference
crop_preference_text = self.create_voice_input_widget(
"🌱 Crop Preference - Voice Input",
language=language,
key="crop_preference_voice",
help_text="Speak your crop preference (Grains, Vegetables, Fruits)"
)
# Voice input for soil type
soil_type_text = self.create_voice_input_widget(
"🗺️ Soil Type - Voice Input",
language=language,
key="soil_type_voice",
help_text="Speak the soil type (Loamy, Sandy, Clay)"
)
# Get voice results from session state
farm_size_voice = st.session_state.get("farm_size_voice_voice_result", "")
crop_preference_voice = st.session_state.get("crop_preference_voice_voice_result", "")
soil_type_voice = st.session_state.get("soil_type_voice_voice_result", "")
# Use voice input if available, otherwise use text input
farm_size_text = farm_size_voice if farm_size_voice else farm_size_text
crop_preference_text = crop_preference_voice if crop_preference_voice else crop_preference_text
soil_type_text = soil_type_voice if soil_type_voice else soil_type_text
# Process voice inputs
data = {}
# Debug: Show what was captured
if farm_size_text or crop_preference_text or soil_type_text:
st.info(f"🎤 Voice inputs captured: Farm Size='{farm_size_text}', Crop='{crop_preference_text}', Soil='{soil_type_text}'")
if farm_size_text:
try:
import re
numbers = re.findall(r'\d+', farm_size_text)
if numbers:
data['land_size'] = int(numbers[0])
st.success(f"✅ Parsed land size: {data['land_size']} hectares")
else:
st.warning("Could not find numbers in farm size voice input")
except Exception as e:
st.warning(f"Could not parse farm size from voice input: {e}")
if crop_preference_text:
crop_lower = crop_preference_text.lower()
if any(word in crop_lower for word in ['grain', 'grains', 'अनाज', 'ಧಾನ್ಯ', 'ధాన్యం', 'தானியம்', 'ധാന്യം']):
data['crop_preference'] = 'Grains'
st.success(f"✅ Parsed crop preference: {data['crop_preference']}")
elif any(word in crop_lower for word in ['vegetable', 'vegetables', 'सब्जी', 'ತರಕಾರಿ', 'కూరగాయలు', 'காய்கறி', 'പച്ചക്കറി']):
data['crop_preference'] = 'Vegetables'
st.success(f"✅ Parsed crop preference: {data['crop_preference']}")
elif any(word in crop_lower for word in ['fruit', 'fruits', 'फल', 'ಹಣ್ಣು', 'పండు', 'பழம்', 'പഴം']):
data['crop_preference'] = 'Fruits'
st.success(f"✅ Parsed crop preference: {data['crop_preference']}")
else:
st.warning(f"Could not recognize crop preference from: '{crop_preference_text}'")
if soil_type_text:
soil_lower = soil_type_text.lower()
if any(word in soil_lower for word in ['loamy', 'loam', 'दोमट', 'ಲೋಮಿ', 'లోమి', 'லோமி', 'ലോമി']):
data['soil_type'] = 'Loamy'
st.success(f"✅ Parsed soil type: {data['soil_type']}")
elif any(word in soil_lower for word in ['sandy', 'sand', 'बालू', 'ಮರಳು', 'ఇసుక', 'மணல்', 'മണൽ']):
data['soil_type'] = 'Sandy'
st.success(f"✅ Parsed soil type: {data['soil_type']}")
elif any(word in soil_lower for word in ['clay', 'चिकनी', 'ಕ್ಲೇ', 'క్లే', 'களிமண்', 'കളിമണ്ണ്']):
data['soil_type'] = 'Clay'
st.success(f"✅ Parsed soil type: {data['soil_type']}")
else:
st.warning(f"Could not recognize soil type from: '{soil_type_text}'")
return data
def create_voice_help_system(self, language: str = 'English'):
"""
Create a voice help system for farmers
"""
st.markdown("### 🎤 Voice Help System")
help_texts = {
'English': {
'welcome': "Welcome to the Sustainable Farming AI Platform. You can use voice commands to interact with the system.",
'farm_details': "To enter farm details, speak your farm size, crop preference, and soil type.",
'sustainability': "To log sustainability data, speak your water usage, fertilizer usage, and whether you practice crop rotation.",
'recommendations': "Click the generate recommendation button to get AI-powered farming advice based on your inputs."
},
'Hindi': {
'welcome': "सस्टेनेबल फार्मिंग AI प्लेटफॉर्म में आपका स्वागत है। आप सिस्टम के साथ बातचीत करने के लिए आवाज कमांड का उपयोग कर सकते हैं।",
'farm_details': "फार्म विवरण दर्ज करने के लिए, अपने फार्म का आकार, फसल पसंद और मिट्टी का प्रकार बोलें।",
'sustainability': "सस्टेनेबलिटी डेटा लॉग करने के लिए, अपने पानी के उपयोग, उर्वरक के उपयोग और क्या आप फसल चक्रण का अभ्यास करते हैं, बोलें।",
'recommendations': "अपने इनपुट के आधार पर AI-संचालित खेती सलाह प्राप्त करने के लिए सिफारिश बटन पर क्लिक करें।"
},
'Telugu': {
'welcome': "సస్టైనబుల్ ఫార్మింగ్ AI ప్లాట్‌ఫారమ్‌కు స్వాగతం. మీరు సిస్టమ్‌తో ఇంటరాక్ట్ చేయడానికి వాయిస్ కమాండ్‌లను ఉపయోగించవచ్చు.",
'farm_details': "ఫార్మ్ వివరాలను నమోదు చేయడానికి, మీ ఫార్మ్ పరిమాణం, పంట ప్రాధాన్యత మరియు నేల రకాన్ని మాట్లాడండి.",
'sustainability': "సస్టైనబిలిటీ డేటాను లాగ్ చేయడానికి, మీ నీటి వినియోగం, ఎరువు వినియోగం మరియు మీరు పంట మార్పిడిని అభ్యసిస్తున్నారా అని మాట్లాడండి.",
'recommendations': "మీ ఇన్‌పుట్‌ల ఆధారంగా AI-ఆధారిత వ్యవసాయ సలహా పొందడానికి సిఫారసు బటన్‌పై క్లిక్ చేయండి."
}
}
help_data = help_texts.get(language, help_texts['English'])
col1, col2 = st.columns(2)
with col1:
if st.button("🔊 Listen to Welcome", key="help_welcome"):
self.text_to_speech(help_data['welcome'], language)
with col2:
if st.button("🔊 Listen to Farm Details Help", key="help_farm"):
self.text_to_speech(help_data['farm_details'], language)
col3, col4 = st.columns(2)
with col3:
if st.button("🔊 Listen to Sustainability Help", key="help_sustainability"):
self.text_to_speech(help_data['sustainability'], language)
with col4:
if st.button("🔊 Listen to Recommendations Help", key="help_recommendations"):
self.text_to_speech(help_data['recommendations'], language)
def get_supported_languages(self) -> List[str]:
"""
Get list of supported languages
"""
return list(self.language_codes.keys())
def is_voice_available(self) -> bool:
"""
Check if microphone input is available (PyAudio + microphone).
"""
return (self.pyaudio_available and self.microphone is not None) or (WEBRTC_AVAILABLE and self.browser_mic_available)