Spaces:

MCP-1st-Birthday
/

AI-RADIO

Sleeping

File size: 8,457 Bytes

"""Voice Input Service for Speech Recognition"""
import speech_recognition as sr
from typing import Optional, Callable
import io

class VoiceInputService:
    """Service for handling voice input and speech recognition"""
    
    def __init__(self):
        """Initialize voice input service"""
        self.recognizer = sr.Recognizer()
        self.microphone = None
        self.available = False
        
        try:
            # Try to initialize microphone (requires pyaudio)
            self.microphone = sr.Microphone()
            # Adjust for ambient noise
            with self.microphone as source:
                self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
            self.available = True
        except OSError as e:
            print(f"Warning: Could not initialize microphone: {e}")
            print("Voice input will not be available")
            print("To enable voice input, install PortAudio:")
            print("  macOS: brew install portaudio")
            print("  Linux: sudo apt-get install portaudio19-dev")
            print("  Then: pip install pyaudio")
            self.available = False
        except Exception as e:
            print(f"Warning: Could not initialize microphone: {e}")
            print("Voice input will not be available")
            self.available = False
    
    def listen_and_recognize(self, timeout: int = 5, phrase_time_limit: int = 10) -> Optional[str]:
        """
        Listen to microphone and recognize speech
        
        Args:
            timeout: Maximum time to wait for speech to start
            phrase_time_limit: Maximum time for a phrase
            
        Returns:
            Recognized text or None if error
        """
        if not self.available or not self.microphone:
            return None
        
        try:
            with self.microphone as source:
                print("Listening... Speak now!")
                audio = self.recognizer.listen(
                    source,
                    timeout=timeout,
                    phrase_time_limit=phrase_time_limit
                )
            
            print("Processing speech...")
            # Use Google's free speech recognition API
            text = self.recognizer.recognize_google(audio)
            print(f"Recognized: {text}")
            return text
            
        except sr.WaitTimeoutError:
            print("No speech detected within timeout")
            return None
        except sr.UnknownValueError:
            print("Could not understand audio")
            return None
        except sr.RequestError as e:
            print(f"Error with speech recognition service: {e}")
            return None
        except Exception as e:
            print(f"Error during voice recognition: {e}")
            return None
    
    def process_song_request(self, recognized_text: str) -> dict:
        """
        Process a song request from recognized speech
        
        Args:
            recognized_text: Text recognized from speech
            
        Returns:
            Dictionary with song request details
        """
        text_lower = recognized_text.lower()
        
        # Extract keywords
        request = {
            "original_text": recognized_text,
            "action": None,
            "song": None,
            "artist": None,
            "genre": None,
            "mood": None
        }
        
        # Remove common action words to get the actual query
        # Order matters - longer phrases first
        action_phrases = [
            "i want to hear", "i want to", "want to hear", 
            "i'd like to hear", "i would like to hear",
            "play", "put on", "listen to", "i want", 
            "can you", "please", "i'd like", "i would like"
        ]
        cleaned_text = recognized_text.lower()
        for phrase in action_phrases:
            if phrase in cleaned_text:
                cleaned_text = cleaned_text.replace(phrase, "").strip()
                break  # Only remove one phrase
        
        # Clean up extra spaces and remove standalone "i", "a", "the"
        words = cleaned_text.split()
        words = [w for w in words if w not in ["i", "a", "an", "the"]]
        cleaned_text = " ".join(words).strip()
        
        # Detect action
        if any(word in text_lower for word in ["play", "put on", "listen to", "want to hear"]):
            request["action"] = "play"
        elif any(word in text_lower for word in ["skip", "next", "change"]):
            request["action"] = "skip"
        else:
            request["action"] = "play"  # Default
        
        # Try to extract song/artist/genre
        # Simple keyword extraction - can be enhanced with NLP
        if "by" in text_lower:
            parts = text_lower.split("by")
            if len(parts) == 2:
                request["song"] = parts[0].strip()
                request["artist"] = parts[1].strip()
        else:
            # If no "by", treat the cleaned text as the song/query
            # But remove genre/mood words that are already extracted
            song_text = cleaned_text if cleaned_text else recognized_text
            if request.get("genre"):
                # Remove genre from song text
                song_text = song_text.replace(request["genre"], "").strip()
            if request.get("mood"):
                # Remove mood from song text
                song_text = song_text.replace(request["mood"], "").strip()
            song_text = " ".join(song_text.split())  # Clean up spaces
            request["song"] = song_text if song_text else recognized_text
        
        # Check for genre keywords - first try known genres, then extract custom ones
        known_genres = ["pop", "rock", "jazz", "classical", "electronic", "hip-hop", "hip hop", "country", "indie", "rap", "blues", "folk"]
        genre_found = False

        # First, check for known genres
        for genre in known_genres:
            if genre in text_lower:
                request["genre"] = genre
                genre_found = True
                break

        # If no known genre found, try to extract a custom genre
        if not genre_found:
            # Look for patterns like "some [genre] music", "[genre] music", "play [genre]"
            # Remove action words and common words to find potential genre
            genre_indicators = ["music", "song", "track", "tune"]
            words = cleaned_text.split()
            
            # Find words that might be genres (not action words, not common words)
            common_words = {"i", "want", "to", "hear", "play", "put", "on", "listen", "some", "a", "an", "the", "me", "my"}
            
            # Look for genre-like words (usually before "music" or standalone)
            for i, word in enumerate(words):
                # If word is before "music" or similar, it might be a genre
                if i < len(words) - 1 and words[i + 1] in genre_indicators:
                    if word not in common_words and len(word) > 2:
                        request["genre"] = word
                        genre_found = True
                        break
                # Or if it's a standalone word that's not a common word
                elif word not in common_words and len(word) > 3 and word not in known_genres:
                    # Check if it looks like a genre (not a song/artist name pattern)
                    # Simple heuristic: if it's a single word and not capitalized in original, might be genre
                    if word in text_lower and not word[0].isupper() if word[0].isalpha() else False:
                        # Additional check: if user said "some [word]" or "[word] music", likely a genre
                        if i > 0 and words[i-1] in ["some", "any", "a", "an"]:
                            request["genre"] = word
                            genre_found = True
                            break
                        elif i < len(words) - 1 and words[i+1] in genre_indicators:
                            request["genre"] = word
                            genre_found = True
                            break

        # Check for mood keywords
        moods = ["happy", "sad", "energetic", "calm", "relaxed", "focused", "upbeat", "chill"]
        for mood in moods:
            if mood in text_lower:
                request["mood"] = mood
                break
        
        return request