""" Voice Processing Service for the AI Chatbot with Reusable Intelligence Handles the cleaning and intent extraction from raw voice-to-text strings """ import asyncio import json from typing import Dict, Optional, Tuple from dataclasses import dataclass import uuid from datetime import datetime @dataclass class VoiceProcessingResult: """Data class for voice processing results""" cleaned_text: str extracted_intent: str confidence_score: float processing_time: float original_audio_path: Optional[str] = None class VoiceProcessingService: """Service class for processing voice input and extracting intent""" def __init__(self): # In a real implementation, this would initialize speech recognition models # For now, we'll simulate processing pass async def process_voice_input(self, raw_text: str, audio_path: Optional[str] = None) -> VoiceProcessingResult: """ Process raw voice-to-text input to clean text and extract intent """ start_time = datetime.now() # Clean the raw text cleaned_text = await self._clean_text(raw_text) # Extract intent from the cleaned text extracted_intent, confidence_score = await self._extract_intent(cleaned_text) end_time = datetime.now() processing_time = (end_time - start_time).total_seconds() return VoiceProcessingResult( cleaned_text=cleaned_text, extracted_intent=extracted_intent, confidence_score=confidence_score, processing_time=processing_time, original_audio_path=audio_path ) async def _clean_text(self, raw_text: str) -> str: """ Clean raw voice-to-text output Removes filler words, corrects common speech-to-text errors """ # Remove common filler words and normalize cleaned = raw_text.lower().strip() # Common speech-to-text corrections corrections = { "umm": "", "uh": "", "uhh": "", "ah": "", "like": "", "you know": "", "right": "", "okay": "", "so": "", } for word, replacement in corrections.items(): cleaned = cleaned.replace(word, replacement) # Remove extra whitespace cleaned = ' '.join(cleaned.split()) # Capitalize first letter if cleaned: cleaned = cleaned[0].upper() + cleaned[1:] if len(cleaned) > 1 else cleaned.upper() return cleaned async def _extract_intent(self, text: str) -> Tuple[str, float]: """ Extract intent from cleaned text with confidence score """ text_lower = text.lower() # Define common intents and their keywords intents = { "task_add": { "keywords": ["add", "create", "make", "new", "task", "kam", "bnao", "shamil"], "confidence_boost_keywords": ["add task", "create task", "kam shamil"] }, "task_list": { "keywords": ["list", "show", "display", "dikhao", "list karo", "kya hai"], "confidence_boost_keywords": ["show tasks", "list tasks", "kam dikhao"] }, "task_complete": { "keywords": ["complete", "done", "finish", "hogaya", "ho gaya", "khatam"], "confidence_boost_keywords": ["mark done", "complete task", "kam khatam"] }, "task_delete": { "keywords": ["delete", "remove", "delete", "hatado", "nikalo", "khatam"], "confidence_boost_keywords": ["delete task", "remove task", "kam hatao"] }, "greeting": { "keywords": ["hello", "hi", "hey", "helo", "kese ho", "kaia hal", "assalam"], "confidence_boost_keywords": ["hello there", "hi there", "helo"] }, "question": { "keywords": ["what", "how", "why", "kya", "kese", "kyun", "kaia"], "confidence_boost_keywords": ["what is", "how to", "kya hai", "kese"] }, "affirmation": { "keywords": ["yes", "yeah", "sure", "jeee", "haan", "jaroor", "ji"], "confidence_boost_keywords": ["yes please", "sure thing", "haan ji"] }, "negation": { "keywords": ["no", "nope", "nahi", "mat", "mtlb", "nahe", "nai"], "confidence_boost_keywords": ["no thanks", "no please", "nahi chahiye"] } } best_intent = "unknown" best_confidence = 0.0 for intent, config in intents.items(): confidence = 0 # Score based on regular keywords for keyword in config["keywords"]: if keyword in text_lower: confidence += 1 # Boost score for specific phrases for phrase in config["confidence_boost_keywords"]: if phrase in text_lower: confidence += 2 # Higher weight for specific phrases # Calculate confidence as percentage of matched keywords if confidence > 0: # Normalize based on the length of the input text confidence_ratio = min(confidence / len(text_lower.split()), 1.0) final_confidence = min(confidence_ratio * 2, 1.0) # Boost slightly but cap at 1.0 if final_confidence > best_confidence: best_confidence = final_confidence best_intent = intent # Set a minimum confidence threshold if best_confidence < 0.1: best_intent = "unknown" best_confidence = 0.0 return best_intent, best_confidence async def validate_voice_input(self, raw_text: str) -> bool: """ Validate if the voice input is usable """ if not raw_text or len(raw_text.strip()) == 0: return False # Check if text is just noise or common meaningless phrases invalid_phrases = [ "noise", "background", "static", "garbage", "unintelligible", "inaudible", "unclear", "", " ", "\n", "\t" ] cleaned = raw_text.strip().lower() if cleaned in invalid_phrases: return False # Check if it's mostly repeated characters (indicating poor quality) if len(set(cleaned)) < 3 and len(cleaned) > 10: return False return True # Singleton instance voice_processing_service = VoiceProcessingService() def get_voice_processing_service() -> VoiceProcessingService: """Get the singleton voice processing service instance""" return voice_processing_service