Spaces:
Runtime error
Runtime error
File size: 6,815 Bytes
dd1b74d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | """
Voice Processing Service for the AI Chatbot with Reusable Intelligence
Handles the cleaning and intent extraction from raw voice-to-text strings
"""
import asyncio
import json
from typing import Dict, Optional, Tuple
from dataclasses import dataclass
import uuid
from datetime import datetime
@dataclass
class VoiceProcessingResult:
"""Data class for voice processing results"""
cleaned_text: str
extracted_intent: str
confidence_score: float
processing_time: float
original_audio_path: Optional[str] = None
class VoiceProcessingService:
"""Service class for processing voice input and extracting intent"""
def __init__(self):
# In a real implementation, this would initialize speech recognition models
# For now, we'll simulate processing
pass
async def process_voice_input(self, raw_text: str, audio_path: Optional[str] = None) -> VoiceProcessingResult:
"""
Process raw voice-to-text input to clean text and extract intent
"""
start_time = datetime.now()
# Clean the raw text
cleaned_text = await self._clean_text(raw_text)
# Extract intent from the cleaned text
extracted_intent, confidence_score = await self._extract_intent(cleaned_text)
end_time = datetime.now()
processing_time = (end_time - start_time).total_seconds()
return VoiceProcessingResult(
cleaned_text=cleaned_text,
extracted_intent=extracted_intent,
confidence_score=confidence_score,
processing_time=processing_time,
original_audio_path=audio_path
)
async def _clean_text(self, raw_text: str) -> str:
"""
Clean raw voice-to-text output
Removes filler words, corrects common speech-to-text errors
"""
# Remove common filler words and normalize
cleaned = raw_text.lower().strip()
# Common speech-to-text corrections
corrections = {
"umm": "",
"uh": "",
"uhh": "",
"ah": "",
"like": "",
"you know": "",
"right": "",
"okay": "",
"so": "",
}
for word, replacement in corrections.items():
cleaned = cleaned.replace(word, replacement)
# Remove extra whitespace
cleaned = ' '.join(cleaned.split())
# Capitalize first letter
if cleaned:
cleaned = cleaned[0].upper() + cleaned[1:] if len(cleaned) > 1 else cleaned.upper()
return cleaned
async def _extract_intent(self, text: str) -> Tuple[str, float]:
"""
Extract intent from cleaned text with confidence score
"""
text_lower = text.lower()
# Define common intents and their keywords
intents = {
"task_add": {
"keywords": ["add", "create", "make", "new", "task", "kam", "bnao", "shamil"],
"confidence_boost_keywords": ["add task", "create task", "kam shamil"]
},
"task_list": {
"keywords": ["list", "show", "display", "dikhao", "list karo", "kya hai"],
"confidence_boost_keywords": ["show tasks", "list tasks", "kam dikhao"]
},
"task_complete": {
"keywords": ["complete", "done", "finish", "hogaya", "ho gaya", "khatam"],
"confidence_boost_keywords": ["mark done", "complete task", "kam khatam"]
},
"task_delete": {
"keywords": ["delete", "remove", "delete", "hatado", "nikalo", "khatam"],
"confidence_boost_keywords": ["delete task", "remove task", "kam hatao"]
},
"greeting": {
"keywords": ["hello", "hi", "hey", "helo", "kese ho", "kaia hal", "assalam"],
"confidence_boost_keywords": ["hello there", "hi there", "helo"]
},
"question": {
"keywords": ["what", "how", "why", "kya", "kese", "kyun", "kaia"],
"confidence_boost_keywords": ["what is", "how to", "kya hai", "kese"]
},
"affirmation": {
"keywords": ["yes", "yeah", "sure", "jeee", "haan", "jaroor", "ji"],
"confidence_boost_keywords": ["yes please", "sure thing", "haan ji"]
},
"negation": {
"keywords": ["no", "nope", "nahi", "mat", "mtlb", "nahe", "nai"],
"confidence_boost_keywords": ["no thanks", "no please", "nahi chahiye"]
}
}
best_intent = "unknown"
best_confidence = 0.0
for intent, config in intents.items():
confidence = 0
# Score based on regular keywords
for keyword in config["keywords"]:
if keyword in text_lower:
confidence += 1
# Boost score for specific phrases
for phrase in config["confidence_boost_keywords"]:
if phrase in text_lower:
confidence += 2 # Higher weight for specific phrases
# Calculate confidence as percentage of matched keywords
if confidence > 0:
# Normalize based on the length of the input text
confidence_ratio = min(confidence / len(text_lower.split()), 1.0)
final_confidence = min(confidence_ratio * 2, 1.0) # Boost slightly but cap at 1.0
if final_confidence > best_confidence:
best_confidence = final_confidence
best_intent = intent
# Set a minimum confidence threshold
if best_confidence < 0.1:
best_intent = "unknown"
best_confidence = 0.0
return best_intent, best_confidence
async def validate_voice_input(self, raw_text: str) -> bool:
"""
Validate if the voice input is usable
"""
if not raw_text or len(raw_text.strip()) == 0:
return False
# Check if text is just noise or common meaningless phrases
invalid_phrases = [
"noise", "background", "static", "garbage", "unintelligible",
"inaudible", "unclear", "", " ", "\n", "\t"
]
cleaned = raw_text.strip().lower()
if cleaned in invalid_phrases:
return False
# Check if it's mostly repeated characters (indicating poor quality)
if len(set(cleaned)) < 3 and len(cleaned) > 10:
return False
return True
# Singleton instance
voice_processing_service = VoiceProcessingService()
def get_voice_processing_service() -> VoiceProcessingService:
"""Get the singleton voice processing service instance"""
return voice_processing_service |