| """ |
| Quality Control Module for Audio Intelligence System |
| |
| This module implements quality checks and model selection strategies |
| to ensure the system only demonstrates its best capabilities. |
| """ |
|
|
| import logging |
| from typing import Dict, List, Optional, Tuple |
| import re |
|
|
| logger = logging.getLogger(__name__) |
|
|
| class QualityController: |
| """ |
| Controls quality of transcription and translation to avoid |
| misleading results in demonstrations. |
| """ |
| |
| def __init__(self): |
| |
| self.reliable_languages = { |
| 'hi': {'name': 'Hindi', 'opus_mt': True, 'quality': 'high'}, |
| 'ja': {'name': 'Japanese', 'opus_mt': True, 'quality': 'high'}, |
| 'fr': {'name': 'French', 'opus_mt': True, 'quality': 'high'}, |
| 'en': {'name': 'English', 'opus_mt': True, 'quality': 'high'}, |
| 'ur': {'name': 'Urdu', 'opus_mt': True, 'quality': 'medium'}, |
| 'bn': {'name': 'Bengali', 'opus_mt': True, 'quality': 'medium'}, |
| } |
| |
| |
| self.poor_quality_patterns = [ |
| r'^(.+?)\1{4,}', |
| r'^(ΰ€€ΰ₯\s*){10,}', |
| r'^(.{1,3}\s*){20,}', |
| ] |
| |
| def validate_language_detection(self, text: str, detected_lang: str) -> Tuple[str, float]: |
| """ |
| Validate language detection and return corrected language with confidence. |
| |
| Returns: |
| Tuple[str, float]: (corrected_language, confidence) |
| """ |
| |
| clean_text = text.strip() |
| |
| |
| devanagari_chars = sum(1 for char in clean_text if '\u0900' <= char <= '\u097F') |
| arabic_chars = sum(1 for char in clean_text if '\u0600' <= char <= '\u06FF') |
| latin_chars = sum(1 for char in clean_text if char.isascii() and char.isalpha()) |
| |
| total_chars = len([c for c in clean_text if c.isalpha()]) |
| |
| if total_chars == 0: |
| return detected_lang, 0.1 |
| |
| |
| devanagari_ratio = devanagari_chars / total_chars |
| arabic_ratio = arabic_chars / total_chars |
| latin_ratio = latin_chars / total_chars |
| |
| |
| if devanagari_ratio > 0.8: |
| return 'hi', 0.95 |
| elif arabic_ratio > 0.8: |
| return 'ur', 0.9 |
| elif latin_ratio > 0.9: |
| |
| if detected_lang in ['en', 'fr']: |
| return detected_lang, 0.8 |
| return 'en', 0.7 |
| |
| |
| if devanagari_ratio > 0.5: |
| return 'hi', 0.7 |
| elif arabic_ratio > 0.5: |
| return 'ur', 0.7 |
| |
| |
| if detected_lang in ['zh', 'th', 'ko'] and devanagari_ratio > 0.2: |
| return 'hi', 0.6 |
| |
| return detected_lang, 0.5 |
| |
| def assess_transcription_quality(self, text: str) -> Dict[str, any]: |
| """ |
| Assess the quality of transcribed text. |
| |
| Returns: |
| Dict with quality assessment |
| """ |
| clean_text = text.strip() |
| words = clean_text.split() |
| |
| assessment = { |
| 'text': clean_text, |
| 'quality_score': 1.0, |
| 'issues': [], |
| 'recommendation': 'accept' |
| } |
| |
| |
| if len(clean_text) < 5: |
| assessment['quality_score'] *= 0.3 |
| assessment['issues'].append('very_short') |
| |
| if len(words) == 0: |
| assessment['quality_score'] = 0.0 |
| assessment['issues'].append('empty') |
| assessment['recommendation'] = 'reject' |
| return assessment |
| |
| |
| unique_words = set(words) |
| repetition_ratio = len(unique_words) / len(words) |
| |
| if repetition_ratio < 0.3: |
| assessment['quality_score'] *= 0.2 |
| assessment['issues'].append('highly_repetitive') |
| assessment['recommendation'] = 'filter' |
| elif repetition_ratio < 0.5: |
| assessment['quality_score'] *= 0.6 |
| assessment['issues'].append('repetitive') |
| |
| |
| for pattern in self.poor_quality_patterns: |
| if re.match(pattern, clean_text): |
| assessment['quality_score'] *= 0.1 |
| assessment['issues'].append('pattern_match') |
| assessment['recommendation'] = 'reject' |
| break |
| |
| |
| alpha_ratio = len([c for c in clean_text if c.isalpha()]) / max(1, len(clean_text)) |
| if alpha_ratio < 0.5: |
| assessment['quality_score'] *= 0.4 |
| assessment['issues'].append('garbled') |
| |
| |
| if assessment['quality_score'] < 0.2: |
| assessment['recommendation'] = 'reject' |
| elif assessment['quality_score'] < 0.5: |
| assessment['recommendation'] = 'filter' |
| |
| return assessment |
| |
| def should_process_language(self, language: str) -> bool: |
| """ |
| Determine if we should process this language based on our capabilities. |
| """ |
| return language in self.reliable_languages |
| |
| def get_best_translation_strategy(self, source_lang: str, target_lang: str) -> Dict[str, any]: |
| """ |
| Get the best translation strategy for the language pair. |
| """ |
| strategy = { |
| 'method': 'hybrid', |
| 'confidence': 0.5, |
| 'explanation': 'Standard hybrid approach' |
| } |
| |
| if source_lang not in self.reliable_languages: |
| strategy['method'] = 'google_only' |
| strategy['confidence'] = 0.6 |
| strategy['explanation'] = f'Language {source_lang} not in reliable set, using Google API' |
| elif self.reliable_languages[source_lang]['quality'] == 'high': |
| strategy['confidence'] = 0.9 |
| strategy['explanation'] = f'High quality support for {source_lang}' |
| |
| return strategy |
| |
| def filter_results_for_demo(self, segments: List) -> List: |
| """ |
| Filter results to show only high-quality segments for demo purposes. |
| """ |
| filtered_segments = [] |
| |
| for segment in segments: |
| |
| quality = self.assess_transcription_quality(segment.original_text) |
| |
| if quality['recommendation'] == 'accept': |
| filtered_segments.append(segment) |
| elif quality['recommendation'] == 'filter': |
| |
| segment.original_text = f"[Filtered] {segment.original_text}" |
| segment.confidence_transcription *= 0.5 |
| filtered_segments.append(segment) |
| |
| |
| logger.info(f"Quality filter: {len(segments)} β {len(filtered_segments)} segments") |
| return filtered_segments |
|
|
| |
| quality_controller = QualityController() |
|
|
|
|
|
|