Spaces:
Running
Running
| """Input validation module for audio and script files.""" | |
| import subprocess | |
| from pathlib import Path | |
| from typing import Dict, Union | |
| from config import MIN_WORDS_PER_MINUTE, MAX_WORDS_PER_MINUTE, MISMATCH_THRESHOLD | |
| def validate_inputs(audio_path: Union[str, Path], script_path: Union[str, Path]) -> Dict: | |
| """Validate audio and script files before processing. | |
| Performs comprehensive pre-flight checks including file existence, | |
| content validation, and duration/word count sanity checks for | |
| Tunisian Arabic content. | |
| """ | |
| audio_path = Path(audio_path) | |
| script_path = Path(script_path) | |
| warnings = [] | |
| # Check audio file exists and is non-empty | |
| if not audio_path.exists(): | |
| raise FileNotFoundError(f"Audio file not found: {audio_path}") | |
| if audio_path.stat().st_size == 0: | |
| raise ValueError(f"Audio file is empty: {audio_path}") | |
| # Check script file exists and is non-empty | |
| if not script_path.exists(): | |
| raise FileNotFoundError(f"Script file not found: {script_path}") | |
| if script_path.stat().st_size == 0: | |
| raise ValueError(f"Script file is empty: {script_path}") | |
| # Validate script encoding and content | |
| try: | |
| with open(script_path, 'r', encoding='utf-8') as f: | |
| script_content = f.read() | |
| except UnicodeDecodeError: | |
| raise ValueError(f"Script file must be UTF-8 encoded: {script_path}") | |
| if not script_content.strip(): | |
| raise ValueError(f"Script file contains no text: {script_path}") | |
| # Parse script into sentences (non-empty lines) | |
| sentences = [line.strip() for line in script_content.splitlines() if line.strip()] | |
| if len(sentences) == 0: | |
| raise ValueError(f"Script file contains no non-empty lines: {script_path}") | |
| # Count words in script | |
| # For Tunisian Arabic: count Arabic words, Latin words, and numbers | |
| word_count = 0 | |
| for sentence in sentences: | |
| # Split on whitespace and count non-empty tokens | |
| # This handles mixed Arabic/French/numbers appropriately | |
| tokens = sentence.split() | |
| word_count += len([token for token in tokens if token.strip()]) | |
| # Get audio duration using ffprobe | |
| try: | |
| audio_duration_sec = _get_audio_duration(audio_path) | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to analyze audio duration: {e}") | |
| # Validate duration/word count ratio for Tunisian Arabic | |
| if audio_duration_sec > 0: | |
| words_per_minute = (word_count / audio_duration_sec) * 60 | |
| if words_per_minute < MIN_WORDS_PER_MINUTE: | |
| pct_diff = ((MIN_WORDS_PER_MINUTE - words_per_minute) / MIN_WORDS_PER_MINUTE) * 100 | |
| if pct_diff > (MISMATCH_THRESHOLD * 100): | |
| warnings.append( | |
| f"Script may be too short for audio duration. " | |
| f"Expected ≥{MIN_WORDS_PER_MINUTE} words/min, got {words_per_minute:.1f} " | |
| f"({pct_diff:.1f}% below minimum)" | |
| ) | |
| elif words_per_minute > MAX_WORDS_PER_MINUTE: | |
| pct_diff = ((words_per_minute - MAX_WORDS_PER_MINUTE) / MAX_WORDS_PER_MINUTE) * 100 | |
| if pct_diff > (MISMATCH_THRESHOLD * 100): | |
| warnings.append( | |
| f"Script may be too long for audio duration. " | |
| f"Expected ≤{MAX_WORDS_PER_MINUTE} words/min, got {words_per_minute:.1f} " | |
| f"({pct_diff:.1f}% above maximum)" | |
| ) | |
| return { | |
| "audio_duration_sec": audio_duration_sec, | |
| "sentence_count": len(sentences), | |
| "word_count": word_count, | |
| "warnings": warnings | |
| } | |
| def _get_audio_duration(audio_path: Path) -> float: | |
| """Get audio duration in seconds using ffprobe.""" | |
| cmd = [ | |
| "ffprobe", "-v", "quiet", "-show_entries", | |
| "format=duration", "-of", "csv=p=0", str(audio_path) | |
| ] | |
| try: | |
| result = subprocess.run(cmd, capture_output=True, text=True, check=True) | |
| duration_str = result.stdout.strip() | |
| if not duration_str: | |
| raise ValueError("ffprobe returned empty duration") | |
| return float(duration_str) | |
| except subprocess.CalledProcessError as e: | |
| raise RuntimeError(f"ffprobe failed: {e.stderr}") | |
| except ValueError as e: | |
| raise RuntimeError(f"Failed to parse audio duration: {e}") | |
| except FileNotFoundError: | |
| raise RuntimeError( | |
| "ffprobe not found. Please install ffmpeg: " | |
| "brew install ffmpeg (macOS) or visit https://ffmpeg.org/" | |
| ) |