Spaces:

jcudit
/

voice-tools

Running on Zero

App Files Files Community

jcudit HF Staff commited on Dec 28, 2025

Commit

3ff2f18

1 Parent(s): 0456b70

fix: also correct lib/ in gitignore to only exclude root-level, add src/lib package

Browse files

Files changed (8) hide show

.gitignore +2 -2
src/lib/__init__.py +0 -0
src/lib/audio_io.py +703 -0
src/lib/format_converter.py +336 -0
src/lib/gpu_utils.py +158 -0
src/lib/memory_optimizer.py +312 -0
src/lib/metadata_logger.py +336 -0
src/lib/quality_metrics.py +397 -0

.gitignore CHANGED Viewed

@@ -10,8 +10,8 @@ dist/
 downloads/
 eggs/
 .eggs/
-lib/
-lib64/
 parts/
 sdist/
 var/

 downloads/
 eggs/
 .eggs/
+/lib/
+/lib64/
 parts/
 sdist/
 var/

src/lib/__init__.py ADDED Viewed

File without changes

src/lib/audio_io.py ADDED Viewed

	@@ -0,0 +1,703 @@

+"""
+Audio I/O utilities: Read, write, and validate audio files.
+Handles m4a and wav formats with format validation and error handling.
+"""
+import logging
+from pathlib import Path
+from typing import Optional, Tuple
+import numpy as np
+logger = logging.getLogger(__name__)
+class AudioIOError(Exception):
+    """Custom exception for audio I/O errors."""
+    pass
+def read_audio(file_path: str, target_sr: Optional[int] = None) -> Tuple[np.ndarray, int]:
+    """
+    Read audio file and return waveform and sample rate.
+    Supports m4a and wav formats. Automatically converts to mono if stereo.
+    Args:
+        file_path: Path to audio file
+        target_sr: Target sample rate (resamples if different), None = keep original
+    Returns:
+        Tuple of (audio_array, sample_rate)
+        - audio_array: 1D numpy array of audio samples (float32, mono)
+        - sample_rate: Sample rate in Hz
+    Raises:
+        AudioIOError: If file cannot be read or format is invalid
+    """
+    import subprocess
+    import tempfile
+    import soundfile as sf
+    file_path = Path(file_path)
+    if not file_path.exists():
+        raise AudioIOError(f"Audio file not found: {file_path}")
+    try:
+        # Try reading directly with soundfile
+        audio, sr = sf.read(str(file_path), dtype="float32")
+    except Exception as e:
+        # If M4A/AAC format not recognized, convert to WAV using FFmpeg
+        if file_path.suffix.lower() in [".m4a", ".aac", ".mp4"]:
+            logger.debug(f"Converting {file_path.suffix} to WAV for reading...")
+            # Create temporary WAV file
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
+                tmp_wav_path = tmp_wav.name
+            try:
+                # Convert M4A to WAV using FFmpeg
+                target_rate = target_sr if target_sr else 44100
+                cmd = [
+                    "ffmpeg",
+                    "-i",
+                    str(file_path),
+                    "-ar",
+                    str(target_rate),
+                    "-ac",
+                    "1",  # Mono
+                    "-y",  # Overwrite
+                    tmp_wav_path,
+                ]
+                result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+                # Read the converted WAV file
+                audio, sr = sf.read(tmp_wav_path, dtype="float32")
+                logger.debug(f"Converted and read {file_path.name} via FFmpeg")
+            finally:
+                # Clean up temporary file
+                if Path(tmp_wav_path).exists():
+                    Path(tmp_wav_path).unlink()
+        else:
+            # Not an M4A file, re-raise the original error
+            raise AudioIOError(f"Failed to read audio file {file_path}: {str(e)}")
+    # Convert stereo to mono if needed (in case FFmpeg didn't do it)
+    if audio.ndim > 1:
+        audio = audio.mean(axis=1)
+    # Resample if target sample rate specified and not already done
+    if target_sr is not None and sr != target_sr:
+        audio = resample_audio(audio, sr, target_sr)
+        sr = target_sr
+    logger.debug(f"Read audio: {file_path.name} ({len(audio) / sr:.1f}s at {sr}Hz)")
+    return audio, sr
+def write_audio(
+    file_path: str, audio: np.ndarray, sample_rate: int, format: Optional[str] = None
+) -> None:
+    """
+    Write audio array to file.
+    Args:
+        file_path: Output file path
+        audio: Audio array (1D numpy array, float32)
+        sample_rate: Sample rate in Hz
+        format: Audio format ('wav', 'm4a', etc.), auto-detected from extension if None
+    Raises:
+        AudioIOError: If file cannot be written
+    """
+    import subprocess
+    import tempfile
+    import soundfile as sf
+    file_path = Path(file_path)
+    # Create output directory if needed
+    file_path.parent.mkdir(parents=True, exist_ok=True)
+    # Ensure audio is 1D
+    if audio.ndim > 1:
+        audio = audio.squeeze()
+    # Auto-detect format from extension
+    if format is None:
+        format = file_path.suffix.lstrip(".")
+    try:
+        # Check if M4A/AAC format (not supported by soundfile)
+        if format.lower() in ["m4a", "aac", "mp4"]:
+            logger.debug(f"Converting to {format.upper()} via FFmpeg...")
+            # Write to temporary WAV file first
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
+                tmp_wav_path = tmp_wav.name
+            try:
+                # Write WAV using soundfile
+                sf.write(tmp_wav_path, audio, sample_rate, format="wav")
+                # Convert WAV to M4A using FFmpeg
+                # Clamp sample rate to M4A maximum (48kHz)
+                output_sr = min(sample_rate, 48000)
+                bitrate = "192k"  # Good quality for voice
+                cmd = [
+                    "ffmpeg",
+                    "-i",
+                    tmp_wav_path,
+                    "-ar",
+                    str(output_sr),
+                    "-b:a",
+                    bitrate,
+                    "-c:a",
+                    "aac",
+                    "-y",  # Overwrite
+                    str(file_path),
+                ]
+                result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+                logger.debug(
+                    f"Wrote audio: {file_path.name} ({len(audio) / sample_rate:.1f}s at {output_sr}Hz, {bitrate})"
+                )
+            finally:
+                # Clean up temporary file
+                if Path(tmp_wav_path).exists():
+                    Path(tmp_wav_path).unlink()
+        else:
+            # Write directly with soundfile for WAV and other supported formats
+            sf.write(str(file_path), audio, sample_rate, format=format)
+            logger.debug(
+                f"Wrote audio: {file_path.name} ({len(audio) / sample_rate:.1f}s at {sample_rate}Hz)"
+            )
+    except subprocess.CalledProcessError as e:
+        raise AudioIOError(f"FFmpeg conversion failed for {file_path}: {e.stderr}")
+    except Exception as e:
+        raise AudioIOError(f"Failed to write audio file {file_path}: {str(e)}")
+def validate_audio_file(file_path: str, min_duration: float = 0.1) -> Tuple[bool, Optional[str]]:
+    """
+    Validate that file is a readable audio file with comprehensive checks.
+    Args:
+        file_path: Path to audio file
+        min_duration: Minimum duration in seconds (default: 0.1)
+    Returns:
+        Tuple of (is_valid, error_message)
+        - is_valid: True if file is valid audio
+        - error_message: Description of validation failure, None if valid
+    """
+    try:
+        file_path = Path(file_path)
+        # Check file exists
+        if not file_path.exists():
+            return False, f"File not found: {file_path}"
+        # Check file is not empty
+        if file_path.stat().st_size == 0:
+            return False, f"File is empty: {file_path}"
+        # Check file extension
+        valid_extensions = {".m4a", ".wav", ".mp3", ".flac", ".ogg", ".aac", ".mp4"}
+        if file_path.suffix.lower() not in valid_extensions:
+            return (
+                False,
+                f"Unsupported format: {file_path.suffix}. Supported formats: {', '.join(valid_extensions)}",
+            )
+        # Try to read file metadata
+        import subprocess
+        import soundfile as sf
+        try:
+            # For M4A/AAC, use ffprobe for metadata
+            if file_path.suffix.lower() in [".m4a", ".aac", ".mp4"]:
+                result = subprocess.run(
+                    [
+                        "ffprobe",
+                        "-v",
+                        "error",
+                        "-show_entries",
+                        "format=duration,bit_rate:stream=codec_name,sample_rate,channels",
+                        "-of",
+                        "json",
+                        str(file_path),
+                    ],
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+                import json
+                probe_data = json.loads(result.stdout)
+                if "format" not in probe_data or "duration" not in probe_data["format"]:
+                    return False, f"Invalid audio file: Cannot read metadata"
+                duration = float(probe_data["format"]["duration"])
+                if duration < min_duration:
+                    return False, f"Audio too short: {duration:.2f}s (minimum: {min_duration}s)"
+            else:
+                # For WAV and other formats, use soundfile
+                info = sf.info(str(file_path))
+                # Check basic properties
+                if info.samplerate <= 0:
+                    return False, f"Invalid sample rate: {info.samplerate}"
+                if info.frames <= 0:
+                    return False, f"No audio frames in file"
+                duration = info.frames / info.samplerate
+                if duration < min_duration:
+                    return False, f"Audio too short: {duration:.2f}s (minimum: {min_duration}s)"
+        except FileNotFoundError:
+            return False, "FFmpeg/FFprobe not found. Please install FFmpeg for M4A support."
+        except subprocess.CalledProcessError as e:
+            return False, f"Cannot read audio metadata: {e.stderr}"
+        except Exception as e:
+            return False, f"Invalid audio file: {str(e)}"
+        return True, None
+    except Exception as e:
+        return False, f"Validation error: {str(e)}"
+def get_audio_duration(file_path: str) -> float:
+    """
+    Get duration of audio file in seconds.
+    Args:
+        file_path: Path to audio file
+    Returns:
+        Duration in seconds
+    Raises:
+        AudioIOError: If file cannot be read
+    """
+    try:
+        # For M4A/AAC files, use FFprobe since soundfile doesn't support them
+        if Path(file_path).suffix.lower() in [".m4a", ".aac", ".mp4"]:
+            import subprocess
+            result = subprocess.run(
+                [
+                    "ffprobe",
+                    "-v",
+                    "error",
+                    "-show_entries",
+                    "format=duration",
+                    "-of",
+                    "default=noprint_wrappers=1:nokey=1",
+                    str(file_path),
+                ],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            return float(result.stdout.strip())
+        else:
+            # For WAV and other formats, use soundfile
+            import soundfile as sf
+            info = sf.info(str(file_path))
+            return info.frames / info.samplerate
+    except Exception as e:
+        raise AudioIOError(f"Failed to get audio duration for {file_path}: {str(e)}")
+def get_audio_info(file_path: str) -> dict:
+    """
+    Get detailed information about audio file.
+    Args:
+        file_path: Path to audio file
+    Returns:
+        Dictionary with keys: duration, sample_rate, channels, format, subtype
+    Raises:
+        AudioIOError: If file cannot be read
+    """
+    try:
+        import soundfile as sf
+        info = sf.info(str(file_path))
+        return {
+            "duration": info.frames / info.samplerate,
+            "sample_rate": info.samplerate,
+            "channels": info.channels,
+            "format": info.format,
+            "subtype": info.subtype,
+            "frames": info.frames,
+        }
+    except Exception as e:
+        raise AudioIOError(f"Failed to get audio info for {file_path}: {str(e)}")
+def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+    """
+    Resample audio to target sample rate.
+    Args:
+        audio: Audio array
+        orig_sr: Original sample rate
+        target_sr: Target sample rate
+    Returns:
+        Resampled audio array
+    """
+    try:
+        import librosa
+        if orig_sr == target_sr:
+            return audio
+        resampled = librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
+        return resampled
+    except Exception as e:
+        raise AudioIOError(f"Failed to resample audio: {str(e)}")
+def normalize_audio(audio: np.ndarray, target_db: float = -20.0) -> np.ndarray:
+    """
+    Normalize audio to target dB level.
+    Args:
+        audio: Audio array
+        target_db: Target level in dB (default: -20dB)
+    Returns:
+        Normalized audio array
+    """
+    # Calculate current RMS
+    rms = np.sqrt(np.mean(audio**2))
+    if rms == 0:
+        return audio
+    # Calculate target RMS from dB
+    target_rms = 10 ** (target_db / 20)
+    # Apply gain
+    gain = target_rms / rms
+    normalized = audio * gain
+    # Prevent clipping
+    max_val = np.abs(normalized).max()
+    if max_val > 1.0:
+        normalized = normalized / max_val * 0.99
+    return normalized
+def extract_segment(
+    audio: np.ndarray, sample_rate: int, start_time: float, end_time: float
+) -> np.ndarray:
+    """
+    Extract segment from audio array.
+    Args:
+        audio: Audio array
+        sample_rate: Sample rate in Hz
+        start_time: Start time in seconds
+        end_time: End time in seconds
+    Returns:
+        Audio segment array
+    """
+    start_sample = int(start_time * sample_rate)
+    end_sample = int(end_time * sample_rate)
+    # Clamp to valid range
+    start_sample = max(0, start_sample)
+    end_sample = min(len(audio), end_sample)
+    return audio[start_sample:end_sample]
+def split_audio_chunks(
+    audio: np.ndarray, sample_rate: int, chunk_duration: float, overlap: float = 0.0
+) -> list:
+    """
+    Split audio into chunks for processing.
+    Args:
+        audio: Audio array
+        sample_rate: Sample rate in Hz
+        chunk_duration: Chunk duration in seconds
+        overlap: Overlap between chunks in seconds
+    Returns:
+        List of (chunk_audio, start_time, end_time) tuples
+    """
+    chunk_samples = int(chunk_duration * sample_rate)
+    overlap_samples = int(overlap * sample_rate)
+    step_samples = chunk_samples - overlap_samples
+    chunks = []
+    position = 0
+    while position < len(audio):
+        chunk_end = min(position + chunk_samples, len(audio))
+        chunk = audio[position:chunk_end]
+        start_time = position / sample_rate
+        end_time = chunk_end / sample_rate
+        chunks.append((chunk, start_time, end_time))
+        position += step_samples
+        # Stop if we've reached the end
+        if chunk_end >= len(audio):
+            break
+    return chunks
+# ===== M4A/WAV Conversion Utilities (T007-T008) =====
+def convert_m4a_to_wav(
+    input_path: str, output_path: Optional[str] = None, sample_rate: int = 16000
+) -> str:
+    """
+    Convert M4A/AAC audio file to WAV format using FFmpeg.
+    This is required for pyannote.audio processing which expects WAV input.
+    Args:
+        input_path: Path to input M4A/AAC file
+        output_path: Path for output WAV file (auto-generated if None)
+        sample_rate: Target sample rate in Hz (default: 16000 for pyannote)
+    Returns:
+        Path to converted WAV file
+    Raises:
+        AudioIOError: If conversion fails or FFmpeg is not available
+    """
+    import subprocess
+    from pathlib import Path
+    input_path = Path(input_path)
+    if not input_path.exists():
+        raise AudioIOError(f"Input file not found: {input_path}")
+    # Auto-generate output path if not provided
+    if output_path is None:
+        output_path = input_path.with_suffix(".wav")
+    else:
+        output_path = Path(output_path)
+    # Create output directory if needed
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        # Run FFmpeg conversion
+        cmd = [
+            "ffmpeg",
+            "-i",
+            str(input_path),
+            "-ar",
+            str(sample_rate),  # Resample to target rate
+            "-ac",
+            "1",  # Convert to mono
+            "-y",  # Overwrite output
+            str(output_path),
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        logger.info(f"Converted {input_path.name} to WAV at {sample_rate}Hz")
+        return str(output_path)
+    except FileNotFoundError:
+        raise AudioIOError(
+            "FFmpeg not found. Please install FFmpeg: https://ffmpeg.org/download.html"
+        )
+    except subprocess.CalledProcessError as e:
+        raise AudioIOError(f"FFmpeg conversion failed: {e.stderr}")
+def convert_wav_to_m4a(
+    input_path: str, output_path: str, sample_rate: int = 44100, bitrate: str = "192k"
+) -> str:
+    """
+    Convert WAV audio file to M4A/AAC format using FFmpeg.
+    Used for exporting final processed audio in M4A format.
+    Args:
+        input_path: Path to input WAV file
+        output_path: Path for output M4A file
+        sample_rate: Target sample rate in Hz (default: 44100, max 48000 for M4A)
+        bitrate: Target bitrate (default: "192k")
+    Returns:
+        Path to converted M4A file
+    Raises:
+        AudioIOError: If conversion fails or FFmpeg is not available
+    """
+    import subprocess
+    from pathlib import Path
+    input_path = Path(input_path)
+    output_path = Path(output_path)
+    if not input_path.exists():
+        raise AudioIOError(f"Input file not found: {input_path}")
+    # Validate sample rate for M4A (max 48kHz)
+    if sample_rate > 48000:
+        logger.warning(f"Sample rate {sample_rate}Hz exceeds M4A limit, using 48000Hz")
+        sample_rate = 48000
+    # Create output directory if needed
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        # Run FFmpeg conversion
+        cmd = [
+            "ffmpeg",
+            "-i",
+            str(input_path),
+            "-ar",
+            str(sample_rate),  # Resample to target rate
+            "-b:a",
+            bitrate,  # Set bitrate
+            "-c:a",
+            "aac",  # Use AAC codec
+            "-y",  # Overwrite output
+            str(output_path),
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        logger.info(f"Converted {input_path.name} to M4A at {sample_rate}Hz, {bitrate}")
+        return str(output_path)
+    except FileNotFoundError:
+        raise AudioIOError(
+            "FFmpeg not found. Please install FFmpeg: https://ffmpeg.org/download.html"
+        )
+    except subprocess.CalledProcessError as e:
+        raise AudioIOError(f"FFmpeg conversion failed: {e.stderr}")
+# ===== Audio Quality Validation (T009) =====
+def validate_audio_quality(
+    audio: np.ndarray, sample_rate: int, file_path: Optional[str] = None
+) -> dict:
+    """
+    Validate audio quality and return metrics.
+    Checks for issues like:
+    - Signal-to-Noise Ratio (SNR)
+    - Clipping/distortion
+    - Duration requirements
+    - RMS energy levels
+    Args:
+        audio: Audio array
+        sample_rate: Sample rate in Hz
+        file_path: Optional file path for logging
+    Returns:
+        Dictionary with quality metrics and validation results:
+        {
+            'snr_db': float,           # Signal-to-noise ratio in dB
+            'is_clipped': bool,        # True if audio has clipping
+            'clipping_ratio': float,   # Percentage of clipped samples
+            'rms_energy': float,       # RMS energy level
+            'is_too_quiet': bool,      # True if audio is too quiet
+            'duration': float,         # Duration in seconds
+            'is_valid': bool,          # Overall validation result
+            'warnings': list,          # List of warning messages
+        }
+    """
+    metrics = {"duration": len(audio) / sample_rate, "warnings": []}
+    # Calculate SNR estimate
+    noise_floor = np.percentile(np.abs(audio), 10)
+    signal_peak = np.percentile(np.abs(audio), 90)
+    snr_db = 20 * np.log10(signal_peak / (noise_floor + 1e-10))
+    metrics["snr_db"] = float(snr_db)
+    if snr_db < 15:
+        metrics["warnings"].append(f"Low SNR ({snr_db:.1f} dB < 15 dB)")
+    # Check for clipping
+    clipping_threshold = 0.99
+    clipped_samples = np.sum(np.abs(audio) > clipping_threshold)
+    clipping_ratio = clipped_samples / len(audio)
+    metrics["is_clipped"] = clipping_ratio > 0.01
+    metrics["clipping_ratio"] = float(clipping_ratio)
+    if metrics["is_clipped"]:
+        metrics["warnings"].append(f"Audio has clipping ({clipping_ratio * 100:.1f}% of samples)")
+    # Check RMS energy
+    rms_energy = np.sqrt(np.mean(audio**2))
+    metrics["rms_energy"] = float(rms_energy)
+    metrics["is_too_quiet"] = rms_energy < 0.01
+    if metrics["is_too_quiet"]:
+        metrics["warnings"].append(f"Audio is too quiet (RMS: {rms_energy:.4f})")
+    # Check duration
+    if metrics["duration"] < 1.0:
+        metrics["warnings"].append(f"Audio is very short ({metrics['duration']:.1f}s)")
+    # Overall validation
+    metrics["is_valid"] = (
+        snr_db >= 10  # Minimum acceptable SNR
+        and not metrics["is_clipped"]
+        and not metrics["is_too_quiet"]
+        and metrics["duration"] > 0.5
+    )
+    # Log results
+    file_desc = f" for {file_path}" if file_path else ""
+    if metrics["is_valid"]:
+        logger.debug(f"Audio quality validation passed{file_desc}")
+    else:
+        logger.warning(
+            f"Audio quality validation failed{file_desc}: " + ", ".join(metrics["warnings"])
+        )
+    return metrics

src/lib/format_converter.py ADDED Viewed

	@@ -0,0 +1,336 @@

+"""
+Audio format converter: m4a ↔ wav conversion, sample rate normalization.
+Converts between m4a (compressed) and wav (lossless) formats.
+Normalizes to 48kHz/24-bit for processing, outputs as m4a/192kbps for final.
+"""
+import logging
+import tempfile
+from pathlib import Path
+from typing import Optional
+logger = logging.getLogger(__name__)
+class FormatConversionError(Exception):
+    """Custom exception for format conversion errors."""
+    pass
+def m4a_to_wav(
+    input_path: str,
+    output_path: Optional[str] = None,
+    target_sr: int = 48000,
+    target_bit_depth: int = 24,
+) -> str:
+    """
+    Convert m4a to wav format with normalization.
+    Args:
+        input_path: Path to input m4a file
+        output_path: Path to output wav file (temp file if None)
+        target_sr: Target sample rate in Hz (default: 48000)
+        target_bit_depth: Target bit depth (default: 24)
+    Returns:
+        Path to output wav file
+    Raises:
+        FormatConversionError: If conversion fails
+    """
+    try:
+        from pydub import AudioSegment
+        input_path = Path(input_path)
+        if not input_path.exists():
+            raise FormatConversionError(f"Input file not found: {input_path}")
+        # Create output path if not provided
+        if output_path is None:
+            temp_dir = tempfile.gettempdir()
+            output_path = Path(temp_dir) / f"{input_path.stem}_temp.wav"
+        else:
+            output_path = Path(output_path)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+        # Load m4a
+        audio = AudioSegment.from_file(str(input_path), format="m4a")
+        # Normalize to target format
+        audio = audio.set_frame_rate(target_sr)
+        audio = audio.set_channels(1)  # Mono
+        audio = audio.set_sample_width(target_bit_depth // 8)  # Bytes (24-bit = 3 bytes)
+        # Export as wav
+        audio.export(str(output_path), format="wav")
+        logger.debug(f"Converted m4a to wav: {input_path.name} -> {output_path.name}")
+        return str(output_path)
+    except Exception as e:
+        if isinstance(e, FormatConversionError):
+            raise
+        raise FormatConversionError(f"Failed to convert m4a to wav: {str(e)}")
+def wav_to_m4a(
+    input_path: str, output_path: str, bitrate: str = "192k", sample_rate: int = 48000
+) -> str:
+    """
+    Convert wav to m4a format.
+    Args:
+        input_path: Path to input wav file
+        output_path: Path to output m4a file
+        bitrate: AAC bitrate (default: "192k")
+        sample_rate: Sample rate in Hz (default: 48000)
+    Returns:
+        Path to output m4a file
+    Raises:
+        FormatConversionError: If conversion fails
+    """
+    try:
+        from pydub import AudioSegment
+        input_path = Path(input_path)
+        output_path = Path(output_path)
+        if not input_path.exists():
+            raise FormatConversionError(f"Input file not found: {input_path}")
+        # Create output directory if needed
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        # Load wav
+        audio = AudioSegment.from_file(str(input_path), format="wav")
+        # Normalize sample rate
+        audio = audio.set_frame_rate(sample_rate)
+        # Export as m4a with AAC codec
+        audio.export(
+            str(output_path),
+            format="mp4",  # m4a uses mp4 container
+            codec="aac",
+            bitrate=bitrate,
+            parameters=["-profile:a", "aac_low"],
+        )
+        logger.debug(f"Converted wav to m4a: {input_path.name} -> {output_path.name}")
+        return str(output_path)
+    except Exception as e:
+        if isinstance(e, FormatConversionError):
+            raise
+        raise FormatConversionError(f"Failed to convert wav to m4a: {str(e)}")
+def normalize_to_intermediate(input_path: str, output_path: Optional[str] = None) -> str:
+    """
+    Normalize any audio format to intermediate wav format (48kHz/24-bit/mono).
+    This is the standard intermediate format for all processing.
+    Args:
+        input_path: Path to input audio file (m4a, wav, mp3, etc.)
+        output_path: Path to output wav file (temp file if None)
+    Returns:
+        Path to normalized wav file
+    Raises:
+        FormatConversionError: If normalization fails
+    """
+    try:
+        from pydub import AudioSegment
+        input_path = Path(input_path)
+        if not input_path.exists():
+            raise FormatConversionError(f"Input file not found: {input_path}")
+        # Create output path if not provided
+        if output_path is None:
+            temp_dir = tempfile.gettempdir()
+            output_path = Path(temp_dir) / f"{input_path.stem}_normalized.wav"
+        else:
+            output_path = Path(output_path)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+        # Detect input format
+        input_format = input_path.suffix.lstrip(".")
+        # Load audio
+        audio = AudioSegment.from_file(str(input_path), format=input_format)
+        # Normalize to intermediate format: 48kHz, 24-bit, mono
+        audio = audio.set_frame_rate(48000)
+        audio = audio.set_channels(1)
+        audio = audio.set_sample_width(3)  # 24-bit = 3 bytes
+        # Export as wav
+        audio.export(str(output_path), format="wav")
+        logger.debug(f"Normalized to intermediate: {input_path.name} -> {output_path.name}")
+        return str(output_path)
+    except Exception as e:
+        if isinstance(e, FormatConversionError):
+            raise
+        raise FormatConversionError(f"Failed to normalize audio: {str(e)}")
+def convert_to_final_output(input_path: str, output_path: str, format: str = "m4a") -> str:
+    """
+    Convert intermediate wav to final output format.
+    Final output is m4a with AAC 192kbps, 48kHz, mono.
+    Args:
+        input_path: Path to input wav file
+        output_path: Path to output file
+        format: Output format (default: "m4a")
+    Returns:
+        Path to output file
+    Raises:
+        FormatConversionError: If conversion fails
+    """
+    if format == "m4a":
+        return wav_to_m4a(input_path, output_path, bitrate="192k", sample_rate=48000)
+    elif format == "wav":
+        # Just copy if wav output requested
+        import shutil
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy(input_path, output_path)
+        return str(output_path)
+    else:
+        raise FormatConversionError(f"Unsupported output format: {format}")
+def batch_convert(
+    input_files: list, output_dir: str, output_format: str = "m4a", progress_callback=None
+) -> list:
+    """
+    Convert multiple files to output format.
+    Args:
+        input_files: List of input file paths
+        output_dir: Output directory
+        output_format: Output format (default: "m4a")
+        progress_callback: Optional callback(index, total, filename)
+    Returns:
+        List of output file paths
+    Raises:
+        FormatConversionError: If any conversion fails
+    """
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_files = []
+    total = len(input_files)
+    for i, input_file in enumerate(input_files):
+        input_path = Path(input_file)
+        # Generate output filename
+        output_name = f"{input_path.stem}.{output_format}"
+        output_path = output_dir / output_name
+        if progress_callback:
+            progress_callback(i + 1, total, input_path.name)
+        # Convert to intermediate then to final
+        intermediate = normalize_to_intermediate(str(input_path))
+        final = convert_to_final_output(intermediate, str(output_path), output_format)
+        # Clean up intermediate file
+        Path(intermediate).unlink(missing_ok=True)
+        output_files.append(final)
+    return output_files
+def get_conversion_info(input_path: str) -> dict:
+    """
+    Get information about required conversion.
+    Args:
+        input_path: Path to input file
+    Returns:
+        Dictionary with conversion details
+    """
+    try:
+        from pydub import AudioSegment
+        input_path = Path(input_path)
+        if not input_path.exists():
+            return {"error": "File not found"}
+        # Load audio to inspect properties
+        input_format = input_path.suffix.lstrip(".")
+        audio = AudioSegment.from_file(str(input_path), format=input_format)
+        return {
+            "current_format": input_format,
+            "current_sample_rate": audio.frame_rate,
+            "current_channels": audio.channels,
+            "current_sample_width": audio.sample_width,
+            "duration_seconds": len(audio) / 1000.0,
+            "needs_conversion": (
+                audio.frame_rate != 48000 or audio.channels != 1 or audio.sample_width != 3
+            ),
+            "target_format": "wav (intermediate) -> m4a (final)",
+            "target_sample_rate": 48000,
+            "target_channels": 1,
+            "target_bit_depth": 24,
+        }
+    except Exception as e:
+        return {"error": str(e)}
+def estimate_output_size(input_path: str, output_format: str = "m4a") -> int:
+    """
+    Estimate output file size in bytes.
+    Args:
+        input_path: Path to input file
+        output_format: Output format
+    Returns:
+        Estimated file size in bytes
+    """
+    try:
+        info = get_conversion_info(input_path)
+        if "error" in info:
+            return 0
+        duration = info["duration_seconds"]
+        if output_format == "m4a":
+            # AAC 192kbps = 192 * 1000 / 8 bytes per second
+            bitrate_bps = 192 * 1000 / 8
+            return int(duration * bitrate_bps)
+        elif output_format == "wav":
+            # 48kHz * 3 bytes (24-bit) * 1 channel
+            return int(duration * 48000 * 3)
+        else:
+            return 0
+    except Exception:
+        return 0

src/lib/gpu_utils.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""GPU resource management utilities for ZeroGPU compatibility.
+This module provides utilities for managing GPU resources, including model device
+transfers, cache management, and context managers for automatic cleanup.
+"""
+import logging
+import time
+from contextlib import contextmanager
+from typing import Any, Optional
+import torch
+from src.config.gpu_config import GPUConfig
+logger = logging.getLogger(__name__)
+def acquire_gpu(model: torch.nn.Module, device: str = "cuda") -> bool:
+    """Move a model to the specified GPU device.
+    Args:
+        model: PyTorch model to move to GPU
+        device: Target device (default: "cuda")
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    try:
+        start_time = time.time()
+        target_device = torch.device(device)
+        model.to(target_device)
+        elapsed = time.time() - start_time
+        logger.debug(f"Model {model.__class__.__name__} moved to {device} in {elapsed:.3f}s")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to move model to {device}: {e}")
+        return False
+def release_gpu(model: torch.nn.Module, clear_cache: bool = True) -> bool:
+    """Move a model back to CPU and optionally clear CUDA cache.
+    Args:
+        model: PyTorch model to move to CPU
+        clear_cache: Whether to clear CUDA cache after moving
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    try:
+        start_time = time.time()
+        model.to(torch.device("cpu"))
+        if clear_cache and GPUConfig.ENABLE_CACHE_CLEARING and torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        elapsed = time.time() - start_time
+        if elapsed > GPUConfig.CLEANUP_TIMEOUT:
+            logger.warning(
+                f"GPU cleanup took {elapsed:.3f}s, exceeding {GPUConfig.CLEANUP_TIMEOUT}s limit"
+            )
+        else:
+            logger.debug(f"GPU released in {elapsed:.3f}s")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to release GPU: {e}")
+        return False
+@contextmanager
+def gpu_context(model: torch.nn.Module, device: str = "cuda"):
+    """Context manager for automatic GPU resource management.
+    Acquires GPU on entry and releases it on exit, even if an exception occurs.
+    Args:
+        model: PyTorch model to manage
+        device: Target GPU device (default: "cuda")
+    Yields:
+        torch.nn.Module: The model on the GPU device
+    Example:
+        >>> with gpu_context(my_model) as model:
+        ...     result = model(input_data)
+    """
+    acquired = False
+    try:
+        acquired = acquire_gpu(model, device)
+        if not acquired:
+            logger.warning(f"Failed to acquire GPU, model remains on {model.device}")
+        yield model
+    finally:
+        if acquired:
+            release_gpu(model, clear_cache=True)
+def move_to_device(data: Any, device: torch.device) -> Any:
+    """Recursively move tensors to the specified device.
+    Handles nested structures like lists, tuples, and dicts.
+    Args:
+        data: Data to move (tensor, list, tuple, dict, or other)
+        device: Target device
+    Returns:
+        Data with all tensors moved to the device
+    """
+    if isinstance(data, torch.Tensor):
+        return data.to(device)
+    elif isinstance(data, dict):
+        return {k: move_to_device(v, device) for k, v in data.items()}
+    elif isinstance(data, list):
+        return [move_to_device(item, device) for item in data]
+    elif isinstance(data, tuple):
+        return tuple(move_to_device(item, device) for item in data)
+    else:
+        return data
+def get_gpu_memory_info() -> Optional[dict]:
+    """Get current GPU memory usage information.
+    Returns:
+        dict: Memory information with 'allocated' and 'reserved' in GB, or None if CUDA unavailable
+    """
+    if not torch.cuda.is_available():
+        return None
+    try:
+        allocated = torch.cuda.memory_allocated() / 1024**3  # Convert to GB
+        reserved = torch.cuda.memory_reserved() / 1024**3
+        return {
+            "allocated_gb": round(allocated, 2),
+            "reserved_gb": round(reserved, 2),
+        }
+    except Exception as e:
+        logger.error(f"Failed to get GPU memory info: {e}")
+        return None
+def log_gpu_usage(operation: str):
+    """Log current GPU memory usage for a specific operation.
+    Args:
+        operation: Description of the operation being performed
+    """
+    memory_info = get_gpu_memory_info()
+    if memory_info:
+        logger.info(
+            f"[{operation}] GPU Memory - Allocated: {memory_info['allocated_gb']}GB, "
+            f"Reserved: {memory_info['reserved_gb']}GB"
+        )

src/lib/memory_optimizer.py ADDED Viewed

	@@ -0,0 +1,312 @@

+"""
+Memory optimization utilities.
+Provides utilities for processing large audio files (>1 hour) efficiently
+without running out of memory.
+"""
+import gc
+import logging
+from pathlib import Path
+from typing import Iterator, List, Optional, Tuple
+import numpy as np
+from src.lib.audio_io import AudioIOError, read_audio
+logger = logging.getLogger(__name__)
+class AudioChunker:
+    """
+    Utility for processing large audio files in chunks.
+    Allows processing audio files that are too large to fit in memory
+    by streaming them in manageable chunks.
+    """
+    def __init__(self, chunk_duration: float = 60.0, overlap: float = 5.0):
+        """
+        Initialize audio chunker.
+        Args:
+            chunk_duration: Duration of each chunk in seconds (default: 60s)
+            overlap: Overlap between chunks in seconds (default: 5s)
+        """
+        self.chunk_duration = chunk_duration
+        self.overlap = overlap
+        logger.debug(f"AudioChunker initialized (chunk: {chunk_duration}s, overlap: {overlap}s)")
+    def iter_chunks(
+        self, file_path: str, target_sr: int = 16000
+    ) -> Iterator[Tuple[np.ndarray, int, float, float]]:
+        """
+        Iterate over audio file in chunks.
+        Args:
+            file_path: Path to audio file
+            target_sr: Target sample rate
+        Yields:
+            Tuples of (audio_chunk, sample_rate, start_time, end_time)
+        Raises:
+            AudioIOError: If file cannot be read
+        """
+        try:
+            # Read full audio (we'll optimize this for truly large files later)
+            audio, sr = read_audio(file_path, target_sr=target_sr)
+            total_duration = len(audio) / sr
+            logger.info(
+                f"Processing {Path(file_path).name} in chunks "
+                f"(duration: {total_duration:.1f}s, chunk size: {self.chunk_duration}s)"
+            )
+            # Calculate chunk parameters
+            chunk_samples = int(self.chunk_duration * sr)
+            overlap_samples = int(self.overlap * sr)
+            step_samples = chunk_samples - overlap_samples
+            position = 0
+            chunk_idx = 0
+            while position < len(audio):
+                # Extract chunk
+                chunk_start = position
+                chunk_end = min(position + chunk_samples, len(audio))
+                chunk = audio[chunk_start:chunk_end]
+                # Calculate time boundaries
+                start_time = chunk_start / sr
+                end_time = chunk_end / sr
+                logger.debug(
+                    f"Chunk {chunk_idx}: {start_time:.1f}s - {end_time:.1f}s "
+                    f"({len(chunk) / sr:.1f}s)"
+                )
+                yield chunk, sr, start_time, end_time
+                # Move to next chunk
+                position += step_samples
+                chunk_idx += 1
+                # Force garbage collection between chunks
+                gc.collect()
+            logger.info(f"Processed {chunk_idx} chunks")
+        except Exception as e:
+            logger.error(f"Failed to process chunks: {e}")
+            raise AudioIOError(f"Chunking failed: {e}")
+    def process_file_in_chunks(
+        self, file_path: str, processor_func, target_sr: int = 16000, **processor_kwargs
+    ) -> List:
+        """
+        Process audio file in chunks with custom processor function.
+        Args:
+            file_path: Path to audio file
+            processor_func: Function to process each chunk
+                           Should accept (audio, sr, start_time, end_time, **kwargs)
+            target_sr: Target sample rate
+            **processor_kwargs: Additional arguments for processor function
+        Returns:
+            List of processing results from each chunk
+        Example:
+            >>> def detect_segments(audio, sr, start_time, end_time):
+            ...     # Process audio chunk
+            ...     return segments
+            >>>
+            >>> chunker = AudioChunker(chunk_duration=60.0)
+            >>> results = chunker.process_file_in_chunks(
+            ...     "long_file.m4a",
+            ...     detect_segments
+            ... )
+        """
+        results = []
+        for chunk, sr, start_time, end_time in self.iter_chunks(file_path, target_sr):
+            try:
+                result = processor_func(chunk, sr, start_time, end_time, **processor_kwargs)
+                results.append(result)
+            except Exception as e:
+                logger.error(f"Chunk processing failed at {start_time:.1f}s: {e}")
+                # Continue with next chunk
+                continue
+        return results
+class MemoryMonitor:
+    """
+    Monitor and manage memory usage during processing.
+    """
+    def __init__(self, max_memory_mb: Optional[float] = None):
+        """
+        Initialize memory monitor.
+        Args:
+            max_memory_mb: Maximum memory usage in MB (None = no limit)
+        """
+        self.max_memory_mb = max_memory_mb
+        try:
+            import os
+            import psutil
+            self.process = psutil.Process(os.getpid())
+            self.psutil_available = True
+        except ImportError:
+            logger.warning("psutil not available, memory monitoring disabled")
+            self.psutil_available = False
+    def get_current_memory_mb(self) -> float:
+        """
+        Get current memory usage in MB.
+        Returns:
+            Memory usage in MB, or 0 if unavailable
+        """
+        if not self.psutil_available:
+            return 0.0
+        try:
+            return self.process.memory_info().rss / 1024 / 1024
+        except Exception:
+            return 0.0
+    def check_memory_limit(self) -> bool:
+        """
+        Check if memory usage is below limit.
+        Returns:
+            True if within limit (or no limit set), False if exceeded
+        """
+        if self.max_memory_mb is None:
+            return True
+        current_mb = self.get_current_memory_mb()
+        if current_mb > self.max_memory_mb:
+            logger.warning(
+                f"Memory limit exceeded: {current_mb:.1f}MB > {self.max_memory_mb:.1f}MB"
+            )
+            return False
+        return True
+    def force_cleanup(self):
+        """Force garbage collection and cleanup."""
+        gc.collect()
+        if self.psutil_available:
+            try:
+                import torch
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    logger.debug("Cleared CUDA cache")
+            except ImportError:
+                pass
+        logger.debug("Forced garbage collection")
+def optimize_for_large_files(audio_duration: float) -> dict:
+    """
+    Get optimization recommendations for large files.
+    Args:
+        audio_duration: Duration of audio file in seconds
+    Returns:
+        Dictionary with optimization parameters
+    """
+    # Thresholds
+    LARGE_FILE_THRESHOLD = 3600  # 1 hour
+    VERY_LARGE_FILE_THRESHOLD = 7200  # 2 hours
+    config = {
+        "use_chunking": False,
+        "chunk_duration": 60.0,
+        "chunk_overlap": 5.0,
+        "force_gc_frequency": 10,  # Force GC every N chunks
+        "recommended_batch_size": 32,
+    }
+    if audio_duration > VERY_LARGE_FILE_THRESHOLD:
+        # Very large file (>2 hours)
+        config.update(
+            {
+                "use_chunking": True,
+                "chunk_duration": 30.0,  # Smaller chunks
+                "chunk_overlap": 3.0,
+                "force_gc_frequency": 5,  # More frequent GC
+                "recommended_batch_size": 16,  # Smaller batches
+            }
+        )
+        logger.info(
+            f"Large file detected ({audio_duration / 3600:.1f}h), "
+            "using aggressive memory optimization"
+        )
+    elif audio_duration > LARGE_FILE_THRESHOLD:
+        # Large file (>1 hour)
+        config.update(
+            {
+                "use_chunking": True,
+                "chunk_duration": 60.0,
+                "chunk_overlap": 5.0,
+                "force_gc_frequency": 10,
+                "recommended_batch_size": 24,
+            }
+        )
+        logger.info(
+            f"Large file detected ({audio_duration / 3600:.1f}h), using memory optimization"
+        )
+    return config
+def estimate_memory_requirements(
+    audio_duration: float, sample_rate: int = 16000, num_models: int = 3, safety_factor: float = 2.0
+) -> float:
+    """
+    Estimate memory requirements for processing.
+    Args:
+        audio_duration: Duration in seconds
+        sample_rate: Sample rate in Hz
+        num_models: Number of ML models to load
+        safety_factor: Safety multiplier (default: 2.0)
+    Returns:
+        Estimated memory requirement in MB
+    """
+    # Audio data (float32 = 4 bytes)
+    audio_mb = (audio_duration * sample_rate * 4) / 1024 / 1024
+    # Model overhead (rough estimate)
+    model_mb = num_models * 500  # ~500MB per model
+    # Processing overhead
+    processing_mb = audio_mb * 2  # Intermediate buffers, embeddings, etc.
+    total_mb = (audio_mb + model_mb + processing_mb) * safety_factor
+    logger.debug(
+        f"Estimated memory: audio={audio_mb:.1f}MB, "
+        f"models={model_mb:.1f}MB, processing={processing_mb:.1f}MB, "
+        f"total={total_mb:.1f}MB (with {safety_factor}x safety factor)"
+    )
+    return total_mb

src/lib/metadata_logger.py ADDED Viewed

	@@ -0,0 +1,336 @@

+"""
+Processing metadata logging utility.
+Tracks and logs processing metadata for all workflows including timing,
+resource usage, and processing statistics.
+"""
+import json
+import logging
+import os
+import time
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, Optional
+import psutil
+logger = logging.getLogger(__name__)
+@dataclass
+class ProcessingMetadata:
+    """
+    Metadata for a processing job.
+    Tracks timing, resource usage, and processing statistics.
+    """
+    # Job identification
+    job_id: str
+    workflow: str  # 'separation', 'extraction', 'denoising'
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+    # Input/Output
+    input_files: list = field(default_factory=list)
+    output_files: list = field(default_factory=list)
+    # Timing (seconds)
+    start_time: Optional[float] = None
+    end_time: Optional[float] = None
+    processing_time: Optional[float] = None
+    # Resource usage
+    peak_memory_mb: float = 0.0
+    avg_cpu_percent: float = 0.0
+    # Processing statistics (workflow-specific)
+    statistics: Dict[str, Any] = field(default_factory=dict)
+    # Configuration
+    configuration: Dict[str, Any] = field(default_factory=dict)
+    # Status
+    status: str = "pending"  # pending, running, completed, failed
+    error_message: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert metadata to dictionary."""
+        return asdict(self)
+    def to_json(self) -> str:
+        """Convert metadata to JSON string."""
+        return json.dumps(self.to_dict(), indent=2)
+class MetadataLogger:
+    """
+    Logger for processing metadata.
+    Tracks timing, resource usage, and statistics for processing jobs.
+    """
+    def __init__(self, output_dir: Optional[Path] = None):
+        """
+        Initialize metadata logger.
+        Args:
+            output_dir: Directory to save metadata logs (default: ./metadata_logs)
+        """
+        self.output_dir = output_dir or Path("./metadata_logs")
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.current_metadata: Optional[ProcessingMetadata] = None
+        self.process = psutil.Process(os.getpid())
+        # Resource tracking
+        self._start_memory = 0.0
+        self._cpu_samples = []
+        logger.debug(f"Metadata logger initialized (output: {self.output_dir})")
+    def start_job(
+        self, job_id: str, workflow: str, input_files: list, configuration: Dict[str, Any]
+    ) -> ProcessingMetadata:
+        """
+        Start tracking a new processing job.
+        Args:
+            job_id: Unique job identifier
+            workflow: Workflow name ('separation', 'extraction', 'denoising')
+            input_files: List of input file paths
+            configuration: Job configuration parameters
+        Returns:
+            ProcessingMetadata object for this job
+        """
+        self.current_metadata = ProcessingMetadata(
+            job_id=job_id,
+            workflow=workflow,
+            input_files=[str(f) for f in input_files],
+            configuration=configuration,
+            start_time=time.time(),
+            status="running",
+        )
+        # Initialize resource tracking
+        self._start_memory = self.process.memory_info().rss / 1024 / 1024  # MB
+        self._cpu_samples = []
+        logger.info(f"Started tracking job: {job_id} ({workflow})")
+        return self.current_metadata
+    def update_progress(self, statistics: Dict[str, Any]):
+        """
+        Update job statistics during processing.
+        Args:
+            statistics: Current processing statistics
+        """
+        if self.current_metadata is None:
+            logger.warning("No active job to update")
+            return
+        self.current_metadata.statistics.update(statistics)
+        # Track resources
+        current_memory = self.process.memory_info().rss / 1024 / 1024  # MB
+        self.current_metadata.peak_memory_mb = max(
+            self.current_metadata.peak_memory_mb, current_memory
+        )
+        # Sample CPU usage
+        try:
+            cpu_percent = self.process.cpu_percent(interval=0.1)
+            self._cpu_samples.append(cpu_percent)
+        except Exception:
+            pass
+    def complete_job(
+        self, output_files: list, final_statistics: Optional[Dict[str, Any]] = None
+    ) -> ProcessingMetadata:
+        """
+        Mark job as completed and finalize metadata.
+        Args:
+            output_files: List of output file paths
+            final_statistics: Final processing statistics
+        Returns:
+            Completed ProcessingMetadata object
+        """
+        if self.current_metadata is None:
+            raise ValueError("No active job to complete")
+        self.current_metadata.end_time = time.time()
+        self.current_metadata.processing_time = (
+            self.current_metadata.end_time - self.current_metadata.start_time
+        )
+        self.current_metadata.output_files = [str(f) for f in output_files]
+        self.current_metadata.status = "completed"
+        # Update final statistics
+        if final_statistics:
+            self.current_metadata.statistics.update(final_statistics)
+        # Calculate average CPU usage
+        if self._cpu_samples:
+            self.current_metadata.avg_cpu_percent = sum(self._cpu_samples) / len(self._cpu_samples)
+        # Save metadata
+        self._save_metadata()
+        logger.info(
+            f"Completed job: {self.current_metadata.job_id} "
+            f"(time: {self.current_metadata.processing_time:.2f}s, "
+            f"memory: {self.current_metadata.peak_memory_mb:.2f}MB)"
+        )
+        completed_metadata = self.current_metadata
+        self.current_metadata = None
+        return completed_metadata
+    def fail_job(self, error_message: str) -> ProcessingMetadata:
+        """
+        Mark job as failed.
+        Args:
+            error_message: Error description
+        Returns:
+            Failed ProcessingMetadata object
+        """
+        if self.current_metadata is None:
+            raise ValueError("No active job to fail")
+        self.current_metadata.end_time = time.time()
+        self.current_metadata.processing_time = (
+            self.current_metadata.end_time - self.current_metadata.start_time
+        )
+        self.current_metadata.status = "failed"
+        self.current_metadata.error_message = error_message
+        # Save metadata
+        self._save_metadata()
+        logger.error(f"Failed job: {self.current_metadata.job_id} - {error_message}")
+        failed_metadata = self.current_metadata
+        self.current_metadata = None
+        return failed_metadata
+    def _save_metadata(self):
+        """Save metadata to file."""
+        if self.current_metadata is None:
+            return
+        try:
+            # Create filename from job ID and timestamp
+            filename = f"{self.current_metadata.workflow}_{self.current_metadata.job_id}.json"
+            filepath = self.output_dir / filename
+            # Write metadata
+            with open(filepath, "w") as f:
+                f.write(self.current_metadata.to_json())
+            logger.debug(f"Saved metadata: {filepath}")
+        except Exception as e:
+            logger.error(f"Failed to save metadata: {e}")
+    def get_job_history(self, workflow: Optional[str] = None) -> list:
+        """
+        Get processing history for completed jobs.
+        Args:
+            workflow: Filter by workflow name (None = all workflows)
+        Returns:
+            List of ProcessingMetadata dictionaries
+        """
+        history = []
+        try:
+            for metadata_file in self.output_dir.glob("*.json"):
+                # Filter by workflow if specified
+                if workflow and not metadata_file.stem.startswith(workflow):
+                    continue
+                with open(metadata_file) as f:
+                    metadata = json.load(f)
+                    history.append(metadata)
+            # Sort by timestamp (newest first)
+            history.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
+        except Exception as e:
+            logger.error(f"Failed to load job history: {e}")
+        return history
+    def get_statistics_summary(self, workflow: str) -> Dict[str, Any]:
+        """
+        Get aggregated statistics for a workflow.
+        Args:
+            workflow: Workflow name
+        Returns:
+            Dictionary with aggregated statistics
+        """
+        history = self.get_job_history(workflow=workflow)
+        if not history:
+            return {
+                "total_jobs": 0,
+                "completed_jobs": 0,
+                "failed_jobs": 0,
+            }
+        completed = [j for j in history if j["status"] == "completed"]
+        failed = [j for j in history if j["status"] == "failed"]
+        summary = {
+            "total_jobs": len(history),
+            "completed_jobs": len(completed),
+            "failed_jobs": len(failed),
+            "success_rate": len(completed) / len(history) if history else 0.0,
+        }
+        if completed:
+            processing_times = [j["processing_time"] for j in completed if j.get("processing_time")]
+            memory_usage = [j["peak_memory_mb"] for j in completed if j.get("peak_memory_mb")]
+            if processing_times:
+                summary["avg_processing_time"] = sum(processing_times) / len(processing_times)
+                summary["min_processing_time"] = min(processing_times)
+                summary["max_processing_time"] = max(processing_times)
+            if memory_usage:
+                summary["avg_memory_mb"] = sum(memory_usage) / len(memory_usage)
+                summary["peak_memory_mb"] = max(memory_usage)
+        return summary
+# Global metadata logger instance
+_global_logger: Optional[MetadataLogger] = None
+def get_metadata_logger(output_dir: Optional[Path] = None) -> MetadataLogger:
+    """
+    Get global metadata logger instance.
+    Args:
+        output_dir: Directory to save metadata logs
+    Returns:
+        MetadataLogger instance
+    """
+    global _global_logger
+    if _global_logger is None:
+        _global_logger = MetadataLogger(output_dir=output_dir)
+    return _global_logger

src/lib/quality_metrics.py ADDED Viewed

	@@ -0,0 +1,397 @@

+"""
+Audio quality metrics: SNR, STOI, PESQ calculation functions.
+Provides objective quality measurements for audio extraction validation.
+"""
+import logging
+from typing import Optional, Tuple
+import numpy as np
+logger = logging.getLogger(__name__)
+class QualityMetricsError(Exception):
+    """Custom exception for quality metric calculation errors."""
+    pass
+def calculate_snr(clean_signal: np.ndarray, noisy_signal: np.ndarray) -> float:
+    """
+    Calculate Signal-to-Noise Ratio (SNR) in dB.
+    Measures the ratio of signal power to noise power.
+    Higher values indicate cleaner audio.
+    Args:
+        clean_signal: Clean reference signal
+        noisy_signal: Signal with noise
+    Returns:
+        SNR in dB
+    Raises:
+        QualityMetricsError: If signals have different lengths or calculation fails
+    """
+    try:
+        # Ensure same length
+        min_len = min(len(clean_signal), len(noisy_signal))
+        clean_signal = clean_signal[:min_len]
+        noisy_signal = noisy_signal[:min_len]
+        # Calculate noise
+        noise = noisy_signal - clean_signal
+        # Calculate power
+        signal_power = np.mean(clean_signal**2)
+        noise_power = np.mean(noise**2)
+        # Handle edge case: no noise
+        if noise_power == 0:
+            return float("inf")
+        # Handle edge case: no signal
+        if signal_power == 0:
+            return float("-inf")
+        # Calculate SNR in dB
+        snr = 10 * np.log10(signal_power / noise_power)
+        return snr
+    except Exception as e:
+        raise QualityMetricsError(f"Failed to calculate SNR: {str(e)}")
+def calculate_snr_segmental(
+    signal: np.ndarray, sample_rate: int, frame_length_ms: int = 20
+) -> float:
+    """
+    Calculate segmental SNR for signal without clean reference.
+    Useful when you don't have a clean reference - estimates SNR
+    by analyzing signal characteristics.
+    Args:
+        signal: Audio signal
+        sample_rate: Sample rate in Hz
+        frame_length_ms: Frame length in milliseconds
+    Returns:
+        Segmental SNR in dB
+    """
+    try:
+        frame_length = int(sample_rate * frame_length_ms / 1000)
+        hop_length = frame_length // 2
+        snrs = []
+        for i in range(0, len(signal) - frame_length, hop_length):
+            frame = signal[i : i + frame_length]
+            signal_power = np.mean(frame**2)
+            if signal_power > 0:
+                snr_db = 10 * np.log10(signal_power)
+                snrs.append(snr_db)
+        if not snrs:
+            return 0.0
+        return np.mean(snrs)
+    except Exception as e:
+        raise QualityMetricsError(f"Failed to calculate segmental SNR: {str(e)}")
+def calculate_stoi(
+    clean_signal: np.ndarray, degraded_signal: np.ndarray, sample_rate: int, extended: bool = True
+) -> float:
+    """
+    Calculate Short-Time Objective Intelligibility (STOI) score.
+    Measures speech intelligibility. Range: 0-1 (higher = better).
+    Extended STOI (e-STOI) is better for intermediate quality levels.
+    Args:
+        clean_signal: Clean reference signal
+        degraded_signal: Degraded signal to evaluate
+        sample_rate: Sample rate in Hz
+        extended: Use extended STOI (default: True)
+    Returns:
+        STOI score (0-1)
+    Raises:
+        QualityMetricsError: If calculation fails
+    """
+    try:
+        from pystoi import stoi
+        # Ensure same length
+        min_len = min(len(clean_signal), len(degraded_signal))
+        clean_signal = clean_signal[:min_len]
+        degraded_signal = degraded_signal[:min_len]
+        # Calculate STOI
+        score = stoi(clean_signal, degraded_signal, sample_rate, extended=extended)
+        return score
+    except Exception as e:
+        raise QualityMetricsError(f"Failed to calculate STOI: {str(e)}")
+def calculate_pesq(
+    reference_signal: np.ndarray, degraded_signal: np.ndarray, sample_rate: int, mode: str = "wb"
+) -> float:
+    """
+    Calculate Perceptual Evaluation of Speech Quality (PESQ) score.
+    Correlates with human perception of quality. Range: -0.5 to 4.5 (higher = better).
+    Args:
+        reference_signal: Reference (clean) signal
+        degraded_signal: Degraded signal to evaluate
+        sample_rate: Sample rate in Hz (must be 8000 or 16000)
+        mode: 'wb' (wideband, 16kHz) or 'nb' (narrowband, 8kHz)
+    Returns:
+        PESQ score
+    Raises:
+        QualityMetricsError: If calculation fails or sample rate is invalid
+    """
+    try:
+        from pesq import pesq
+        # Ensure same length
+        min_len = min(len(reference_signal), len(degraded_signal))
+        reference_signal = reference_signal[:min_len]
+        degraded_signal = degraded_signal[:min_len]
+        # PESQ requires specific sample rates
+        if mode == "wb" and sample_rate != 16000:
+            raise QualityMetricsError(
+                f"Wideband PESQ requires 16kHz sample rate, got {sample_rate}Hz. "
+                "Resample before calling this function."
+            )
+        elif mode == "nb" and sample_rate != 8000:
+            raise QualityMetricsError(
+                f"Narrowband PESQ requires 8kHz sample rate, got {sample_rate}Hz. "
+                "Resample before calling this function."
+            )
+        # Calculate PESQ
+        score = pesq(sample_rate, reference_signal, degraded_signal, mode)
+        return score
+    except Exception as e:
+        if isinstance(e, QualityMetricsError):
+            raise
+        raise QualityMetricsError(f"Failed to calculate PESQ: {str(e)}")
+def calculate_pesq_with_resampling(
+    reference_signal: np.ndarray, degraded_signal: np.ndarray, sample_rate: int, mode: str = "wb"
+) -> float:
+    """
+    Calculate PESQ with automatic resampling to required sample rate.
+    Args:
+        reference_signal: Reference signal
+        degraded_signal: Degraded signal
+        sample_rate: Current sample rate
+        mode: 'wb' (wideband, 16kHz) or 'nb' (narrowband, 8kHz)
+    Returns:
+        PESQ score
+    """
+    try:
+        from pesq import pesq
+        from scipy.signal import resample
+        # Ensure same length
+        min_len = min(len(reference_signal), len(degraded_signal))
+        reference_signal = reference_signal[:min_len]
+        degraded_signal = degraded_signal[:min_len]
+        # Determine target sample rate
+        target_sr = 16000 if mode == "wb" else 8000
+        # Resample if needed
+        if sample_rate != target_sr:
+            target_len = int(len(reference_signal) * target_sr / sample_rate)
+            reference_signal = resample(reference_signal, target_len)
+            degraded_signal = resample(degraded_signal, target_len)
+        # Calculate PESQ
+        score = pesq(target_sr, reference_signal, degraded_signal, mode)
+        return score
+    except Exception as e:
+        raise QualityMetricsError(f"Failed to calculate PESQ with resampling: {str(e)}")
+def validate_extraction_quality(
+    original_signal: np.ndarray,
+    extracted_signal: np.ndarray,
+    sample_rate: int,
+    snr_threshold: float = 20.0,
+    stoi_threshold: float = 0.75,
+    pesq_threshold: float = 2.5,
+) -> dict:
+    """
+    Validate extraction quality against thresholds.
+    Calculates all three metrics and checks if they meet minimum thresholds.
+    Args:
+        original_signal: Original (noisy) signal
+        extracted_signal: Extracted (cleaned) signal
+        sample_rate: Sample rate in Hz
+        snr_threshold: Minimum SNR in dB (default: 20)
+        stoi_threshold: Minimum STOI score (default: 0.75)
+        pesq_threshold: Minimum PESQ score (default: 2.5)
+    Returns:
+        Dictionary with metrics and pass/fail status
+    """
+    results = {
+        "snr": None,
+        "snr_pass": False,
+        "stoi": None,
+        "stoi_pass": False,
+        "pesq": None,
+        "pesq_pass": False,
+        "overall_pass": False,
+    }
+    try:
+        # Calculate SNR
+        try:
+            results["snr"] = calculate_snr(original_signal, extracted_signal)
+            results["snr_pass"] = results["snr"] >= snr_threshold
+        except Exception as e:
+            logger.warning(f"SNR calculation failed: {e}")
+        # Calculate STOI
+        try:
+            results["stoi"] = calculate_stoi(
+                original_signal, extracted_signal, sample_rate, extended=True
+            )
+            results["stoi_pass"] = results["stoi"] >= stoi_threshold
+        except Exception as e:
+            logger.warning(f"STOI calculation failed: {e}")
+        # Calculate PESQ (with resampling if needed)
+        try:
+            results["pesq"] = calculate_pesq_with_resampling(
+                original_signal, extracted_signal, sample_rate, mode="wb"
+            )
+            results["pesq_pass"] = results["pesq"] >= pesq_threshold
+        except Exception as e:
+            logger.warning(f"PESQ calculation failed: {e}")
+        # Overall pass if all metrics that were calculated passed
+        results["overall_pass"] = (
+            results.get("snr_pass", False)
+            and results.get("stoi_pass", False)
+            and results.get("pesq_pass", False)
+        )
+    except Exception as e:
+        logger.error(f"Quality validation failed: {e}")
+    return results
+def get_quality_label(metric_name: str, value: float) -> str:
+    """
+    Get quality label for a metric value.
+    Args:
+        metric_name: Metric name ('snr', 'stoi', 'pesq')
+        value: Metric value
+    Returns:
+        Quality label string
+    """
+    if metric_name == "snr":
+        if value > 40:
+            return "Excellent"
+        elif value > 30:
+            return "Very Good"
+        elif value > 20:
+            return "Good"
+        elif value > 10:
+            return "Fair"
+        else:
+            return "Poor"
+    elif metric_name == "stoi":
+        if value > 0.9:
+            return "Excellent"
+        elif value > 0.8:
+            return "Very Good"
+        elif value > 0.7:
+            return "Good"
+        elif value > 0.6:
+            return "Fair"
+        else:
+            return "Poor"
+    elif metric_name == "pesq":
+        if value > 3.5:
+            return "Excellent"
+        elif value > 3.0:
+            return "Good"
+        elif value > 2.5:
+            return "Fair"
+        elif value > 2.0:
+            return "Poor"
+        else:
+            return "Bad"
+    return "Unknown"
+def generate_quality_report(metrics: dict) -> str:
+    """
+    Generate human-readable quality report.
+    Args:
+        metrics: Dictionary from validate_extraction_quality()
+    Returns:
+        Formatted report string
+    """
+    report = ["=== Voice Extraction Quality Report ===", ""]
+    # SNR
+    if metrics["snr"] is not None:
+        status = "PASS" if metrics["snr_pass"] else "FAIL"
+        quality = get_quality_label("snr", metrics["snr"])
+        report.append(f"SNR: {metrics['snr']:.2f} dB [{status}] - {quality}")
+    # STOI
+    if metrics["stoi"] is not None:
+        status = "PASS" if metrics["stoi_pass"] else "FAIL"
+        quality = get_quality_label("stoi", metrics["stoi"])
+        report.append(f"STOI: {metrics['stoi']:.3f} [{status}] - {quality}")
+    # PESQ
+    if metrics["pesq"] is not None:
+        status = "PASS" if metrics["pesq_pass"] else "FAIL"
+        quality = get_quality_label("pesq", metrics["pesq"])
+        report.append(f"PESQ: {metrics['pesq']:.2f} [{status}] - {quality}")
+    # Overall
+    overall = "PASS" if metrics["overall_pass"] else "FAIL"
+    report.append("")
+    report.append(f"Overall Quality: [{overall}]")
+    return "\n".join(report)