Spaces:

jcudit
/

voice-tools

Paused

App Files Files Community

jcudit HF Staff commited on Dec 28, 2025

Commit

0456b70

1 Parent(s): ff074e4

fix: correct gitignore to only exclude root-level models directory, not src/models package

Browse files

Files changed (6) hide show

.gitignore +3 -3
src/models/__init__.py +0 -0
src/models/audio_segment.py +130 -0
src/models/processing_job.py +320 -0
src/models/speaker_profile.py +62 -0
src/models/voice_profile.py +152 -0

.gitignore CHANGED Viewed

@@ -41,8 +41,8 @@ env/
 .env.*
 !.env.example
-# Models directory (HuggingFace cache)
-models/
 *.pt
 *.pth
 *.bin
@@ -79,4 +79,4 @@ tmp/
 # Planning
 specs
 .specify
-CLAUDE.md

 .env.*
 !.env.example
+# Models directory (HuggingFace cache) - only at root level
+/models/
 *.pt
 *.pth
 *.bin
 # Planning
 specs
 .specify
+CLAUDE.md

src/models/__init__.py ADDED Viewed

File without changes

src/models/audio_segment.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+Audio Segment Model
+Represents a contiguous portion of audio with speaker and timing information.
+"""
+from dataclasses import dataclass
+from enum import Enum
+from typing import List, Optional
+class SegmentType(Enum):
+    """Classification of audio segment types."""
+    SPEECH = "speech"
+    NONVERBAL = "nonverbal"
+    SILENCE = "silence"
+    OVERLAP = "overlap"  # Multiple speakers talking simultaneously
+@dataclass
+class AudioSegment:
+    """
+    Audio segment with time range and speaker information.
+    Attributes:
+        start_time: Beginning timestamp in seconds
+        end_time: Ending timestamp in seconds
+        speaker_id: Identifier of the speaker in this segment
+        confidence: Certainty of speaker identification (0.0-1.0)
+        segment_type: Classification of the segment
+        audio_file: Path to the source audio file (optional)
+    """
+    start_time: float
+    end_time: float
+    speaker_id: str
+    confidence: float = 1.0
+    segment_type: SegmentType = SegmentType.SPEECH
+    audio_file: Optional[str] = None
+    def __post_init__(self):
+        """Validate audio segment data."""
+        if self.start_time < 0:
+            raise ValueError(f"Start time cannot be negative: {self.start_time}")
+        if self.end_time <= self.start_time:
+            raise ValueError(
+                f"End time ({self.end_time}) must be after start time ({self.start_time})"
+            )
+        if not 0.0 <= self.confidence <= 1.0:
+            raise ValueError(f"Confidence must be between 0.0 and 1.0, got {self.confidence}")
+    @property
+    def duration(self) -> float:
+        """Calculate duration of the segment in seconds."""
+        return self.end_time - self.start_time
+    def overlaps_with(self, other: "AudioSegment") -> bool:
+        """Check if this segment overlaps with another segment."""
+        return not (self.end_time <= other.start_time or other.end_time <= self.start_time)
+    def contains_time(self, time: float) -> bool:
+        """Check if a timestamp falls within this segment."""
+        return self.start_time <= time <= self.end_time
+    def __repr__(self) -> str:
+        return (
+            f"AudioSegment("
+            f"speaker='{self.speaker_id}', "
+            f"time={self.start_time:.2f}-{self.end_time:.2f}s, "
+            f"duration={self.duration:.2f}s, "
+            f"confidence={self.confidence:.2f}, "
+            f"type={self.segment_type.value})"
+        )
+class SegmentCollection:
+    """
+    Collection of audio segments with utility methods.
+    Provides methods for filtering, sorting, and analyzing groups of segments.
+    """
+    def __init__(self, segments: List[AudioSegment]):
+        """Initialize collection with segments."""
+        self.segments = segments
+    def __len__(self) -> int:
+        """Return number of segments."""
+        return len(self.segments)
+    def __iter__(self):
+        """Iterate over segments."""
+        return iter(self.segments)
+    def __getitem__(self, index):
+        """Get segment by index."""
+        return self.segments[index]
+    @property
+    def total_duration(self) -> float:
+        """Calculate total duration of all segments."""
+        return sum(seg.duration for seg in self.segments)
+    def filter_by_speaker(self, speaker_id: str) -> "SegmentCollection":
+        """Filter segments by speaker ID."""
+        filtered = [seg for seg in self.segments if seg.speaker_id == speaker_id]
+        return SegmentCollection(filtered)
+    def filter_by_type(self, segment_type: SegmentType) -> "SegmentCollection":
+        """Filter segments by type."""
+        filtered = [seg for seg in self.segments if seg.segment_type == segment_type]
+        return SegmentCollection(filtered)
+    def sort_by_time(self) -> "SegmentCollection":
+        """Sort segments by start time."""
+        sorted_segments = sorted(self.segments, key=lambda s: s.start_time)
+        return SegmentCollection(sorted_segments)
+    def get_speakers(self) -> List[str]:
+        """Get unique list of speaker IDs."""
+        return list(set(seg.speaker_id for seg in self.segments))
+    def average_confidence(self) -> float:
+        """Calculate average confidence across all segments."""
+        if not self.segments:
+            return 0.0
+        return sum(seg.confidence for seg in self.segments) / len(self.segments)

src/models/processing_job.py ADDED Viewed

	@@ -0,0 +1,320 @@

+"""
+ProcessingJob data model: Batch configuration and execution tracking.
+Represents a voice extraction job with configuration and state.
+"""
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from typing import List, Literal, Optional
+class ExtractionMode(Enum):
+    """Extraction mode for audio processing."""
+    SPEECH = "speech"
+    NONVERBAL = "nonverbal"
+    BOTH = "both"
+class JobStatus(Enum):
+    """Processing job status."""
+    PENDING = "pending"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+@dataclass
+class ProcessingJob:
+    """
+    Voice extraction processing job.
+    Represents a batch processing job with configuration, state tracking,
+    and results collection.
+    """
+    # Input configuration
+    reference_file: str
+    input_files: List[str]
+    output_dir: str
+    # Processing options
+    extraction_mode: ExtractionMode = ExtractionMode.SPEECH
+    apply_denoising: bool = False
+    vad_threshold: float = 0.5
+    quality_threshold_enabled: bool = True
+    # Job state
+    status: JobStatus = JobStatus.PENDING
+    job_id: Optional[str] = None
+    created_at: Optional[str] = None
+    started_at: Optional[str] = None
+    completed_at: Optional[str] = None
+    # Progress tracking
+    total_files: int = 0
+    files_processed: int = 0
+    files_failed: int = 0
+    current_file: Optional[str] = None
+    # Results
+    output_files: List[str] = field(default_factory=list)
+    failed_files: List[dict] = field(default_factory=list)  # {file, error}
+    # Statistics
+    total_input_duration: float = 0.0
+    total_extracted_duration: float = 0.0
+    total_processing_time: float = 0.0
+    def __post_init__(self):
+        """Initialize job after creation."""
+        if self.job_id is None:
+            # Generate job ID from timestamp
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            self.job_id = f"job_{timestamp}"
+        if self.created_at is None:
+            self.created_at = datetime.now().isoformat()
+        self.total_files = len(self.input_files)
+    @property
+    def progress_percentage(self) -> float:
+        """
+        Get job progress as percentage.
+        Returns:
+            Progress percentage (0-100)
+        """
+        if self.total_files == 0:
+            return 0.0
+        return (self.files_processed / self.total_files) * 100
+    @property
+    def success_rate(self) -> float:
+        """
+        Get success rate for processed files.
+        Returns:
+            Success rate as percentage (0-100)
+        """
+        processed = self.files_processed
+        if processed == 0:
+            return 0.0
+        succeeded = processed - self.files_failed
+        return (succeeded / processed) * 100
+    @property
+    def extraction_yield(self) -> float:
+        """
+        Get extraction yield percentage.
+        Returns:
+            Yield as percentage of input duration (0-100)
+        """
+        if self.total_input_duration == 0:
+            return 0.0
+        return (self.total_extracted_duration / self.total_input_duration) * 100
+    @property
+    def is_complete(self) -> bool:
+        """Check if job is complete."""
+        return self.status in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED)
+    @property
+    def is_running(self) -> bool:
+        """Check if job is currently running."""
+        return self.status == JobStatus.RUNNING
+    def start(self):
+        """Mark job as started."""
+        self.status = JobStatus.RUNNING
+        self.started_at = datetime.now().isoformat()
+    def complete(self):
+        """Mark job as completed."""
+        self.status = JobStatus.COMPLETED
+        self.completed_at = datetime.now().isoformat()
+        # Calculate total processing time
+        if self.started_at and self.completed_at:
+            start = datetime.fromisoformat(self.started_at)
+            end = datetime.fromisoformat(self.completed_at)
+            self.total_processing_time = (end - start).total_seconds()
+    def fail(self, error: str):
+        """Mark job as failed."""
+        self.status = JobStatus.FAILED
+        self.completed_at = datetime.now().isoformat()
+        # Add general error to failed files
+        self.failed_files.append(
+            {
+                "file": "JOB",
+                "error": error,
+            }
+        )
+    def cancel(self):
+        """Mark job as cancelled."""
+        self.status = JobStatus.CANCELLED
+        self.completed_at = datetime.now().isoformat()
+    def add_success(
+        self, input_file: str, output_file: str, input_duration: float, extracted_duration: float
+    ):
+        """
+        Record successful file processing.
+        Args:
+            input_file: Input file path
+            output_file: Output file path
+            input_duration: Input file duration in seconds
+            extracted_duration: Extracted audio duration in seconds
+        """
+        self.files_processed += 1
+        self.output_files.append(output_file)
+        self.total_input_duration += input_duration
+        self.total_extracted_duration += extracted_duration
+    def add_failure(self, input_file: str, error: str):
+        """
+        Record failed file processing.
+        Args:
+            input_file: Input file path that failed
+            error: Error message
+        """
+        self.files_processed += 1
+        self.files_failed += 1
+        self.failed_files.append(
+            {
+                "file": input_file,
+                "error": error,
+            }
+        )
+    def update_progress(self, current_file: str):
+        """
+        Update current processing file.
+        Args:
+            current_file: Currently processing file path
+        """
+        self.current_file = current_file
+    def get_summary(self) -> dict:
+        """
+        Get job summary statistics.
+        Returns:
+            Dictionary with summary information
+        """
+        return {
+            "job_id": self.job_id,
+            "status": self.status.value,
+            "extraction_mode": self.extraction_mode.value,
+            "apply_denoising": self.apply_denoising,
+            "total_files": self.total_files,
+            "files_processed": self.files_processed,
+            "files_succeeded": self.files_processed - self.files_failed,
+            "files_failed": self.files_failed,
+            "progress_percentage": self.progress_percentage,
+            "success_rate": self.success_rate,
+            "total_input_duration": self.total_input_duration,
+            "total_extracted_duration": self.total_extracted_duration,
+            "extraction_yield": self.extraction_yield,
+            "total_processing_time": self.total_processing_time,
+            "created_at": self.created_at,
+            "started_at": self.started_at,
+            "completed_at": self.completed_at,
+        }
+    def to_dict(self) -> dict:
+        """Convert job to dictionary."""
+        return {
+            "job_id": self.job_id,
+            "reference_file": self.reference_file,
+            "input_files": self.input_files,
+            "output_dir": self.output_dir,
+            "extraction_mode": self.extraction_mode.value,
+            "apply_denoising": self.apply_denoising,
+            "vad_threshold": self.vad_threshold,
+            "quality_threshold_enabled": self.quality_threshold_enabled,
+            "status": self.status.value,
+            "created_at": self.created_at,
+            "started_at": self.started_at,
+            "completed_at": self.completed_at,
+            "total_files": self.total_files,
+            "files_processed": self.files_processed,
+            "files_failed": self.files_failed,
+            "current_file": self.current_file,
+            "output_files": self.output_files,
+            "failed_files": self.failed_files,
+            "total_input_duration": self.total_input_duration,
+            "total_extracted_duration": self.total_extracted_duration,
+            "total_processing_time": self.total_processing_time,
+            "summary": self.get_summary(),
+        }
+    @classmethod
+    def from_dict(cls, data: dict) -> "ProcessingJob":
+        """Create job from dictionary."""
+        data = data.copy()
+        # Convert enum strings to enums
+        if isinstance(data.get("extraction_mode"), str):
+            data["extraction_mode"] = ExtractionMode(data["extraction_mode"])
+        if isinstance(data.get("status"), str):
+            data["status"] = JobStatus(data["status"])
+        # Remove computed properties
+        data.pop("summary", None)
+        return cls(**data)
+    def generate_report(self) -> str:
+        """
+        Generate human-readable job report.
+        Returns:
+            Formatted report string
+        """
+        report = ["=== Voice Extraction Job Report ===", ""]
+        report.append(f"Job ID: {self.job_id}")
+        report.append(f"Status: {self.status.value.upper()}")
+        report.append(f"Mode: {self.extraction_mode.value}")
+        report.append(f"Denoising: {'Enabled' if self.apply_denoising else 'Disabled'}")
+        report.append("")
+        report.append(f"Files Processed: {self.files_processed}/{self.total_files}")
+        report.append(f"Success Rate: {self.success_rate:.1f}%")
+        report.append(f"Progress: {self.progress_percentage:.1f}%")
+        report.append("")
+        report.append(f"Input Duration: {self.total_input_duration / 60:.1f} minutes")
+        report.append(f"Extracted Duration: {self.total_extracted_duration / 60:.1f} minutes")
+        report.append(f"Extraction Yield: {self.extraction_yield:.1f}%")
+        if self.total_processing_time > 0:
+            report.append(f"Processing Time: {self.total_processing_time / 60:.1f} minutes")
+        if self.files_failed > 0:
+            report.append("")
+            report.append(f"Failed Files ({self.files_failed}):")
+            for failure in self.failed_files[:5]:  # Show first 5
+                report.append(f"  - {failure['file']}: {failure['error']}")
+            if len(self.failed_files) > 5:
+                report.append(f"  ... and {len(self.failed_files) - 5} more")
+        return "\n".join(report)

src/models/speaker_profile.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""
+Speaker Profile Model
+Represents a speaker's voice characteristics extracted from audio,
+used for identification and matching.
+"""
+from dataclasses import dataclass
+from typing import Optional
+import numpy as np
+@dataclass
+class SpeakerProfile:
+    """
+    Speaker profile containing voice embeddings and metadata.
+    Attributes:
+        speaker_id: Unique identifier (e.g., "SPEAKER_00", "SPEAKER_01")
+        embedding: Numerical representation of voice characteristics
+        source_audio: Path to the audio file this profile was extracted from
+        start_time: Start time in source audio (seconds)
+        end_time: End time in source audio (seconds)
+        confidence: Quality/reliability metric (0.0-1.0)
+    """
+    speaker_id: str
+    embedding: np.ndarray
+    source_audio: str
+    start_time: float
+    end_time: float
+    confidence: float = 1.0
+    def __post_init__(self):
+        """Validate speaker profile data."""
+        if self.embedding is None or len(self.embedding) == 0:
+            raise ValueError("Embedding vector cannot be empty")
+        if self.end_time <= self.start_time:
+            raise ValueError(
+                f"End time ({self.end_time}) must be after start time ({self.start_time})"
+            )
+        if not 0.0 <= self.confidence <= 1.0:
+            raise ValueError(f"Confidence must be between 0.0 and 1.0, got {self.confidence}")
+        duration = self.end_time - self.start_time
+        if duration < 3.0:
+            raise ValueError(f"Duration ({duration}s) is too short (minimum 3 seconds recommended)")
+    @property
+    def duration(self) -> float:
+        """Calculate duration of the profile in seconds."""
+        return self.end_time - self.start_time
+    def __repr__(self) -> str:
+        return (
+            f"SpeakerProfile(speaker_id='{self.speaker_id}', "
+            f"duration={self.duration:.2f}s, "
+            f"confidence={self.confidence:.2f})"
+        )

src/models/voice_profile.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""
+VoiceProfile data model: Reference embedding and speaker identification.
+Represents a target voice profile extracted from reference audio clip.
+"""
+from dataclasses import dataclass, field
+from typing import Optional
+import numpy as np
+@dataclass
+class VoiceProfile:
+    """
+    Voice profile representing a target speaker.
+    Contains embedding vectors and metadata for voice identification.
+    """
+    # Core identification
+    speaker_id: str
+    embedding: np.ndarray  # 512-dimensional vector from pyannote
+    # Source information
+    reference_file: str
+    reference_duration: float  # seconds
+    # Quality metrics
+    embedding_quality: float = 1.0  # 0-1 score indicating embedding confidence
+    num_speech_segments: int = 0
+    # Metadata
+    sample_rate: int = 16000
+    created_at: Optional[str] = None
+    def __post_init__(self):
+        """Validate voice profile after initialization."""
+        if self.embedding.ndim != 1:
+            raise ValueError("Embedding must be 1-dimensional vector")
+        if self.embedding_quality < 0 or self.embedding_quality > 1:
+            raise ValueError("Embedding quality must be between 0 and 1")
+    def similarity(self, other_embedding: np.ndarray) -> float:
+        """
+        Calculate cosine similarity with another embedding.
+        Args:
+            other_embedding: Another voice embedding vector
+        Returns:
+            Similarity score (0-1, higher = more similar)
+        """
+        from scipy.spatial.distance import cosine
+        # Cosine similarity = 1 - cosine distance
+        similarity = 1 - cosine(self.embedding, other_embedding)
+        return max(0.0, min(1.0, similarity))  # Clamp to [0, 1]
+    def matches(self, other_embedding: np.ndarray, threshold: float = 0.7) -> bool:
+        """
+        Check if another embedding matches this voice profile.
+        Args:
+            other_embedding: Voice embedding to compare
+            threshold: Similarity threshold for match (default: 0.7)
+        Returns:
+            True if embeddings match above threshold
+        """
+        return self.similarity(other_embedding) >= threshold
+    def to_dict(self) -> dict:
+        """
+        Convert voice profile to dictionary.
+        Returns:
+            Dictionary representation
+        """
+        return {
+            "speaker_id": self.speaker_id,
+            "embedding": self.embedding.tolist(),
+            "reference_file": self.reference_file,
+            "reference_duration": self.reference_duration,
+            "embedding_quality": self.embedding_quality,
+            "num_speech_segments": self.num_speech_segments,
+            "sample_rate": self.sample_rate,
+            "created_at": self.created_at,
+        }
+    @classmethod
+    def from_dict(cls, data: dict) -> "VoiceProfile":
+        """
+        Create voice profile from dictionary.
+        Args:
+            data: Dictionary representation
+        Returns:
+            VoiceProfile instance
+        """
+        data = data.copy()
+        data["embedding"] = np.array(data["embedding"])
+        return cls(**data)
+    def save(self, file_path: str):
+        """
+        Save voice profile to file.
+        Args:
+            file_path: Output file path (.npz format)
+        """
+        import numpy as np
+        np.savez(
+            file_path,
+            speaker_id=self.speaker_id,
+            embedding=self.embedding,
+            reference_file=self.reference_file,
+            reference_duration=self.reference_duration,
+            embedding_quality=self.embedding_quality,
+            num_speech_segments=self.num_speech_segments,
+            sample_rate=self.sample_rate,
+            created_at=self.created_at or "",
+        )
+    @classmethod
+    def load(cls, file_path: str) -> "VoiceProfile":
+        """
+        Load voice profile from file.
+        Args:
+            file_path: Input file path (.npz format)
+        Returns:
+            VoiceProfile instance
+        """
+        import numpy as np
+        data = np.load(file_path, allow_pickle=True)
+        return cls(
+            speaker_id=str(data["speaker_id"]),
+            embedding=data["embedding"],
+            reference_file=str(data["reference_file"]),
+            reference_duration=float(data["reference_duration"]),
+            embedding_quality=float(data["embedding_quality"]),
+            num_speech_segments=int(data["num_speech_segments"]),
+            sample_rate=int(data["sample_rate"]),
+            created_at=str(data["created_at"]) if data["created_at"] else None,
+        )