Spaces:
Paused
Paused
fix: correct gitignore to only exclude root-level models directory, not src/models package
Browse files- .gitignore +3 -3
- src/models/__init__.py +0 -0
- src/models/audio_segment.py +130 -0
- src/models/processing_job.py +320 -0
- src/models/speaker_profile.py +62 -0
- src/models/voice_profile.py +152 -0
.gitignore
CHANGED
|
@@ -41,8 +41,8 @@ env/
|
|
| 41 |
.env.*
|
| 42 |
!.env.example
|
| 43 |
|
| 44 |
-
# Models directory (HuggingFace cache)
|
| 45 |
-
models/
|
| 46 |
*.pt
|
| 47 |
*.pth
|
| 48 |
*.bin
|
|
@@ -79,4 +79,4 @@ tmp/
|
|
| 79 |
# Planning
|
| 80 |
specs
|
| 81 |
.specify
|
| 82 |
-
CLAUDE.md
|
|
|
|
| 41 |
.env.*
|
| 42 |
!.env.example
|
| 43 |
|
| 44 |
+
# Models directory (HuggingFace cache) - only at root level
|
| 45 |
+
/models/
|
| 46 |
*.pt
|
| 47 |
*.pth
|
| 48 |
*.bin
|
|
|
|
| 79 |
# Planning
|
| 80 |
specs
|
| 81 |
.specify
|
| 82 |
+
CLAUDE.md
|
src/models/__init__.py
ADDED
|
File without changes
|
src/models/audio_segment.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio Segment Model
|
| 3 |
+
|
| 4 |
+
Represents a contiguous portion of audio with speaker and timing information.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from enum import Enum
|
| 9 |
+
from typing import List, Optional
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class SegmentType(Enum):
|
| 13 |
+
"""Classification of audio segment types."""
|
| 14 |
+
|
| 15 |
+
SPEECH = "speech"
|
| 16 |
+
NONVERBAL = "nonverbal"
|
| 17 |
+
SILENCE = "silence"
|
| 18 |
+
OVERLAP = "overlap" # Multiple speakers talking simultaneously
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class AudioSegment:
|
| 23 |
+
"""
|
| 24 |
+
Audio segment with time range and speaker information.
|
| 25 |
+
|
| 26 |
+
Attributes:
|
| 27 |
+
start_time: Beginning timestamp in seconds
|
| 28 |
+
end_time: Ending timestamp in seconds
|
| 29 |
+
speaker_id: Identifier of the speaker in this segment
|
| 30 |
+
confidence: Certainty of speaker identification (0.0-1.0)
|
| 31 |
+
segment_type: Classification of the segment
|
| 32 |
+
audio_file: Path to the source audio file (optional)
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
start_time: float
|
| 36 |
+
end_time: float
|
| 37 |
+
speaker_id: str
|
| 38 |
+
confidence: float = 1.0
|
| 39 |
+
segment_type: SegmentType = SegmentType.SPEECH
|
| 40 |
+
audio_file: Optional[str] = None
|
| 41 |
+
|
| 42 |
+
def __post_init__(self):
|
| 43 |
+
"""Validate audio segment data."""
|
| 44 |
+
if self.start_time < 0:
|
| 45 |
+
raise ValueError(f"Start time cannot be negative: {self.start_time}")
|
| 46 |
+
|
| 47 |
+
if self.end_time <= self.start_time:
|
| 48 |
+
raise ValueError(
|
| 49 |
+
f"End time ({self.end_time}) must be after start time ({self.start_time})"
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
if not 0.0 <= self.confidence <= 1.0:
|
| 53 |
+
raise ValueError(f"Confidence must be between 0.0 and 1.0, got {self.confidence}")
|
| 54 |
+
|
| 55 |
+
@property
|
| 56 |
+
def duration(self) -> float:
|
| 57 |
+
"""Calculate duration of the segment in seconds."""
|
| 58 |
+
return self.end_time - self.start_time
|
| 59 |
+
|
| 60 |
+
def overlaps_with(self, other: "AudioSegment") -> bool:
|
| 61 |
+
"""Check if this segment overlaps with another segment."""
|
| 62 |
+
return not (self.end_time <= other.start_time or other.end_time <= self.start_time)
|
| 63 |
+
|
| 64 |
+
def contains_time(self, time: float) -> bool:
|
| 65 |
+
"""Check if a timestamp falls within this segment."""
|
| 66 |
+
return self.start_time <= time <= self.end_time
|
| 67 |
+
|
| 68 |
+
def __repr__(self) -> str:
|
| 69 |
+
return (
|
| 70 |
+
f"AudioSegment("
|
| 71 |
+
f"speaker='{self.speaker_id}', "
|
| 72 |
+
f"time={self.start_time:.2f}-{self.end_time:.2f}s, "
|
| 73 |
+
f"duration={self.duration:.2f}s, "
|
| 74 |
+
f"confidence={self.confidence:.2f}, "
|
| 75 |
+
f"type={self.segment_type.value})"
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class SegmentCollection:
|
| 80 |
+
"""
|
| 81 |
+
Collection of audio segments with utility methods.
|
| 82 |
+
|
| 83 |
+
Provides methods for filtering, sorting, and analyzing groups of segments.
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
def __init__(self, segments: List[AudioSegment]):
|
| 87 |
+
"""Initialize collection with segments."""
|
| 88 |
+
self.segments = segments
|
| 89 |
+
|
| 90 |
+
def __len__(self) -> int:
|
| 91 |
+
"""Return number of segments."""
|
| 92 |
+
return len(self.segments)
|
| 93 |
+
|
| 94 |
+
def __iter__(self):
|
| 95 |
+
"""Iterate over segments."""
|
| 96 |
+
return iter(self.segments)
|
| 97 |
+
|
| 98 |
+
def __getitem__(self, index):
|
| 99 |
+
"""Get segment by index."""
|
| 100 |
+
return self.segments[index]
|
| 101 |
+
|
| 102 |
+
@property
|
| 103 |
+
def total_duration(self) -> float:
|
| 104 |
+
"""Calculate total duration of all segments."""
|
| 105 |
+
return sum(seg.duration for seg in self.segments)
|
| 106 |
+
|
| 107 |
+
def filter_by_speaker(self, speaker_id: str) -> "SegmentCollection":
|
| 108 |
+
"""Filter segments by speaker ID."""
|
| 109 |
+
filtered = [seg for seg in self.segments if seg.speaker_id == speaker_id]
|
| 110 |
+
return SegmentCollection(filtered)
|
| 111 |
+
|
| 112 |
+
def filter_by_type(self, segment_type: SegmentType) -> "SegmentCollection":
|
| 113 |
+
"""Filter segments by type."""
|
| 114 |
+
filtered = [seg for seg in self.segments if seg.segment_type == segment_type]
|
| 115 |
+
return SegmentCollection(filtered)
|
| 116 |
+
|
| 117 |
+
def sort_by_time(self) -> "SegmentCollection":
|
| 118 |
+
"""Sort segments by start time."""
|
| 119 |
+
sorted_segments = sorted(self.segments, key=lambda s: s.start_time)
|
| 120 |
+
return SegmentCollection(sorted_segments)
|
| 121 |
+
|
| 122 |
+
def get_speakers(self) -> List[str]:
|
| 123 |
+
"""Get unique list of speaker IDs."""
|
| 124 |
+
return list(set(seg.speaker_id for seg in self.segments))
|
| 125 |
+
|
| 126 |
+
def average_confidence(self) -> float:
|
| 127 |
+
"""Calculate average confidence across all segments."""
|
| 128 |
+
if not self.segments:
|
| 129 |
+
return 0.0
|
| 130 |
+
return sum(seg.confidence for seg in self.segments) / len(self.segments)
|
src/models/processing_job.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ProcessingJob data model: Batch configuration and execution tracking.
|
| 3 |
+
|
| 4 |
+
Represents a voice extraction job with configuration and state.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from enum import Enum
|
| 10 |
+
from typing import List, Literal, Optional
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ExtractionMode(Enum):
|
| 14 |
+
"""Extraction mode for audio processing."""
|
| 15 |
+
|
| 16 |
+
SPEECH = "speech"
|
| 17 |
+
NONVERBAL = "nonverbal"
|
| 18 |
+
BOTH = "both"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class JobStatus(Enum):
|
| 22 |
+
"""Processing job status."""
|
| 23 |
+
|
| 24 |
+
PENDING = "pending"
|
| 25 |
+
RUNNING = "running"
|
| 26 |
+
COMPLETED = "completed"
|
| 27 |
+
FAILED = "failed"
|
| 28 |
+
CANCELLED = "cancelled"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class ProcessingJob:
|
| 33 |
+
"""
|
| 34 |
+
Voice extraction processing job.
|
| 35 |
+
|
| 36 |
+
Represents a batch processing job with configuration, state tracking,
|
| 37 |
+
and results collection.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
# Input configuration
|
| 41 |
+
reference_file: str
|
| 42 |
+
input_files: List[str]
|
| 43 |
+
output_dir: str
|
| 44 |
+
|
| 45 |
+
# Processing options
|
| 46 |
+
extraction_mode: ExtractionMode = ExtractionMode.SPEECH
|
| 47 |
+
apply_denoising: bool = False
|
| 48 |
+
vad_threshold: float = 0.5
|
| 49 |
+
quality_threshold_enabled: bool = True
|
| 50 |
+
|
| 51 |
+
# Job state
|
| 52 |
+
status: JobStatus = JobStatus.PENDING
|
| 53 |
+
job_id: Optional[str] = None
|
| 54 |
+
created_at: Optional[str] = None
|
| 55 |
+
started_at: Optional[str] = None
|
| 56 |
+
completed_at: Optional[str] = None
|
| 57 |
+
|
| 58 |
+
# Progress tracking
|
| 59 |
+
total_files: int = 0
|
| 60 |
+
files_processed: int = 0
|
| 61 |
+
files_failed: int = 0
|
| 62 |
+
current_file: Optional[str] = None
|
| 63 |
+
|
| 64 |
+
# Results
|
| 65 |
+
output_files: List[str] = field(default_factory=list)
|
| 66 |
+
failed_files: List[dict] = field(default_factory=list) # {file, error}
|
| 67 |
+
|
| 68 |
+
# Statistics
|
| 69 |
+
total_input_duration: float = 0.0
|
| 70 |
+
total_extracted_duration: float = 0.0
|
| 71 |
+
total_processing_time: float = 0.0
|
| 72 |
+
|
| 73 |
+
def __post_init__(self):
|
| 74 |
+
"""Initialize job after creation."""
|
| 75 |
+
if self.job_id is None:
|
| 76 |
+
# Generate job ID from timestamp
|
| 77 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 78 |
+
self.job_id = f"job_{timestamp}"
|
| 79 |
+
|
| 80 |
+
if self.created_at is None:
|
| 81 |
+
self.created_at = datetime.now().isoformat()
|
| 82 |
+
|
| 83 |
+
self.total_files = len(self.input_files)
|
| 84 |
+
|
| 85 |
+
@property
|
| 86 |
+
def progress_percentage(self) -> float:
|
| 87 |
+
"""
|
| 88 |
+
Get job progress as percentage.
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
Progress percentage (0-100)
|
| 92 |
+
"""
|
| 93 |
+
if self.total_files == 0:
|
| 94 |
+
return 0.0
|
| 95 |
+
|
| 96 |
+
return (self.files_processed / self.total_files) * 100
|
| 97 |
+
|
| 98 |
+
@property
|
| 99 |
+
def success_rate(self) -> float:
|
| 100 |
+
"""
|
| 101 |
+
Get success rate for processed files.
|
| 102 |
+
|
| 103 |
+
Returns:
|
| 104 |
+
Success rate as percentage (0-100)
|
| 105 |
+
"""
|
| 106 |
+
processed = self.files_processed
|
| 107 |
+
if processed == 0:
|
| 108 |
+
return 0.0
|
| 109 |
+
|
| 110 |
+
succeeded = processed - self.files_failed
|
| 111 |
+
return (succeeded / processed) * 100
|
| 112 |
+
|
| 113 |
+
@property
|
| 114 |
+
def extraction_yield(self) -> float:
|
| 115 |
+
"""
|
| 116 |
+
Get extraction yield percentage.
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
Yield as percentage of input duration (0-100)
|
| 120 |
+
"""
|
| 121 |
+
if self.total_input_duration == 0:
|
| 122 |
+
return 0.0
|
| 123 |
+
|
| 124 |
+
return (self.total_extracted_duration / self.total_input_duration) * 100
|
| 125 |
+
|
| 126 |
+
@property
|
| 127 |
+
def is_complete(self) -> bool:
|
| 128 |
+
"""Check if job is complete."""
|
| 129 |
+
return self.status in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED)
|
| 130 |
+
|
| 131 |
+
@property
|
| 132 |
+
def is_running(self) -> bool:
|
| 133 |
+
"""Check if job is currently running."""
|
| 134 |
+
return self.status == JobStatus.RUNNING
|
| 135 |
+
|
| 136 |
+
def start(self):
|
| 137 |
+
"""Mark job as started."""
|
| 138 |
+
self.status = JobStatus.RUNNING
|
| 139 |
+
self.started_at = datetime.now().isoformat()
|
| 140 |
+
|
| 141 |
+
def complete(self):
|
| 142 |
+
"""Mark job as completed."""
|
| 143 |
+
self.status = JobStatus.COMPLETED
|
| 144 |
+
self.completed_at = datetime.now().isoformat()
|
| 145 |
+
|
| 146 |
+
# Calculate total processing time
|
| 147 |
+
if self.started_at and self.completed_at:
|
| 148 |
+
start = datetime.fromisoformat(self.started_at)
|
| 149 |
+
end = datetime.fromisoformat(self.completed_at)
|
| 150 |
+
self.total_processing_time = (end - start).total_seconds()
|
| 151 |
+
|
| 152 |
+
def fail(self, error: str):
|
| 153 |
+
"""Mark job as failed."""
|
| 154 |
+
self.status = JobStatus.FAILED
|
| 155 |
+
self.completed_at = datetime.now().isoformat()
|
| 156 |
+
|
| 157 |
+
# Add general error to failed files
|
| 158 |
+
self.failed_files.append(
|
| 159 |
+
{
|
| 160 |
+
"file": "JOB",
|
| 161 |
+
"error": error,
|
| 162 |
+
}
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
def cancel(self):
|
| 166 |
+
"""Mark job as cancelled."""
|
| 167 |
+
self.status = JobStatus.CANCELLED
|
| 168 |
+
self.completed_at = datetime.now().isoformat()
|
| 169 |
+
|
| 170 |
+
def add_success(
|
| 171 |
+
self, input_file: str, output_file: str, input_duration: float, extracted_duration: float
|
| 172 |
+
):
|
| 173 |
+
"""
|
| 174 |
+
Record successful file processing.
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
input_file: Input file path
|
| 178 |
+
output_file: Output file path
|
| 179 |
+
input_duration: Input file duration in seconds
|
| 180 |
+
extracted_duration: Extracted audio duration in seconds
|
| 181 |
+
"""
|
| 182 |
+
self.files_processed += 1
|
| 183 |
+
self.output_files.append(output_file)
|
| 184 |
+
self.total_input_duration += input_duration
|
| 185 |
+
self.total_extracted_duration += extracted_duration
|
| 186 |
+
|
| 187 |
+
def add_failure(self, input_file: str, error: str):
|
| 188 |
+
"""
|
| 189 |
+
Record failed file processing.
|
| 190 |
+
|
| 191 |
+
Args:
|
| 192 |
+
input_file: Input file path that failed
|
| 193 |
+
error: Error message
|
| 194 |
+
"""
|
| 195 |
+
self.files_processed += 1
|
| 196 |
+
self.files_failed += 1
|
| 197 |
+
self.failed_files.append(
|
| 198 |
+
{
|
| 199 |
+
"file": input_file,
|
| 200 |
+
"error": error,
|
| 201 |
+
}
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
def update_progress(self, current_file: str):
|
| 205 |
+
"""
|
| 206 |
+
Update current processing file.
|
| 207 |
+
|
| 208 |
+
Args:
|
| 209 |
+
current_file: Currently processing file path
|
| 210 |
+
"""
|
| 211 |
+
self.current_file = current_file
|
| 212 |
+
|
| 213 |
+
def get_summary(self) -> dict:
|
| 214 |
+
"""
|
| 215 |
+
Get job summary statistics.
|
| 216 |
+
|
| 217 |
+
Returns:
|
| 218 |
+
Dictionary with summary information
|
| 219 |
+
"""
|
| 220 |
+
return {
|
| 221 |
+
"job_id": self.job_id,
|
| 222 |
+
"status": self.status.value,
|
| 223 |
+
"extraction_mode": self.extraction_mode.value,
|
| 224 |
+
"apply_denoising": self.apply_denoising,
|
| 225 |
+
"total_files": self.total_files,
|
| 226 |
+
"files_processed": self.files_processed,
|
| 227 |
+
"files_succeeded": self.files_processed - self.files_failed,
|
| 228 |
+
"files_failed": self.files_failed,
|
| 229 |
+
"progress_percentage": self.progress_percentage,
|
| 230 |
+
"success_rate": self.success_rate,
|
| 231 |
+
"total_input_duration": self.total_input_duration,
|
| 232 |
+
"total_extracted_duration": self.total_extracted_duration,
|
| 233 |
+
"extraction_yield": self.extraction_yield,
|
| 234 |
+
"total_processing_time": self.total_processing_time,
|
| 235 |
+
"created_at": self.created_at,
|
| 236 |
+
"started_at": self.started_at,
|
| 237 |
+
"completed_at": self.completed_at,
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
def to_dict(self) -> dict:
|
| 241 |
+
"""Convert job to dictionary."""
|
| 242 |
+
return {
|
| 243 |
+
"job_id": self.job_id,
|
| 244 |
+
"reference_file": self.reference_file,
|
| 245 |
+
"input_files": self.input_files,
|
| 246 |
+
"output_dir": self.output_dir,
|
| 247 |
+
"extraction_mode": self.extraction_mode.value,
|
| 248 |
+
"apply_denoising": self.apply_denoising,
|
| 249 |
+
"vad_threshold": self.vad_threshold,
|
| 250 |
+
"quality_threshold_enabled": self.quality_threshold_enabled,
|
| 251 |
+
"status": self.status.value,
|
| 252 |
+
"created_at": self.created_at,
|
| 253 |
+
"started_at": self.started_at,
|
| 254 |
+
"completed_at": self.completed_at,
|
| 255 |
+
"total_files": self.total_files,
|
| 256 |
+
"files_processed": self.files_processed,
|
| 257 |
+
"files_failed": self.files_failed,
|
| 258 |
+
"current_file": self.current_file,
|
| 259 |
+
"output_files": self.output_files,
|
| 260 |
+
"failed_files": self.failed_files,
|
| 261 |
+
"total_input_duration": self.total_input_duration,
|
| 262 |
+
"total_extracted_duration": self.total_extracted_duration,
|
| 263 |
+
"total_processing_time": self.total_processing_time,
|
| 264 |
+
"summary": self.get_summary(),
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
@classmethod
|
| 268 |
+
def from_dict(cls, data: dict) -> "ProcessingJob":
|
| 269 |
+
"""Create job from dictionary."""
|
| 270 |
+
data = data.copy()
|
| 271 |
+
|
| 272 |
+
# Convert enum strings to enums
|
| 273 |
+
if isinstance(data.get("extraction_mode"), str):
|
| 274 |
+
data["extraction_mode"] = ExtractionMode(data["extraction_mode"])
|
| 275 |
+
|
| 276 |
+
if isinstance(data.get("status"), str):
|
| 277 |
+
data["status"] = JobStatus(data["status"])
|
| 278 |
+
|
| 279 |
+
# Remove computed properties
|
| 280 |
+
data.pop("summary", None)
|
| 281 |
+
|
| 282 |
+
return cls(**data)
|
| 283 |
+
|
| 284 |
+
def generate_report(self) -> str:
|
| 285 |
+
"""
|
| 286 |
+
Generate human-readable job report.
|
| 287 |
+
|
| 288 |
+
Returns:
|
| 289 |
+
Formatted report string
|
| 290 |
+
"""
|
| 291 |
+
report = ["=== Voice Extraction Job Report ===", ""]
|
| 292 |
+
|
| 293 |
+
report.append(f"Job ID: {self.job_id}")
|
| 294 |
+
report.append(f"Status: {self.status.value.upper()}")
|
| 295 |
+
report.append(f"Mode: {self.extraction_mode.value}")
|
| 296 |
+
report.append(f"Denoising: {'Enabled' if self.apply_denoising else 'Disabled'}")
|
| 297 |
+
report.append("")
|
| 298 |
+
|
| 299 |
+
report.append(f"Files Processed: {self.files_processed}/{self.total_files}")
|
| 300 |
+
report.append(f"Success Rate: {self.success_rate:.1f}%")
|
| 301 |
+
report.append(f"Progress: {self.progress_percentage:.1f}%")
|
| 302 |
+
report.append("")
|
| 303 |
+
|
| 304 |
+
report.append(f"Input Duration: {self.total_input_duration / 60:.1f} minutes")
|
| 305 |
+
report.append(f"Extracted Duration: {self.total_extracted_duration / 60:.1f} minutes")
|
| 306 |
+
report.append(f"Extraction Yield: {self.extraction_yield:.1f}%")
|
| 307 |
+
|
| 308 |
+
if self.total_processing_time > 0:
|
| 309 |
+
report.append(f"Processing Time: {self.total_processing_time / 60:.1f} minutes")
|
| 310 |
+
|
| 311 |
+
if self.files_failed > 0:
|
| 312 |
+
report.append("")
|
| 313 |
+
report.append(f"Failed Files ({self.files_failed}):")
|
| 314 |
+
for failure in self.failed_files[:5]: # Show first 5
|
| 315 |
+
report.append(f" - {failure['file']}: {failure['error']}")
|
| 316 |
+
|
| 317 |
+
if len(self.failed_files) > 5:
|
| 318 |
+
report.append(f" ... and {len(self.failed_files) - 5} more")
|
| 319 |
+
|
| 320 |
+
return "\n".join(report)
|
src/models/speaker_profile.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Speaker Profile Model
|
| 3 |
+
|
| 4 |
+
Represents a speaker's voice characteristics extracted from audio,
|
| 5 |
+
used for identification and matching.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from typing import Optional
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class SpeakerProfile:
|
| 16 |
+
"""
|
| 17 |
+
Speaker profile containing voice embeddings and metadata.
|
| 18 |
+
|
| 19 |
+
Attributes:
|
| 20 |
+
speaker_id: Unique identifier (e.g., "SPEAKER_00", "SPEAKER_01")
|
| 21 |
+
embedding: Numerical representation of voice characteristics
|
| 22 |
+
source_audio: Path to the audio file this profile was extracted from
|
| 23 |
+
start_time: Start time in source audio (seconds)
|
| 24 |
+
end_time: End time in source audio (seconds)
|
| 25 |
+
confidence: Quality/reliability metric (0.0-1.0)
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
speaker_id: str
|
| 29 |
+
embedding: np.ndarray
|
| 30 |
+
source_audio: str
|
| 31 |
+
start_time: float
|
| 32 |
+
end_time: float
|
| 33 |
+
confidence: float = 1.0
|
| 34 |
+
|
| 35 |
+
def __post_init__(self):
|
| 36 |
+
"""Validate speaker profile data."""
|
| 37 |
+
if self.embedding is None or len(self.embedding) == 0:
|
| 38 |
+
raise ValueError("Embedding vector cannot be empty")
|
| 39 |
+
|
| 40 |
+
if self.end_time <= self.start_time:
|
| 41 |
+
raise ValueError(
|
| 42 |
+
f"End time ({self.end_time}) must be after start time ({self.start_time})"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
if not 0.0 <= self.confidence <= 1.0:
|
| 46 |
+
raise ValueError(f"Confidence must be between 0.0 and 1.0, got {self.confidence}")
|
| 47 |
+
|
| 48 |
+
duration = self.end_time - self.start_time
|
| 49 |
+
if duration < 3.0:
|
| 50 |
+
raise ValueError(f"Duration ({duration}s) is too short (minimum 3 seconds recommended)")
|
| 51 |
+
|
| 52 |
+
@property
|
| 53 |
+
def duration(self) -> float:
|
| 54 |
+
"""Calculate duration of the profile in seconds."""
|
| 55 |
+
return self.end_time - self.start_time
|
| 56 |
+
|
| 57 |
+
def __repr__(self) -> str:
|
| 58 |
+
return (
|
| 59 |
+
f"SpeakerProfile(speaker_id='{self.speaker_id}', "
|
| 60 |
+
f"duration={self.duration:.2f}s, "
|
| 61 |
+
f"confidence={self.confidence:.2f})"
|
| 62 |
+
)
|
src/models/voice_profile.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceProfile data model: Reference embedding and speaker identification.
|
| 3 |
+
|
| 4 |
+
Represents a target voice profile extracted from reference audio clip.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
+
from typing import Optional
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class VoiceProfile:
|
| 15 |
+
"""
|
| 16 |
+
Voice profile representing a target speaker.
|
| 17 |
+
|
| 18 |
+
Contains embedding vectors and metadata for voice identification.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
# Core identification
|
| 22 |
+
speaker_id: str
|
| 23 |
+
embedding: np.ndarray # 512-dimensional vector from pyannote
|
| 24 |
+
|
| 25 |
+
# Source information
|
| 26 |
+
reference_file: str
|
| 27 |
+
reference_duration: float # seconds
|
| 28 |
+
|
| 29 |
+
# Quality metrics
|
| 30 |
+
embedding_quality: float = 1.0 # 0-1 score indicating embedding confidence
|
| 31 |
+
num_speech_segments: int = 0
|
| 32 |
+
|
| 33 |
+
# Metadata
|
| 34 |
+
sample_rate: int = 16000
|
| 35 |
+
created_at: Optional[str] = None
|
| 36 |
+
|
| 37 |
+
def __post_init__(self):
|
| 38 |
+
"""Validate voice profile after initialization."""
|
| 39 |
+
if self.embedding.ndim != 1:
|
| 40 |
+
raise ValueError("Embedding must be 1-dimensional vector")
|
| 41 |
+
|
| 42 |
+
if self.embedding_quality < 0 or self.embedding_quality > 1:
|
| 43 |
+
raise ValueError("Embedding quality must be between 0 and 1")
|
| 44 |
+
|
| 45 |
+
def similarity(self, other_embedding: np.ndarray) -> float:
|
| 46 |
+
"""
|
| 47 |
+
Calculate cosine similarity with another embedding.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
other_embedding: Another voice embedding vector
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
Similarity score (0-1, higher = more similar)
|
| 54 |
+
"""
|
| 55 |
+
from scipy.spatial.distance import cosine
|
| 56 |
+
|
| 57 |
+
# Cosine similarity = 1 - cosine distance
|
| 58 |
+
similarity = 1 - cosine(self.embedding, other_embedding)
|
| 59 |
+
return max(0.0, min(1.0, similarity)) # Clamp to [0, 1]
|
| 60 |
+
|
| 61 |
+
def matches(self, other_embedding: np.ndarray, threshold: float = 0.7) -> bool:
|
| 62 |
+
"""
|
| 63 |
+
Check if another embedding matches this voice profile.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
other_embedding: Voice embedding to compare
|
| 67 |
+
threshold: Similarity threshold for match (default: 0.7)
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
True if embeddings match above threshold
|
| 71 |
+
"""
|
| 72 |
+
return self.similarity(other_embedding) >= threshold
|
| 73 |
+
|
| 74 |
+
def to_dict(self) -> dict:
|
| 75 |
+
"""
|
| 76 |
+
Convert voice profile to dictionary.
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
Dictionary representation
|
| 80 |
+
"""
|
| 81 |
+
return {
|
| 82 |
+
"speaker_id": self.speaker_id,
|
| 83 |
+
"embedding": self.embedding.tolist(),
|
| 84 |
+
"reference_file": self.reference_file,
|
| 85 |
+
"reference_duration": self.reference_duration,
|
| 86 |
+
"embedding_quality": self.embedding_quality,
|
| 87 |
+
"num_speech_segments": self.num_speech_segments,
|
| 88 |
+
"sample_rate": self.sample_rate,
|
| 89 |
+
"created_at": self.created_at,
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
@classmethod
|
| 93 |
+
def from_dict(cls, data: dict) -> "VoiceProfile":
|
| 94 |
+
"""
|
| 95 |
+
Create voice profile from dictionary.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
data: Dictionary representation
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
VoiceProfile instance
|
| 102 |
+
"""
|
| 103 |
+
data = data.copy()
|
| 104 |
+
data["embedding"] = np.array(data["embedding"])
|
| 105 |
+
return cls(**data)
|
| 106 |
+
|
| 107 |
+
def save(self, file_path: str):
|
| 108 |
+
"""
|
| 109 |
+
Save voice profile to file.
|
| 110 |
+
|
| 111 |
+
Args:
|
| 112 |
+
file_path: Output file path (.npz format)
|
| 113 |
+
"""
|
| 114 |
+
import numpy as np
|
| 115 |
+
|
| 116 |
+
np.savez(
|
| 117 |
+
file_path,
|
| 118 |
+
speaker_id=self.speaker_id,
|
| 119 |
+
embedding=self.embedding,
|
| 120 |
+
reference_file=self.reference_file,
|
| 121 |
+
reference_duration=self.reference_duration,
|
| 122 |
+
embedding_quality=self.embedding_quality,
|
| 123 |
+
num_speech_segments=self.num_speech_segments,
|
| 124 |
+
sample_rate=self.sample_rate,
|
| 125 |
+
created_at=self.created_at or "",
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
@classmethod
|
| 129 |
+
def load(cls, file_path: str) -> "VoiceProfile":
|
| 130 |
+
"""
|
| 131 |
+
Load voice profile from file.
|
| 132 |
+
|
| 133 |
+
Args:
|
| 134 |
+
file_path: Input file path (.npz format)
|
| 135 |
+
|
| 136 |
+
Returns:
|
| 137 |
+
VoiceProfile instance
|
| 138 |
+
"""
|
| 139 |
+
import numpy as np
|
| 140 |
+
|
| 141 |
+
data = np.load(file_path, allow_pickle=True)
|
| 142 |
+
|
| 143 |
+
return cls(
|
| 144 |
+
speaker_id=str(data["speaker_id"]),
|
| 145 |
+
embedding=data["embedding"],
|
| 146 |
+
reference_file=str(data["reference_file"]),
|
| 147 |
+
reference_duration=float(data["reference_duration"]),
|
| 148 |
+
embedding_quality=float(data["embedding_quality"]),
|
| 149 |
+
num_speech_segments=int(data["num_speech_segments"]),
|
| 150 |
+
sample_rate=int(data["sample_rate"]),
|
| 151 |
+
created_at=str(data["created_at"]) if data["created_at"] else None,
|
| 152 |
+
)
|