File size: 4,569 Bytes
0456b70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""
VoiceProfile data model: Reference embedding and speaker identification.

Represents a target voice profile extracted from reference audio clip.
"""

from dataclasses import dataclass, field
from typing import Optional

import numpy as np


@dataclass
class VoiceProfile:
    """
    Voice profile representing a target speaker.

    Contains embedding vectors and metadata for voice identification.
    """

    # Core identification
    speaker_id: str
    embedding: np.ndarray  # 512-dimensional vector from pyannote

    # Source information
    reference_file: str
    reference_duration: float  # seconds

    # Quality metrics
    embedding_quality: float = 1.0  # 0-1 score indicating embedding confidence
    num_speech_segments: int = 0

    # Metadata
    sample_rate: int = 16000
    created_at: Optional[str] = None

    def __post_init__(self):
        """Validate voice profile after initialization."""
        if self.embedding.ndim != 1:
            raise ValueError("Embedding must be 1-dimensional vector")

        if self.embedding_quality < 0 or self.embedding_quality > 1:
            raise ValueError("Embedding quality must be between 0 and 1")

    def similarity(self, other_embedding: np.ndarray) -> float:
        """
        Calculate cosine similarity with another embedding.

        Args:
            other_embedding: Another voice embedding vector

        Returns:
            Similarity score (0-1, higher = more similar)
        """
        from scipy.spatial.distance import cosine

        # Cosine similarity = 1 - cosine distance
        similarity = 1 - cosine(self.embedding, other_embedding)
        return max(0.0, min(1.0, similarity))  # Clamp to [0, 1]

    def matches(self, other_embedding: np.ndarray, threshold: float = 0.7) -> bool:
        """
        Check if another embedding matches this voice profile.

        Args:
            other_embedding: Voice embedding to compare
            threshold: Similarity threshold for match (default: 0.7)

        Returns:
            True if embeddings match above threshold
        """
        return self.similarity(other_embedding) >= threshold

    def to_dict(self) -> dict:
        """
        Convert voice profile to dictionary.

        Returns:
            Dictionary representation
        """
        return {
            "speaker_id": self.speaker_id,
            "embedding": self.embedding.tolist(),
            "reference_file": self.reference_file,
            "reference_duration": self.reference_duration,
            "embedding_quality": self.embedding_quality,
            "num_speech_segments": self.num_speech_segments,
            "sample_rate": self.sample_rate,
            "created_at": self.created_at,
        }

    @classmethod
    def from_dict(cls, data: dict) -> "VoiceProfile":
        """
        Create voice profile from dictionary.

        Args:
            data: Dictionary representation

        Returns:
            VoiceProfile instance
        """
        data = data.copy()
        data["embedding"] = np.array(data["embedding"])
        return cls(**data)

    def save(self, file_path: str):
        """
        Save voice profile to file.

        Args:
            file_path: Output file path (.npz format)
        """
        import numpy as np

        np.savez(
            file_path,
            speaker_id=self.speaker_id,
            embedding=self.embedding,
            reference_file=self.reference_file,
            reference_duration=self.reference_duration,
            embedding_quality=self.embedding_quality,
            num_speech_segments=self.num_speech_segments,
            sample_rate=self.sample_rate,
            created_at=self.created_at or "",
        )

    @classmethod
    def load(cls, file_path: str) -> "VoiceProfile":
        """
        Load voice profile from file.

        Args:
            file_path: Input file path (.npz format)

        Returns:
            VoiceProfile instance
        """
        import numpy as np

        data = np.load(file_path, allow_pickle=True)

        return cls(
            speaker_id=str(data["speaker_id"]),
            embedding=data["embedding"],
            reference_file=str(data["reference_file"]),
            reference_duration=float(data["reference_duration"]),
            embedding_quality=float(data["embedding_quality"]),
            num_speech_segments=int(data["num_speech_segments"]),
            sample_rate=int(data["sample_rate"]),
            created_at=str(data["created_at"]) if data["created_at"] else None,
        )