Training in progress - step 1000

Browse files

Files changed (7) hide show

alignment.py +56 -40
asr_config.py +2 -0
asr_modeling.py +3 -3
chat_template.jinja +94 -89
diarization.py +35 -8
tokenizer.json +2 -2
tokenizer_config.json +0 -0

alignment.py CHANGED Viewed

@@ -3,6 +3,11 @@
 import numpy as np
 import torch
 def _get_device() -> str:
     """Get best available device for non-transformers models."""
@@ -65,6 +70,11 @@ class ForcedAligner:
         trellis = torch.full((num_frames + 1, num_tokens + 1), -float("inf"))
         trellis[0, 0] = 0
         for t in range(num_frames):
             for j in range(num_tokens + 1):
                 # Stay: emit blank and stay at j tokens
@@ -80,7 +90,7 @@ class ForcedAligner:
     @staticmethod
     def _backtrack(
         trellis: torch.Tensor, emission: torch.Tensor, tokens: list[int], blank_id: int = 0
-    ) -> list[tuple[int, float, float]]:
         """Backtrack through trellis to find optimal forced monotonic alignment.
         Guarantees:
@@ -88,7 +98,8 @@ class ForcedAligner:
         - Strictly monotonic: each token's frames come after previous token's
         - No frame skipping or token teleporting
-        Returns list of (token_id, start_frame, end_frame) for each token.
         """
         num_frames = emission.size(0)
         num_tokens = len(tokens)
@@ -102,13 +113,18 @@ class ForcedAligner:
             # Alignment failed - fall back to uniform distribution
             frames_per_token = num_frames / num_tokens
             return [
-                (tokens[i], i * frames_per_token, (i + 1) * frames_per_token)
                 for i in range(num_tokens)
             ]
         # Backtrack: find where each token transition occurred
-        # path[i] = frame where token i was first emitted
-        token_frames: list[list[int]] = [[] for _ in range(num_tokens)]
         t = num_frames
         j = num_tokens
@@ -120,38 +136,40 @@ class ForcedAligner:
             if move_score >= stay_score:
                 # Token j-1 was emitted at frame t-1
-                token_frames[j - 1].insert(0, t - 1)
                 j -= 1
             # Always decrement time (monotonic)
             t -= 1
         # Handle any remaining tokens at the start (edge case)
         while j > 0:
-            token_frames[j - 1].insert(0, 0)
             j -= 1
-        # Convert to spans
-        token_spans: list[tuple[int, float, float]] = []
-        for token_idx, frames in enumerate(token_frames):
-            if not frames:
                 # Token never emitted - assign minimal span after previous
                 if token_spans:
                     prev_end = token_spans[-1][2]
-                    frames = [int(prev_end)]
                 else:
-                    frames = [0]
             token_id = tokens[token_idx]
             start_frame = float(min(frames))
             end_frame = float(max(frames)) + 1.0
-            token_spans.append((token_id, start_frame, end_frame))
-        return token_spans
-    # Offset compensation for Wav2Vec2-BASE systematic bias (in seconds)
-    # Calibrated on librispeech-alignments dataset
-    START_OFFSET = 0.06  # Subtract from start times (shift earlier)
-    END_OFFSET = -0.03  # Add to end times (shift later)
     @classmethod
     def align(
@@ -229,26 +247,28 @@ class ForcedAligner:
         frame_duration = 320 / cls._bundle.sample_rate
         # Apply separate offset compensation for start/end (Wav2Vec2 systematic bias)
-        start_offset = cls.START_OFFSET
-        end_offset = cls.END_OFFSET
         # Group aligned tokens into words based on pipe separator
         words = text.split()
         word_timestamps = []
-        current_word_start = None
-        current_word_end = None
         word_idx = 0
         separator_id = dictionary.get("|", dictionary.get(" ", 0))
-        for token_id, start_frame, end_frame in alignment_path:
             if token_id == separator_id:  # Word separator
                 if (
-                    current_word_start is not None
-                    and current_word_end is not None
                     and word_idx < len(words)
                 ):
-                    start_time = max(0.0, current_word_start * frame_duration - start_offset)
-                    end_time = max(0.0, current_word_end * frame_duration - end_offset)
                     word_timestamps.append(
                         {
                             "word": words[word_idx],
@@ -257,21 +277,17 @@ class ForcedAligner:
                         }
                     )
                     word_idx += 1
-                current_word_start = None
-                current_word_end = None
             else:
-                if current_word_start is None:
-                    current_word_start = start_frame
-                current_word_end = end_frame
         # Don't forget the last word
-        if (
-            current_word_start is not None
-            and current_word_end is not None
-            and word_idx < len(words)
-        ):
-            start_time = max(0.0, current_word_start * frame_duration - start_offset)
-            end_time = max(0.0, current_word_end * frame_duration - end_offset)
             word_timestamps.append(
                 {
                     "word": words[word_idx],

 import numpy as np
 import torch
+# Offset compensation for Wav2Vec2-BASE systematic bias (in seconds)
+# Calibrated on librispeech-alignments dataset (n=25, MAE=48ms)
+START_OFFSET = 0.04  # Subtract from start times (shift earlier)
+END_OFFSET = -0.04  # Subtract from end times (shift later)
 def _get_device() -> str:
     """Get best available device for non-transformers models."""
         trellis = torch.full((num_frames + 1, num_tokens + 1), -float("inf"))
         trellis[0, 0] = 0
+        # Force alignment to use all tokens by preventing staying in blank
+        # at the end when there are still tokens to emit
+        if num_tokens > 1:
+            trellis[-num_tokens + 1 :, 0] = float("inf")
         for t in range(num_frames):
             for j in range(num_tokens + 1):
                 # Stay: emit blank and stay at j tokens
     @staticmethod
     def _backtrack(
         trellis: torch.Tensor, emission: torch.Tensor, tokens: list[int], blank_id: int = 0
+    ) -> list[tuple[int, float, float, float]]:
         """Backtrack through trellis to find optimal forced monotonic alignment.
         Guarantees:
         - Strictly monotonic: each token's frames come after previous token's
         - No frame skipping or token teleporting
+        Returns list of (token_id, start_frame, end_frame, peak_frame) for each token.
+        The peak_frame is the frame with highest emission probability for that token.
         """
         num_frames = emission.size(0)
         num_tokens = len(tokens)
             # Alignment failed - fall back to uniform distribution
             frames_per_token = num_frames / num_tokens
             return [
+                (
+                    tokens[i],
+                    i * frames_per_token,
+                    (i + 1) * frames_per_token,
+                    (i + 0.5) * frames_per_token,
+                )
                 for i in range(num_tokens)
             ]
         # Backtrack: find where each token transition occurred
+        # Store (frame, emission_score) for each token
+        token_frames: list[list[tuple[int, float]]] = [[] for _ in range(num_tokens)]
         t = num_frames
         j = num_tokens
             if move_score >= stay_score:
                 # Token j-1 was emitted at frame t-1
+                # Store frame and its emission probability
+                emit_prob = emission[t - 1, tokens[j - 1]].exp().item()
+                token_frames[j - 1].insert(0, (t - 1, emit_prob))
                 j -= 1
             # Always decrement time (monotonic)
             t -= 1
         # Handle any remaining tokens at the start (edge case)
         while j > 0:
+            token_frames[j - 1].insert(0, (0, 0.0))
             j -= 1
+        # Convert to spans with peak frame
+        token_spans: list[tuple[int, float, float, float]] = []
+        for token_idx, frames_with_scores in enumerate(token_frames):
+            if not frames_with_scores:
                 # Token never emitted - assign minimal span after previous
                 if token_spans:
                     prev_end = token_spans[-1][2]
+                    frames_with_scores = [(int(prev_end), 0.0)]
                 else:
+                    frames_with_scores = [(0, 0.0)]
             token_id = tokens[token_idx]
+            frames = [f for f, _ in frames_with_scores]
             start_frame = float(min(frames))
             end_frame = float(max(frames)) + 1.0
+            # Find peak frame (highest emission probability)
+            peak_frame, _ = max(frames_with_scores, key=lambda x: x[1])
+            token_spans.append((token_id, start_frame, end_frame, float(peak_frame)))
+        return token_spans
     @classmethod
     def align(
         frame_duration = 320 / cls._bundle.sample_rate
         # Apply separate offset compensation for start/end (Wav2Vec2 systematic bias)
+        start_offset = START_OFFSET
+        end_offset = END_OFFSET
         # Group aligned tokens into words based on pipe separator
+        # Use peak emission frame for more accurate word boundaries
         words = text.split()
         word_timestamps = []
+        first_char_peak = None
+        last_char_peak = None
         word_idx = 0
         separator_id = dictionary.get("|", dictionary.get(" ", 0))
+        for token_id, _start_frame, _end_frame, peak_frame in alignment_path:
             if token_id == separator_id:  # Word separator
                 if (
+                    first_char_peak is not None
+                    and last_char_peak is not None
                     and word_idx < len(words)
                 ):
+                    # Use peak frames for word boundaries
+                    start_time = max(0.0, first_char_peak * frame_duration - start_offset)
+                    end_time = max(0.0, (last_char_peak + 1) * frame_duration - end_offset)
                     word_timestamps.append(
                         {
                             "word": words[word_idx],
                         }
                     )
                     word_idx += 1
+                first_char_peak = None
+                last_char_peak = None
             else:
+                if first_char_peak is None:
+                    first_char_peak = peak_frame
+                last_char_peak = peak_frame
         # Don't forget the last word
+        if first_char_peak is not None and last_char_peak is not None and word_idx < len(words):
+            start_time = max(0.0, first_char_peak * frame_duration - start_offset)
+            end_time = max(0.0, (last_char_peak + 1) * frame_duration - end_offset)
             word_timestamps.append(
                 {
                     "word": words[word_idx],

asr_config.py CHANGED Viewed

@@ -64,6 +64,7 @@ class ASRConfig(transformers.PretrainedConfig):
         lora_target_modules: Optional[list] = None,  # Default: all linear layers
         freeze_projector: bool = False,  # True for Stage 2 (LoRA-only training)
         do_sample: bool = False,
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
         top_k: Optional[int] = None,
@@ -174,6 +175,7 @@ class ASRConfig(transformers.PretrainedConfig):
         )
         self.use_cache = use_cache if use_cache is not None else generation_defaults["use_cache"]
         self.do_sample = do_sample
         self.temperature = temperature
         self.top_p = top_p
         self.top_k = top_k

         lora_target_modules: Optional[list] = None,  # Default: all linear layers
         freeze_projector: bool = False,  # True for Stage 2 (LoRA-only training)
         do_sample: bool = False,
+        enable_thinking: bool = False,  # Enable Qwen3 thinking mode for omni models
         temperature: Optional[float] = None,
         top_p: Optional[float] = None,
         top_k: Optional[int] = None,
         )
         self.use_cache = use_cache if use_cache is not None else generation_defaults["use_cache"]
         self.do_sample = do_sample
+        self.enable_thinking = enable_thinking
         self.temperature = temperature
         self.top_p = top_p
         self.top_k = top_k

asr_modeling.py CHANGED Viewed

@@ -582,7 +582,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
                 tokenize=True,
                 add_generation_prompt=True,
                 return_tensors="pt",
-                enable_thinking=False,  # Disable Qwen3 thinking mode for ASR
             )
             input_ids = chat_result.input_ids.to(device)
@@ -665,7 +665,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
-            enable_thinking=False,  # Disable Qwen3 thinking mode for ASR
         )
         input_ids = chat_result.input_ids.to(device)
@@ -764,7 +764,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
-            enable_thinking=False,  # Disable Qwen3 thinking mode for ASR
         ).to(device)
         if input_ids.dim() == 1:

                 tokenize=True,
                 add_generation_prompt=True,
                 return_tensors="pt",
+                enable_thinking=getattr(self.config, "enable_thinking", False),
             )
             input_ids = chat_result.input_ids.to(device)
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
+            enable_thinking=getattr(self.config, "enable_thinking", False),
         )
         input_ids = chat_result.input_ids.to(device)
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
+            enable_thinking=getattr(self.config, "enable_thinking", False),
         ).to(device)
         if input_ids.dim() == 1:

chat_template.jinja CHANGED Viewed

@@ -1,89 +1,94 @@
-{%- if tools %}
-    {{- '<|im_start|>system\n' }}
-    {%- if messages[0].role == 'system' %}
-        {{- messages[0].content + '\n\n' }}
-    {%- endif %}
-    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-    {%- for tool in tools %}
-        {{- "\n" }}
-        {{- tool | tojson }}
-    {%- endfor %}
-    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-{%- else %}
-    {%- if messages[0].role == 'system' %}
-        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
-    {%- endif %}
-{%- endif %}
-{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
-{%- for message in messages[::-1] %}
-    {%- set index = (messages|length - 1) - loop.index0 %}
-    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
-        {%- set ns.multi_step_tool = false %}
-        {%- set ns.last_query_index = index %}
-    {%- endif %}
-{%- endfor %}
-{%- for message in messages %}
-    {%- if message.content is string %}
-        {%- set content = message.content %}
-    {%- else %}
-        {%- set content = '' %}
-    {%- endif %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
-        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "assistant" %}
-        {%- set reasoning_content = '' %}
-        {%- if message.reasoning_content is string %}
-            {%- set reasoning_content = message.reasoning_content %}
-        {%- else %}
-            {%- if '</think>' in content %}
-                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
-                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
-            {%- endif %}
-        {%- endif %}
-        {%- if loop.index0 > ns.last_query_index %}
-            {%- if loop.last or (not loop.last and reasoning_content) %}
-                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
-            {%- else %}
-                {{- '<|im_start|>' + message.role + '\n' + content }}
-            {%- endif %}
-        {%- else %}
-            {{- '<|im_start|>' + message.role + '\n' + content }}
-        {%- endif %}
-        {%- if message.tool_calls %}
-            {%- for tool_call in message.tool_calls %}
-                {%- if (loop.first and content) or (not loop.first) %}
-                    {{- '\n' }}
-                {%- endif %}
-                {%- if tool_call.function %}
-                    {%- set tool_call = tool_call.function %}
-                {%- endif %}
-                {{- '<tool_call>\n{"name": "' }}
-                {{- tool_call.name }}
-                {{- '", "arguments": ' }}
-                {%- if tool_call.arguments is string %}
-                    {{- tool_call.arguments }}
-                {%- else %}
-                    {{- tool_call.arguments | tojson }}
-                {%- endif %}
-                {{- '}\n</tool_call>' }}
-            {%- endfor %}
-        {%- endif %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
-            {{- '<|im_start|>user' }}
-        {%- endif %}
-        {{- '\n<tool_response>\n' }}
-        {{- content }}
-        {{- '\n</tool_response>' }}
-        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-    {%- if true %}
-        {{- '<think>\n\n</think>\n\n' }}
-    {%- endif %}
-{%- endif %}

+{# ───── defaults ───── #}
+{%- if enable_thinking is not defined -%}
+{%- set enable_thinking = true -%}
+{%- endif -%}
+{# ───── reasoning mode ───── #}
+{%- if enable_thinking -%}
+  {%- set reasoning_mode = "/think" -%}
+{%- else -%}
+  {%- set reasoning_mode = "/no_think" -%}
+{%- endif -%}
+{# ───── header (system message) ───── #}
+{{- "<|im_start|>system\n" -}}
+{%- if messages[0].role == "system" -%}
+  {%- set system_message = messages[0].content -%}
+  {%- if "/no_think" in system_message -%}
+    {%- set reasoning_mode = "/no_think" -%}
+  {%- elif "/think" in system_message -%}
+    {%- set reasoning_mode = "/think" -%}
+  {%- endif -%}
+  {%- set custom_instructions = system_message.replace("/no_think", "").replace("/think", "").rstrip() -%}
+{%- endif -%}
+{%- if "/system_override" in system_message -%}
+  {{- custom_instructions.replace("/system_override", "").rstrip() -}}
+  {{- "<|im_end|>\n" -}}
+{%- else -%}
+  {{- "## Metadata\n\n" -}}
+  {{- "Knowledge Cutoff Date: June 2025\n" -}}
+  {%- set today = strftime_now("%d %B %Y") -%}
+  {{- "Today Date: " ~ today ~ "\n" -}}
+  {{- "Reasoning Mode: " + reasoning_mode + "\n\n" -}}
+  {{- "## Custom Instructions\n\n" -}}
+  {%- if custom_instructions -%}
+    {{- custom_instructions + "\n\n" -}}
+  {%- elif reasoning_mode == "/think" -%}
+    {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracking, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> Thought section </think> Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion.\n\n" -}}
+  {%- else -%}
+    {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face.\n\n" -}}
+  {%- endif -%}
+  {%- if xml_tools or python_tools or tools -%}
+    {{- "### Tools\n\n" -}}
+    {%- if xml_tools or tools -%}
+      {%- if tools -%}
+        {%- set xml_tools = tools -%}
+      {%- endif -%}
+      {%- set ns = namespace(xml_tool_string="You may call one or more functions to assist with the user query.\nYou are provided with function signatures within <tools></tools> XML tags:\n\n<tools>\n") -%}
+      {%- for tool in xml_tools[:] -%} {# The slicing makes sure that xml_tools is a list #}
+        {%- set ns.xml_tool_string = ns.xml_tool_string ~ (tool | string) ~ "\n" -%}
+      {%- endfor -%}
+      {%- set xml_tool_string = ns.xml_tool_string + "</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>" -%}
+      {{- xml_tool_string -}}
+    {%- endif -%}
+    {%- if python_tools -%}
+      {%- set ns = namespace(python_tool_string="When you send a message containing Python code between '<code>' and '</code>' tags, it will be executed in a stateful Jupyter notebook environment, and you will then be given the output to continued reasoning in an agentic loop.\n\nYou can use the following tools in your python code like regular functions:\n<tools>\n") -%}
+      {%- for tool in python_tools[:] -%} {# The slicing makes sure that python_tools is a list #}
+        {%- set ns.python_tool_string = ns.python_tool_string ~ (tool | string) ~ "\n" -%}
+      {%- endfor -%}
+      {%- set python_tool_string = ns.python_tool_string + "</tools>\n\nThe state persists between code executions: so variables that you define in one step are still available thereafter." -%}
+      {{- python_tool_string -}}
+    {%- endif -%}
+    {{- "\n\n" -}}
+    {{- "<|im_end|>\n" -}}
+  {%- endif -%}
+{%- endif -%}
+{# ───── main loop ───── #}
+{%- for message in messages -%}
+    {%- set content = message.content if message.content is string else "" -%}
+    {%- if message.role == "user" -%}
+        {{ "<|im_start|>" + message.role + "\n"  + content + "<|im_end|>\n" }}
+    {%- elif message.role == "assistant" -%}
+        {% generation %}
+        {%- if reasoning_mode == "/think" -%}
+            {{ "<|im_start|>assistant\n" + content.lstrip("\n") + "<|im_end|>\n" }}
+        {%- else -%}
+            {{ "<|im_start|>assistant\n" + "<think>\n\n</think>\n" + content.lstrip("\n") + "<|im_end|>\n" }}
+        {%- endif -%}
+        {% endgeneration %}
+    {%- elif message.role == "tool" -%}
+    {{ "<|im_start|>" + "user\n"  + content + "<|im_end|>\n" }}
+    {%- endif -%}
+{%- endfor -%}
+{# ───── generation prompt ───── #}
+{%- if add_generation_prompt -%}
+    {%- if reasoning_mode == "/think" -%}
+        {{ "<|im_start|>assistant\n" }}
+    {%- else -%}
+        {{ "<|im_start|>assistant\n" + "<think>\n\n</think>\n"  }}
+    {%- endif -%}
+{%- endif -%}

diarization.py CHANGED Viewed

@@ -91,20 +91,47 @@ class SpectralCluster:
     def get_spec_embs(
         self, laplacian: np.ndarray, k_oracle: int | None = None
     ) -> tuple[np.ndarray, int]:
-        """Extract spectral embeddings from Laplacian."""
         lambdas, eig_vecs = scipy.linalg.eigh(laplacian)
-        if k_oracle is not None:
-            num_of_spk = k_oracle
-        else:
-            lambda_gap_list = self.get_eigen_gaps(
-                lambdas[self.min_num_spks - 1 : self.max_num_spks + 1]
-            )
-            num_of_spk = np.argmax(lambda_gap_list) + self.min_num_spks
         emb = eig_vecs[:, :num_of_spk]
         return emb, num_of_spk
     def cluster_embs(self, emb: np.ndarray, k: int) -> np.ndarray:
         """Cluster spectral embeddings using k-means."""
         _, labels, _ = k_means(emb, k, n_init=10)

     def get_spec_embs(
         self, laplacian: np.ndarray, k_oracle: int | None = None
     ) -> tuple[np.ndarray, int]:
+        """Extract spectral embeddings from Laplacian.
+        Uses the eigengap heuristic to estimate the number of clusters:
+        The number of clusters k is chosen where the gap between consecutive
+        eigenvalues is largest, indicating a transition from "cluster" eigenvalues
+        (near 0) to "noise" eigenvalues.
+        """
         lambdas, eig_vecs = scipy.linalg.eigh(laplacian)
+        num_of_spk = k_oracle if k_oracle is not None else self._estimate_num_speakers(lambdas)
         emb = eig_vecs[:, :num_of_spk]
         return emb, num_of_spk
+    def _estimate_num_speakers(self, lambdas: np.ndarray) -> int:
+        """Estimate number of speakers using refined eigengap heuristic.
+        For spectral clustering, we look for the largest gap in eigenvalues.
+        The eigenvalues corresponding to clusters are close to 0, and there
+        should be a significant jump to the remaining eigenvalues.
+        """
+        # Consider eigenvalues from index 1 to max_num_spks (skip first, it's always ~0)
+        # We need gaps between positions, so look at indices 1 to max_num_spks+1
+        max_idx = min(self.max_num_spks + 1, len(lambdas))
+        relevant_lambdas = lambdas[1:max_idx]  # Skip first eigenvalue
+        if len(relevant_lambdas) < 2:
+            return self.min_num_spks
+        # Compute absolute gaps (not ratios - ratios are unstable near 0)
+        gaps = np.diff(relevant_lambdas)
+        # Find the largest gap - the index gives us (k-1) since we skipped first
+        # Add 1 to convert from gap index to number of speakers
+        # Add 1 again because we skipped the first eigenvalue
+        max_gap_idx = int(np.argmax(gaps))
+        num_of_spk = max_gap_idx + 2  # +1 for gap->count, +1 for skipped eigenvalue
+        # Clamp between min and max
+        return max(self.min_num_spks, min(num_of_spk, self.max_num_spks))
     def cluster_embs(self, emb: np.ndarray, k: int) -> np.ndarray:
         """Cluster spectral embeddings using k-means."""
         _, labels, _ = k_means(emb, k, n_init=10)

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33b674fb8444e2553eae8f1b261093371920a28ef75b5c18f4deb3f9217ed0ba
-size 11422834

 version https://git-lfs.github.com/spec/v1
+oid sha256:d4aeaf198f783cbf58d8cd59812baac429ffe49147bf9648f6618de20b8d4a4c
+size 17209003

tokenizer_config.json CHANGED Viewed

Binary files a/tokenizer_config.json and b/tokenizer_config.json differ