Update model weights after training (epoch 7, loss 4.6721)

Browse files

Files changed (7) hide show

cross_attention.safetensors +1 -1
llm.safetensors +2 -2
model.safetensors.index.json +4 -1
modeling_xoron.py +408 -104
streaming_state.json +14 -8
trainer_state.json +3 -3
training_state.pt +2 -2

cross_attention.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:15e348f1b98e8cc48f633f80a818a98a727f8a95e3794d3d7496c7c67d319c21
 size 174191400

 version https://git-lfs.github.com/spec/v1
+oid sha256:8cde5e1fb540a32b44b78415f2dcf2f037489d8b119a0f33b68a49811e6b8b50
 size 174191400

llm.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3888d6f2029add98a6540daf90a2fffaf8b2c0420fca1b401042a37ae56f957f
-size 1506832040

 version https://git-lfs.github.com/spec/v1
+oid sha256:3b5ffb3061f8b427f852d024bd147adf53062939c8d92babd677d4ceacf953a8
+size 1506836434

model.safetensors.index.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "metadata": {
-    "total_size": 7309254542,
     "format": "components"
   },
   "weight_map": {
@@ -696,6 +696,9 @@
     "llm.model.layers.11.mlp.shared_expert.down_proj.lora_B": "llm.safetensors",
     "llm.model.layers.11.mlp.shared_expert.down_proj.linear.weight": "llm.safetensors",
     "llm.model.norm.weight": "llm.safetensors",
     "llm.lm_head.weight": "llm.safetensors",
     "vision_encoder.vision_model.vision_model.embeddings.patch_embedding.weight": "vision_encoder.safetensors",
     "vision_encoder.vision_model.vision_model.embeddings.patch_embedding.bias": "vision_encoder.safetensors",

 {
   "metadata": {
+    "total_size": 7309258640,
     "format": "components"
   },
   "weight_map": {
     "llm.model.layers.11.mlp.shared_expert.down_proj.lora_B": "llm.safetensors",
     "llm.model.layers.11.mlp.shared_expert.down_proj.linear.weight": "llm.safetensors",
     "llm.model.norm.weight": "llm.safetensors",
+    "llm.model.thought_gate.weight": "llm.safetensors",
+    "llm.model.thought_gate.bias": "llm.safetensors",
+    "llm.model.thought_layernorm.weight": "llm.safetensors",
     "llm.lm_head.weight": "llm.safetensors",
     "vision_encoder.vision_model.vision_model.embeddings.patch_embedding.weight": "vision_encoder.safetensors",
     "vision_encoder.vision_model.vision_model.embeddings.patch_embedding.bias": "vision_encoder.safetensors",

modeling_xoron.py CHANGED Viewed

@@ -9122,6 +9122,13 @@ class MoELlamaModel (nn .Module ):
         self .num_moe_layers =sum (1 for layer in self .layers if layer .is_moe_layer )
         self ._init_weights ()
     def _init_weights (self ):
@@ -9147,6 +9154,7 @@ class MoELlamaModel (nn .Module ):
     output_hidden_states :bool =False ,
     return_dict :bool =True ,
     cache_position :Optional [torch .Tensor ]=None ,
     )->Union [Tuple ,MoELlamaModelOutput ]:
         if inputs_embeds is None :
@@ -9207,6 +9215,39 @@ class MoELlamaModel (nn .Module ):
             if output_attentions and attn_weights is not None :
                 all_attentions =all_attentions +(attn_weights ,)
         hidden_states =self .norm (hidden_states )
         if output_hidden_states :
@@ -9315,6 +9356,7 @@ class MoELlamaForCausalLM (nn .Module ):
     output_hidden_states :bool =False ,
     return_dict :bool =True ,
     cache_position :Optional [torch .Tensor ]=None ,
     **kwargs ,
     )->Union [Tuple ,CausalLMOutput ]:
@@ -9329,6 +9371,7 @@ class MoELlamaForCausalLM (nn .Module ):
         output_hidden_states =output_hidden_states ,
         return_dict =True ,
         cache_position =cache_position ,
         )
         hidden_states =outputs .last_hidden_state
@@ -9379,12 +9422,14 @@ class MoELlamaForCausalLM (nn .Module ):
     pad_token_id :Optional [int ]=None ,
     eos_token_id :Optional [int ]=None ,
     attention_mask :Optional [torch .Tensor ]=None ,
     **kwargs ,
     )->torch .Tensor :
         batch_size =input_ids .shape [0 ]
         device =input_ids .device
         past_key_values =None
         if attention_mask is None :
             attention_mask =torch .ones_like (input_ids )
@@ -9396,7 +9441,10 @@ class MoELlamaForCausalLM (nn .Module ):
             attention_mask =attention_mask ,
             )
-            outputs =self .forward (**model_inputs ,use_cache =True ,return_dict =True )
             next_token_logits =outputs .logits [:,-1 ,:]
@@ -9660,67 +9708,85 @@ class XoronMultimodalModel (nn .Module ):
     def apply_model_parallel (self ,device_map :Dict [str ,str ]):
-        """Apply Model Parallelism by placing components on different devices."""
-        self .device_map =device_map
-        device_values =[v for v in device_map .values ()if isinstance (v ,str )]
-        self ._model_parallel =len (set (device_values ))>1
-        if not self ._model_parallel :
             logger .info ("  ℹ️ Single device - no model parallelism needed")
-            return self
-        logger .info ("Applying Model Parallelism...")
-        self .vision_encoder =self .vision_encoder .to (device_map ['vision_encoder'])
-        logger .info (f"  ✅ Vision encoder -> {device_map ['vision_encoder']}")
-        self .video_encoder =self .video_encoder .to (device_map ['video_encoder'])
-        logger .info (f"  ✅ Video encoder -> {device_map ['video_encoder']}")
-        self .audio_encoder =self .audio_encoder .to (device_map ['audio_encoder'])
-        logger .info (f"  ✅ Audio encoder -> {device_map ['audio_encoder']}")
-        self .audio_decoder =self .audio_decoder .to (device_map ['audio_decoder'])
-        logger .info (f"  ✅ Audio decoder -> {device_map ['audio_decoder']}")
-        if hasattr (self ,'waveform_decoder')and self .waveform_decoder is not None :
-            waveform_device =device_map .get ('waveform_decoder',device_map ['audio_decoder'])
-            self .waveform_decoder =self .waveform_decoder .to (waveform_device )
-            logger .info (f"  ✅ Waveform decoder -> {waveform_device }")
-        self .projector =self .projector .to (device_map ['projector'])
-        logger .info (f"  ✅ Projector -> {device_map ['projector']}")
-        self .audio_projector =self .audio_projector .to (device_map ['audio_projector'])
-        logger .info (f"  ✅ Audio projector -> {device_map ['audio_projector']}")
-        self .llm =self .llm .to (device_map ['llm'])
-        logger .info (f"  ✅ LLM -> {device_map ['llm']}")
-        if self .cross_attention_layers is not None :
-            self .cross_attention_layers =self .cross_attention_layers .to (device_map ['cross_attention'])
-            logger .info (f"  ✅ Cross-attention -> {device_map ['cross_attention']}")
-        if self .generator is not None :
-            self .generator =self .generator .to (device_map ['generator'])
-            logger .info (f"  ✅ Image generator -> {device_map ['generator']}")
-        if self .video_generator is not None :
-            self .video_generator =self .video_generator .to (device_map ['video_generator'])
-            logger .info (f"  ✅ Video generator -> {device_map ['video_generator']}")
-        marker_device =device_map ['modality_markers']
-        self .image_start =nn .Parameter (self .image_start .data .to (marker_device ))
-        self .image_end =nn .Parameter (self .image_end .data .to (marker_device ))
-        self .video_start =nn .Parameter (self .video_start .data .to (marker_device ))
-        self .video_end =nn .Parameter (self .video_end .data .to (marker_device ))
-        self .audio_start =nn .Parameter (self .audio_start .data .to (marker_device ))
-        self .audio_end =nn .Parameter (self .audio_end .data .to (marker_device ))
-        logger .info (f"  ✅ Modality markers -> {marker_device }")
         logger .info ("Model Parallelism applied successfully!")
         return self
@@ -10113,63 +10179,297 @@ class XoronMultimodalModel (nn .Module ):
     def listen_and_respond (
     self ,
     audio_waveform :torch .Tensor ,
-    max_new_tokens :int =256 ,
     speaker_embedding :torch .Tensor =None ,
-    )->torch .Tensor :
-        """
-        Full Speech-to-Speech: Listen to audio, generate text response, speak it back.
-        This is the main conversational method - you speak to it, it responds with voice.
-        Args:
-            audio_waveform: [B, T_audio] input audio (what you said)
-            max_new_tokens: Maximum tokens to generate for response
-            speaker_embedding: Optional speaker embedding for response voice
-        Returns:
-            response_audio: [B, T_response] audio waveform of the model's response
         """
-        device =audio_waveform .device
-        audio_embeds =self .listen (audio_waveform )
-        batch_size =audio_waveform .shape [0 ]
-        dummy_input =torch .zeros (batch_size ,1 ,dtype =torch .long ,device =device )
-        outputs =self .forward (
-        input_ids =dummy_input ,
-        audio_features =audio_waveform ,
-        )
-        response_embeds =outputs .get ('hidden_states',outputs .get ('last_hidden_state'))
-        if response_embeds is not None :
-            mel ,durations ,_ ,_ =self .audio_decoder (
             response_embeds ,
             speaker_embedding =speaker_embedding ,
-            )
-            mel_features =mel .transpose (1 ,2 )
-            if not hasattr (self ,'_mel_to_hidden'):
-                self ._mel_to_hidden =nn .Linear (80 ,self .config .hidden_size ).to (device )
-            audio_features =self ._mel_to_hidden (mel_features )
-            response_audio =self .waveform_decoder (audio_features )
-            return response_audio
-        return torch .zeros (batch_size ,16000 ,device =device )
     def merge_lora_weights (self ):
         """Merge LoRA weights into main weights for inference."""
@@ -11120,6 +11420,10 @@ XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
         model =cls (config ,device_map =device_map )
         components_json =os .path .join (path ,"components.json")
         model_path =os .path .join (path ,"model.safetensors")
@@ -11127,7 +11431,7 @@ XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
             logger .info ("Loading from component-based format...")
             model ._load_components (path ,strict =strict )
-            model .lora_applied =lora_was_applied
         elif os .path .exists (model_path ):
             logger .info ("Loading weights from safetensors...")
@@ -11143,7 +11447,7 @@ XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
                 model .load_state_dict (checkpoint_state_dict ,strict =False )
                 logger .info ("Loaded weights from checkpoint")
-            model .lora_applied =lora_was_applied
         else :
             pytorch_path =os .path .join (path ,"pytorch_model.bin")
@@ -11154,7 +11458,7 @@ XoronForCausalLM.register_for_auto_class("AutoModelForCausalLM")
                 model .load_state_dict (checkpoint_state_dict ,strict =False )
                 logger .info ("Loaded weights from checkpoint")
-                model .lora_applied =lora_was_applied
             else :
                 raise FileNotFoundError (f"No model weights found at {path }")

         self .num_moe_layers =sum (1 for layer in self .layers if layer .is_moe_layer )
+        # ── Coconut: Continuous Thought components ──
+        # Learned gate controls how much recurrent thought vs original input
+        # to retain at each thinking step. Sigmoid output in [0,1].
+        self .thought_gate = nn .Linear (config .hidden_size , 1 , bias =True )
+        nn .init .constant_ (self .thought_gate .bias , -2.0 )  # Initialize gate biased toward original (sigmoid(-2)≈0.12)
+        self .thought_layernorm = LlamaRMSNorm (config .hidden_size , eps =config .rms_norm_eps )
         self ._init_weights ()
     def _init_weights (self ):
     output_hidden_states :bool =False ,
     return_dict :bool =True ,
     cache_position :Optional [torch .Tensor ]=None ,
+    thinking_depth :int =0 ,
     )->Union [Tuple ,MoELlamaModelOutput ]:
         if inputs_embeds is None :
             if output_attentions and attn_weights is not None :
                 all_attentions =all_attentions +(attn_weights ,)
+        # ── Coconut: Continuous Thought Loop ──
+        # After the normal pass, loop hidden states back through the
+        # transformer layers for extra computation in latent space.
+        # No tokens are decoded — pure continuous reasoning.
+        if thinking_depth > 0 :
+            original_hidden = hidden_states .clone ()
+            thought_position_ids = torch .arange (
+                seq_len , device =hidden_states .device
+            ).unsqueeze (0 ).expand (batch_size , -1 )
+            for thought_step in range (thinking_depth ):
+                # Normalize before re-entering the layers
+                hidden_states = self .thought_layernorm (hidden_states )
+                # Run through all layers again (no cache — full re-computation)
+                for layer in self .layers :
+                    hidden_states , _ , _ , step_aux = layer (
+                        hidden_states =hidden_states ,
+                        attention_mask =None ,  # Self-attend freely in thought space
+                        position_ids =thought_position_ids ,
+                        past_key_value =None ,
+                        output_attentions =False ,
+                        use_cache =False ,
+                    )
+                    if step_aux is not None :
+                        total_aux_loss = total_aux_loss + step_aux
+                # Gated residual: blend thought with original
+                # gate ∈ [0,1], initialized small so early training
+                # stays close to original behavior
+                gate = torch .sigmoid (self .thought_gate (hidden_states ))
+                hidden_states = gate * hidden_states + (1.0 - gate ) * original_hidden
         hidden_states =self .norm (hidden_states )
         if output_hidden_states :
     output_hidden_states :bool =False ,
     return_dict :bool =True ,
     cache_position :Optional [torch .Tensor ]=None ,
+    thinking_depth :int =0 ,
     **kwargs ,
     )->Union [Tuple ,CausalLMOutput ]:
         output_hidden_states =output_hidden_states ,
         return_dict =True ,
         cache_position =cache_position ,
+        thinking_depth =thinking_depth ,
         )
         hidden_states =outputs .last_hidden_state
     pad_token_id :Optional [int ]=None ,
     eos_token_id :Optional [int ]=None ,
     attention_mask :Optional [torch .Tensor ]=None ,
+    thinking_depth :int =0 ,
     **kwargs ,
     )->torch .Tensor :
         batch_size =input_ids .shape [0 ]
         device =input_ids .device
         past_key_values =None
+        is_prefill =True  # Deep thinking only on first pass (full context)
         if attention_mask is None :
             attention_mask =torch .ones_like (input_ids )
             attention_mask =attention_mask ,
             )
+            # Apply thinking depth only on prefill, not per-token steps
+            current_depth = thinking_depth if is_prefill else 0
+            outputs =self .forward (**model_inputs ,use_cache =True ,return_dict =True ,thinking_depth =current_depth )
+            is_prefill =False
             next_token_logits =outputs .logits [:,-1 ,:]
     def apply_model_parallel (self ,device_map :Dict [str ,str ]):
+        """Apply Model Parallelism by sharding components across devices.
+        Trained components get their layers split across all training GPUs.
+        Frozen components go to CPU. Small components (projectors, markers)
+        go to the primary GPU.
+        """
+        self .device_map =device_map
+        training_gpus = device_map .get ('training_gpus', ['cuda:0'])
+        primary = device_map .get ('primary', 'cuda:0')
+        if len (training_gpus ) <= 1 and not any (v == 'cpu' for v in device_map .values () if isinstance (v, str)):
             logger .info ("  ℹ️ Single device - no model parallelism needed")
+            return self
+        self ._model_parallel = True
+        logger .info ("Applying Model Parallelism (layer sharding)...")
+        def _shard_module (module, name, gpus):
+            """Shard a module's sub-layers across GPUs."""
+            # Find shardable sub-layers (nn.ModuleList children)
+            layer_lists = []
+            for attr_name in dir (module):
+                attr = getattr (module, attr_name, None)
+                if isinstance (attr, nn .ModuleList) and len (attr) > 0:
+                    layer_lists .append ((attr_name, attr))
+            if layer_lists:
+                # Shard the largest ModuleList across GPUs
+                layer_lists .sort (key=lambda x: len (x[1]), reverse=True)
+                list_name, layers = layer_lists [0]
+                for i, layer in enumerate (layers):
+                    target_gpu = gpus [i % len (gpus)]
+                    layer .to (target_gpu)
+                # Put remaining params on primary GPU
+                for param_name, param in module .named_parameters ():
+                    if not any (f'{list_name}.' in param_name for _ in [1]):
+                        param .data = param .data .to (gpus [0])
+                logger .info (f"  ✅ {name}: {len(layers)} layers sharded across {gpus}")
+            else:
+                # No layers to shard — put whole module on first GPU
+                module .to (gpus [0])
+                logger .info (f"  ✅ {name} -> {gpus[0]}")
+        # Map component names to actual attributes
+        component_attrs = {
+            'vision_encoder': 'vision_encoder',
+            'video_encoder': 'video_encoder',
+            'audio_encoder': 'audio_encoder',
+            'audio_decoder': 'audio_decoder',
+            'waveform_decoder': 'waveform_decoder',
+            'projector': 'projector',
+            'audio_projector': 'audio_projector',
+            'llm': 'llm',
+            'cross_attention': 'cross_attention_layers',
+            'generator': 'generator',
+            'video_generator': 'video_generator',
+        }
+        for comp_name, attr_name in component_attrs .items ():
+            comp = getattr (self, attr_name, None)
+            if comp is None:
+                continue
+            target = device_map .get (comp_name, 'cpu')
+            if target == 'cpu':
+                comp .to ('cpu')
+                logger .info (f"  ❄️ {comp_name} -> cpu (frozen)")
+            else:
+                # Shard across all training GPUs
+                _shard_module (comp, comp_name, training_gpus)
+        # Modality markers → primary GPU
+        marker_device = device_map .get ('modality_markers', primary)
+        if marker_device != 'cpu':
+            marker_device = primary
+        for marker_name in ['image_start', 'image_end', 'video_start', 'video_end', 'audio_start', 'audio_end']:
+            marker = getattr (self, marker_name, None)
+            if marker is not None:
+                setattr (self, marker_name, nn .Parameter (marker .data .to (marker_device)))
+        logger .info (f"  ✅ Modality markers -> {marker_device}")
         logger .info ("Model Parallelism applied successfully!")
         return self
     def listen_and_respond (
     self ,
     audio_waveform :torch .Tensor ,
+    tokenizer =None ,
+    max_new_tokens :int =512 ,
     speaker_embedding :torch .Tensor =None ,
+    temperature :float =0.7 ,
+    top_p :float =0.9 ,
+    tool_executor =None ,
+    available_tools :list =None ,
+    system_prompt :str =None ,
+    max_tool_calls :int =5 ,
+    ) -> Dict [str ,Any ]:
         """
+        Agentic Speech-to-Speech: Listen, think, use tools, speak back.
+        This is the full agentic pipeline for live voice conversations.
+        The model can detect when the user is asking for actions (e.g.
+        "write me a Python script") and execute tools mid-generation.
+        Pipeline:
+            1. Encode input audio → audio embeddings (ASR)
+            2. Build context (system prompt with tools + audio embeddings)
+            3. Generate tokens, watching for <|tool_call|> sequences
+            4. When tool call detected: parse, execute, inject result, resume
+            5. Synthesize final spoken response from non-tool text
+        Args:
+            audio_waveform: [B, T_audio] input audio waveform
+            tokenizer: Tokenizer for decoding tokens to text (required for tools)
+            max_new_tokens: Maximum total tokens to generate
+            speaker_embedding: [B, D] optional speaker embedding for voice cloning
+            temperature: Sampling temperature
+            top_p: Nucleus sampling probability
+            tool_executor: Callable(tool_name, args_dict) -> str result.
+                           If None, tool calls are detected but not executed.
+            available_tools: List of tool definition dicts for system prompt.
+            system_prompt: Optional system prompt override.
+            max_tool_calls: Maximum number of tool calls per response (safety limit).
+        Returns:
+            Dict with:
+                'waveform': [B, T_response] audio waveform tensor (in-memory, no file I/O)
+                'text': str full response text (excluding tool call markup)
+                'token_ids': [B, T_tokens] all generated token IDs
+                'mel': [B, 80, T_mel] intermediate mel spectrogram
+                'tool_calls': List[Dict] executed tool calls and their results
+                'speaking_text': str clean text that was spoken (no tool markup)
+        """
+        import re
+        import json as _json
+        device = audio_waveform .device
+        batch_size = audio_waveform .shape [0 ]
+        llm_device = self .get_llm_device ()
+        # ── 1. Listen: encode input audio ──
+        audio_embeds = self .encode_audio (audio_waveform )
+        # Wrap with start/end markers
+        audio_start = self .audio_start .expand (batch_size , -1 , -1 ).to (llm_device )
+        audio_end   = self .audio_end   .expand (batch_size , -1 , -1 ).to (llm_device )
+        audio_embeds = audio_embeds .to (llm_device )
+        # ── 2. Build context with system prompt + tools ──
+        context_parts = []
+        if tokenizer is not None and (system_prompt or tool_executor):
+            sys_text = system_prompt or "You are Xoron, an intelligent voice assistant. You can use tools to help the user."
+            if tool_executor and hasattr (tool_executor , 'get_tool_prompt' ):
+                sys_text = sys_text + "\n\n" + tool_executor .get_tool_prompt ()
+            elif available_tools :
+                from utils .tool_executor import format_tools_for_prompt
+                sys_text = sys_text + "\n\n" + format_tools_for_prompt (available_tools )
+            # Encode system prompt and prepend
+            sys_str = "<|system|>" + sys_text + "<|/system|>"
+            sys_token_ids = tokenizer .encode (sys_str , return_tensors ="pt" ).to (llm_device )
+            sys_embeds = self .llm .model .embed_tokens (sys_token_ids )
+            context_parts .append (sys_embeds .squeeze (0 ) if sys_embeds .dim () == 3 else sys_embeds )
+        # Audio context
+        context_parts .extend ([audio_start , audio_embeds , audio_end ])
+        # Assistant generation prompt
+        if tokenizer is not None :
+            asst_str = "<|assistant|>"
+            asst_ids = tokenizer .encode (asst_str , return_tensors ="pt" ).to (llm_device )
+            asst_embeds = self .llm .model .embed_tokens (asst_ids )
+            context_parts .append (asst_embeds .squeeze (0 ) if asst_embeds .dim () == 3 else asst_embeds )
+        input_embeds = torch .cat (context_parts , dim =1 )
+        # ── 3. Agentic generation loop with tool call detection ──
+        tool_call_start_token = "<|tool_call|>"
+        tool_call_end_token = "<|/tool_call|>"
+        fn_name_start = "<|function_name|>"
+        fn_name_end = "<|/function_name|>"
+        fn_args_start = "<|function_args|>"
+        fn_args_end = "<|/function_args|>"
+        tool_result_start = "<|tool_result|>"
+        tool_result_end = "<|/tool_result|>"
+        eos_token = "<|eos|>"
+        all_generated_ids = []
+        tool_calls_made = []
+        num_tool_calls = 0
+        generated_text = ""
+        total_tokens = 0
+        # Use standard generation if no tool executor
+        if tool_executor is None or tokenizer is None :
+            gen_kwargs = {
+                'inputs_embeds': input_embeds ,
+                'max_new_tokens': max_new_tokens ,
+                'do_sample': True ,
+                'temperature': temperature ,
+                'top_p': top_p ,
+                'use_cache': True ,
+            }
+            generated_ids = self .llm .generate (**gen_kwargs )
+            all_generated_ids = [generated_ids ]
+            if tokenizer is not None :
+                generated_text = tokenizer .batch_decode (generated_ids , skip_special_tokens =True )[0 ]
+        else :
+            # Token-by-token generation with tool call detection
+            current_embeds = input_embeds
+            past_key_values = None
+            in_tool_call = False
+            tool_call_buffer = ""
+            while total_tokens < max_new_tokens :
+                outputs = self .llm (
+                    inputs_embeds =current_embeds ,
+                    past_key_values =past_key_values ,
+                    use_cache =True ,
+                )
+                past_key_values = outputs .past_key_values
+                logits = outputs .logits [:, -1 :, :]
+                # Sample next token
+                if temperature > 0 :
+                    logits = logits / temperature
+                    if top_p < 1.0 :
+                        sorted_logits , sorted_indices = torch .sort (logits , descending =True , dim =-1 )
+                        cumulative_probs = torch .cumsum (F .softmax (sorted_logits , dim =-1 ), dim =-1 )
+                        sorted_mask = cumulative_probs - F .softmax (sorted_logits , dim =-1 ) >= top_p
+                        sorted_logits [sorted_mask ] = float ('-inf' )
+                        logits .scatter_ (-1 , sorted_indices , sorted_logits )
+                    probs = F .softmax (logits , dim =-1 )
+                    next_token = torch .multinomial (probs .squeeze (1 ), num_samples =1 )
+                else :
+                    next_token = logits .argmax (dim =-1 )
+                total_tokens += 1
+                all_generated_ids .append (next_token )
+                # Decode the token
+                token_text = tokenizer .decode (next_token [0 ], skip_special_tokens =False )
+                generated_text = generated_text + token_text
+                # Check for EOS
+                if eos_token in token_text or next_token .item () == tokenizer .eos_token_id :
+                    break
+                # ── Tool call detection ──
+                if tool_call_start_token in generated_text and not in_tool_call :
+                    in_tool_call = True
+                    # Extract everything after the tool_call_start
+                    tc_start_idx = generated_text .rfind (tool_call_start_token )
+                    tool_call_buffer = generated_text [tc_start_idx :]
+                if in_tool_call :
+                    tool_call_buffer = tool_call_buffer + token_text if tool_call_buffer else generated_text
+                    # Check if we have a complete tool call
+                    if tool_call_end_token in tool_call_buffer :
+                        in_tool_call = False
+                        num_tool_calls += 1
+                        # Parse the tool call
+                        tool_name = ""
+                        tool_args = {}
+                        try :
+                            # Extract function name
+                            name_start = tool_call_buffer .find (fn_name_start ) + len (fn_name_start )
+                            name_end = tool_call_buffer .find (fn_name_end )
+                            if name_start > 0 and name_end > 0 :
+                                tool_name = tool_call_buffer [name_start :name_end ].strip ()
+                            # Extract arguments
+                            args_start = tool_call_buffer .find (fn_args_start ) + len (fn_args_start )
+                            args_end = tool_call_buffer .find (fn_args_end )
+                            if args_start > 0 and args_end > 0 :
+                                args_str = tool_call_buffer [args_start :args_end ].strip ()
+                                try :
+                                    import json as _json
+                                    tool_args = _json .loads (args_str )
+                                except Exception :
+                                    tool_args = {"raw": args_str }
+                        except Exception :
+                            pass
+                        # Execute the tool
+                        tool_result = "[error]: Failed to parse tool call"
+                        if tool_name :
+                            tool_result = tool_executor (tool_name , tool_args )
+                        tool_calls_made .append ({
+                            "name": tool_name ,
+                            "arguments": tool_args ,
+                            "result": tool_result ,
+                        })
+                        # Inject tool result back into generation context
+                        result_str = tool_result_start + tool_result + tool_result_end
+                        result_ids = tokenizer .encode (result_str , return_tensors ="pt" ).to (llm_device )
+                        result_embeds = self .llm .model .embed_tokens (result_ids )
+                        current_embeds = result_embeds
+                        past_key_values = None  # Reset KV cache to include result
+                        all_generated_ids .append (result_ids .squeeze (0 ))
+                        generated_text = generated_text + result_str
+                        tool_call_buffer = ""
+                        if num_tool_calls >= max_tool_calls :
+                            break
+                        continue
+                # Prepare next input
+                next_embeds = self .llm .model .embed_tokens (next_token )
+                current_embeds = next_embeds
+        # Combine all generated IDs
+        if all_generated_ids :
+            flat_ids = []
+            for t in all_generated_ids :
+                if t .dim () == 0 :
+                    flat_ids .append (t .unsqueeze (0 ))
+                elif t .dim () == 1 :
+                    flat_ids .append (t )
+                else :
+                    flat_ids .append (t .view (-1 ))
+            generated_ids = torch .cat (flat_ids , dim =0 ).unsqueeze (0 )
+        else :
+            generated_ids = torch .tensor ([[]], dtype =torch .long , device =llm_device )
+        # ── 4. Extract speaking text (strip tool call/result markup) ──
+        speaking_text = generated_text
+        # Remove tool call blocks
+        while tool_call_start_token in speaking_text :
+            tc_s = speaking_text .find (tool_call_start_token )
+            tc_e = speaking_text .find (tool_call_end_token )
+            if tc_e > tc_s :
+                speaking_text = speaking_text [:tc_s ] + speaking_text [tc_e + len (tool_call_end_token ):]
+            else :
+                break
+        # Remove tool result blocks
+        while tool_result_start in speaking_text :
+            tr_s = speaking_text .find (tool_result_start )
+            tr_e = speaking_text .find (tool_result_end )
+            if tr_e > tr_s :
+                speaking_text = speaking_text [:tr_s ] + speaking_text [tr_e + len (tool_result_end ):]
+            else :
+                break
+        speaking_text = speaking_text .strip ()
+        # ── 5. Speak: encode → mel → stream_decode → waveform ──
+        response_embeds = self .llm .model .embed_tokens (generated_ids .to (llm_device ))
+        mel , durations , _ , _ = self .audio_decoder (
             response_embeds ,
             speaker_embedding =speaker_embedding ,
+        )
+        mel_features = mel .transpose (1 , 2 )
+        if not hasattr (self , '_mel_to_hidden' ):
+            self ._mel_to_hidden = nn .Linear (80 , self .config .hidden_size ).to (mel .device )
+        audio_features = self ._mel_to_hidden (mel_features )
+        waveform = self .waveform_decoder .stream_decode (audio_features )
+        return {
+            'waveform': waveform ,
+            'text': generated_text ,
+            'speaking_text': speaking_text ,
+            'token_ids': generated_ids ,
+            'mel': mel ,
+            'tool_calls': tool_calls_made ,
+        }
     def merge_lora_weights (self ):
         """Merge LoRA weights into main weights for inference."""
         model =cls (config ,device_map =device_map )
+        if lora_was_applied:
+            logger .info ("Checkpoint has LoRA weights. Applying LoRA structure before loading...")
+            model .apply_lora ()
         components_json =os .path .join (path ,"components.json")
         model_path =os .path .join (path ,"model.safetensors")
             logger .info ("Loading from component-based format...")
             model ._load_components (path ,strict =strict )
+            model .lora_applied =False  # Always allow fresh LoRA application (checkpoint has merged weights)
         elif os .path .exists (model_path ):
             logger .info ("Loading weights from safetensors...")
                 model .load_state_dict (checkpoint_state_dict ,strict =False )
                 logger .info ("Loaded weights from checkpoint")
+            model .lora_applied =False  # Always allow fresh LoRA application (checkpoint has merged weights)
         else :
             pytorch_path =os .path .join (path ,"pytorch_model.bin")
                 model .load_state_dict (checkpoint_state_dict ,strict =False )
                 logger .info ("Loaded weights from checkpoint")
+                model .lora_applied =False  # Always allow fresh LoRA application (checkpoint has merged weights)
             else :
                 raise FileNotFoundError (f"No model weights found at {path }")

streaming_state.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
-  "epoch": 135,
-  "unique_samples": 350,
-  "total_yields": 700,
   "dataset_positions": {
     "WebSight": 386,
     "ScienceQA": 364,
@@ -30,7 +30,7 @@
     "NoRobots": 800,
     "Synth-LanguageSetup": 200,
     "Function-Calling-ChatML": 200,
-    "Synth-CoT": 550,
     "Python-Code-18k": 200,
     "Code-Feedback": 200,
     "HumanEval-CPP": 164,
@@ -97,7 +97,9 @@
     "Cosmopedia-OpenStax": 600,
     "MedMCQA": 650,
     "Medical-Reasoning-SFT-Mega": 650,
-    "Medical-O1-Reasoning-EN": 650
   },
   "modality_positions": {
     "text": {
@@ -154,7 +156,10 @@
       "Synth-FactCheck": 550,
       "Synth-ConfidenceLevel": 550,
       "Synth-Citation": 550,
-      "Synth-Uncertainty": 550
     },
     "image": {
       "WebSight": 386,
@@ -179,10 +184,11 @@
     "audio": {}
   },
   "modality_counts": {
-    "text": 350,
     "image": 0,
     "video": 0,
-    "audio": 0
   },
   "last_modality": null
 }

 {
+  "epoch": 148,
+  "unique_samples": 150,
+  "total_yields": 300,
   "dataset_positions": {
     "WebSight": 386,
     "ScienceQA": 364,
     "NoRobots": 800,
     "Synth-LanguageSetup": 200,
     "Function-Calling-ChatML": 200,
+    "Synth-CoT": 900,
     "Python-Code-18k": 200,
     "Code-Feedback": 200,
     "HumanEval-CPP": 164,
     "Cosmopedia-OpenStax": 600,
     "MedMCQA": 650,
     "Medical-Reasoning-SFT-Mega": 650,
+    "Medical-O1-Reasoning-EN": 650,
+    "OpenThoughts-114k": 350,
+    "Bespoke-Stratos-17k": 350
   },
   "modality_positions": {
     "text": {
       "Synth-FactCheck": 550,
       "Synth-ConfidenceLevel": 550,
       "Synth-Citation": 550,
+      "Synth-Uncertainty": 550,
+      "OpenThoughts-114k": 350,
+      "Bespoke-Stratos-17k": 350,
+      "Synth-CoT": 900
     },
     "image": {
       "WebSight": 386,
     "audio": {}
   },
   "modality_counts": {
+    "text": 0,
     "image": 0,
     "video": 0,
+    "audio": 0,
+    "reasoning": 150
   },
   "last_modality": null
 }

trainer_state.json CHANGED Viewed

@@ -1,14 +1,14 @@
 {
   "best_model_checkpoint": "/kaggle/working/xoron-final",
-  "best_metric": 6.622317645549774,
   "epoch": 7,
   "epochs_completed": 7,
-  "global_step": 301,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [],
   "logging_steps": 50,
-  "max_steps": 301,
   "num_train_epochs": 7,
   "total_flos": 0,
   "train_batch_size": 1,

 {
   "best_model_checkpoint": "/kaggle/working/xoron-final",
+  "best_metric": 4.672067043383916,
   "epoch": 7,
   "epochs_completed": 7,
+  "global_step": 126,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [],
   "logging_steps": 50,
+  "max_steps": 126,
   "num_train_epochs": 7,
   "total_flos": 0,
   "train_batch_size": 1,

training_state.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a89e9a0652c7c060ae5d2f1211f9a8ce9e301009c1282faa827cfb44a01e4db3
-size 1514912171

 version https://git-lfs.github.com/spec/v1
+oid sha256:681e7dfcad848ba9070f7c69e68e4517dd623592df17ce9d5e050436701a3611
+size 1514916733