Spaces:

mclemcrew
/

CoMix-Demo

Sleeping

App Files Files Community

mclemcrew commited on Mar 25, 2025

Commit

d11be63

1 Parent(s): ed7ce23

updates

Browse files

Files changed (1) hide show

app.py +27 -14

app.py CHANGED Viewed

@@ -128,19 +128,29 @@ def generate_response(audio_path, message, chat_history=None):
                     conversation.append({"role": "assistant", "content": bot_msg})
         if audio_path:
-            # Match the format that was working in the original code
-            # Create a placeholder audio URL to ensure the <|AUDIO|> token is generated
-            conversation.append({
-                "role": "user",
-                "content": [
-                    {"type": "audio", "audio_url": "placeholder_audio_url"},
-                    {"type": "text", "text": message}
-                ]
-            })
             audio_data = process_audio_file(audio_path)
             if audio_data is not None:
                 audios.append(audio_data)
         else:
             conversation.append({"role": "user", "content": message})
@@ -158,12 +168,17 @@ def generate_response(audio_path, message, chat_history=None):
             truncation=True
         )
         # Move inputs to the same device as model
         device = next(model.parameters()).device
         inputs = {k: v.to(device) for k, v in inputs.items()}
-        logger.info(f"Processor output: {inputs}")
         log_gpu_memory("Before generation")
         logger.info("Generating response")
@@ -178,8 +193,6 @@ def generate_response(audio_path, message, chat_history=None):
                 pad_token_id=processor.tokenizer.pad_token_id
             )
-        logger.info(f"Model output: {output}")
         generated_text = processor.batch_decode(
             output[:, inputs["input_ids"].shape[1]:],
             skip_special_tokens=True

                     conversation.append({"role": "assistant", "content": bot_msg})
         if audio_path:
+            # Process the audio first to ensure it's valid
             audio_data = process_audio_file(audio_path)
             if audio_data is not None:
+                # Log audio properties to verify it's loaded correctly
+                logger.info(f"Audio data loaded: length={len(audio_data)}, dtype={audio_data.dtype}, "
+                           f"min={np.min(audio_data)}, max={np.max(audio_data)}, "
+                           f"contains_nan={np.isnan(audio_data).any()}")
+                # Store audio data for processing
                 audios.append(audio_data)
+                # Create conversation entry with audio
+                conversation.append({
+                    "role": "user",
+                    "content": [
+                        {"type": "audio", "audio_url": "placeholder_audio_url"},
+                        {"type": "text", "text": message}
+                    ]
+                })
+            else:
+                logger.error("Failed to process audio file, continuing without audio")
+                conversation.append({"role": "user", "content": message})
         else:
             conversation.append({"role": "user", "content": message})
             truncation=True
         )
+        # Verify audio was included in inputs
+        logger.info(f"Inputs keys: {inputs.keys()}")
+        if 'audio_features' in inputs:
+            logger.info(f"Audio features shape: {inputs['audio_features'].shape}")
+        else:
+            logger.warning("No audio_features in processor output!")
         # Move inputs to the same device as model
         device = next(model.parameters()).device
         inputs = {k: v.to(device) for k, v in inputs.items()}
         log_gpu_memory("Before generation")
         logger.info("Generating response")
                 pad_token_id=processor.tokenizer.pad_token_id
             )
         generated_text = processor.batch_decode(
             output[:, inputs["input_ids"].shape[1]:],
             skip_special_tokens=True